{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "--- Processed Data ---\n",
      "                                                 text  \\\n",
      "0  Natural Language Processing is an exciting fie...   \n",
      "1      Machine learning helps in speech recognition.   \n",
      "2  Deep learning advances AI technologies signifi...   \n",
      "3   TF-IDF is useful for text representation in NLP.   \n",
      "4    Lemmatization reduces words to their base form.   \n",
      "\n",
      "                                      processed_text          label  \\\n",
      "0      natural language processing exciting field ai             AI   \n",
      "1           machine learning help speech recognition             ML   \n",
      "2  deep learning advance ai technology significantly             DL   \n",
      "3              tf idf useful text representation nlp            NLP   \n",
      "4               lemmatization reduces word base form  Preprocessing   \n",
      "\n",
      "   label_encoded  \n",
      "0              0  \n",
      "1              2  \n",
      "2              1  \n",
      "3              3  \n",
      "4              4  \n",
      "\n",
      "--- TF-IDF Representation ---\n",
      "     advance        ai      base      deep  exciting     field      form  \\\n",
      "0  0.000000  0.339393  0.000000  0.000000  0.420669  0.420669  0.000000   \n",
      "1  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   \n",
      "2  0.434297  0.350388  0.000000  0.434297  0.000000  0.000000  0.000000   \n",
      "3  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   \n",
      "4  0.000000  0.000000  0.447214  0.000000  0.000000  0.000000  0.447214   \n",
      "\n",
      "       help       idf  language  ...  recognition   reduces  representation  \\\n",
      "0  0.000000  0.000000  0.420669  ...     0.000000  0.000000        0.000000   \n",
      "1  0.463693  0.000000  0.000000  ...     0.463693  0.000000        0.000000   \n",
      "2  0.000000  0.000000  0.000000  ...     0.000000  0.000000        0.000000   \n",
      "3  0.000000  0.408248  0.000000  ...     0.000000  0.000000        0.408248   \n",
      "4  0.000000  0.000000  0.000000  ...     0.000000  0.447214        0.000000   \n",
      "\n",
      "   significantly    speech  technology      text        tf    useful      word  \n",
      "0       0.000000  0.000000    0.000000  0.000000  0.000000  0.000000  0.000000  \n",
      "1       0.000000  0.463693    0.000000  0.000000  0.000000  0.000000  0.000000  \n",
      "2       0.434297  0.000000    0.434297  0.000000  0.000000  0.000000  0.000000  \n",
      "3       0.000000  0.000000    0.000000  0.408248  0.408248  0.408248  0.000000  \n",
      "4       0.000000  0.000000    0.000000  0.000000  0.000000  0.000000  0.447214  \n",
      "\n",
      "[5 rows x 26 columns]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to\n",
      "[nltk_data]     C:\\Users\\ameyp\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     C:\\Users\\ameyp\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "[nltk_data] Downloading package wordnet to\n",
      "[nltk_data]     C:\\Users\\ameyp\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd, re, nltk, pickle\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "# Download required resources\n",
    "for res in [\"punkt\", \"stopwords\", \"wordnet\"]:\n",
    "    nltk.download(res)\n",
    "\n",
    "# Sample data\n",
    "df = pd.DataFrame({\n",
    "    \"text\": [\n",
    "        \"Natural Language Processing is an exciting field of AI!\",\n",
    "        \"Machine learning helps in speech recognition.\",\n",
    "        \"Deep learning advances AI technologies significantly.\",\n",
    "        \"TF-IDF is useful for text representation in NLP.\",\n",
    "        \"Lemmatization reduces words to their base form.\"\n",
    "    ],\n",
    "    \"label\": [\"AI\", \"ML\", \"DL\", \"NLP\", \"Preprocessing\"]\n",
    "})\n",
    "\n",
    "# Text cleaning and preprocessing\n",
    "stop_words = set(stopwords.words(\"english\"))\n",
    "lemmatizer = WordNetLemmatizer()\n",
    "\n",
    "def preprocess(text):\n",
    "    text = re.sub(r\"\\W\", \" \", text.lower())\n",
    "    tokens = word_tokenize(text)\n",
    "    return \" \".join(lemmatizer.lemmatize(w) for w in tokens if w not in stop_words)\n",
    "\n",
    "df[\"processed_text\"] = df[\"text\"].apply(preprocess)\n",
    "\n",
    "# Encode labels\n",
    "label_encoder = LabelEncoder()\n",
    "df[\"label_encoded\"] = label_encoder.fit_transform(df[\"label\"])\n",
    "\n",
    "# TF-IDF vectorization\n",
    "tfidf = TfidfVectorizer()\n",
    "X_tfidf = tfidf.fit_transform(df[\"processed_text\"])\n",
    "tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())\n",
    "\n",
    "# Save data and models\n",
    "df.to_csv(\"processed_text_data.csv\", index=False)\n",
    "tfidf_df.to_csv(\"tfidf_representation.csv\", index=False)\n",
    "with open(\"label_encoder.pkl\", \"wb\") as f: pickle.dump(label_encoder, f)\n",
    "with open(\"tfidf_vectorizer.pkl\", \"wb\") as f: pickle.dump(tfidf, f)\n",
    "\n",
    "# Display results\n",
    "print(\"\\n--- Processed Data ---\\n\", df[[\"text\", \"processed_text\", \"label\", \"label_encoded\"]])\n",
    "print(\"\\n--- TF-IDF Representation ---\\n\", tfidf_df.head())\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
