{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "--- Bag-of-Words ---\n",
      "\n",
      "Count Occurrence:\n",
      "    ai  amazing  and  are  capture  deep  embeddings  helps  idf  in  ...  \\\n",
      "0   0        1    0    0        0     0           0      0    0   0  ...   \n",
      "1   1        0    1    1        0     1           0      0    0   0  ...   \n",
      "2   0        0    0    0        0     1           0      1    0   1  ...   \n",
      "3   0        0    0    0        1     0           1      0    0   0  ...   \n",
      "4   0        0    0    0        0     0           0      1    1   1  ...   \n",
      "\n",
      "   parts  processing  representation  semantic  tasks  text  tf  translation  \\\n",
      "0      0           1               0         0      0     0   0            0   \n",
      "1      1           0               0         0      0     0   0            0   \n",
      "2      0           0               0         0      1     0   0            1   \n",
      "3      0           0               0         1      0     0   0            0   \n",
      "4      0           0               1         0      0     1   1            0   \n",
      "\n",
      "   word  words  \n",
      "0     0      0  \n",
      "1     0      0  \n",
      "2     0      0  \n",
      "3     1      1  \n",
      "4     0      0  \n",
      "\n",
      "[5 rows x 29 columns]\n",
      "\n",
      "Normalized Count:\n",
      "          ai  amazing       and       are   capture      deep  embeddings  \\\n",
      "0  0.000000      0.2  0.000000  0.000000  0.000000  0.000000    0.000000   \n",
      "1  0.111111      0.0  0.111111  0.111111  0.000000  0.111111    0.000000   \n",
      "2  0.000000      0.0  0.000000  0.000000  0.000000  0.125000    0.000000   \n",
      "3  0.000000      0.0  0.000000  0.000000  0.142857  0.000000    0.142857   \n",
      "4  0.000000      0.0  0.000000  0.000000  0.000000  0.000000    0.000000   \n",
      "\n",
      "      helps       idf        in  ...     parts  processing  representation  \\\n",
      "0  0.000000  0.000000  0.000000  ...  0.000000         0.2        0.000000   \n",
      "1  0.000000  0.000000  0.000000  ...  0.111111         0.0        0.000000   \n",
      "2  0.125000  0.000000  0.125000  ...  0.000000         0.0        0.000000   \n",
      "3  0.000000  0.000000  0.000000  ...  0.000000         0.0        0.000000   \n",
      "4  0.166667  0.166667  0.166667  ...  0.000000         0.0        0.166667   \n",
      "\n",
      "   semantic  tasks      text        tf  translation      word     words  \n",
      "0  0.000000  0.000  0.000000  0.000000        0.000  0.000000  0.000000  \n",
      "1  0.000000  0.000  0.000000  0.000000        0.000  0.000000  0.000000  \n",
      "2  0.000000  0.125  0.000000  0.000000        0.125  0.000000  0.000000  \n",
      "3  0.142857  0.000  0.000000  0.000000        0.000  0.142857  0.142857  \n",
      "4  0.000000  0.000  0.166667  0.166667        0.000  0.000000  0.000000  \n",
      "\n",
      "[5 rows x 29 columns]\n",
      "\n",
      "--- TF-IDF ---\n",
      "         ai   amazing       and       are   capture      deep  embeddings  \\\n",
      "0  0.000000  0.447214  0.000000  0.000000  0.000000  0.000000    0.000000   \n",
      "1  0.335097  0.000000  0.335097  0.335097  0.000000  0.270354    0.000000   \n",
      "2  0.000000  0.000000  0.000000  0.000000  0.000000  0.313957    0.000000   \n",
      "3  0.000000  0.000000  0.000000  0.000000  0.387757  0.000000    0.387757   \n",
      "4  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000    0.000000   \n",
      "\n",
      "      helps       idf        in  ...     parts  processing  representation  \\\n",
      "0  0.000000  0.000000  0.000000  ...  0.000000    0.447214        0.000000   \n",
      "1  0.000000  0.000000  0.000000  ...  0.335097    0.000000        0.000000   \n",
      "2  0.313957  0.000000  0.313957  ...  0.000000    0.000000        0.000000   \n",
      "3  0.000000  0.000000  0.000000  ...  0.000000    0.000000        0.000000   \n",
      "4  0.350388  0.434297  0.350388  ...  0.000000    0.000000        0.434297   \n",
      "\n",
      "   semantic     tasks      text        tf  translation      word     words  \n",
      "0  0.000000  0.000000  0.000000  0.000000     0.000000  0.000000  0.000000  \n",
      "1  0.000000  0.000000  0.000000  0.000000     0.000000  0.000000  0.000000  \n",
      "2  0.000000  0.389141  0.000000  0.000000     0.389141  0.000000  0.000000  \n",
      "3  0.387757  0.000000  0.000000  0.000000     0.000000  0.387757  0.387757  \n",
      "4  0.000000  0.000000  0.434297  0.434297     0.000000  0.000000  0.000000  \n",
      "\n",
      "[5 rows x 29 columns]\n",
      "\n",
      "--- Word2Vec Embeddings ---\n",
      "Vector for 'learning':\n",
      "[ 0.07380505 -0.01533471 -0.04536613  0.06554051 -0.0486016  -0.01816018\n",
      "  0.0287658   0.00991874 -0.08285215 -0.09448818]\n",
      "\n",
      "Vector for 'deep':\n",
      "[-0.07510868 -0.00931038  0.09536546 -0.07319412 -0.02332759 -0.01937097\n",
      "  0.08076666 -0.05932511  0.00046473 -0.04753933]\n",
      "\n",
      "Vector for 'natural':\n",
      "[ 0.01218818 -0.08458325 -0.08223945 -0.00231016  0.01237288 -0.05743381\n",
      " -0.04725274 -0.07346074  0.08328615  0.00121298]\n",
      "\n",
      "Vector for 'text':\n",
      "[-0.08534335  0.03207107 -0.04637997 -0.05088955  0.03589618  0.05370339\n",
      "  0.07769515 -0.05766506  0.07433361  0.06625496]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "from gensim.models import Word2Vec\n",
    "from nltk.tokenize import word_tokenize\n",
    "\n",
    "# Sample dataset\n",
    "docs = [\n",
    "    \"Natural Language Processing is amazing.\",\n",
    "    \"Machine learning and deep learning are parts of AI.\",\n",
    "    \"Deep learning helps in NLP tasks like translation.\",\n",
    "    \"Word embeddings capture semantic meaning of words.\",\n",
    "    \"TF-IDF helps in text representation.\"\n",
    "]\n",
    "\n",
    "# 1. BoW (Raw + Normalized)\n",
    "print(\"\\n--- Bag-of-Words ---\")\n",
    "vectorizer = CountVectorizer()\n",
    "X_bow = vectorizer.fit_transform(docs).toarray()\n",
    "cols = vectorizer.get_feature_names_out()\n",
    "\n",
    "for label, matrix in [(\"Count Occurrence\", X_bow), \n",
    "                      (\"Normalized Count\", X_bow / X_bow.sum(axis=1, keepdims=True))]:\n",
    "    print(f\"\\n{label}:\\n\", pd.DataFrame(matrix, columns=cols))\n",
    "\n",
    "# 2. TF-IDF\n",
    "print(\"\\n--- TF-IDF ---\")\n",
    "X_tfidf = TfidfVectorizer().fit_transform(docs).toarray()\n",
    "print(pd.DataFrame(X_tfidf, columns=TfidfVectorizer().fit(docs).get_feature_names_out()))\n",
    "\n",
    "# 3. Word2Vec Embeddings\n",
    "print(\"\\n--- Word2Vec Embeddings ---\")\n",
    "tokens = [word_tokenize(doc.lower()) for doc in docs]\n",
    "model = Word2Vec(sentences=tokens, vector_size=10, window=5, min_count=1, workers=4)\n",
    "\n",
    "for word in [\"learning\", \"deep\", \"natural\", \"text\"]:\n",
    "    print(f\"Vector for '{word}':\\n{model.wv[word]}\\n\" if word in model.wv else f\"'{word}' not in vocabulary.\\n\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "      "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
