{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to\n",
      "[nltk_data]     C:\\Users\\ameyp\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package wordnet to\n",
      "[nltk_data]     C:\\Users\\ameyp\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n",
      "[nltk_data] Downloading package omw-1.4 to\n",
      "[nltk_data]     C:\\Users\\ameyp\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package omw-1.4 is already up-to-date!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "--- Tokenization ---\n",
      "Whitespace Tokenization: ['Hello', 'there!', \"I'm\", 'testing', 'various', 'tokenization', 'methods:', 'whitespace,', 'punctuation-based,', 'treebank,', 'tweet', '&', 'MWE.']\n",
      "Punctuation-based Tokenization: ['Hello', 'there', 'I', 'm', 'testing', 'various', 'tokenization', 'methods', 'whitespace', 'punctuation', 'based', 'treebank', 'tweet', 'MWE']\n",
      "Treebank Tokenization: ['Hello', 'there', '!', 'I', \"'m\", 'testing', 'various', 'tokenization', 'methods', ':', 'whitespace', ',', 'punctuation-based', ',', 'treebank', ',', 'tweet', '&', 'MWE', '.']\n",
      "Tweet Tokenization: ['Hello', 'there', '!', \"I'm\", 'testing', 'various', 'tokenization', 'methods', ':', 'whitespace', ',', 'punctuation-based', ',', 'treebank', ',', 'tweet', '&', 'MWE', '.']\n",
      "MWE Tokenization: ['Hello', 'there', '!', 'I', \"'m\", 'testing_various', 'tokenization_methods', ':', 'whitespace', ',', 'punctuation-based', ',', 'treebank', ',', 'tweet', '&', 'MWE', '.']\n",
      "\n",
      "--- Stemming ---\n",
      "Porter Stemming: ['hello', 'there', 'i', 'm', 'test', 'variou', 'token', 'method', 'whitespac', 'punctuat', 'base', 'treebank', 'tweet', 'mwe']\n",
      "Snowball Stemming: ['hello', 'there', 'i', 'm', 'test', 'various', 'token', 'method', 'whitespac', 'punctuat', 'base', 'treebank', 'tweet', 'mwe']\n",
      "\n",
      "--- Lemmatization ---\n",
      "Lemmatization: ['Hello', 'there', 'I', 'm', 'testing', 'various', 'tokenization', 'method', 'whitespace', 'punctuation', 'based', 'treebank', 'tweet', 'MWE']\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "from nltk.tokenize import word_tokenize, TreebankWordTokenizer, TweetTokenizer, MWETokenizer, WhitespaceTokenizer, RegexpTokenizer\n",
    "from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer\n",
    "\n",
    "# Download necessary resources\n",
    "nltk.download('punkt')\n",
    "nltk.download('wordnet')\n",
    "nltk.download('omw-1.4')\n",
    "\n",
    "text = \"Hello there! I'm testing various tokenization methods: whitespace, punctuation-based, treebank, tweet & MWE.\"\n",
    "\n",
    "# Tokenizers\n",
    "tokenizers = {\n",
    "    \"Whitespace\": WhitespaceTokenizer(),\n",
    "    \"Punctuation-based\": RegexpTokenizer(r'\\w+'),\n",
    "    \"Treebank\": TreebankWordTokenizer(),\n",
    "    \"Tweet\": TweetTokenizer(),\n",
    "    \"MWE\": MWETokenizer([('testing', 'various'), ('tokenization', 'methods')])\n",
    "}\n",
    "\n",
    "print(\"\\n--- Tokenization ---\")\n",
    "tokens = {}\n",
    "for name, tokenizer in tokenizers.items():\n",
    "    if name == \"MWE\":\n",
    "        tokens[name] = tokenizer.tokenize(word_tokenize(text))\n",
    "    else:\n",
    "        tokens[name] = tokenizer.tokenize(text)\n",
    "    print(f\"{name} Tokenization: {tokens[name]}\")\n",
    "\n",
    "# Use punctuation tokens for stemming and lemmatization\n",
    "base_tokens = tokens[\"Punctuation-based\"]\n",
    "\n",
    "print(\"\\n--- Stemming ---\")\n",
    "for stemmer_name, stemmer in {\n",
    "    \"Porter\": PorterStemmer(),\n",
    "    \"Snowball\": SnowballStemmer(\"english\")\n",
    "}.items():\n",
    "    stems = [stemmer.stem(w) for w in base_tokens]\n",
    "    print(f\"{stemmer_name} Stemming: {stems}\")\n",
    "\n",
    "print(\"\\n--- Lemmatization ---\")\n",
    "lemmatizer = WordNetLemmatizer()\n",
    "lemmatized = [lemmatizer.lemmatize(w) for w in base_tokens]\n",
    "print(f\"Lemmatization: {lemmatized}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}