{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "Assignment 3: Write a program to recognize a document is positive or negative based on polarity\n",
        "words using suitable classification method."
      ],
      "metadata": {
        "id": "93rFRa0BNhai"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import nltk\n",
        "\n",
        "# Check if punkt is installed\n",
        "try:\n",
        "    nltk.data.find('tokenizers/punkt')\n",
        "    print(\"Punkt tokenizer is already installed!\")\n",
        "except LookupError:\n",
        "    print(\"Punkt tokenizer is missing. Downloading it now.\")\n",
        "    nltk.download('punkt')\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "__ArjvaFOFg2",
        "outputId": "6c7b8a42-f772-4589-f58d-31840760e1a1"
      },
      "execution_count": 22,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Punkt tokenizer is already installed!\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import nltk\n",
        "\n",
        "# Set the NLTK data path to a specific folder\n",
        "nltk.data.path.append(\"/root/nltk_data\")\n",
        "\n",
        "# Try downloading punkt again\n",
        "nltk.download('punkt')\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1dC12VgJOG_t",
        "outputId": "ec1d2995-b089-40d3-df9e-5041dd6b308c"
      },
      "execution_count": 23,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
            "[nltk_data]   Package punkt is already up-to-date!\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {},
          "execution_count": 23
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install nltk\n",
        "import nltk\n",
        "\n",
        "# Download 'punkt' and 'punkt_tab' datasets\n",
        "nltk.download('punkt')\n",
        "nltk.download('punkt_tab')\n",
        "\n",
        "\n",
        "# ... (rest of the code remains the same)\n",
        "\n",
        "import pandas as pd\n",
        "import nltk\n",
        "from sklearn.feature_extraction.text import CountVectorizer\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.naive_bayes import MultinomialNB\n",
        "from sklearn.metrics import classification_report, accuracy_score\n",
        "\n",
        "# Sample dataset with polarity words (positive/negative sentences)\n",
        "data = {\n",
        "    'text': [\n",
        "        \"I love this product, it's amazing!\",\n",
        "        \"I hate this product, it's terrible.\",\n",
        "        \"The movie was awesome, I enjoyed it!\",\n",
        "        \"It was the worst movie I've ever seen.\",\n",
        "        \"Such a great experience, I'm so happy!\",\n",
        "        \"I will never buy from this store again, very disappointed.\",\n",
        "        \"Fantastic service, I am really pleased.\",\n",
        "        \"Horrible, worst service ever.\",\n",
        "        \"I am thrilled with the results, totally recommend!\",\n",
        "        \"It was an awful experience, very frustrating.\"\n",
        "    ],\n",
        "    'label': [\n",
        "        'positive', 'negative', 'positive', 'negative', 'positive',\n",
        "        'negative', 'positive', 'negative', 'positive', 'negative'\n",
        "    ]\n",
        "}\n",
        "\n",
        "# Create DataFrame\n",
        "df = pd.DataFrame(data)\n",
        "\n",
        "# Step 1: Define polarity words (positive and negative)\n",
        "positive_words = ['love', 'awesome', 'great', 'happy', 'enjoyed', 'fantastic', 'pleased', 'thrilled', 'recommend']\n",
        "negative_words = ['hate', 'terrible', 'worst', 'never', 'disappointed', 'horrible', 'awful', 'frustrating']\n",
        "\n",
        "# Step 2: Function to calculate polarity score for each document\n",
        "def polarity_score(text):\n",
        "    words = nltk.word_tokenize(text.lower())  # Tokenize the text\n",
        "    positive_count = sum(1 for word in words if word in positive_words)\n",
        "    negative_count = sum(1 for word in words if word in negative_words)\n",
        "\n",
        "    # Classify based on word counts\n",
        "    if positive_count > negative_count:\n",
        "        return 'positive'\n",
        "    elif negative_count > positive_count:\n",
        "        return 'negative'\n",
        "    else:\n",
        "        return 'neutral'\n",
        "\n",
        "# Step 3: Apply polarity_score function to the dataset\n",
        "df['predicted_label'] = df['text'].apply(polarity_score)\n",
        "\n",
        "# Step 4: Train a classification model (Naive Bayes for simplicity)\n",
        "X = df['text']\n",
        "y = df['label']\n",
        "\n",
        "# Vectorize the text data\n",
        "vectorizer = CountVectorizer(stop_words='english')\n",
        "X_vec = vectorizer.fit_transform(X)\n",
        "\n",
        "# Train-test split\n",
        "X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.3, random_state=42)\n",
        "\n",
        "# Train Naive Bayes classifier\n",
        "classifier = MultinomialNB()\n",
        "classifier.fit(X_train, y_train)\n",
        "\n",
        "# Step 5: Predict on the test set\n",
        "y_pred = classifier.predict(X_test)\n",
        "\n",
        "# Step 6: Evaluate the classifier\n",
        "print(\"Accuracy:\", accuracy_score(y_test, y_pred))\n",
        "print(\"Classification Report:\")\n",
        "print(classification_report(y_test, y_pred))\n",
        "\n",
        "# Example of classifying a new document\n",
        "new_document = \"I absolutely love this new phone, it's perfect!\"\n",
        "new_document_vec = vectorizer.transform([new_document])\n",
        "prediction = classifier.predict(new_document_vec)\n",
        "print(f\"The sentiment of the document is: {prediction[0]}\")\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "OKeImZ4dNitU",
        "outputId": "37e7aa1b-0cad-4f28-fffa-7385a3d9d22c"
      },
      "execution_count": 27,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: nltk in /usr/local/lib/python3.11/dist-packages (3.9.1)\n",
            "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk) (8.1.8)\n",
            "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk) (1.4.2)\n",
            "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk) (2024.11.6)\n",
            "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk) (4.67.1)\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
            "[nltk_data]   Package punkt is already up-to-date!\n",
            "[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n",
            "[nltk_data]   Unzipping tokenizers/punkt_tab.zip.\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Accuracy: 0.0\n",
            "Classification Report:\n",
            "              precision    recall  f1-score   support\n",
            "\n",
            "    negative       0.00      0.00      0.00       2.0\n",
            "    positive       0.00      0.00      0.00       1.0\n",
            "\n",
            "    accuracy                           0.00       3.0\n",
            "   macro avg       0.00      0.00      0.00       3.0\n",
            "weighted avg       0.00      0.00      0.00       3.0\n",
            "\n",
            "The sentiment of the document is: positive\n"
          ]
        }
      ]
    }
  ]
}