#######LAB ASSIGNMENT 1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
# Load dataset (replace with actual path or URL)
df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv')
print(df.head())
X = df.drop(['survived'], axis=1)
y = df['survived']
df.fillna(df.mean(numeric_only=True), inplace=True)
print(df.isnull().sum())
categorical_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
print(X.head())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print("X_train_scaled sample:\n", X_train[:5])
print("y_train sample:\n", y_train[:5])




#######LAB ASSIGNMENT 2
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, accuracy_score
import matplotlib.pyplot as plt
import graphviz
df=pd.read_csv('student_marksheet.csv')
# Replace missing values in subject marks with the mean of each column
df.fillna(df.mean(numeric_only=True), inplace=True)
# Verify that there are no missing values
df.isnull().sum()
#Create Binary Target Variable
# Define a pass threshold for 'Total Marks' to classify as Pass or Fail
pass_threshold = 340
df['Pass/Fail'] = (df['Total Marks'] > pass_threshold).astype(int)
# Check the distribution of the target variable
df['Pass/Fail'].value_counts()
#Split Data into Features and Target
# Select features and target
features = ['Subject1 Marks', 'Subject2 Marks', 'Subject3 Marks', 'Subject4 Marks']
X = df[features]
y = df['Pass/Fail']
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#Train a Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)
#Predict Probabilities, Generate ROC Curve, and Display AUC
# Predict probabilities for the test set
y_prob = model.predict_proba(X_test)[:, 1]
# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', label='Random guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid()
plt.show()
# Print the AUC value
print(f'Area Under the Curve (AUC): {roc_auc:.2f}')
#Calculate Model Accuracy
# Predict class labels
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')




########LAB ASSIGNMENT 3
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, roc_curve, auc
# Load the dataset
df = pd.read_csv("diabetes.csv")
#Features
X = df[["Glucose", "BloodPressure", "BMI", "Age"]]
y = df["Outcome"]
# Spliting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#Parameters tuning
param_grid = {
    'max_depth': [3, 5, 10, 12],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}
# Initialize the Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)
# GridSearch
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
# Get the best parameters and train the final model
best_params = grid_search.best_params_
best_clf = DecisionTreeClassifier(**best_params, random_state=42)
best_clf.fit(X_train, y_train)
# Evaluate the model
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Best Parameters: {best_params}")
print(f"Test Accuracy: {accuracy:.4f}")
# Plot the decision tree
plt.figure(figsize=(12, 8))
plot_tree(best_clf, feature_names=X.columns, class_names=["No Diabetes", "Diabetes"], filled=True, rounded=True)
plt.show()
# Compute ROC curve and AUC
y_score = best_clf.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()




########LAB ASSIGNMENT 4
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.svm import SVC
# Load the datasets
cancer = load_breast_cancer()
X = cancer.data[:, :2]
y = cancer.target
#Build the model
svm = SVC(kernel="rbf", gamma=0.5, C=1.0)
# Trained the modela
svm.fit(X, y)
# Plot Decision Boundary
DecisionBoundaryDisplay.from_estimator(
        svm,
        X,
        response_method="predict",
        cmap=plt.cm.Spectral,
        alpha=0.8,
        xlabel=cancer.feature_names[0],
        ylabel=cancer.feature_names[1],
    )
# Scatter plot
plt.scatter(X[:, 0], X[:, 1],
            c=y,
            s=20, edgecolors="k")
plt.show()




########LAB ASSIGNMENT 5
# Step 1: Importing Libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Step 2: Load the Iris dataset
iris = load_iris()
data = pd.DataFrame(data=iris.data, columns=iris.feature_names)
data['target'] = iris.target
# Step 3: Displaying first 5 rows
data.head()
# Step 4: Independent (X) and Dependent (y) variables
X = data.iloc[:, :-1]  # All feature columns
y = data.iloc[:, -1]   # Target column
# Step 5: Check and handle missing values
X.fillna(X.mean(), inplace=True)
# Step 6: Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Step 7: Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
# Step 8: Creating Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=42)
# Step 9: Train the model
rf_model.fit(X_train, y_train)
# Step 10: Predict on test data
y_pred = rf_model.predict(X_test)
# Step 11: Performance Analysis
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))




########LAB ASSIGNMENT 6
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN
from sklearn.metrics import silhouette_score
# Generating synthetic dataset
X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=42)
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Visualize data
plt.scatter(X_scaled[:, 0], X_scaled[:, 1])
plt.title("Sample Clustering Dataset")
plt.show()
# K-Means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled)
# Visualization
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=kmeans_labels, cmap='viridis')
plt.title("K-Means Clustering")
plt.show()
# Evaluation
print("K-Means Silhouette Score:", silhouette_score(X_scaled, kmeans_labels))
# Spectral clustering
spectral = SpectralClustering(n_clusters=4, affinity='nearest_neighbors', random_state=42)
spectral_labels = spectral.fit_predict(X_scaled)
# Visualization
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=spectral_labels, cmap='plasma')
plt.title("Spectral Clustering")
plt.show()
# Evaluation
print("Spectral Clustering Silhouette Score:", silhouette_score(X_scaled, spectral_labels))
# DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled)
# Visualization
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=dbscan_labels, cmap='rainbow')
plt.title("DBSCAN Clustering")
plt.show()
# Evaluation
print("DBSCAN Silhouette Score (excluding noise):", silhouette_score(X_scaled[dbscan_labels != -1], dbscan_labels[dbscan_labels != -1]))




########LAB ASSIGNMENT 7
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report
# Load Dataset (Graduate Admission Prediction)
url = "https://raw.githubusercontent.com/divyansha1115/Graduate-Admission-Prediction/master/Admission_Predict.csv"
data = pd.read_csv(url)
data.head()
#  Data Preprocessing
data = data.drop(columns=['Serial No.'])
X = data.drop("Chance of Admit ", axis=1)
y = data["Chance of Admit "]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#  Linear Regression
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
y_pred_lin = lin_model.predict(X_test)
print("Linear Regression Performance:")
print("MSE:", mean_squared_error(y_test, y_pred_lin))
print("R2 Score:", r2_score(y_test, y_pred_lin))
#  Linear Regression Visualization - Predicted vs Actual
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred_lin, color='dodgerblue', alpha=0.7)
plt.plot([0, 1], [0, 1], 'r--')  # reference line
plt.xlabel("Actual Chance of Admit")
plt.ylabel("Predicted Chance of Admit")
plt.title("Linear Regression: Actual vs Predicted")
plt.grid(True)
plt.show()
#  Convert target into binary classification
y_class = np.where(y >= 0.75, 1, 0)
y_train_class = np.where(y_train >= 0.75, 1, 0)
y_test_class = np.where(y_test >= 0.75, 1, 0)
# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train_class)
y_pred_class = log_model.predict(X_test)
#  Confusion Matrix Heatmap
cm = confusion_matrix(y_test_class, y_pred_class)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Admitted', 'Admitted'], yticklabels=['Not Admitted', 'Admitted'])
plt.title("Logistic Regression - Confusion Matrix")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()
#  Ridge Regression (L2 Regularization)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)
print("Ridge Regression Performance:")
print("MSE:", mean_squared_error(y_test, y_pred_ridge))
print("R2 Score:", r2_score(y_test, y_pred_ridge))
# Lasso Regression (L1 Regularization)
lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
print("Lasso Regression Performance:")
print("MSE:", mean_squared_error(y_test, y_pred_lasso))
print("R2 Score:", r2_score(y_test, y_pred_lasso))
#  Visualizing Underfitting / Overfitting using Polynomial Degree
from sklearn.preprocessing import PolynomialFeatures
train_score = []
test_score = []
for d in range(1, 10):
    poly = PolynomialFeatures(degree=d)
    X_poly_train = poly.fit_transform(X_train)
    X_poly_test = poly.transform(X_test)
    model = LinearRegression()
    model.fit(X_poly_train, y_train)
    train_score.append(model.score(X_poly_train, y_train))
    test_score.append(model.score(X_poly_test, y_test))
plt.figure(figsize=(8,6))
plt.plot(range(1,10), train_score, label='Train Score')
plt.plot(range(1,10), test_score, label='Test Score')
plt.xlabel('Polynomial Degree')
plt.ylabel('R2 Score')
plt.title('Underfitting vs Overfitting Visualization')
plt.legend()
plt.grid(True)
plt.show()
