Project Code.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import subprocess
subprocess.check_call(['pip','install','xlrd'])
# %matplotlib inline
from scipy.stats import zscore # Outiers
from sklearn.preprocessing import StandardScaler # normalization
from imblearn.over_sampling import RandomOverSampler # Class Imbalance
from sklearn.linear_model import Lasso # Feature Selection
from sklearn.decomposition import PCA # Dimensionality Reduction
from sklearn.model_selection import train_test_split # split train and test data
# models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV # Hyperparameter tuning
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold # Model Evaluation
from sklearn.metrics import balanced_accuracy_score # Model Evaluation
import warnings
import joblib
# Suppress all warnings
warnings.filterwarnings("ignore")
# Data Loading
dataLoad = pd.read_excel("TrainDataset2023.xls")
df_raw = pd.DataFrame(dataLoad)
""" DATA PREPARATION """
## drop unwanted columns
df_raw = df_raw.drop('ID', axis=1)
## Handling missing values
df_raw.replace(999, np.nan, inplace=True)
df_raw.fillna(df_raw.mean().round().astype(int), inplace=True)
## Handling outliers using Z-score
columns_to_zscore = df_raw.columns[2:]
# Calculate z-scores for the selected columns
z_scores= df_raw[columns_to_zscore].apply(zscore)
df_zscore = df_raw[(z_scores > -3) & (z_scores < 3)]
df_zscore[['pCR (outcome)', 'RelapseFreeSurvival (outcome)']]=df_raw[['pCR (outcome)', 'RelapseFreeSurvival (outcome)']]
df = df_zscore
df_zscore.head()
## Handling categorical variables
# List of categorical variables 'ER', 'PgR', 'HER2','TrippleNegative','ChemoGrade','Proliferation','HistologyType','LNStatus','TumourStage'.
#Converting Categorical variables to One-Hot encoded vector
# Perform one-hot encoding for multiple columns
df = pd.get_dummies(df, columns=['ER', 'PgR', 'HER2','TrippleNegative','ChemoGrade','Proliferation','HistologyType','LNStatus','TumourStage'])
## Scaling / Normalization of data
scaler = StandardScaler()
df[df.columns[2:]] = scaler.fit_transform(df[df.columns[2:]])
df.fillna(df.mean().round().astype(int), inplace=True)
## Spliting data into X, y vectors
data = df.drop(['pCR (outcome)', 'RelapseFreeSurvival (outcome)'],axis=1)
X = df.drop(['pCR (outcome)', 'RelapseFreeSurvival (outcome)'],axis=1).values
y = df['pCR (outcome)'].values
## Handling Class Imbalances
# OverSampling
# Create a RandomOverSampler instance
ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
# Fit and apply the random over-sampling
X_resampled, y_resampled = ros.fit_resample(X, y)
# Convert the resampled dataset into a DataFrame
df_resampled = pd.DataFrame(X_resampled, columns=data.columns)
df_resampled['pCR (outcome)'] = y_resampled
df = df_resampled
X,y=X_resampled, y_resampled
""" FEATURE ENGINEERING """
"""Feature Selection"""
# Using Embedded method LASSO
# Assuming 'X' is your feature matrix and 'y' is your target variable
model = Lasso(alpha=0.01) # You may need to tune the alpha parameter
model.fit(X, y)
# Access coefficients and identify non-zero coefficients as selected features
selected_feature_names = df.columns[np.where(model.coef_ != 0)]
# Reframing the data with selected columns
y = df['pCR (outcome)'].values
df=df[selected_feature_names]
X = df.values
FS_list = list(selected_feature_names)
FS_df = pd.DataFrame({'Selected Columns': FS_list})
FS_df.to_csv('selectedColumnsFS_PCR.csv')
# """ Dimensionality Reduction """
# Using PCA
# Assuming 'X' is your feature matrix
n_components = 10 # Choose the number of components
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)
X=X_pca
# Spliting data into Train and Test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
""" MODEL SELECTION """
model = DecisionTreeClassifier()
param_grid = {
'criterion': ['gini', 'entropy'],
'splitter': ['best'],
'max_depth': [30, 45, 50],
'min_samples_split': [2,4],
'min_samples_leaf': [1,2],
'max_features': ['auto']
}
""" HYPERPARAMETER TUNING """
# Define the classifier
classifier = model
# Create the GridSearchCV object
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='balanced_accuracy', n_jobs=-1)
# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)
# Print the best hyperparameters found
print("Best Hyperparameters DecisionTreeClassifier:", grid_search.best_params_)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(best_score)
""" MODEL EVALUATION - Kfold CROSS VALIDATION """
print("MODEL EVALUATION - Kfold CROSS VALIDATION - DecisionTreeClassifier")
# Define the classifier
classifier = DecisionTreeClassifier(**best_params)
# Define the number of folds for cross-validation
num_folds = 5
# Lists to store training and validation accuracies for each fold
train_scores = []
cv_scores = []
# Perform k-fold cross-validation
cv = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
# Iterate over folds
for i, (train_idx, test_idx) in enumerate(cv.split(X, y), 1):
# Split the data into training and validation sets
X_train, X_val = X[train_idx], X[test_idx]
y_train, y_val = y[train_idx], y[test_idx]
# Fit the model on the training data
classifier.fit(X_train, y_train)
# Get the training balanced accuracy
train_preds = classifier.predict(X_train)
train_accuracy = balanced_accuracy_score(y_train, train_preds)
train_scores.append(train_accuracy)
# Get the validation balanced accuracy
cv_preds = classifier.predict(X_val)
cv_accuracy = balanced_accuracy_score(y_val, cv_preds)
cv_scores.append(cv_accuracy)
# Plot the comparison line chart
plt.figure(figsize=(8, 6))
# Plot the training accuracies as a line
plt.plot(range(1, num_folds + 1), train_scores, marker='o', linestyle='-', color='green', label='Training Accuracy')
# Plot the validation accuracies as bars
plt.plot(range(1, num_folds + 1), cv_scores, color='blue', alpha=0.7, label='Validation Accuracy')
plt.xlabel('Fold Number')
plt.ylabel('Balanced Accuracy')
plt.title('Training and Validation Performance-DecisionTreeClassifier')
plt.legend()
plt.show()
# y_pred = classifier.predict(X_test)
# bal_accuracy = balanced_accuracy_score(y_test, y_pred)
# print(f'Test Balanced Accuracy Decision Tree: {bal_accuracy:.4f}')
y_pred = classifier.predict(X_test)
bal_accuracy = balanced_accuracy_score(y_test, y_pred)
print(f'Test Balanced Accuracy Decision Tree: {bal_accuracy:.4f}')
joblib.dump(classifier, 'PCR_model_C4_29.pkl')