Project Report.


Project Code.



                import numpy as np
                import pandas as pd
                import matplotlib.pyplot as plt
                import seaborn as sns
                import subprocess
                subprocess.check_call(['pip','install','xlrd'])
                
                # %matplotlib inline
                from scipy.stats import zscore # Outiers
                from sklearn.preprocessing import StandardScaler # normalization
                from imblearn.over_sampling import RandomOverSampler # Class Imbalance
                from sklearn.linear_model import Lasso # Feature Selection
                from sklearn.decomposition import PCA # Dimensionality Reduction
                from sklearn.model_selection import train_test_split # split train and test data
                # models
                from sklearn.tree import DecisionTreeClassifier
                from sklearn.ensemble import AdaBoostClassifier
                from sklearn.linear_model import LogisticRegression
                
                from sklearn.model_selection import GridSearchCV # Hyperparameter tuning
                from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold # Model Evaluation
                from sklearn.metrics import balanced_accuracy_score # Model Evaluation
                import warnings
                import joblib
                
                # Suppress all warnings
                warnings.filterwarnings("ignore")
                # Data Loading
                dataLoad = pd.read_excel("TrainDataset2023.xls")
                df_raw = pd.DataFrame(dataLoad)
                
                """ DATA PREPARATION """
                
                
                ## drop unwanted columns
                df_raw = df_raw.drop('ID', axis=1)
                
                ## Handling missing values
                df_raw.replace(999, np.nan, inplace=True)
                df_raw.fillna(df_raw.mean().round().astype(int), inplace=True)
                
                ## Handling outliers using Z-score
                
                columns_to_zscore = df_raw.columns[2:]
                # Calculate z-scores for the selected columns
                z_scores= df_raw[columns_to_zscore].apply(zscore)
                df_zscore = df_raw[(z_scores > -3) & (z_scores < 3)]
                df_zscore[['pCR (outcome)', 'RelapseFreeSurvival (outcome)']]=df_raw[['pCR (outcome)', 'RelapseFreeSurvival (outcome)']]
                df = df_zscore
                df_zscore.head()
                
                ## Handling categorical variables
                # List of categorical variables 'ER', 'PgR', 'HER2','TrippleNegative','ChemoGrade','Proliferation','HistologyType','LNStatus','TumourStage'.
                #Converting Categorical variables to One-Hot encoded vector
                
                # Perform one-hot encoding for multiple columns
                df = pd.get_dummies(df, columns=['ER', 'PgR', 'HER2','TrippleNegative','ChemoGrade','Proliferation','HistologyType','LNStatus','TumourStage'])
                
                ## Scaling / Normalization of data
                
                scaler = StandardScaler()
                df[df.columns[2:]] = scaler.fit_transform(df[df.columns[2:]])
                
                df.fillna(df.mean().round().astype(int), inplace=True)
                
                ## Spliting data into X, y vectors
                
                data = df.drop(['pCR (outcome)', 'RelapseFreeSurvival (outcome)'],axis=1)
                X = df.drop(['pCR (outcome)', 'RelapseFreeSurvival (outcome)'],axis=1).values
                y = df['pCR (outcome)'].values
                
                ## Handling Class Imbalances
                
                # OverSampling
                
                # Create a RandomOverSampler instance
                ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
                
                # Fit and apply the random over-sampling
                X_resampled, y_resampled = ros.fit_resample(X, y)
                
                # Convert the resampled dataset into a DataFrame
                df_resampled = pd.DataFrame(X_resampled, columns=data.columns)
                df_resampled['pCR (outcome)'] = y_resampled
                
                df = df_resampled
                
                X,y=X_resampled, y_resampled
                
                """ FEATURE ENGINEERING """
                """Feature Selection"""
                
                # Using Embedded method LASSO
                
                # Assuming 'X' is your feature matrix and 'y' is your target variable
                model = Lasso(alpha=0.01)  # You may need to tune the alpha parameter
                model.fit(X, y)
                
                # Access coefficients and identify non-zero coefficients as selected features
                selected_feature_names = df.columns[np.where(model.coef_ != 0)]
                
                # Reframing the data with selected columns
                y = df['pCR (outcome)'].values
                df=df[selected_feature_names]
                X = df.values
                
                FS_list = list(selected_feature_names)
                FS_df = pd.DataFrame({'Selected Columns': FS_list})
                FS_df.to_csv('selectedColumnsFS_PCR.csv')
                
                
                # """ Dimensionality Reduction """
                
                # Using PCA
                
                # Assuming 'X' is your feature matrix
                n_components = 10  # Choose the number of components
                pca = PCA(n_components=n_components)
                X_pca = pca.fit_transform(X)
                X=X_pca
                
                # Spliting data into Train and Test data
                
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                
                
                """ MODEL SELECTION """
                model = DecisionTreeClassifier()
                
                param_grid = {
                    'criterion': ['gini', 'entropy'],
                    'splitter': ['best'],
                    'max_depth': [30, 45, 50],
                    'min_samples_split': [2,4],
                    'min_samples_leaf': [1,2],
                    'max_features': ['auto']
                }
                
                """ HYPERPARAMETER TUNING """
                
                # Define the classifier
                classifier = model
                
                # Create the GridSearchCV object
                grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='balanced_accuracy', n_jobs=-1)
                
                # Fit the GridSearchCV object to the training data
                grid_search.fit(X_train, y_train)
                
                # Print the best hyperparameters found
                print("Best Hyperparameters DecisionTreeClassifier:", grid_search.best_params_)
                best_params = grid_search.best_params_
                best_score = grid_search.best_score_
                print(best_score)
                
                
                """ MODEL EVALUATION - Kfold CROSS VALIDATION """
                print("MODEL EVALUATION - Kfold CROSS VALIDATION - DecisionTreeClassifier")
                # Define the classifier
                classifier = DecisionTreeClassifier(**best_params)
                
                # Define the number of folds for cross-validation
                num_folds = 5
                
                # Lists to store training and validation accuracies for each fold
                train_scores = []
                cv_scores = []
                
                # Perform k-fold cross-validation
                cv = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
                
                # Iterate over folds
                for i, (train_idx, test_idx) in enumerate(cv.split(X, y), 1):
                    # Split the data into training and validation sets
                    X_train, X_val = X[train_idx], X[test_idx]
                    y_train, y_val = y[train_idx], y[test_idx]
                
                    # Fit the model on the training data
                    classifier.fit(X_train, y_train)
                
                    # Get the training balanced accuracy
                    train_preds = classifier.predict(X_train)
                    train_accuracy = balanced_accuracy_score(y_train, train_preds)
                    train_scores.append(train_accuracy)
                
                    # Get the validation balanced accuracy
                    cv_preds = classifier.predict(X_val)
                    cv_accuracy = balanced_accuracy_score(y_val, cv_preds)
                    cv_scores.append(cv_accuracy)
                
                # Plot the comparison line chart
                plt.figure(figsize=(8, 6))
                
                # Plot the training accuracies as a line
                plt.plot(range(1, num_folds + 1), train_scores, marker='o', linestyle='-', color='green', label='Training Accuracy')
                
                # Plot the validation accuracies as bars
                plt.plot(range(1, num_folds + 1), cv_scores, color='blue', alpha=0.7, label='Validation Accuracy')
                
                plt.xlabel('Fold Number')
                plt.ylabel('Balanced Accuracy')
                plt.title('Training and Validation Performance-DecisionTreeClassifier')
                plt.legend()
                plt.show()
                # y_pred = classifier.predict(X_test)
                # bal_accuracy = balanced_accuracy_score(y_test, y_pred)
                # print(f'Test Balanced Accuracy Decision Tree: {bal_accuracy:.4f}')
                y_pred = classifier.predict(X_test)
                bal_accuracy = balanced_accuracy_score(y_test, y_pred)
                print(f'Test Balanced Accuracy Decision Tree: {bal_accuracy:.4f}')
                joblib.dump(classifier, 'PCR_model_C4_29.pkl')