# loading the test and train datasets that were shuffled and split in the 
# previous R script
test_70original <- read.csv("test_df_70.csv")
train_70original <- read.csv("train_df_70.csv")

sprintf("train dataset is %s by %s", nrow(train_70original), ncol(train_70original))
## [1] "train dataset is 759 by 20568"
sprintf("test dataset is %s by %s", nrow(test_70original), ncol(test_70original))
## [1] "test dataset is 323 by 20568"
# drop columns with more than 15% NA's (in train) and impute median for columns 
# with less than 15% NAs

t70_dropped <- train_70original %>% dplyr::select(where(~mean(is.na(.)) < 0.15))
dropped_col_indices_70 <- which(!names(train_70original) %in% names(t70_dropped))
test_70 <- test_70original[, -dropped_col_indices_70]

# median imputation!
for (dataset in list(t70_dropped, test_70)){
  for (col_name in colnames(dataset)){
    if (is.numeric(dataset[[col_name]])){
      dataset[[col_name]][is.na(dataset[[col_name]])] <- median(dataset[[col_name]], na.rm=TRUE) 
    }
  }
}
# Clean the data by using caret nearZeroVar to drop any mRNA columns that have 
# near zero variance! (note that the mRNA data starts in the 38th row of the dataset)
train_70_nzv_cols <- nearZeroVar((t70_dropped[, -c(1:36)]))
train_70_nzv <- t70_dropped[,-train_70_nzv_cols]

# apply same findings to the test datasets! 
test_70 <- test_70[,-train_70_nzv_cols]

dim(train_70_nzv)
## [1]   759 19107
# establishing outcome as a 0 or 1 factor so then we can perform a wilcoxon test 
# on each column, comparing values in the group 0 outcome to values in the group1 
# outcome. If there is no significant variance, then the column should be dropped

train_70_nzv <- train_70_nzv %>% filter(DSS_STATUS != "") # if the outcome is 
# missing, no use in the patient data
train_70_nzv$outcome <- factor(train_70_nzv$DSS_STATUS)
train_70_nzv$outcome <- ifelse(train_70_nzv$outcome == "0:ALIVE OR DEAD TUMOR FREE", 0, 1)

test_70$outcome <- factor(test_70$DSS_STATUS)
test_70$outcome <- ifelse(test_70$outcome == "0:ALIVE OR DEAD TUMOR FREE", 0, 1)

gene_importance <- data.frame(data.frame("Transcript ID" = character(),
                              "P-val" = numeric(), stringsAsFactors = FALSE))

dropped_col_indices_70 <- c()
# head(train_70_nzv[, 37:ncol(train_70_nzv)])
for (i in 37:ncol(train_70_nzv)) {
  index <- i
  # skip the outcome column
  if (is.numeric(train_70_nzv[, i])){
    # perform Wilcoxon test
    col_data <- train_70_nzv[, index]
    alive <- train_70_nzv[train_70_nzv$outcome == 0, i]
    dead <- train_70_nzv[train_70_nzv$outcome == 1, i]
    wilcox_result <- wilcox.test(alive, dead)
    # drop column if p-value is not significant
    if (!is.nan(wilcox_result$p.value)){
      if (wilcox_result$p.value > 0.05) {
        dropped_col_indices_70 <- c(dropped_col_indices_70, index)
      } else {
        row <- c(colnames(train_70_nzv)[i], wilcox_result$p.value)
        gene_importance <- rbind(gene_importance, row)
      }
    }
  }
}
test_70 <- test_70[,-dropped_col_indices_70]
train_70_nzv_filtered <- train_70_nzv[, -dropped_col_indices_70]
# head(train_70_nzv_filtered)
colnames(gene_importance) <- c("gene_ID", "P.val")
#write.csv(gene_importance, file = "~/Desktop/new_quantified_variance_genes_2.csv", row.names = FALSE)
#write.csv(train_70_nzv_filtered, file = "~/Desktop/train_70.csv", row.names = FALSE)
#write.csv(test_70, file = "~/Desktop/test_70.csv", row.names = FALSE)

train_df <- train_70_nzv_filtered
test_df <- test_70
quantified_variance_genes <- gene_importance
# head(train_df)
# head(test_df)
# head(quantified_variance_genes)
# scatterplot for the two most significant genes, colored by mortality
most_critical_genes <- gene_importance[order(gene_importance$`P.val`), ][c(1,2), ]
sub_matrix <- train_70_nzv_filtered[, c("DSS_STATUS", most_critical_genes$gene_ID[1], most_critical_genes$gene_ID[2])]
# head(sub_matrix)
sub_matrix <- sub_matrix[!rowSums(is.na(sub_matrix)),] #drop NAs 
ggplot(sub_matrix, aes(x=!!sym(most_critical_genes$gene_ID[1]), y=!!sym(most_critical_genes$gene_ID[2]), colour = DSS_STATUS)) + geom_point(position = position_dodge(width = .3))
## Warning: `position_dodge()` requires non-overlapping x intervals

# Histogram of mortality vs alive patients in dataset
ggplot(train_70_nzv_filtered, aes(x=DSS_STATUS, fill=DSS_STATUS)) + geom_bar() + 
  scale_x_discrete(guide = guide_axis(angle = 60))

# Show distribution of gender, age
ggplot(train_70_nzv, aes(x=DSS_STATUS, y=AGE, fill=DSS_STATUS)) + geom_violin() +
  scale_x_discrete(guide = guide_axis(angle = 60))

# stacked bar plot showing mortality by cancer stage 
ggplot(train_70_nzv, aes(x=AJCC_PATHOLOGIC_TUMOR_STAGE, fill=DSS_STATUS)) + geom_bar() + scale_x_discrete(guide = guide_axis(angle = 60))

mRNA_data <- train_70_nzv_filtered
mRNA_data <- mRNA_data %>% filter(SUBTYPE != "")
mRNA_data_numerics <- mRNA_data[, -c(1:28)]

distance_matrix <- dist(mRNA_data_numerics, method = "euclidean")
hclust_results <- hclust(distance_matrix, method = "ward.D2")

# Create dendrogram object
BRCA_dendro <- as.dendrogram(hclust_results)
mRNA_data$SUBTYPE <- as.factor(mRNA_data$SUBTYPE)
subtype <- mRNA_data$SUBTYPE
cols <- rainbow_hcl(length(unique(subtype))) #select a number of colors based on subtype size
col_subtype <- cols[subtype] #make color palette assigning the selected colors to the groups
col_subtype <- col_subtype[order.dendrogram(BRCA_dendro)] 

dend <- BRCA_dendro %>% 
set("leaves_pch", 7)  %>%
set("leaves_cex", 2) %>%
set("leaves_col", col_subtype) %>%
set("labels", c()) %>%
plot(main = "Dendrogram Colored by Subtype")
## Warning in `labels<-.dendrogram`(dend, value = value, ...): The lengths of the
## new labels is shorter than the number of leaves in the dendrogram - labels are
## recycled.
## Warning in rep(new_labels, length.out = leaves_length): 'x' is NULL so the
## result will be NULL
legend("topright", legend = levels(subtype), fill = cols, cex = 0.7)

Now, the same thing but labeling by mortality!

mRNA_data$outcome <- as.factor(mRNA_data$outcome)
outcomes <- mRNA_data$outcome
cols <- rainbow_hcl(length(unique(outcomes))) #select a number of colors based on subtype size
col_outcome <- cols[outcomes] #make color palette assigning the selected colors to the groups
col_outcome <- col_outcome[order.dendrogram(BRCA_dendro)] 
dend <- BRCA_dendro %>% 
set("leaves_pch", 7)  %>%
set("leaves_cex", 2) %>%
set("leaves_col", col_outcome) %>%
set("labels", c()) %>%
plot(main = "Dendrogram Colored by Mortality Outcome")
## Warning in `labels<-.dendrogram`(dend, value = value, ...): The lengths of the
## new labels is shorter than the number of leaves in the dendrogram - labels are
## recycled.
## Warning in rep(new_labels, length.out = leaves_length): 'x' is NULL so the
## result will be NULL
legend("topright", legend = levels(outcomes), fill = cols, cex = 0.7)

# Let's look at the first 40 columns of the data...
colnames(train_df)[1:40]
##  [1] "SUBTYPE"                                   
##  [2] "CANCER_TYPE_ACRONYM"                       
##  [3] "OTHER_PATIENT_ID"                          
##  [4] "AGE"                                       
##  [5] "SEX"                                       
##  [6] "AJCC_PATHOLOGIC_TUMOR_STAGE"               
##  [7] "DAYS_TO_BIRTH"                             
##  [8] "DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS"      
##  [9] "ETHNICITY"                                 
## [10] "FORM_COMPLETION_DATE"                      
## [11] "HISTORY_NEOADJUVANT_TRTYN"                 
## [12] "INFORMED_CONSENT_VERIFIED"                 
## [13] "NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT"   
## [14] "PATH_M_STAGE"                              
## [15] "PATH_N_STAGE"                              
## [16] "PATH_T_STAGE"                              
## [17] "PRIMARY_LYMPH_NODE_PRESENTATION_ASSESSMENT"
## [18] "PRIOR_DX"                                  
## [19] "RACE"                                      
## [20] "RADIATION_THERAPY"                         
## [21] "IN_PANCANPATHWAYS_FREEZE"                  
## [22] "OS_STATUS"                                 
## [23] "OS_MONTHS"                                 
## [24] "DSS_STATUS"                                
## [25] "DSS_MONTHS"                                
## [26] "DFS_STATUS"                                
## [27] "DFS_MONTHS"                                
## [28] "PFS_STATUS"                                
## [29] "X100130426"                                
## [30] "X100133144"                                
## [31] "UBE2Q2P2"                                  
## [32] "HMGB1P1"                                   
## [33] "X10431"                                    
## [34] "X155060"                                   
## [35] "SSX9P"                                     
## [36] "X317712"                                   
## [37] "EFCAB8"                                    
## [38] "X553137"                                   
## [39] "X729884"                                   
## [40] "EFCAB12"
# column 29+ are the mRNA data
train_data = train_df[,29:2311] # only the numerical mRNA features + the output as the last variable
test_data = test_df[,29:2310] # only the numerical mRNA features
test_outcome = test_df[,2311]
# A basic function to print the confusion matrix and its respective metrics
print_metrics <- function(Actual_Values, Predicted_Values) {
  # Confusion Matrix
  cm <- table(Predicted_Values, Actual_Values)
  print(cm)
  # Calculate accuracy
  accuracy <- sum(diag(cm)) / sum(cm)*100
  cat("Accuracy: ",  round(accuracy, 1), "%\n")
  
  # Calculate precision
  precision <- cm[4] / (cm[2] + cm[4])*100
  cat("Precision: ",  round(precision, 1), "%\n")
  
  # Calculate sensitivity
  recall <- cm[4] / (cm[3] + cm[4])*100
  cat("Sensitivity: ",  round(recall, 1), "%\n")
  
  # Calculate specificity
  specificity <- cm[1] / (cm[2] + cm[1])*100
  cat("Specificity: ",  round(specificity, 1), "%\n")
  return(cm)
}
# Cross Validation Function for SVM
svm_cross_validation <- function(input_data, C, K) {
  folds <- createFolds(input_data$outcome, k = K)

  outcomecol = ncol(input_data)
  mean_accuracy = list()
  mean_precision = list()
  mean_sensitivity = list()
  mean_specificity = list()
  
  for (c in C) {
    cat('c is', c)
    acc = list()
    prec = list()
    sens = list()
    spec = list()
    for (k in 1:length(folds)) {
      train_train <- input_data[-folds[[k]], ] 
      validation <- input_data[folds[[k]], -outcomecol]
      validation_outcome = input_data[folds[[k]], outcomecol]
      
      cv_svm_model <- svm(outcome ~ ., data = train_train, kernel = "linear", 
                     type = 'C-classification', cost = c, class.weights="inverse")
      cv_svm_pred <- predict(cv_svm_model, validation)
      cv_svm_cm <- table(validation_outcome[1:length(cv_svm_pred)], cv_svm_pred)
      accuracy <- sum(diag(cv_svm_cm)) / sum(cv_svm_cm)
      precision <- cv_svm_cm[4] / (cv_svm_cm[3] + cv_svm_cm[4])
      sensitivity <- cv_svm_cm[4] / (cv_svm_cm[2] + cv_svm_cm[4])
      specificity <- cv_svm_cm[1] / (cv_svm_cm[3] + cv_svm_cm[1])
      acc <- c(acc, accuracy)
      prec <- c(prec, precision)
      sens <- c(sens, sensitivity)
      spec <- c(spec, specificity)
    }
    mean_accuracy <- c(mean_accuracy, mean(unlist(acc)))
    mean_precision <- c(mean_precision, mean(unlist(prec)))
    mean_sensitivity <- c(mean_sensitivity, mean(unlist(sens)))
    mean_specificity <- c(mean_specificity, mean(unlist(spec)))
  }
  # plotting the SVM cross validation results
  print(mean_sensitivity)
  plot(log10(unlist(C)), mean_accuracy, type = "b", frame = FALSE, pch = 19, 
       col = "red", xlab = "Log(C)", ylab = "", ylim = c(0, 1), xlim = c(-3, 2))
  lines(log10(unlist(C)), mean_sensitivity, type = "b", frame = FALSE, pch = 19, 
       col = "blue") 
  lines(log10(unlist(C)), mean_specificity, type = "b", frame = FALSE, pch = 19, 
       col = "green")
  legend("bottomleft", legend = c("Accuracy", "Sensitivity", "Specificity"), 
         col = c("red", "blue", "green"), lty = 1)
}
# Cross Validation for SVM on the train dataset with 2516 features

C = list(0.001, 0.01, 0.1, 1, 10)
svm_cross_validation(train_data, C, 5)
## c is 0.001
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'MRGPRG' and 'OR5W2' and 'RBMY1J' and 'RBMY2FP' constant. Cannot
## scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.

## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.

## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## c is 0.01
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'MRGPRG' and 'OR5W2' and 'RBMY1J' and 'RBMY2FP' constant. Cannot
## scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.

## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.

## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## c is 0.1
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'MRGPRG' and 'OR5W2' and 'RBMY1J' and 'RBMY2FP' constant. Cannot
## scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.

## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.

## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## c is 1
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'MRGPRG' and 'OR5W2' and 'RBMY1J' and 'RBMY2FP' constant. Cannot
## scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.

## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.

## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## c is 10
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'MRGPRG' and 'OR5W2' and 'RBMY1J' and 'RBMY2FP' constant. Cannot
## scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.

## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.

## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## [[1]]
## [1] 0.2935065
## 
## [[2]]
## [1] 0.6353463
## 
## [[3]]
## [1] 0.4252165
## 
## [[4]]
## [1] 0.2205844
## 
## [[5]]
## [1] 0.2205844
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

# Running SVM with optimal cost value
svm_model <- svm(outcome ~ ., data = train_data, kernel = "linear",
                 type = 'C-classification', cost = 0.01)
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
svm_pred <- predict(svm_model, test_data)
svm_cm = print_metrics(test_outcome, svm_pred)
##                 Actual_Values
## Predicted_Values   0   1
##                0 283  29
##                1  10   1
## Accuracy:  87.9 %
## Precision:  9.1 %
## Sensitivity:  3.3 %
## Specificity:  96.6 %
rf_model <- randomForest(outcome ~ ., data = train_data, na.action=na.roughfix)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
rf_pred <- predict(rf_model, test_data)
rf_pred_binary <- ifelse(rf_pred >= 0.5, 1, 0)
rf_pred_factor <- factor(rf_pred_binary, levels = c(0, 1))
rf_cm = print_metrics(test_outcome, rf_pred_factor)
##                 Actual_Values
## Predicted_Values   0   1
##                0 292  30
##                1   1   0
## Accuracy:  90.4 %
## Precision:  0 %
## Sensitivity:  0 %
## Specificity:  99.7 %
# Feature selection based on Wilcoxon Test p-values

# There are 444 features with p-values smaller that 0.005 
significant_features <- quantified_variance_genes %>% filter(P.val< 0.005) 
significant_features <- data.frame(significant_features[-c(17, 303),1])
colnames(significant_features) <- c("gene")

train_data_filtered <- train_df %>% dplyr::select(significant_features$gene)
test_data_filtered <- test_df %>% dplyr::select(significant_features$gene)
train_data_filtered$outcome = train_df$outcome
# Running SVM with filtered features
svm_model <- svm(outcome ~ ., data = train_data_filtered, kernel = "linear",
                 type = 'C-classification', cost = 0.01)

svm_pred <- predict(svm_model, test_data_filtered)
svm_cm = print_metrics(test_outcome, svm_pred)
##                 Actual_Values
## Predicted_Values   0   1
##                0 281  30
##                1  12   0
## Accuracy:  87 %
## Precision:  0 %
## Sensitivity:  0 %
## Specificity:  95.9 %
# Upsampling the minority (+) claass
outcomecol = ncol(train_data_filtered)
upsample_data <- SMOTE(train_data_filtered[,-outcomecol], 
                       train_data_filtered[,outcomecol], 
                       K = 7, 
                       dup_size=3)
upsampled_train = upsample_data$data
colnames(upsampled_train)[outcomecol] <- "outcome"

print("The data before upsampling:")
## [1] "The data before upsampling:"
table(train_data_filtered$outcome)
## 
##   0   1 
## 686  59
print("The data after upsampling:")
## [1] "The data after upsampling:"
table(upsampled_train$outcome)
## 
##   0   1 
## 686 236
# Cross Validation to find the best C (cost) for SVM
C = list(0.001, 0.01, 0.1, 1, 10, 100)
svm_cross_validation(upsampled_train, C, 5)
## c is 0.001c is 0.01c is 0.1c is 1c is 10c is 100[[1]]
## [1] 0
## 
## [[2]]
## [1] 0.6609929
## 
## [[3]]
## [1] 0.7840426
## 
## [[4]]
## [1] 0.9702128
## 
## [[5]]
## [1] 0.9914894
## 
## [[6]]
## [1] 0.9914894
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

# using the best C (100), for the training on the whole train dataset and testing
# on the test data:
optimal_svm_model <- svm(outcome ~ ., data = upsampled_train, kernel = "linear", 
                   type = 'C-classification', cost = 1, class.weights="inverse")
optimal_svm_pred <- predict(optimal_svm_model, test_data_filtered)
optimal_svm_cm <- print_metrics(test_outcome, optimal_svm_pred)
##                 Actual_Values
## Predicted_Values   0   1
##                0 256  27
##                1  37   3
## Accuracy:  80.2 %
## Precision:  7.5 %
## Sensitivity:  10 %
## Specificity:  87.4 %
# the results show that while the cross validation sensiticity and accuracy were
# really good, the optimal SVM did not perform well on the test dataset.
# running the CV on the whole train dataset, and validating on test dataset
folds <- createFolds(upsampled_train$outcome, k = 5)

outcomecol = ncol(upsampled_train)
mean_accuracy = list()
mean_precision = list()
mean_sensitivity = list()
mean_specificity = list()

C = list(0.001, 0.01, 0.05, 0.1, 1, 10)

for (c in C) {
  cat('c is', c)
  acc = list()
  prec = list()
  sens = list()
  spec = list()
  for (k in 1:length(folds)) {
    cv_svm_model <- svm(outcome ~ ., data = upsampled_train, kernel = "linear", 
                   type = 'C-classification', cost = c, class.weights="inverse")
    cv_svm_pred <- predict(cv_svm_model, test_data_filtered)
    cv_svm_cm <- table(test_outcome[1:length(cv_svm_pred)], cv_svm_pred)
    accuracy <- sum(diag(cv_svm_cm)) / sum(cv_svm_cm)
    precision <- cv_svm_cm[4] / (cv_svm_cm[3] + cv_svm_cm[4])
    sensitivity <- cv_svm_cm[4] / (cv_svm_cm[2] + cv_svm_cm[4])
    specificity <- cv_svm_cm[1] / (cv_svm_cm[3] + cv_svm_cm[1])
    acc <- c(acc, accuracy)
    prec <- c(prec, precision)
    sens <- c(sens, sensitivity)
    spec <- c(spec, specificity)
  }
  mean_accuracy <- c(mean_accuracy, mean(unlist(acc)))
  mean_precision <- c(mean_precision, mean(unlist(prec)))
  mean_sensitivity <- c(mean_sensitivity, mean(unlist(sens)))
  mean_specificity <- c(mean_specificity, mean(unlist(spec)))
}
## c is 0.001c is 0.01c is 0.05c is 0.1c is 1c is 10
# plotting the SVM cross validation results
print(mean_sensitivity)
## [[1]]
## [1] 0
## 
## [[2]]
## [1] 0.3333333
## 
## [[3]]
## [1] 0.2666667
## 
## [[4]]
## [1] 0.2333333
## 
## [[5]]
## [1] 0.1
## 
## [[6]]
## [1] 0.06666667
plot(log10(unlist(C)), mean_accuracy, type = "b", frame = FALSE, pch = 19, 
     col = "red", xlab = "Log(C)", ylab="", ylim = c(0, 1), xlim = c(-3, 1))
lines(log10(unlist(C)), mean_sensitivity, type = "b", frame = FALSE, pch = 19, 
     col = "blue") 
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
lines(log10(unlist(C)), mean_specificity, type = "b", frame = FALSE, pch = 19, 
     col = "green")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
legend("bottomleft", legend = c("Accuracy", "Sensitivity", "Specificity"), 
       col = c("red", "blue", "green"), lty = 1)

# Based on the results abvoe, the best C is 0.05:
optimal_svm_model <- svm(outcome ~ ., data = upsampled_train, kernel = "linear", 
                   type = 'C-classification', cost = 0.05, class.weights="inverse")
optimal_svm_pred <- predict(optimal_svm_model, test_data_filtered)
optimal_svm_cm <- print_metrics(test_outcome, optimal_svm_pred)
##                 Actual_Values
## Predicted_Values   0   1
##                0 208  22
##                1  85   8
## Accuracy:  66.9 %
## Precision:  8.6 %
## Sensitivity:  26.7 %
## Specificity:  71 %
# Cross validation for Random Forrest
repeat_cv <- trainControl(method='repeatedcv', number=5, repeats=3)
forest <- train(outcome ~ ., 
                data=upsampled_train, 
                method='rf', 
                trControl=repeat_cv)

forest$finalModel
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4.45%
## Confusion matrix:
##     0   1 class.error
## 0 686   0   0.0000000
## 1  41 195   0.1737288
# The results of random forrest on train/validation
optimal_rf_cm_train = forest$finalModel$confusion

optimal_rf_cm_train
##     0   1 class.error
## 0 686   0   0.0000000
## 1  41 195   0.1737288
# Print confusion matrix
# Calculate accuracy
accuracy <- sum(diag(optimal_rf_cm_train)) / sum(optimal_rf_cm_train)*100
cat("Accuracy: ",  round(accuracy, 1), "%\n")
## Accuracy:  95.5 %
# Calculate precision
precision <- optimal_rf_cm_train[4] / 
  (optimal_rf_cm_train[3] + optimal_rf_cm_train[4])*100
cat("Precision: ",  round(precision, 1), "%\n")
## Precision:  100 %
# Calculate sensitivity
recall <- optimal_rf_cm_train[4] /
  (optimal_rf_cm_train[2] + optimal_rf_cm_train[4])*100
cat("Sensitivity: ",  round(recall, 1), "%\n")
## Sensitivity:  82.6 %
# Calculate specificity
specificity <- optimal_rf_cm_train[1] / 
  (optimal_rf_cm_train[2] + optimal_rf_cm_train[1])*100
cat("Specificity: ",  round(specificity, 1), "%\n")
## Specificity:  94.4 %
# using the optimal RF configuration on test dataset
optimal_rf_pred <- predict(object=forest, newdata=test_data_filtered)
optimal_rf_cm = print_metrics(test_outcome, optimal_rf_pred)
##                 Actual_Values
## Predicted_Values   0   1
##                0 293  30
##                1   0   0
## Accuracy:  90.7 %
## Precision:  NaN %
## Sensitivity:  0 %
## Specificity:  100 %
var_imp = data.frame(forest$finalModel$importance)
var_imp_df <- data.frame(variable = row.names(var_imp), importance = var_imp[, 1])
var_imp2 = as_tibble(var_imp_df[var_imp_df$importance>2,])
## Create a plot of variable importance
var_imp2 %>% arrange(importance) %>% 
  ggplot(aes(x=reorder(variable, importance), y=importance)) + 
  geom_bar(stat='identity') + 
  coord_flip() + 
  xlab('Variables') +
  labs(title='Random forest variable importance') + 
  theme_minimal() + 
  theme(axis.text = element_text(size = 10), 
        axis.title = element_text(size = 15), 
        plot.title = element_text(size = 20))

# using the most important features of RF, to train SVM again
var_imp_ind = data.frame(var_imp_df[var_imp_df$importance>1,])
train_data_filtered_rf <- upsampled_train %>% dplyr::select(var_imp_ind$variable)
test_data_filtered_rf <- test_data_filtered %>% dplyr::select(var_imp_ind$variable)
train_data_filtered_rf$outcome = upsampled_train$outcome
# running the CV on the whole train dataset, and validating on test dataset
folds <- createFolds(train_data_filtered_rf$outcome, k = 5)

outcomecol = ncol(train_data_filtered_rf)
mean_accuracy = list()
mean_precision = list()
mean_sensitivity = list()
mean_specificity = list()

C = list(0.01, 0.02, 0.03, 0.04, 0.05, 0.1)

for (c in C) {
  cat('c is', c)
  acc = list()
  prec = list()
  sens = list()
  spec = list()
  for (k in 1:length(folds)) {
    cv_svm_model <- svm(outcome ~ ., data = train_data_filtered_rf, kernel = "linear", 
                   type = 'C-classification', cost = c, class.weights="inverse")
    cv_svm_pred <- predict(cv_svm_model, test_data_filtered_rf)
    cv_svm_cm <- table(test_outcome[1:length(cv_svm_pred)], cv_svm_pred)
    accuracy <- sum(diag(cv_svm_cm)) / sum(cv_svm_cm)
    precision <- cv_svm_cm[4] / (cv_svm_cm[3] + cv_svm_cm[4])
    sensitivity <- cv_svm_cm[4] / (cv_svm_cm[2] + cv_svm_cm[4])
    specificity <- cv_svm_cm[1] / (cv_svm_cm[3] + cv_svm_cm[1])
    acc <- c(acc, accuracy)
    prec <- c(prec, precision)
    sens <- c(sens, sensitivity)
    spec <- c(spec, specificity)
  }
  mean_accuracy <- c(mean_accuracy, mean(unlist(acc)))
  mean_precision <- c(mean_precision, mean(unlist(prec)))
  mean_sensitivity <- c(mean_sensitivity, mean(unlist(sens)))
  mean_specificity <- c(mean_specificity, mean(unlist(spec)))
}
## c is 0.01c is 0.02c is 0.03c is 0.04c is 0.05c is 0.1
# plotting the SVM cross validation results
print(mean_sensitivity)
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 0.9666667
## 
## [[3]]
## [1] 0.5
## 
## [[4]]
## [1] 0.4
## 
## [[5]]
## [1] 0.4
## 
## [[6]]
## [1] 0.3666667
plot(log10(unlist(C)), mean_accuracy, type = "b", frame = FALSE, pch = 19, 
     col = "red", xlab = "Log(C)", ylab="", ylim = c(0, 1), xlim = c(-2, -1))
lines(log10(unlist(C)), mean_sensitivity, type = "b", frame = FALSE, pch = 19, 
     col = "blue") 
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
lines(log10(unlist(C)), mean_specificity, type = "b", frame = FALSE, pch = 19, 
     col = "green")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
legend("bottomleft", legend = c("Accuracy", "Sensitivity", "Specificity"), 
       col = c("red", "blue", "green"), lty = 1)

# Cross Validation for SVM on the new filtered dataset (rf important features) 

C = list(0.001, 0.01, 0.1, 1, 10, 100)
svm_cross_validation(train_data_filtered_rf, C, 5)
## c is 0.001c is 0.01c is 0.1c is 1c is 10c is 100[[1]]
## [1] 0
## 
## [[2]]
## [1] 0.2
## 
## [[3]]
## [1] 0.7968972
## 
## [[4]]
## [1] 0.864539
## 
## [[5]]
## [1] 0.9407801
## 
## [[6]]
## [1] 0.970656
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

# Based on the results above, the best C is :
optimal_rf_svm_model <- svm(outcome ~ ., data = train_data_filtered_rf, kernel = "linear", 
                   type = 'C-classification', cost = 100, class.weights="inverse")
optimal_rf_svm_pred <- predict(optimal_rf_svm_model, test_data_filtered_rf)
optimal_rf_svm_cm <- print_metrics(test_outcome, optimal_rf_svm_pred)
##                 Actual_Values
## Predicted_Values   0   1
##                0 235  25
##                1  58   5
## Accuracy:  74.3 %
## Precision:  7.9 %
## Sensitivity:  16.7 %
## Specificity:  80.2 %
# But from domain knowledge, C = 100 is very high. So I put C = 0.03 as it performed
# well on test set before:
optimal_rf_svm_model2 <- svm(outcome ~ ., data = train_data_filtered_rf, kernel = "linear", 
                   type = 'C-classification', cost = 0.03, class.weights="inverse")
optimal_rf_svm_pred2 <- predict(optimal_rf_svm_model2, test_data_filtered_rf)
optimal_rf_svm_cm2 <- print_metrics(test_outcome, optimal_rf_svm_pred2)
##                 Actual_Values
## Predicted_Values   0   1
##                0 174  15
##                1 119  15
## Accuracy:  58.5 %
## Precision:  11.2 %
## Sensitivity:  50 %
## Specificity:  59.4 %
## Applying LDA model on the original data
#(To see depending on variance, how different number of features affect the model)

# Storing initial train_data under a different name
model_data <- train_data

# defining vector of feature subset sizes
feature_sizes <- c(1000, 900, 800, 700, 600, 500, 400, 300, 200, 100)

# initializing vectors to store performance metrics
accuracy_values <- vector("numeric", length(feature_sizes))
sensitivity_values <- vector("numeric", length(feature_sizes))
specificity_values <- vector("numeric", length(feature_sizes))

# performing 5-fold cross-validation
folds <- createFolds(model_data$outcome, k = 5, list = TRUE, returnTrain = FALSE)

# looping over feature subset sizes
for (i in seq_along(feature_sizes)) {
  size <- feature_sizes[i]
  
  # initializing vectors to store performance metrics for the current fold
  fold_accuracy_values <- vector("numeric", length(folds))
  fold_sensitivity_values <- vector("numeric", length(folds))
  fold_specificity_values <- vector("numeric", length(folds))

  # looping over the folds
  for (j in seq_along(folds)) {
    # sub-setting the dataframe with the current fold
    model_data_subset <- model_data[-folds[[j]], ]
    # computing variance for each feature
    variances <- apply(model_data_subset, 2, var)
    # sorting variances in decreasing order and selecting top features
    top_features <- names(sort(variances, decreasing = TRUE)[1:size])
    # sub-setting the dataframe with top features and response variable
    model_data_subset <- model_data_subset[c(top_features, "outcome")]
    
    # fitting LDA model on current fold
    lda_model <- lda(outcome ~ ., data = model_data_subset)
    
    # making predictions on current fold
    test_preds_LDA <- predict(lda_model, newdata = model_data[folds[[j]], top_features])

    # computing performance metrics for the current fold
    cm <- confusionMatrix(table(model_data[folds[[j]], "outcome"], test_preds_LDA$class))
    dimnames(cm$table) <- list("Actual Values" = c("0", "1"),
                               "Predicted Values" = c("0", "1"))
    accuracy <- sum(diag(cm$table)) / sum(cm$table)
    sensitivity <- cm$table[2,2] / sum(cm$table[2,])
    specificity <- cm$table[1,1] / sum(cm$table[1,])
    
    # storing performance metrics for the current fold
    fold_accuracy_values[j] <- accuracy
    fold_sensitivity_values[j] <- sensitivity
    fold_specificity_values[j] <- specificity
  }

  # computing mean performance metrics across the folds
  accuracy_values[i] <- mean(fold_accuracy_values)
  sensitivity_values[i] <- mean(fold_sensitivity_values)
  specificity_values[i] <- mean(fold_specificity_values)

  # printing performance metrics
  cat("Performance metrics for", size, "features (averaged across 5 folds):\n")
  cat("Accuracy:", round(accuracy_values[i], 3), "\n")
  cat("Sensitivity:", round(sensitivity_values[i], 3), "\n")
  cat("Specificity:", round(specificity_values[i], 3), "\n\n")
}
## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in lda.default(x, grouping, ...): variables are collinear
## Performance metrics for 1000 features (averaged across 5 folds):
## Accuracy: 0.852 
## Sensitivity: 0.196 
## Specificity: 0.913
## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in lda.default(x, grouping, ...): variables are collinear
## Performance metrics for 900 features (averaged across 5 folds):
## Accuracy: 0.855 
## Sensitivity: 0.308 
## Specificity: 0.905
## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in lda.default(x, grouping, ...): variables are collinear
## Performance metrics for 800 features (averaged across 5 folds):
## Accuracy: 0.817 
## Sensitivity: 0.329 
## Specificity: 0.862
## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in lda.default(x, grouping, ...): variables are collinear
## Performance metrics for 700 features (averaged across 5 folds):
## Accuracy: 0.765 
## Sensitivity: 0.542 
## Specificity: 0.792
## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in lda.default(x, grouping, ...): variables are collinear
## Performance metrics for 600 features (averaged across 5 folds):
## Accuracy: 0.547 
## Sensitivity: 0.5 
## Specificity: 0.552
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Performance metrics for 500 features (averaged across 5 folds):
## Accuracy: 0.712 
## Sensitivity: 0.423 
## Specificity: 0.741
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Performance metrics for 400 features (averaged across 5 folds):
## Accuracy: 0.823 
## Sensitivity: 0.426 
## Specificity: 0.858
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Performance metrics for 300 features (averaged across 5 folds):
## Accuracy: 0.855 
## Sensitivity: 0.33 
## Specificity: 0.903
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Performance metrics for 200 features (averaged across 5 folds):
## Accuracy: 0.887 
## Sensitivity: 0.301 
## Specificity: 0.941
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Performance metrics for 100 features (averaged across 5 folds):
## Accuracy: 0.888 
## Sensitivity: 0.152 
## Specificity: 0.954
# Storing the final results for plotting
cat("Accuracy values:", paste(round(accuracy_values, 3), collapse = ", "), "\n")
## Accuracy values: 0.852, 0.855, 0.817, 0.765, 0.547, 0.712, 0.823, 0.855, 0.887, 0.888
cat("Sensitivity values:", paste(round(sensitivity_values, 3), collapse = ", "), "\n")
## Sensitivity values: 0.196, 0.308, 0.329, 0.542, 0.5, 0.423, 0.426, 0.33, 0.301, 0.152
cat("Specificity values:", paste(round(specificity_values, 3), collapse = ", "), "\n")
## Specificity values: 0.913, 0.905, 0.862, 0.792, 0.552, 0.741, 0.858, 0.903, 0.941, 0.954
# Plotting the model performance on the for the 5-fold cv
X <- feature_sizes
plot(X, accuracy_values, type = "b", pch = 19,
     col = "red", xlab = "Number of features", ylim = c(0, 1), xlim = c(0, 1000), ylab=" ")
lines(X, sensitivity_values, type = "b", pch = 19,
     col = "blue")
lines(X, specificity_values, type = "b", pch = 19,
     col = "green")
legend("bottom", legend = c("Accuracy", "Sensitivity", "Specificity"), col = c("red", "blue", "green"), lty = 1)

# Specifying the hold-out test data set
holdtest_data <- test_data
holdtest_data_outcome <- test_outcome

accuracy_values_test <- vector("numeric", length(feature_sizes))
sensitivity_values_test <- vector("numeric", length(feature_sizes))
specificity_values_test <- vector("numeric", length(feature_sizes))

# Using loop over feature subset sizes as before
for (i in seq_along(feature_sizes)) {
  size <- feature_sizes[i]
  variances <- apply(model_data, 2, var)
  top_features <- names(sort(variances, decreasing = TRUE)[1:size])
  model_data_subset <- model_data[c(top_features, "outcome")]
  
  folds <- createFolds(model_data_subset$outcome, k = 5, returnTrain = TRUE)
  accs <- c()
  sens <- c()
  specs <- c()
  
  for (j in seq_along(folds)) {
    train_LDA <- model_data_subset[folds[[j]], ]
    lda_model <- lda(outcome ~ ., data = train_LDA)
    
    # making predictions on hold-out test data
    holdtest_preds_LDA <- predict(lda_model, newdata = holdtest_data)
    
    cm <- confusionMatrix(table(holdtest_data_outcome, holdtest_preds_LDA$class))
    dimnames(cm$table) <- list("Actual Values" = c("0", "1"),
                               " Predicted Values" = c("0", "1"))

    accuracy_test <- sum(diag(cm$table)) / sum(cm$table)
    sensitivity_test <- cm$table[2,2] / sum(cm$table[2,])
    specificity_test <- cm$table[1,1] / sum(cm$table[1,])
    accs <- c(accs, accuracy_test)
    sens <- c(sens, sensitivity_test)
    specs <- c(specs, specificity_test)

  }
  
  # Computing average performance metrics over the 5 folds
  accuracy_test <- mean(accs)
  sensitivity_test <- mean(sens)
  specificity_test <- mean(specs)
  
  # storing performance metrics for the cross-validation dataset
  accuracy_values_test[i] <- accuracy_test
  sensitivity_values_test[i] <- sensitivity_test
  specificity_values_test[i] <- specificity_test
  
  # print average performance metrics over the 5 folds
  cat("Average performance metrics for", size, "features (on hold-out test dataset):\n")
  cat("Accuracy:", round(accuracy_test, 3), "\n")
  cat("Sensitivity:", round(sensitivity_test, 3), "\n")
  cat("Specificity:", round(specificity_test, 3), "\n\n")
  
}
## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear
## Average performance metrics for 1000 features (on hold-out test dataset):
## Accuracy: 0.85 
## Sensitivity: 0.187 
## Specificity: 0.917
## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear
## Average performance metrics for 900 features (on hold-out test dataset):
## Accuracy: 0.832 
## Sensitivity: 0.153 
## Specificity: 0.902
## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear
## Average performance metrics for 800 features (on hold-out test dataset):
## Accuracy: 0.805 
## Sensitivity: 0.227 
## Specificity: 0.864
## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear
## Average performance metrics for 700 features (on hold-out test dataset):
## Accuracy: 0.776 
## Sensitivity: 0.28 
## Specificity: 0.827
## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear

## Warning in lda.default(x, grouping, ...): variables are collinear
## Average performance metrics for 600 features (on hold-out test dataset):
## Accuracy: 0.579 
## Sensitivity: 0.467 
## Specificity: 0.59 
## 
## Average performance metrics for 500 features (on hold-out test dataset):
## Accuracy: 0.768 
## Sensitivity: 0.327 
## Specificity: 0.814 
## 
## Average performance metrics for 400 features (on hold-out test dataset):
## Accuracy: 0.817 
## Sensitivity: 0.233 
## Specificity: 0.876 
## 
## Average performance metrics for 300 features (on hold-out test dataset):
## Accuracy: 0.838 
## Sensitivity: 0.18 
## Specificity: 0.906 
## 
## Average performance metrics for 200 features (on hold-out test dataset):
## Accuracy: 0.852 
## Sensitivity: 0.133 
## Specificity: 0.926 
## 
## Average performance metrics for 100 features (on hold-out test dataset):
## Accuracy: 0.871 
## Sensitivity: 0.04 
## Specificity: 0.956
# Storing the final results for different feature sizes in the hold-out test dataset
cat("Accuracy values (test):", paste(round(accuracy_values_test, 3), collapse = ", "), "\n")
## Accuracy values (test): 0.85, 0.832, 0.805, 0.776, 0.579, 0.768, 0.817, 0.838, 0.852, 0.871
cat("Sensitivity values (test):", paste(round(sensitivity_values_test, 3), collapse = ", "), "\n")
## Sensitivity values (test): 0.187, 0.153, 0.227, 0.28, 0.467, 0.327, 0.233, 0.18, 0.133, 0.04
cat("Specificity values (test):", paste(round(specificity_values_test, 3), collapse = ", "), "\n")
## Specificity values (test): 0.917, 0.902, 0.864, 0.827, 0.59, 0.814, 0.876, 0.906, 0.926, 0.956
# Plotting the performance for the hold-out test data
plot(X, accuracy_values_test, type = "b", pch = 19,
     col = "red", xlab = "Number of features", ylim = c(0, 1), xlim = c(0, 1000), ylab=" ")
lines(X, sensitivity_values_test, type = "b", pch = 19,
     col = "blue")
lines(X, specificity_values_test, type = "b", pch = 19,
     col = "green")
legend("left", legend = c("Accuracy", "Sensitivity", "Specificity"), col = c("red", "blue", "green"), lty = 1)

# Trying to see how under-sampling the majority class may influence the LDA model
# with the 100 most variable features (using only 100 features since the sample
# size is reduced due to under-sampling)

# creating indices for the minority class ('1') and majority class ('0')
idx_train_1 <- which(train_df$outcome == 1)
idx_train_0 <- which(train_df$outcome == 0)
idx_test_1 <- which(test_df$outcome == 1)
idx_test_0 <- which(test_df$outcome == 0)

# randomly sampling '0' indices to match the number of '1' indices
idx_train_0_undersampled <- sample(idx_train_0, length(idx_train_1))
idx_test_0_undersampled <- sample(idx_test_0, length(idx_test_1))
# combining the minority and undersampled majority class indices
idx_train_undersampled <- c(idx_train_1, idx_train_0_undersampled)
idx_test_undersampled <- c(idx_test_1, idx_test_0_undersampled)
# creating the undersampled dataframe
train_undersampled <- train_df[idx_train_undersampled, ]
test_undersampled <- test_df[idx_test_undersampled, ]

print("The train data before under-sampling:")
## [1] "The train data before under-sampling:"
table(train_df$outcome)
## 
##   0   1 
## 686  59
print("The train data after under-sampling:")
## [1] "The train data after under-sampling:"
table(train_undersampled$outcome)
## 
##  0  1 
## 59 59
print("The test data before under-sampling:")
## [1] "The test data before under-sampling:"
table(test_df$outcome)
## 
##   0   1 
## 293  30
print("The test data after under-sampling:")
## [1] "The test data after under-sampling:"
table(test_undersampled$outcome)
## 
##  0  1 
## 30 30
train_data_undersampled = train_undersampled[,30:2311] # only the numerical mRNA features + the output as the last variable 
test_data_undersampled = test_undersampled[,30:2310] # only the numerical mRNA features 
test_outcome_undersampled = test_undersampled[,2311]

# buidling the model with 100 features of the largest variance
features <- 100
folds <- createFolds(model_data_subset$outcome, k = 5, returnTrain = TRUE)
variances <- apply(train_data_undersampled, 2, var)
top_100_features <- names(sort(variances, decreasing = TRUE)[1:features])
train_data_undersampled_subset <- train_data_undersampled[c(top_100_features, "outcome")]
test_data_undersampled_subset <- test_data_undersampled[c(top_100_features)]
lda_model_undersampled <- lda(outcome ~ ., data = train_data_undersampled_subset)
undersampled_preds_LDA <- predict(lda_model_undersampled, newdata = test_data_undersampled_subset)
  
# computing performance metrics
cm <- confusionMatrix(table(test_outcome_undersampled, undersampled_preds_LDA$class))
dimnames(cm$table) <- list("Actual Values" = c("0", "1"),
                           " Predicted Values" = c("0", "1"))

accuracy_undersampled <- sum(diag(cm$table)) / sum(cm$table)
sensitivity_undersampled <- cm$table[2,2] / sum(cm$table[2,])
specificity_undersampled <- cm$table[1,1] / sum(cm$table[1,])

cat("Confusion matrix for 100 features (in undersampled dataset):\n")
## Confusion matrix for 100 features (in undersampled dataset):
print(cm$table)
##               Predicted Values
## Actual Values  0  1
##             0 15 15
##             1 13 17
cat("\n")
cat("Performance metrics for 100 features (in undersampled dataset):\n")
## Performance metrics for 100 features (in undersampled dataset):
cat("Accuracy in undersampled data:", round(accuracy_undersampled, 3), "\n")
## Accuracy in undersampled data: 0.533
cat("Sensitivity in undersampled data:", round(sensitivity_undersampled, 3), "\n")
## Sensitivity in undersampled data: 0.567
cat("Specificity in undersampled data:", round(specificity_undersampled, 3), "\n\n")
## Specificity in undersampled data: 0.5