# loading the test and train datasets that were shuffled and split in the
# previous R script
test_70original <- read.csv("test_df_70.csv")
train_70original <- read.csv("train_df_70.csv")
sprintf("train dataset is %s by %s", nrow(train_70original), ncol(train_70original))
## [1] "train dataset is 759 by 20568"
sprintf("test dataset is %s by %s", nrow(test_70original), ncol(test_70original))
## [1] "test dataset is 323 by 20568"
# drop columns with more than 15% NA's (in train) and impute median for columns
# with less than 15% NAs
t70_dropped <- train_70original %>% dplyr::select(where(~mean(is.na(.)) < 0.15))
dropped_col_indices_70 <- which(!names(train_70original) %in% names(t70_dropped))
test_70 <- test_70original[, -dropped_col_indices_70]
# median imputation!
for (dataset in list(t70_dropped, test_70)){
for (col_name in colnames(dataset)){
if (is.numeric(dataset[[col_name]])){
dataset[[col_name]][is.na(dataset[[col_name]])] <- median(dataset[[col_name]], na.rm=TRUE)
}
}
}
# Clean the data by using caret nearZeroVar to drop any mRNA columns that have
# near zero variance! (note that the mRNA data starts in the 38th row of the dataset)
train_70_nzv_cols <- nearZeroVar((t70_dropped[, -c(1:36)]))
train_70_nzv <- t70_dropped[,-train_70_nzv_cols]
# apply same findings to the test datasets!
test_70 <- test_70[,-train_70_nzv_cols]
dim(train_70_nzv)
## [1] 759 19107
# establishing outcome as a 0 or 1 factor so then we can perform a wilcoxon test
# on each column, comparing values in the group 0 outcome to values in the group1
# outcome. If there is no significant variance, then the column should be dropped
train_70_nzv <- train_70_nzv %>% filter(DSS_STATUS != "") # if the outcome is
# missing, no use in the patient data
train_70_nzv$outcome <- factor(train_70_nzv$DSS_STATUS)
train_70_nzv$outcome <- ifelse(train_70_nzv$outcome == "0:ALIVE OR DEAD TUMOR FREE", 0, 1)
test_70$outcome <- factor(test_70$DSS_STATUS)
test_70$outcome <- ifelse(test_70$outcome == "0:ALIVE OR DEAD TUMOR FREE", 0, 1)
gene_importance <- data.frame(data.frame("Transcript ID" = character(),
"P-val" = numeric(), stringsAsFactors = FALSE))
dropped_col_indices_70 <- c()
# head(train_70_nzv[, 37:ncol(train_70_nzv)])
for (i in 37:ncol(train_70_nzv)) {
index <- i
# skip the outcome column
if (is.numeric(train_70_nzv[, i])){
# perform Wilcoxon test
col_data <- train_70_nzv[, index]
alive <- train_70_nzv[train_70_nzv$outcome == 0, i]
dead <- train_70_nzv[train_70_nzv$outcome == 1, i]
wilcox_result <- wilcox.test(alive, dead)
# drop column if p-value is not significant
if (!is.nan(wilcox_result$p.value)){
if (wilcox_result$p.value > 0.05) {
dropped_col_indices_70 <- c(dropped_col_indices_70, index)
} else {
row <- c(colnames(train_70_nzv)[i], wilcox_result$p.value)
gene_importance <- rbind(gene_importance, row)
}
}
}
}
test_70 <- test_70[,-dropped_col_indices_70]
train_70_nzv_filtered <- train_70_nzv[, -dropped_col_indices_70]
# head(train_70_nzv_filtered)
colnames(gene_importance) <- c("gene_ID", "P.val")
#write.csv(gene_importance, file = "~/Desktop/new_quantified_variance_genes_2.csv", row.names = FALSE)
#write.csv(train_70_nzv_filtered, file = "~/Desktop/train_70.csv", row.names = FALSE)
#write.csv(test_70, file = "~/Desktop/test_70.csv", row.names = FALSE)
train_df <- train_70_nzv_filtered
test_df <- test_70
quantified_variance_genes <- gene_importance
# head(train_df)
# head(test_df)
# head(quantified_variance_genes)
# scatterplot for the two most significant genes, colored by mortality
most_critical_genes <- gene_importance[order(gene_importance$`P.val`), ][c(1,2), ]
sub_matrix <- train_70_nzv_filtered[, c("DSS_STATUS", most_critical_genes$gene_ID[1], most_critical_genes$gene_ID[2])]
# head(sub_matrix)
sub_matrix <- sub_matrix[!rowSums(is.na(sub_matrix)),] #drop NAs
ggplot(sub_matrix, aes(x=!!sym(most_critical_genes$gene_ID[1]), y=!!sym(most_critical_genes$gene_ID[2]), colour = DSS_STATUS)) + geom_point(position = position_dodge(width = .3))
## Warning: `position_dodge()` requires non-overlapping x intervals
# Histogram of mortality vs alive patients in dataset
ggplot(train_70_nzv_filtered, aes(x=DSS_STATUS, fill=DSS_STATUS)) + geom_bar() +
scale_x_discrete(guide = guide_axis(angle = 60))
# Show distribution of gender, age
ggplot(train_70_nzv, aes(x=DSS_STATUS, y=AGE, fill=DSS_STATUS)) + geom_violin() +
scale_x_discrete(guide = guide_axis(angle = 60))
# stacked bar plot showing mortality by cancer stage
ggplot(train_70_nzv, aes(x=AJCC_PATHOLOGIC_TUMOR_STAGE, fill=DSS_STATUS)) + geom_bar() + scale_x_discrete(guide = guide_axis(angle = 60))
mRNA_data <- train_70_nzv_filtered
mRNA_data <- mRNA_data %>% filter(SUBTYPE != "")
mRNA_data_numerics <- mRNA_data[, -c(1:28)]
distance_matrix <- dist(mRNA_data_numerics, method = "euclidean")
hclust_results <- hclust(distance_matrix, method = "ward.D2")
# Create dendrogram object
BRCA_dendro <- as.dendrogram(hclust_results)
mRNA_data$SUBTYPE <- as.factor(mRNA_data$SUBTYPE)
subtype <- mRNA_data$SUBTYPE
cols <- rainbow_hcl(length(unique(subtype))) #select a number of colors based on subtype size
col_subtype <- cols[subtype] #make color palette assigning the selected colors to the groups
col_subtype <- col_subtype[order.dendrogram(BRCA_dendro)]
dend <- BRCA_dendro %>%
set("leaves_pch", 7) %>%
set("leaves_cex", 2) %>%
set("leaves_col", col_subtype) %>%
set("labels", c()) %>%
plot(main = "Dendrogram Colored by Subtype")
## Warning in `labels<-.dendrogram`(dend, value = value, ...): The lengths of the
## new labels is shorter than the number of leaves in the dendrogram - labels are
## recycled.
## Warning in rep(new_labels, length.out = leaves_length): 'x' is NULL so the
## result will be NULL
legend("topright", legend = levels(subtype), fill = cols, cex = 0.7)
Now, the same thing but labeling by mortality!
mRNA_data$outcome <- as.factor(mRNA_data$outcome)
outcomes <- mRNA_data$outcome
cols <- rainbow_hcl(length(unique(outcomes))) #select a number of colors based on subtype size
col_outcome <- cols[outcomes] #make color palette assigning the selected colors to the groups
col_outcome <- col_outcome[order.dendrogram(BRCA_dendro)]
dend <- BRCA_dendro %>%
set("leaves_pch", 7) %>%
set("leaves_cex", 2) %>%
set("leaves_col", col_outcome) %>%
set("labels", c()) %>%
plot(main = "Dendrogram Colored by Mortality Outcome")
## Warning in `labels<-.dendrogram`(dend, value = value, ...): The lengths of the
## new labels is shorter than the number of leaves in the dendrogram - labels are
## recycled.
## Warning in rep(new_labels, length.out = leaves_length): 'x' is NULL so the
## result will be NULL
legend("topright", legend = levels(outcomes), fill = cols, cex = 0.7)
# Let's look at the first 40 columns of the data...
colnames(train_df)[1:40]
## [1] "SUBTYPE"
## [2] "CANCER_TYPE_ACRONYM"
## [3] "OTHER_PATIENT_ID"
## [4] "AGE"
## [5] "SEX"
## [6] "AJCC_PATHOLOGIC_TUMOR_STAGE"
## [7] "DAYS_TO_BIRTH"
## [8] "DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS"
## [9] "ETHNICITY"
## [10] "FORM_COMPLETION_DATE"
## [11] "HISTORY_NEOADJUVANT_TRTYN"
## [12] "INFORMED_CONSENT_VERIFIED"
## [13] "NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT"
## [14] "PATH_M_STAGE"
## [15] "PATH_N_STAGE"
## [16] "PATH_T_STAGE"
## [17] "PRIMARY_LYMPH_NODE_PRESENTATION_ASSESSMENT"
## [18] "PRIOR_DX"
## [19] "RACE"
## [20] "RADIATION_THERAPY"
## [21] "IN_PANCANPATHWAYS_FREEZE"
## [22] "OS_STATUS"
## [23] "OS_MONTHS"
## [24] "DSS_STATUS"
## [25] "DSS_MONTHS"
## [26] "DFS_STATUS"
## [27] "DFS_MONTHS"
## [28] "PFS_STATUS"
## [29] "X100130426"
## [30] "X100133144"
## [31] "UBE2Q2P2"
## [32] "HMGB1P1"
## [33] "X10431"
## [34] "X155060"
## [35] "SSX9P"
## [36] "X317712"
## [37] "EFCAB8"
## [38] "X553137"
## [39] "X729884"
## [40] "EFCAB12"
# column 29+ are the mRNA data
train_data = train_df[,29:2311] # only the numerical mRNA features + the output as the last variable
test_data = test_df[,29:2310] # only the numerical mRNA features
test_outcome = test_df[,2311]
# A basic function to print the confusion matrix and its respective metrics
print_metrics <- function(Actual_Values, Predicted_Values) {
# Confusion Matrix
cm <- table(Predicted_Values, Actual_Values)
print(cm)
# Calculate accuracy
accuracy <- sum(diag(cm)) / sum(cm)*100
cat("Accuracy: ", round(accuracy, 1), "%\n")
# Calculate precision
precision <- cm[4] / (cm[2] + cm[4])*100
cat("Precision: ", round(precision, 1), "%\n")
# Calculate sensitivity
recall <- cm[4] / (cm[3] + cm[4])*100
cat("Sensitivity: ", round(recall, 1), "%\n")
# Calculate specificity
specificity <- cm[1] / (cm[2] + cm[1])*100
cat("Specificity: ", round(specificity, 1), "%\n")
return(cm)
}
# Cross Validation Function for SVM
svm_cross_validation <- function(input_data, C, K) {
folds <- createFolds(input_data$outcome, k = K)
outcomecol = ncol(input_data)
mean_accuracy = list()
mean_precision = list()
mean_sensitivity = list()
mean_specificity = list()
for (c in C) {
cat('c is', c)
acc = list()
prec = list()
sens = list()
spec = list()
for (k in 1:length(folds)) {
train_train <- input_data[-folds[[k]], ]
validation <- input_data[folds[[k]], -outcomecol]
validation_outcome = input_data[folds[[k]], outcomecol]
cv_svm_model <- svm(outcome ~ ., data = train_train, kernel = "linear",
type = 'C-classification', cost = c, class.weights="inverse")
cv_svm_pred <- predict(cv_svm_model, validation)
cv_svm_cm <- table(validation_outcome[1:length(cv_svm_pred)], cv_svm_pred)
accuracy <- sum(diag(cv_svm_cm)) / sum(cv_svm_cm)
precision <- cv_svm_cm[4] / (cv_svm_cm[3] + cv_svm_cm[4])
sensitivity <- cv_svm_cm[4] / (cv_svm_cm[2] + cv_svm_cm[4])
specificity <- cv_svm_cm[1] / (cv_svm_cm[3] + cv_svm_cm[1])
acc <- c(acc, accuracy)
prec <- c(prec, precision)
sens <- c(sens, sensitivity)
spec <- c(spec, specificity)
}
mean_accuracy <- c(mean_accuracy, mean(unlist(acc)))
mean_precision <- c(mean_precision, mean(unlist(prec)))
mean_sensitivity <- c(mean_sensitivity, mean(unlist(sens)))
mean_specificity <- c(mean_specificity, mean(unlist(spec)))
}
# plotting the SVM cross validation results
print(mean_sensitivity)
plot(log10(unlist(C)), mean_accuracy, type = "b", frame = FALSE, pch = 19,
col = "red", xlab = "Log(C)", ylab = "", ylim = c(0, 1), xlim = c(-3, 2))
lines(log10(unlist(C)), mean_sensitivity, type = "b", frame = FALSE, pch = 19,
col = "blue")
lines(log10(unlist(C)), mean_specificity, type = "b", frame = FALSE, pch = 19,
col = "green")
legend("bottomleft", legend = c("Accuracy", "Sensitivity", "Specificity"),
col = c("red", "blue", "green"), lty = 1)
}
# Cross Validation for SVM on the train dataset with 2516 features
C = list(0.001, 0.01, 0.1, 1, 10)
svm_cross_validation(train_data, C, 5)
## c is 0.001
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'MRGPRG' and 'OR5W2' and 'RBMY1J' and 'RBMY2FP' constant. Cannot
## scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## c is 0.01
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'MRGPRG' and 'OR5W2' and 'RBMY1J' and 'RBMY2FP' constant. Cannot
## scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## c is 0.1
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'MRGPRG' and 'OR5W2' and 'RBMY1J' and 'RBMY2FP' constant. Cannot
## scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## c is 1
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'MRGPRG' and 'OR5W2' and 'RBMY1J' and 'RBMY2FP' constant. Cannot
## scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## c is 10
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'MRGPRG' and 'OR5W2' and 'RBMY1J' and 'RBMY2FP' constant. Cannot
## scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
## [[1]]
## [1] 0.2935065
##
## [[2]]
## [1] 0.6353463
##
## [[3]]
## [1] 0.4252165
##
## [[4]]
## [1] 0.2205844
##
## [[5]]
## [1] 0.2205844
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
# Running SVM with optimal cost value
svm_model <- svm(outcome ~ ., data = train_data, kernel = "linear",
type = 'C-classification', cost = 0.01)
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'OR5W2' constant. Cannot scale data.
svm_pred <- predict(svm_model, test_data)
svm_cm = print_metrics(test_outcome, svm_pred)
## Actual_Values
## Predicted_Values 0 1
## 0 283 29
## 1 10 1
## Accuracy: 87.9 %
## Precision: 9.1 %
## Sensitivity: 3.3 %
## Specificity: 96.6 %
rf_model <- randomForest(outcome ~ ., data = train_data, na.action=na.roughfix)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
rf_pred <- predict(rf_model, test_data)
rf_pred_binary <- ifelse(rf_pred >= 0.5, 1, 0)
rf_pred_factor <- factor(rf_pred_binary, levels = c(0, 1))
rf_cm = print_metrics(test_outcome, rf_pred_factor)
## Actual_Values
## Predicted_Values 0 1
## 0 292 30
## 1 1 0
## Accuracy: 90.4 %
## Precision: 0 %
## Sensitivity: 0 %
## Specificity: 99.7 %
# Feature selection based on Wilcoxon Test p-values
# There are 444 features with p-values smaller that 0.005
significant_features <- quantified_variance_genes %>% filter(P.val< 0.005)
significant_features <- data.frame(significant_features[-c(17, 303),1])
colnames(significant_features) <- c("gene")
train_data_filtered <- train_df %>% dplyr::select(significant_features$gene)
test_data_filtered <- test_df %>% dplyr::select(significant_features$gene)
train_data_filtered$outcome = train_df$outcome
# Running SVM with filtered features
svm_model <- svm(outcome ~ ., data = train_data_filtered, kernel = "linear",
type = 'C-classification', cost = 0.01)
svm_pred <- predict(svm_model, test_data_filtered)
svm_cm = print_metrics(test_outcome, svm_pred)
## Actual_Values
## Predicted_Values 0 1
## 0 281 30
## 1 12 0
## Accuracy: 87 %
## Precision: 0 %
## Sensitivity: 0 %
## Specificity: 95.9 %
# Upsampling the minority (+) claass
outcomecol = ncol(train_data_filtered)
upsample_data <- SMOTE(train_data_filtered[,-outcomecol],
train_data_filtered[,outcomecol],
K = 7,
dup_size=3)
upsampled_train = upsample_data$data
colnames(upsampled_train)[outcomecol] <- "outcome"
print("The data before upsampling:")
## [1] "The data before upsampling:"
table(train_data_filtered$outcome)
##
## 0 1
## 686 59
print("The data after upsampling:")
## [1] "The data after upsampling:"
table(upsampled_train$outcome)
##
## 0 1
## 686 236
# Cross Validation to find the best C (cost) for SVM
C = list(0.001, 0.01, 0.1, 1, 10, 100)
svm_cross_validation(upsampled_train, C, 5)
## c is 0.001c is 0.01c is 0.1c is 1c is 10c is 100[[1]]
## [1] 0
##
## [[2]]
## [1] 0.6609929
##
## [[3]]
## [1] 0.7840426
##
## [[4]]
## [1] 0.9702128
##
## [[5]]
## [1] 0.9914894
##
## [[6]]
## [1] 0.9914894
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
# using the best C (100), for the training on the whole train dataset and testing
# on the test data:
optimal_svm_model <- svm(outcome ~ ., data = upsampled_train, kernel = "linear",
type = 'C-classification', cost = 1, class.weights="inverse")
optimal_svm_pred <- predict(optimal_svm_model, test_data_filtered)
optimal_svm_cm <- print_metrics(test_outcome, optimal_svm_pred)
## Actual_Values
## Predicted_Values 0 1
## 0 256 27
## 1 37 3
## Accuracy: 80.2 %
## Precision: 7.5 %
## Sensitivity: 10 %
## Specificity: 87.4 %
# the results show that while the cross validation sensiticity and accuracy were
# really good, the optimal SVM did not perform well on the test dataset.
# running the CV on the whole train dataset, and validating on test dataset
folds <- createFolds(upsampled_train$outcome, k = 5)
outcomecol = ncol(upsampled_train)
mean_accuracy = list()
mean_precision = list()
mean_sensitivity = list()
mean_specificity = list()
C = list(0.001, 0.01, 0.05, 0.1, 1, 10)
for (c in C) {
cat('c is', c)
acc = list()
prec = list()
sens = list()
spec = list()
for (k in 1:length(folds)) {
cv_svm_model <- svm(outcome ~ ., data = upsampled_train, kernel = "linear",
type = 'C-classification', cost = c, class.weights="inverse")
cv_svm_pred <- predict(cv_svm_model, test_data_filtered)
cv_svm_cm <- table(test_outcome[1:length(cv_svm_pred)], cv_svm_pred)
accuracy <- sum(diag(cv_svm_cm)) / sum(cv_svm_cm)
precision <- cv_svm_cm[4] / (cv_svm_cm[3] + cv_svm_cm[4])
sensitivity <- cv_svm_cm[4] / (cv_svm_cm[2] + cv_svm_cm[4])
specificity <- cv_svm_cm[1] / (cv_svm_cm[3] + cv_svm_cm[1])
acc <- c(acc, accuracy)
prec <- c(prec, precision)
sens <- c(sens, sensitivity)
spec <- c(spec, specificity)
}
mean_accuracy <- c(mean_accuracy, mean(unlist(acc)))
mean_precision <- c(mean_precision, mean(unlist(prec)))
mean_sensitivity <- c(mean_sensitivity, mean(unlist(sens)))
mean_specificity <- c(mean_specificity, mean(unlist(spec)))
}
## c is 0.001c is 0.01c is 0.05c is 0.1c is 1c is 10
# plotting the SVM cross validation results
print(mean_sensitivity)
## [[1]]
## [1] 0
##
## [[2]]
## [1] 0.3333333
##
## [[3]]
## [1] 0.2666667
##
## [[4]]
## [1] 0.2333333
##
## [[5]]
## [1] 0.1
##
## [[6]]
## [1] 0.06666667
plot(log10(unlist(C)), mean_accuracy, type = "b", frame = FALSE, pch = 19,
col = "red", xlab = "Log(C)", ylab="", ylim = c(0, 1), xlim = c(-3, 1))
lines(log10(unlist(C)), mean_sensitivity, type = "b", frame = FALSE, pch = 19,
col = "blue")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
lines(log10(unlist(C)), mean_specificity, type = "b", frame = FALSE, pch = 19,
col = "green")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
legend("bottomleft", legend = c("Accuracy", "Sensitivity", "Specificity"),
col = c("red", "blue", "green"), lty = 1)
# Based on the results abvoe, the best C is 0.05:
optimal_svm_model <- svm(outcome ~ ., data = upsampled_train, kernel = "linear",
type = 'C-classification', cost = 0.05, class.weights="inverse")
optimal_svm_pred <- predict(optimal_svm_model, test_data_filtered)
optimal_svm_cm <- print_metrics(test_outcome, optimal_svm_pred)
## Actual_Values
## Predicted_Values 0 1
## 0 208 22
## 1 85 8
## Accuracy: 66.9 %
## Precision: 8.6 %
## Sensitivity: 26.7 %
## Specificity: 71 %
# Cross validation for Random Forrest
repeat_cv <- trainControl(method='repeatedcv', number=5, repeats=3)
forest <- train(outcome ~ .,
data=upsampled_train,
method='rf',
trControl=repeat_cv)
forest$finalModel
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4.45%
## Confusion matrix:
## 0 1 class.error
## 0 686 0 0.0000000
## 1 41 195 0.1737288
# The results of random forrest on train/validation
optimal_rf_cm_train = forest$finalModel$confusion
optimal_rf_cm_train
## 0 1 class.error
## 0 686 0 0.0000000
## 1 41 195 0.1737288
# Print confusion matrix
# Calculate accuracy
accuracy <- sum(diag(optimal_rf_cm_train)) / sum(optimal_rf_cm_train)*100
cat("Accuracy: ", round(accuracy, 1), "%\n")
## Accuracy: 95.5 %
# Calculate precision
precision <- optimal_rf_cm_train[4] /
(optimal_rf_cm_train[3] + optimal_rf_cm_train[4])*100
cat("Precision: ", round(precision, 1), "%\n")
## Precision: 100 %
# Calculate sensitivity
recall <- optimal_rf_cm_train[4] /
(optimal_rf_cm_train[2] + optimal_rf_cm_train[4])*100
cat("Sensitivity: ", round(recall, 1), "%\n")
## Sensitivity: 82.6 %
# Calculate specificity
specificity <- optimal_rf_cm_train[1] /
(optimal_rf_cm_train[2] + optimal_rf_cm_train[1])*100
cat("Specificity: ", round(specificity, 1), "%\n")
## Specificity: 94.4 %
# using the optimal RF configuration on test dataset
optimal_rf_pred <- predict(object=forest, newdata=test_data_filtered)
optimal_rf_cm = print_metrics(test_outcome, optimal_rf_pred)
## Actual_Values
## Predicted_Values 0 1
## 0 293 30
## 1 0 0
## Accuracy: 90.7 %
## Precision: NaN %
## Sensitivity: 0 %
## Specificity: 100 %
var_imp = data.frame(forest$finalModel$importance)
var_imp_df <- data.frame(variable = row.names(var_imp), importance = var_imp[, 1])
var_imp2 = as_tibble(var_imp_df[var_imp_df$importance>2,])
## Create a plot of variable importance
var_imp2 %>% arrange(importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_bar(stat='identity') +
coord_flip() +
xlab('Variables') +
labs(title='Random forest variable importance') +
theme_minimal() +
theme(axis.text = element_text(size = 10),
axis.title = element_text(size = 15),
plot.title = element_text(size = 20))
# using the most important features of RF, to train SVM again
var_imp_ind = data.frame(var_imp_df[var_imp_df$importance>1,])
train_data_filtered_rf <- upsampled_train %>% dplyr::select(var_imp_ind$variable)
test_data_filtered_rf <- test_data_filtered %>% dplyr::select(var_imp_ind$variable)
train_data_filtered_rf$outcome = upsampled_train$outcome
# running the CV on the whole train dataset, and validating on test dataset
folds <- createFolds(train_data_filtered_rf$outcome, k = 5)
outcomecol = ncol(train_data_filtered_rf)
mean_accuracy = list()
mean_precision = list()
mean_sensitivity = list()
mean_specificity = list()
C = list(0.01, 0.02, 0.03, 0.04, 0.05, 0.1)
for (c in C) {
cat('c is', c)
acc = list()
prec = list()
sens = list()
spec = list()
for (k in 1:length(folds)) {
cv_svm_model <- svm(outcome ~ ., data = train_data_filtered_rf, kernel = "linear",
type = 'C-classification', cost = c, class.weights="inverse")
cv_svm_pred <- predict(cv_svm_model, test_data_filtered_rf)
cv_svm_cm <- table(test_outcome[1:length(cv_svm_pred)], cv_svm_pred)
accuracy <- sum(diag(cv_svm_cm)) / sum(cv_svm_cm)
precision <- cv_svm_cm[4] / (cv_svm_cm[3] + cv_svm_cm[4])
sensitivity <- cv_svm_cm[4] / (cv_svm_cm[2] + cv_svm_cm[4])
specificity <- cv_svm_cm[1] / (cv_svm_cm[3] + cv_svm_cm[1])
acc <- c(acc, accuracy)
prec <- c(prec, precision)
sens <- c(sens, sensitivity)
spec <- c(spec, specificity)
}
mean_accuracy <- c(mean_accuracy, mean(unlist(acc)))
mean_precision <- c(mean_precision, mean(unlist(prec)))
mean_sensitivity <- c(mean_sensitivity, mean(unlist(sens)))
mean_specificity <- c(mean_specificity, mean(unlist(spec)))
}
## c is 0.01c is 0.02c is 0.03c is 0.04c is 0.05c is 0.1
# plotting the SVM cross validation results
print(mean_sensitivity)
## [[1]]
## [1] 1
##
## [[2]]
## [1] 0.9666667
##
## [[3]]
## [1] 0.5
##
## [[4]]
## [1] 0.4
##
## [[5]]
## [1] 0.4
##
## [[6]]
## [1] 0.3666667
plot(log10(unlist(C)), mean_accuracy, type = "b", frame = FALSE, pch = 19,
col = "red", xlab = "Log(C)", ylab="", ylim = c(0, 1), xlim = c(-2, -1))
lines(log10(unlist(C)), mean_sensitivity, type = "b", frame = FALSE, pch = 19,
col = "blue")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
lines(log10(unlist(C)), mean_specificity, type = "b", frame = FALSE, pch = 19,
col = "green")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
legend("bottomleft", legend = c("Accuracy", "Sensitivity", "Specificity"),
col = c("red", "blue", "green"), lty = 1)
# Cross Validation for SVM on the new filtered dataset (rf important features)
C = list(0.001, 0.01, 0.1, 1, 10, 100)
svm_cross_validation(train_data_filtered_rf, C, 5)
## c is 0.001c is 0.01c is 0.1c is 1c is 10c is 100[[1]]
## [1] 0
##
## [[2]]
## [1] 0.2
##
## [[3]]
## [1] 0.7968972
##
## [[4]]
## [1] 0.864539
##
## [[5]]
## [1] 0.9407801
##
## [[6]]
## [1] 0.970656
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
# Based on the results above, the best C is :
optimal_rf_svm_model <- svm(outcome ~ ., data = train_data_filtered_rf, kernel = "linear",
type = 'C-classification', cost = 100, class.weights="inverse")
optimal_rf_svm_pred <- predict(optimal_rf_svm_model, test_data_filtered_rf)
optimal_rf_svm_cm <- print_metrics(test_outcome, optimal_rf_svm_pred)
## Actual_Values
## Predicted_Values 0 1
## 0 235 25
## 1 58 5
## Accuracy: 74.3 %
## Precision: 7.9 %
## Sensitivity: 16.7 %
## Specificity: 80.2 %
# But from domain knowledge, C = 100 is very high. So I put C = 0.03 as it performed
# well on test set before:
optimal_rf_svm_model2 <- svm(outcome ~ ., data = train_data_filtered_rf, kernel = "linear",
type = 'C-classification', cost = 0.03, class.weights="inverse")
optimal_rf_svm_pred2 <- predict(optimal_rf_svm_model2, test_data_filtered_rf)
optimal_rf_svm_cm2 <- print_metrics(test_outcome, optimal_rf_svm_pred2)
## Actual_Values
## Predicted_Values 0 1
## 0 174 15
## 1 119 15
## Accuracy: 58.5 %
## Precision: 11.2 %
## Sensitivity: 50 %
## Specificity: 59.4 %
## Applying LDA model on the original data
#(To see depending on variance, how different number of features affect the model)
# Storing initial train_data under a different name
model_data <- train_data
# defining vector of feature subset sizes
feature_sizes <- c(1000, 900, 800, 700, 600, 500, 400, 300, 200, 100)
# initializing vectors to store performance metrics
accuracy_values <- vector("numeric", length(feature_sizes))
sensitivity_values <- vector("numeric", length(feature_sizes))
specificity_values <- vector("numeric", length(feature_sizes))
# performing 5-fold cross-validation
folds <- createFolds(model_data$outcome, k = 5, list = TRUE, returnTrain = FALSE)
# looping over feature subset sizes
for (i in seq_along(feature_sizes)) {
size <- feature_sizes[i]
# initializing vectors to store performance metrics for the current fold
fold_accuracy_values <- vector("numeric", length(folds))
fold_sensitivity_values <- vector("numeric", length(folds))
fold_specificity_values <- vector("numeric", length(folds))
# looping over the folds
for (j in seq_along(folds)) {
# sub-setting the dataframe with the current fold
model_data_subset <- model_data[-folds[[j]], ]
# computing variance for each feature
variances <- apply(model_data_subset, 2, var)
# sorting variances in decreasing order and selecting top features
top_features <- names(sort(variances, decreasing = TRUE)[1:size])
# sub-setting the dataframe with top features and response variable
model_data_subset <- model_data_subset[c(top_features, "outcome")]
# fitting LDA model on current fold
lda_model <- lda(outcome ~ ., data = model_data_subset)
# making predictions on current fold
test_preds_LDA <- predict(lda_model, newdata = model_data[folds[[j]], top_features])
# computing performance metrics for the current fold
cm <- confusionMatrix(table(model_data[folds[[j]], "outcome"], test_preds_LDA$class))
dimnames(cm$table) <- list("Actual Values" = c("0", "1"),
"Predicted Values" = c("0", "1"))
accuracy <- sum(diag(cm$table)) / sum(cm$table)
sensitivity <- cm$table[2,2] / sum(cm$table[2,])
specificity <- cm$table[1,1] / sum(cm$table[1,])
# storing performance metrics for the current fold
fold_accuracy_values[j] <- accuracy
fold_sensitivity_values[j] <- sensitivity
fold_specificity_values[j] <- specificity
}
# computing mean performance metrics across the folds
accuracy_values[i] <- mean(fold_accuracy_values)
sensitivity_values[i] <- mean(fold_sensitivity_values)
specificity_values[i] <- mean(fold_specificity_values)
# printing performance metrics
cat("Performance metrics for", size, "features (averaged across 5 folds):\n")
cat("Accuracy:", round(accuracy_values[i], 3), "\n")
cat("Sensitivity:", round(sensitivity_values[i], 3), "\n")
cat("Specificity:", round(specificity_values[i], 3), "\n\n")
}
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in lda.default(x, grouping, ...): variables are collinear
## Performance metrics for 1000 features (averaged across 5 folds):
## Accuracy: 0.852
## Sensitivity: 0.196
## Specificity: 0.913
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in lda.default(x, grouping, ...): variables are collinear
## Performance metrics for 900 features (averaged across 5 folds):
## Accuracy: 0.855
## Sensitivity: 0.308
## Specificity: 0.905
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in lda.default(x, grouping, ...): variables are collinear
## Performance metrics for 800 features (averaged across 5 folds):
## Accuracy: 0.817
## Sensitivity: 0.329
## Specificity: 0.862
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in lda.default(x, grouping, ...): variables are collinear
## Performance metrics for 700 features (averaged across 5 folds):
## Accuracy: 0.765
## Sensitivity: 0.542
## Specificity: 0.792
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in lda.default(x, grouping, ...): variables are collinear
## Performance metrics for 600 features (averaged across 5 folds):
## Accuracy: 0.547
## Sensitivity: 0.5
## Specificity: 0.552
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Performance metrics for 500 features (averaged across 5 folds):
## Accuracy: 0.712
## Sensitivity: 0.423
## Specificity: 0.741
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Performance metrics for 400 features (averaged across 5 folds):
## Accuracy: 0.823
## Sensitivity: 0.426
## Specificity: 0.858
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Performance metrics for 300 features (averaged across 5 folds):
## Accuracy: 0.855
## Sensitivity: 0.33
## Specificity: 0.903
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Performance metrics for 200 features (averaged across 5 folds):
## Accuracy: 0.887
## Sensitivity: 0.301
## Specificity: 0.941
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Performance metrics for 100 features (averaged across 5 folds):
## Accuracy: 0.888
## Sensitivity: 0.152
## Specificity: 0.954
# Storing the final results for plotting
cat("Accuracy values:", paste(round(accuracy_values, 3), collapse = ", "), "\n")
## Accuracy values: 0.852, 0.855, 0.817, 0.765, 0.547, 0.712, 0.823, 0.855, 0.887, 0.888
cat("Sensitivity values:", paste(round(sensitivity_values, 3), collapse = ", "), "\n")
## Sensitivity values: 0.196, 0.308, 0.329, 0.542, 0.5, 0.423, 0.426, 0.33, 0.301, 0.152
cat("Specificity values:", paste(round(specificity_values, 3), collapse = ", "), "\n")
## Specificity values: 0.913, 0.905, 0.862, 0.792, 0.552, 0.741, 0.858, 0.903, 0.941, 0.954
# Plotting the model performance on the for the 5-fold cv
X <- feature_sizes
plot(X, accuracy_values, type = "b", pch = 19,
col = "red", xlab = "Number of features", ylim = c(0, 1), xlim = c(0, 1000), ylab=" ")
lines(X, sensitivity_values, type = "b", pch = 19,
col = "blue")
lines(X, specificity_values, type = "b", pch = 19,
col = "green")
legend("bottom", legend = c("Accuracy", "Sensitivity", "Specificity"), col = c("red", "blue", "green"), lty = 1)
# Specifying the hold-out test data set
holdtest_data <- test_data
holdtest_data_outcome <- test_outcome
accuracy_values_test <- vector("numeric", length(feature_sizes))
sensitivity_values_test <- vector("numeric", length(feature_sizes))
specificity_values_test <- vector("numeric", length(feature_sizes))
# Using loop over feature subset sizes as before
for (i in seq_along(feature_sizes)) {
size <- feature_sizes[i]
variances <- apply(model_data, 2, var)
top_features <- names(sort(variances, decreasing = TRUE)[1:size])
model_data_subset <- model_data[c(top_features, "outcome")]
folds <- createFolds(model_data_subset$outcome, k = 5, returnTrain = TRUE)
accs <- c()
sens <- c()
specs <- c()
for (j in seq_along(folds)) {
train_LDA <- model_data_subset[folds[[j]], ]
lda_model <- lda(outcome ~ ., data = train_LDA)
# making predictions on hold-out test data
holdtest_preds_LDA <- predict(lda_model, newdata = holdtest_data)
cm <- confusionMatrix(table(holdtest_data_outcome, holdtest_preds_LDA$class))
dimnames(cm$table) <- list("Actual Values" = c("0", "1"),
" Predicted Values" = c("0", "1"))
accuracy_test <- sum(diag(cm$table)) / sum(cm$table)
sensitivity_test <- cm$table[2,2] / sum(cm$table[2,])
specificity_test <- cm$table[1,1] / sum(cm$table[1,])
accs <- c(accs, accuracy_test)
sens <- c(sens, sensitivity_test)
specs <- c(specs, specificity_test)
}
# Computing average performance metrics over the 5 folds
accuracy_test <- mean(accs)
sensitivity_test <- mean(sens)
specificity_test <- mean(specs)
# storing performance metrics for the cross-validation dataset
accuracy_values_test[i] <- accuracy_test
sensitivity_values_test[i] <- sensitivity_test
specificity_values_test[i] <- specificity_test
# print average performance metrics over the 5 folds
cat("Average performance metrics for", size, "features (on hold-out test dataset):\n")
cat("Accuracy:", round(accuracy_test, 3), "\n")
cat("Sensitivity:", round(sensitivity_test, 3), "\n")
cat("Specificity:", round(specificity_test, 3), "\n\n")
}
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Average performance metrics for 1000 features (on hold-out test dataset):
## Accuracy: 0.85
## Sensitivity: 0.187
## Specificity: 0.917
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Average performance metrics for 900 features (on hold-out test dataset):
## Accuracy: 0.832
## Sensitivity: 0.153
## Specificity: 0.902
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Average performance metrics for 800 features (on hold-out test dataset):
## Accuracy: 0.805
## Sensitivity: 0.227
## Specificity: 0.864
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Average performance metrics for 700 features (on hold-out test dataset):
## Accuracy: 0.776
## Sensitivity: 0.28
## Specificity: 0.827
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Average performance metrics for 600 features (on hold-out test dataset):
## Accuracy: 0.579
## Sensitivity: 0.467
## Specificity: 0.59
##
## Average performance metrics for 500 features (on hold-out test dataset):
## Accuracy: 0.768
## Sensitivity: 0.327
## Specificity: 0.814
##
## Average performance metrics for 400 features (on hold-out test dataset):
## Accuracy: 0.817
## Sensitivity: 0.233
## Specificity: 0.876
##
## Average performance metrics for 300 features (on hold-out test dataset):
## Accuracy: 0.838
## Sensitivity: 0.18
## Specificity: 0.906
##
## Average performance metrics for 200 features (on hold-out test dataset):
## Accuracy: 0.852
## Sensitivity: 0.133
## Specificity: 0.926
##
## Average performance metrics for 100 features (on hold-out test dataset):
## Accuracy: 0.871
## Sensitivity: 0.04
## Specificity: 0.956
# Storing the final results for different feature sizes in the hold-out test dataset
cat("Accuracy values (test):", paste(round(accuracy_values_test, 3), collapse = ", "), "\n")
## Accuracy values (test): 0.85, 0.832, 0.805, 0.776, 0.579, 0.768, 0.817, 0.838, 0.852, 0.871
cat("Sensitivity values (test):", paste(round(sensitivity_values_test, 3), collapse = ", "), "\n")
## Sensitivity values (test): 0.187, 0.153, 0.227, 0.28, 0.467, 0.327, 0.233, 0.18, 0.133, 0.04
cat("Specificity values (test):", paste(round(specificity_values_test, 3), collapse = ", "), "\n")
## Specificity values (test): 0.917, 0.902, 0.864, 0.827, 0.59, 0.814, 0.876, 0.906, 0.926, 0.956
# Plotting the performance for the hold-out test data
plot(X, accuracy_values_test, type = "b", pch = 19,
col = "red", xlab = "Number of features", ylim = c(0, 1), xlim = c(0, 1000), ylab=" ")
lines(X, sensitivity_values_test, type = "b", pch = 19,
col = "blue")
lines(X, specificity_values_test, type = "b", pch = 19,
col = "green")
legend("left", legend = c("Accuracy", "Sensitivity", "Specificity"), col = c("red", "blue", "green"), lty = 1)
# Trying to see how under-sampling the majority class may influence the LDA model
# with the 100 most variable features (using only 100 features since the sample
# size is reduced due to under-sampling)
# creating indices for the minority class ('1') and majority class ('0')
idx_train_1 <- which(train_df$outcome == 1)
idx_train_0 <- which(train_df$outcome == 0)
idx_test_1 <- which(test_df$outcome == 1)
idx_test_0 <- which(test_df$outcome == 0)
# randomly sampling '0' indices to match the number of '1' indices
idx_train_0_undersampled <- sample(idx_train_0, length(idx_train_1))
idx_test_0_undersampled <- sample(idx_test_0, length(idx_test_1))
# combining the minority and undersampled majority class indices
idx_train_undersampled <- c(idx_train_1, idx_train_0_undersampled)
idx_test_undersampled <- c(idx_test_1, idx_test_0_undersampled)
# creating the undersampled dataframe
train_undersampled <- train_df[idx_train_undersampled, ]
test_undersampled <- test_df[idx_test_undersampled, ]
print("The train data before under-sampling:")
## [1] "The train data before under-sampling:"
table(train_df$outcome)
##
## 0 1
## 686 59
print("The train data after under-sampling:")
## [1] "The train data after under-sampling:"
table(train_undersampled$outcome)
##
## 0 1
## 59 59
print("The test data before under-sampling:")
## [1] "The test data before under-sampling:"
table(test_df$outcome)
##
## 0 1
## 293 30
print("The test data after under-sampling:")
## [1] "The test data after under-sampling:"
table(test_undersampled$outcome)
##
## 0 1
## 30 30
train_data_undersampled = train_undersampled[,30:2311] # only the numerical mRNA features + the output as the last variable
test_data_undersampled = test_undersampled[,30:2310] # only the numerical mRNA features
test_outcome_undersampled = test_undersampled[,2311]
# buidling the model with 100 features of the largest variance
features <- 100
folds <- createFolds(model_data_subset$outcome, k = 5, returnTrain = TRUE)
variances <- apply(train_data_undersampled, 2, var)
top_100_features <- names(sort(variances, decreasing = TRUE)[1:features])
train_data_undersampled_subset <- train_data_undersampled[c(top_100_features, "outcome")]
test_data_undersampled_subset <- test_data_undersampled[c(top_100_features)]
lda_model_undersampled <- lda(outcome ~ ., data = train_data_undersampled_subset)
undersampled_preds_LDA <- predict(lda_model_undersampled, newdata = test_data_undersampled_subset)
# computing performance metrics
cm <- confusionMatrix(table(test_outcome_undersampled, undersampled_preds_LDA$class))
dimnames(cm$table) <- list("Actual Values" = c("0", "1"),
" Predicted Values" = c("0", "1"))
accuracy_undersampled <- sum(diag(cm$table)) / sum(cm$table)
sensitivity_undersampled <- cm$table[2,2] / sum(cm$table[2,])
specificity_undersampled <- cm$table[1,1] / sum(cm$table[1,])
cat("Confusion matrix for 100 features (in undersampled dataset):\n")
## Confusion matrix for 100 features (in undersampled dataset):
print(cm$table)
## Predicted Values
## Actual Values 0 1
## 0 15 15
## 1 13 17
cat("\n")
cat("Performance metrics for 100 features (in undersampled dataset):\n")
## Performance metrics for 100 features (in undersampled dataset):
cat("Accuracy in undersampled data:", round(accuracy_undersampled, 3), "\n")
## Accuracy in undersampled data: 0.533
cat("Sensitivity in undersampled data:", round(sensitivity_undersampled, 3), "\n")
## Sensitivity in undersampled data: 0.567
cat("Specificity in undersampled data:", round(specificity_undersampled, 3), "\n\n")
## Specificity in undersampled data: 0.5