Problem #1

# insert code
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(class)     # kNN
library(e1071)     # Naive Bayes
library(pROC)      # ROC + AUC
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(caret)     # confusion matrix + metrics
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:e1071':
## 
##     element
## Loading required package: lattice
data <- read.csv("~/Downloads/kits21_kidney_dataset_NI_augmented.csv")

continuous_data <- data %>% select(age_at_nephrectomy, body_mass_index, radiographic_size)

continuous_data_summary <- continuous_data %>% summarise(
  mean_age = mean(age_at_nephrectomy, na.rm = TRUE),
  sd_age = sd(age_at_nephrectomy, na.rm = TRUE),
  max_age = max(age_at_nephrectomy, na.rm = TRUE),
  min_age = min(age_at_nephrectomy, na.rm = TRUE),
  range_age = max(age_at_nephrectomy, na.rm = TRUE) - min(age_at_nephrectomy, na.rm = TRUE),
  
  mean_bmi = mean(body_mass_index, na.rm = TRUE),
  sd_bmi = sd(body_mass_index, na.rm = TRUE),
  max_bmi = max(body_mass_index, na.rm = TRUE),
  min_bmi = min(body_mass_index, na.rm = TRUE),
  range_bmi = max(body_mass_index, na.rm = TRUE) - min(body_mass_index, na.rm = TRUE),
  
  mean_size = mean(radiographic_size, na.rm = TRUE),
  sd_size = sd(radiographic_size, na.rm = TRUE),
  max_size = max(radiographic_size, na.rm = TRUE),
  min_size = min(radiographic_size, na.rm = TRUE),
  range_size = max(radiographic_size, na.rm = TRUE) - min(radiographic_size, na.rm = TRUE),
  
)

#Continuous data summary
continuous_data_summary
##   mean_age   sd_age max_age min_age range_age mean_bmi   sd_bmi max_bmi min_bmi
## 1 58.35238 14.38798      90       1        89 31.17205 6.463543   49.61    16.2
##   range_bmi mean_size  sd_size max_size min_size range_size
## 1     33.41  4.771667 3.101886     16.2      1.2         15
categorical_data <- data %>% select(gender, smoking_history)

#Gender summary

categorical_data %>% count(gender) %>% mutate(Percent = round(n / sum(n) * 100, 2))
##   gender   n Percent
## 1 female  87   41.43
## 2   male 123   58.57
#Smoking summary

categorical_data %>% count(smoking_history) %>% mutate(Percent = round(n / sum(n) * 100, 2))
##   smoking_history   n Percent
## 1  current_smoker  34   16.19
## 2    never_smoked 100   47.62
## 3 previous_smoker  76   36.19
# insert code
set.seed(42)

train_idx <- sample(1:nrow(data), size = 0.8 * nrow(data))

train_set <- data[train_idx, ]
test_set  <- data[-train_idx, ]


model <- glm(
  vital_days_after_surgery ~ age_at_nephrectomy + gender + body_mass_index +
    smoking_history + radiographic_size + surgery_type +
    surgical_approach + TumorVolume + KidneyVolume,
  data = train_set,
  family = gaussian()
)

summary(model)
## 
## Call:
## glm(formula = vital_days_after_surgery ~ age_at_nephrectomy + 
##     gender + body_mass_index + smoking_history + radiographic_size + 
##     surgery_type + surgical_approach + TumorVolume + KidneyVolume, 
##     family = gaussian(), data = train_set)
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                       1.209e+03  5.194e+02   2.328  0.02121 * 
## age_at_nephrectomy               -7.032e+00  4.492e+00  -1.566  0.11949   
## gendermale                       -6.843e+01  1.409e+02  -0.486  0.62788   
## body_mass_index                   1.231e+00  1.082e+01   0.114  0.90958   
## smoking_historynever_smoked       1.772e+02  1.815e+02   0.977  0.33028   
## smoking_historyprevious_smoker    5.384e+02  1.950e+02   2.761  0.00645 **
## radiographic_size                -7.061e-01  4.258e+01  -0.017  0.98679   
## surgery_typeopen                  1.020e+02  2.355e+02   0.433  0.66563   
## surgery_typerobotic              -5.787e+01  2.129e+02  -0.272  0.78617   
## surgical_approachTransperitoneal  1.735e+02  1.854e+02   0.936  0.35063   
## TumorVolume                       6.664e-04  7.987e-04   0.834  0.40537   
## KidneyVolume                     -6.399e-04  7.062e-04  -0.906  0.36628   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 595290)
## 
##     Null deviance: 101591646  on 167  degrees of freedom
## Residual deviance:  92865235  on 156  degrees of freedom
## AIC: 2724.2
## 
## Number of Fisher Scoring iterations: 2
confint.default(model)
##                                          2.5 %       97.5 %
## (Intercept)                       1.910469e+02 2.227097e+03
## age_at_nephrectomy               -1.583585e+01 1.771789e+00
## gendermale                       -3.445731e+02 2.077180e+02
## body_mass_index                  -1.997135e+01 2.243235e+01
## smoking_historynever_smoked      -1.784428e+02 5.328767e+02
## smoking_historyprevious_smoker    1.562662e+02 9.205747e+02
## radiographic_size                -8.416836e+01 8.275617e+01
## surgery_typeopen                 -3.595544e+02 5.634581e+02
## surgery_typerobotic              -4.752317e+02 3.594969e+02
## surgical_approachTransperitoneal -1.897594e+02 5.368061e+02
## TumorVolume                      -8.990429e-04 2.231797e-03
## KidneyVolume                     -2.024009e-03 7.442365e-04
model_summary <- summary(model)

coef_table <- model_summary$coefficients

ci <- confint.default(model)

results <- as.data.frame(cbind(coef_table, ci))

colnames(results) <- c("Estimate", "Std_Error", "t_value", "p_value", "CI_lower", "CI_upper")

results$Significant <- results$p_value < 0.05

print(results)
##                                       Estimate    Std_Error     t_value
## (Intercept)                       1.209072e+03 5.194101e+02  2.32777907
## age_at_nephrectomy               -7.032033e+00 4.491828e+00 -1.56551686
## gendermale                       -6.842758e+01 1.408932e+02 -0.48566995
## body_mass_index                   1.230503e+00 1.081747e+01  0.11375145
## smoking_historynever_smoked       1.772170e+02 1.814624e+02  0.97660428
## smoking_historyprevious_smoker    5.384205e+02 1.949802e+02  2.76141031
## radiographic_size                -7.060941e-01 4.258357e+01 -0.01658137
## surgery_typeopen                  1.019518e+02 2.354667e+02  0.43297777
## surgery_typerobotic              -5.786740e+01 2.129449e+02 -0.27174825
## surgical_approachTransperitoneal  1.735234e+02 1.853517e+02  0.93618412
## TumorVolume                       6.663769e-04 7.986982e-04  0.83432876
## KidneyVolume                     -6.398864e-04 7.061981e-04 -0.90610043
##                                     p_value      CI_lower     CI_upper
## (Intercept)                      0.02120849  1.910469e+02 2.227097e+03
## age_at_nephrectomy               0.11948804 -1.583585e+01 1.771789e+00
## gendermale                       0.62788263 -3.445731e+02 2.077180e+02
## body_mass_index                  0.90958109 -1.997135e+01 2.243235e+01
## smoking_historynever_smoked      0.33027710 -1.784428e+02 5.328767e+02
## smoking_historyprevious_smoker   0.00644616  1.562662e+02 9.205747e+02
## radiographic_size                0.98679177 -8.416836e+01 8.275617e+01
## surgery_typeopen                 0.66562896 -3.595544e+02 5.634581e+02
## surgery_typerobotic              0.78617490 -4.752317e+02 3.594969e+02
## surgical_approachTransperitoneal 0.35062540 -1.897594e+02 5.368061e+02
## TumorVolume                      0.40537155 -8.990429e-04 2.231797e-03
## KidneyVolume                     0.36628008 -2.024009e-03 7.442365e-04
##                                  Significant
## (Intercept)                             TRUE
## age_at_nephrectomy                     FALSE
## gendermale                             FALSE
## body_mass_index                        FALSE
## smoking_historynever_smoked            FALSE
## smoking_historyprevious_smoker          TRUE
## radiographic_size                      FALSE
## surgery_typeopen                       FALSE
## surgery_typerobotic                    FALSE
## surgical_approachTransperitoneal       FALSE
## TumorVolume                            FALSE
## KidneyVolume                           FALSE
predictions <- predict(model, newdata = test_set)

actual <- test_set$vital_days_after_surgery

correlation <- cor(predictions, actual, use = "complete.obs")
correlation
## [1] 0.2515751
rmse <- sqrt(mean((predictions - actual)^2, na.rm = TRUE))
rmse
## [1] 753.2357
r_squared <- correlation^2
r_squared
## [1] 0.06329005
reduced_model <- glm(
  vital_days_after_surgery ~ age_at_nephrectomy + gender + body_mass_index +
    smoking_history + radiographic_size + surgery_type +
    surgical_approach,
  data = train_set,
  family = gaussian()
)

AIC_full <- AIC(model)
AIC_reduced <- AIC(reduced_model)

AIC_full
## [1] 2724.176
AIC_reduced
## [1] 2721.107
colnames(data)
##  [1] "X"                                                       
##  [2] "case_id"                                                 
##  [3] "age_at_nephrectomy"                                      
##  [4] "gender"                                                  
##  [5] "body_mass_index"                                         
##  [6] "smoking_history"                                         
##  [7] "age_when_quit_smoking"                                   
##  [8] "pack_years"                                              
##  [9] "chewing_tobacco_use"                                     
## [10] "alcohol_use"                                             
## [11] "intraoperative_complications.blood_transfusion"          
## [12] "intraoperative_complications.injury_to_surrounding_organ"
## [13] "intraoperative_complications.cardiac_event"              
## [14] "hospitalization"                                         
## [15] "ischemia_time"                                           
## [16] "radiographic_size"                                       
## [17] "pathologic_size"                                         
## [18] "malignant"                                               
## [19] "pathology_t_stage"                                       
## [20] "pathology_n_stage"                                       
## [21] "pathology_m_stage"                                       
## [22] "tumor_histologic_subtype"                                
## [23] "tumor_necrosis"                                          
## [24] "tumor_isup_grade"                                        
## [25] "clavien_surgical_complications"                          
## [26] "er_visit"                                                
## [27] "readmission"                                             
## [28] "estimated_blood_loss"                                    
## [29] "surgery_type"                                            
## [30] "surgical_procedure"                                      
## [31] "surgical_approach"                                       
## [32] "operative_time"                                          
## [33] "cytoreductive"                                           
## [34] "positive_resection_margins"                              
## [35] "last_preop_egfr.value"                                   
## [36] "last_preop_egfr.days_before_nephrectomy"                 
## [37] "first_postop_egfr.value"                                 
## [38] "first_postop_egfr.days_before_nephrectomy"               
## [39] "last_postop_egfr.value"                                  
## [40] "last_postop_egfr.days_before_nephrectomy"                
## [41] "vital_status"                                            
## [42] "vital_days_after_surgery"                                
## [43] "voxel_spacing.x_spacing"                                 
## [44] "voxel_spacing.y_spacing"                                 
## [45] "voxel_spacing.z_spacing"                                 
## [46] "KidneyVolume"                                            
## [47] "TumorVolume"
data$malignant <- as.factor(data$malignant)

train_set$malignant <- data$malignant[train_idx]
test_set$malignant  <- data$malignant[-train_idx]

features <- c("age_at_nephrectomy", "body_mass_index", "radiographic_size",
              "TumorVolume", "KidneyVolume")

train_x <- train_set[, features]
test_x  <- test_set[, features]

train_y <- factor(train_set$malignant, levels = c("false", "true"))
test_y  <- factor(test_set$malignant, levels = c("false", "true"))

normalize <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}

train_x <- as.data.frame(lapply(train_x, normalize))
test_x  <- as.data.frame(lapply(test_x, normalize))


k_values <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
accuracy_results <- c()

for (k in k_values) {
  pred <- knn(train = train_x, test = test_x, cl = train_y, k = k)
  acc <- mean(pred == test_y)
  accuracy_results <- c(accuracy_results, acc)
}

data.frame(k = k_values, Accuracy = accuracy_results)
##     k  Accuracy
## 1   1 0.8333333
## 2   2 0.8809524
## 3   3 0.9047619
## 4   4 0.9047619
## 5   5 0.9047619
## 6   6 0.9047619
## 7   7 0.9047619
## 8   8 0.9047619
## 9   9 0.9047619
## 10 10 0.9047619
best_k <- k_values[which.max(accuracy_results)]
best_k
## [1] 3
knn_pred <- knn(train = train_x, test = test_x, cl = train_y, k = best_k)

nb_model <- naiveBayes(malignant ~ ., data = train_set[, c(features, "malignant")])

nb_pred <- predict(nb_model, test_set[, features])
nb_pred <- factor(nb_pred, levels = levels(test_y))


conf_knn <- confusionMatrix(knn_pred, test_y, positive = "true")
conf_knn
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction false true
##      false     0    0
##      true      4   38
##                                           
##                Accuracy : 0.9048          
##                  95% CI : (0.7738, 0.9734)
##     No Information Rate : 0.9048          
##     P-Value [Acc > NIR] : 0.6290          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 0.1336          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9048          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.9048          
##          Detection Rate : 0.9048          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : true            
## 
conf_nb <- confusionMatrix(nb_pred, test_y, positive = "true")
conf_nb
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction false true
##      false     0    0
##      true      4   38
##                                           
##                Accuracy : 0.9048          
##                  95% CI : (0.7738, 0.9734)
##     No Information Rate : 0.9048          
##     P-Value [Acc > NIR] : 0.6290          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 0.1336          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9048          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.9048          
##          Detection Rate : 0.9048          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : true            
## 
knn_prob <- attr(knn(train_x, test_x, train_y, k = best_k, prob = TRUE), "prob")

# Convert to probability of "malignant"
knn_prob <- ifelse(knn_pred == "true", knn_prob, 1 - knn_prob)

roc_knn <- roc(test_y, knn_prob)
## Setting levels: control = false, case = true
## Setting direction: controls < cases
plot(roc_knn)

auc(roc_knn)
## Area under the curve: 0.4934
nb_prob <- predict(nb_model, test_set[, features], type = "raw")[, "true"]

roc_nb <- roc(test_y, nb_prob)
## Setting levels: control = false, case = true
## Setting direction: controls < cases
plot(roc_nb, col = "blue")

auc(roc_nb)
## Area under the curve: 0.5658
# kNN
cm <- conf_knn$table
TP_knn <- cm["true", "true"]
TN_knn <- cm["false", "false"]
FP_knn <- cm["true", "false"]
FN_knn <- cm["false", "true"]

# Naive Bayes
cm <- conf_nb$table
TP_nb <- cm["true", "true"]
TN_nb <- cm["false", "false"]
FP_nb <- cm["true", "false"]
FN_nb <- cm["false", "true"]

#the following shows the labeling of True positives, negatives, etc for the KNN model
cm_knn <- conf_knn$table
cm_knn
##           Reference
## Prediction false true
##      false     0    0
##      true      4   38
#the following shows the labeling of True positives, negatives, etc for the Naive Bayes model
cm_nb <- conf_nb$table
cm_nb
##           Reference
## Prediction false true
##      false     0    0
##      true      4   38
# ---- kNN Results ----
cat("k-NN Confusion Matrix:\n")
## k-NN Confusion Matrix:
print(cm_knn)
##           Reference
## Prediction false true
##      false     0    0
##      true      4   38
cat("\n k-NN Classification Breakdown:\n")
## 
##  k-NN Classification Breakdown:
cat("True Positives (TP):", TP_knn, "\n")
## True Positives (TP): 38
cat("True Negatives (TN):", TN_knn, "\n")
## True Negatives (TN): 0
cat("False Positives (FP):", FP_knn, "\n")
## False Positives (FP): 4
cat("False Negatives (FN):", FN_knn, "\n")
## False Negatives (FN): 0
# ---- Naive Bayes Results ----
cat("\n\nNaive Bayes Confusion Matrix:\n")
## 
## 
## Naive Bayes Confusion Matrix:
print(cm_nb)
##           Reference
## Prediction false true
##      false     0    0
##      true      4   38
cat("\n Naive Bayes Classification Breakdown:\n")
## 
##  Naive Bayes Classification Breakdown:
cat("True Positives (TP):", TP_nb, "\n")
## True Positives (TP): 38
cat("True Negatives (TN):", TN_nb, "\n")
## True Negatives (TN): 0
cat("False Positives (FP):", FP_nb, "\n")
## False Positives (FP): 4
cat("False Negatives (FN):", FN_nb, "\n")
## False Negatives (FN): 0