Problem #1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:e1071':
##
## element
## Loading required package: lattice
data <- read.csv("~/Downloads/kits21_kidney_dataset_NI_augmented.csv")
continuous_data <- data %>% select(age_at_nephrectomy, body_mass_index, radiographic_size)
continuous_data_summary <- continuous_data %>% summarise(
mean_age = mean(age_at_nephrectomy, na.rm = TRUE),
sd_age = sd(age_at_nephrectomy, na.rm = TRUE),
max_age = max(age_at_nephrectomy, na.rm = TRUE),
min_age = min(age_at_nephrectomy, na.rm = TRUE),
range_age = max(age_at_nephrectomy, na.rm = TRUE) - min(age_at_nephrectomy, na.rm = TRUE),
mean_bmi = mean(body_mass_index, na.rm = TRUE),
sd_bmi = sd(body_mass_index, na.rm = TRUE),
max_bmi = max(body_mass_index, na.rm = TRUE),
min_bmi = min(body_mass_index, na.rm = TRUE),
range_bmi = max(body_mass_index, na.rm = TRUE) - min(body_mass_index, na.rm = TRUE),
mean_size = mean(radiographic_size, na.rm = TRUE),
sd_size = sd(radiographic_size, na.rm = TRUE),
max_size = max(radiographic_size, na.rm = TRUE),
min_size = min(radiographic_size, na.rm = TRUE),
range_size = max(radiographic_size, na.rm = TRUE) - min(radiographic_size, na.rm = TRUE),
)
#Continuous data summary
continuous_data_summary## mean_age sd_age max_age min_age range_age mean_bmi sd_bmi max_bmi min_bmi
## 1 58.35238 14.38798 90 1 89 31.17205 6.463543 49.61 16.2
## range_bmi mean_size sd_size max_size min_size range_size
## 1 33.41 4.771667 3.101886 16.2 1.2 15
categorical_data <- data %>% select(gender, smoking_history)
#Gender summary
categorical_data %>% count(gender) %>% mutate(Percent = round(n / sum(n) * 100, 2))## gender n Percent
## 1 female 87 41.43
## 2 male 123 58.57
#Smoking summary
categorical_data %>% count(smoking_history) %>% mutate(Percent = round(n / sum(n) * 100, 2))## smoking_history n Percent
## 1 current_smoker 34 16.19
## 2 never_smoked 100 47.62
## 3 previous_smoker 76 36.19
set.seed(42)
train_idx <- sample(1:nrow(data), size = 0.8 * nrow(data))
train_set <- data[train_idx, ]
test_set <- data[-train_idx, ]
model <- glm(
vital_days_after_surgery ~ age_at_nephrectomy + gender + body_mass_index +
smoking_history + radiographic_size + surgery_type +
surgical_approach + TumorVolume + KidneyVolume,
data = train_set,
family = gaussian()
)
summary(model)##
## Call:
## glm(formula = vital_days_after_surgery ~ age_at_nephrectomy +
## gender + body_mass_index + smoking_history + radiographic_size +
## surgery_type + surgical_approach + TumorVolume + KidneyVolume,
## family = gaussian(), data = train_set)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.209e+03 5.194e+02 2.328 0.02121 *
## age_at_nephrectomy -7.032e+00 4.492e+00 -1.566 0.11949
## gendermale -6.843e+01 1.409e+02 -0.486 0.62788
## body_mass_index 1.231e+00 1.082e+01 0.114 0.90958
## smoking_historynever_smoked 1.772e+02 1.815e+02 0.977 0.33028
## smoking_historyprevious_smoker 5.384e+02 1.950e+02 2.761 0.00645 **
## radiographic_size -7.061e-01 4.258e+01 -0.017 0.98679
## surgery_typeopen 1.020e+02 2.355e+02 0.433 0.66563
## surgery_typerobotic -5.787e+01 2.129e+02 -0.272 0.78617
## surgical_approachTransperitoneal 1.735e+02 1.854e+02 0.936 0.35063
## TumorVolume 6.664e-04 7.987e-04 0.834 0.40537
## KidneyVolume -6.399e-04 7.062e-04 -0.906 0.36628
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 595290)
##
## Null deviance: 101591646 on 167 degrees of freedom
## Residual deviance: 92865235 on 156 degrees of freedom
## AIC: 2724.2
##
## Number of Fisher Scoring iterations: 2
## 2.5 % 97.5 %
## (Intercept) 1.910469e+02 2.227097e+03
## age_at_nephrectomy -1.583585e+01 1.771789e+00
## gendermale -3.445731e+02 2.077180e+02
## body_mass_index -1.997135e+01 2.243235e+01
## smoking_historynever_smoked -1.784428e+02 5.328767e+02
## smoking_historyprevious_smoker 1.562662e+02 9.205747e+02
## radiographic_size -8.416836e+01 8.275617e+01
## surgery_typeopen -3.595544e+02 5.634581e+02
## surgery_typerobotic -4.752317e+02 3.594969e+02
## surgical_approachTransperitoneal -1.897594e+02 5.368061e+02
## TumorVolume -8.990429e-04 2.231797e-03
## KidneyVolume -2.024009e-03 7.442365e-04
model_summary <- summary(model)
coef_table <- model_summary$coefficients
ci <- confint.default(model)
results <- as.data.frame(cbind(coef_table, ci))
colnames(results) <- c("Estimate", "Std_Error", "t_value", "p_value", "CI_lower", "CI_upper")
results$Significant <- results$p_value < 0.05
print(results)## Estimate Std_Error t_value
## (Intercept) 1.209072e+03 5.194101e+02 2.32777907
## age_at_nephrectomy -7.032033e+00 4.491828e+00 -1.56551686
## gendermale -6.842758e+01 1.408932e+02 -0.48566995
## body_mass_index 1.230503e+00 1.081747e+01 0.11375145
## smoking_historynever_smoked 1.772170e+02 1.814624e+02 0.97660428
## smoking_historyprevious_smoker 5.384205e+02 1.949802e+02 2.76141031
## radiographic_size -7.060941e-01 4.258357e+01 -0.01658137
## surgery_typeopen 1.019518e+02 2.354667e+02 0.43297777
## surgery_typerobotic -5.786740e+01 2.129449e+02 -0.27174825
## surgical_approachTransperitoneal 1.735234e+02 1.853517e+02 0.93618412
## TumorVolume 6.663769e-04 7.986982e-04 0.83432876
## KidneyVolume -6.398864e-04 7.061981e-04 -0.90610043
## p_value CI_lower CI_upper
## (Intercept) 0.02120849 1.910469e+02 2.227097e+03
## age_at_nephrectomy 0.11948804 -1.583585e+01 1.771789e+00
## gendermale 0.62788263 -3.445731e+02 2.077180e+02
## body_mass_index 0.90958109 -1.997135e+01 2.243235e+01
## smoking_historynever_smoked 0.33027710 -1.784428e+02 5.328767e+02
## smoking_historyprevious_smoker 0.00644616 1.562662e+02 9.205747e+02
## radiographic_size 0.98679177 -8.416836e+01 8.275617e+01
## surgery_typeopen 0.66562896 -3.595544e+02 5.634581e+02
## surgery_typerobotic 0.78617490 -4.752317e+02 3.594969e+02
## surgical_approachTransperitoneal 0.35062540 -1.897594e+02 5.368061e+02
## TumorVolume 0.40537155 -8.990429e-04 2.231797e-03
## KidneyVolume 0.36628008 -2.024009e-03 7.442365e-04
## Significant
## (Intercept) TRUE
## age_at_nephrectomy FALSE
## gendermale FALSE
## body_mass_index FALSE
## smoking_historynever_smoked FALSE
## smoking_historyprevious_smoker TRUE
## radiographic_size FALSE
## surgery_typeopen FALSE
## surgery_typerobotic FALSE
## surgical_approachTransperitoneal FALSE
## TumorVolume FALSE
## KidneyVolume FALSE
predictions <- predict(model, newdata = test_set)
actual <- test_set$vital_days_after_surgery
correlation <- cor(predictions, actual, use = "complete.obs")
correlation## [1] 0.2515751
## [1] 753.2357
## [1] 0.06329005
reduced_model <- glm(
vital_days_after_surgery ~ age_at_nephrectomy + gender + body_mass_index +
smoking_history + radiographic_size + surgery_type +
surgical_approach,
data = train_set,
family = gaussian()
)
AIC_full <- AIC(model)
AIC_reduced <- AIC(reduced_model)
AIC_full## [1] 2724.176
## [1] 2721.107
## [1] "X"
## [2] "case_id"
## [3] "age_at_nephrectomy"
## [4] "gender"
## [5] "body_mass_index"
## [6] "smoking_history"
## [7] "age_when_quit_smoking"
## [8] "pack_years"
## [9] "chewing_tobacco_use"
## [10] "alcohol_use"
## [11] "intraoperative_complications.blood_transfusion"
## [12] "intraoperative_complications.injury_to_surrounding_organ"
## [13] "intraoperative_complications.cardiac_event"
## [14] "hospitalization"
## [15] "ischemia_time"
## [16] "radiographic_size"
## [17] "pathologic_size"
## [18] "malignant"
## [19] "pathology_t_stage"
## [20] "pathology_n_stage"
## [21] "pathology_m_stage"
## [22] "tumor_histologic_subtype"
## [23] "tumor_necrosis"
## [24] "tumor_isup_grade"
## [25] "clavien_surgical_complications"
## [26] "er_visit"
## [27] "readmission"
## [28] "estimated_blood_loss"
## [29] "surgery_type"
## [30] "surgical_procedure"
## [31] "surgical_approach"
## [32] "operative_time"
## [33] "cytoreductive"
## [34] "positive_resection_margins"
## [35] "last_preop_egfr.value"
## [36] "last_preop_egfr.days_before_nephrectomy"
## [37] "first_postop_egfr.value"
## [38] "first_postop_egfr.days_before_nephrectomy"
## [39] "last_postop_egfr.value"
## [40] "last_postop_egfr.days_before_nephrectomy"
## [41] "vital_status"
## [42] "vital_days_after_surgery"
## [43] "voxel_spacing.x_spacing"
## [44] "voxel_spacing.y_spacing"
## [45] "voxel_spacing.z_spacing"
## [46] "KidneyVolume"
## [47] "TumorVolume"
data$malignant <- as.factor(data$malignant)
train_set$malignant <- data$malignant[train_idx]
test_set$malignant <- data$malignant[-train_idx]
features <- c("age_at_nephrectomy", "body_mass_index", "radiographic_size",
"TumorVolume", "KidneyVolume")
train_x <- train_set[, features]
test_x <- test_set[, features]
train_y <- factor(train_set$malignant, levels = c("false", "true"))
test_y <- factor(test_set$malignant, levels = c("false", "true"))
normalize <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
train_x <- as.data.frame(lapply(train_x, normalize))
test_x <- as.data.frame(lapply(test_x, normalize))
k_values <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
accuracy_results <- c()
for (k in k_values) {
pred <- knn(train = train_x, test = test_x, cl = train_y, k = k)
acc <- mean(pred == test_y)
accuracy_results <- c(accuracy_results, acc)
}
data.frame(k = k_values, Accuracy = accuracy_results)## k Accuracy
## 1 1 0.8333333
## 2 2 0.8809524
## 3 3 0.9047619
## 4 4 0.9047619
## 5 5 0.9047619
## 6 6 0.9047619
## 7 7 0.9047619
## 8 8 0.9047619
## 9 9 0.9047619
## 10 10 0.9047619
## [1] 3
knn_pred <- knn(train = train_x, test = test_x, cl = train_y, k = best_k)
nb_model <- naiveBayes(malignant ~ ., data = train_set[, c(features, "malignant")])
nb_pred <- predict(nb_model, test_set[, features])
nb_pred <- factor(nb_pred, levels = levels(test_y))
conf_knn <- confusionMatrix(knn_pred, test_y, positive = "true")
conf_knn## Confusion Matrix and Statistics
##
## Reference
## Prediction false true
## false 0 0
## true 4 38
##
## Accuracy : 0.9048
## 95% CI : (0.7738, 0.9734)
## No Information Rate : 0.9048
## P-Value [Acc > NIR] : 0.6290
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.1336
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9048
## Neg Pred Value : NaN
## Prevalence : 0.9048
## Detection Rate : 0.9048
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : true
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction false true
## false 0 0
## true 4 38
##
## Accuracy : 0.9048
## 95% CI : (0.7738, 0.9734)
## No Information Rate : 0.9048
## P-Value [Acc > NIR] : 0.6290
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.1336
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9048
## Neg Pred Value : NaN
## Prevalence : 0.9048
## Detection Rate : 0.9048
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : true
##
knn_prob <- attr(knn(train_x, test_x, train_y, k = best_k, prob = TRUE), "prob")
# Convert to probability of "malignant"
knn_prob <- ifelse(knn_pred == "true", knn_prob, 1 - knn_prob)
roc_knn <- roc(test_y, knn_prob)## Setting levels: control = false, case = true
## Setting direction: controls < cases
## Area under the curve: 0.4934
nb_prob <- predict(nb_model, test_set[, features], type = "raw")[, "true"]
roc_nb <- roc(test_y, nb_prob)## Setting levels: control = false, case = true
## Setting direction: controls < cases
## Area under the curve: 0.5658
# kNN
cm <- conf_knn$table
TP_knn <- cm["true", "true"]
TN_knn <- cm["false", "false"]
FP_knn <- cm["true", "false"]
FN_knn <- cm["false", "true"]
# Naive Bayes
cm <- conf_nb$table
TP_nb <- cm["true", "true"]
TN_nb <- cm["false", "false"]
FP_nb <- cm["true", "false"]
FN_nb <- cm["false", "true"]
#the following shows the labeling of True positives, negatives, etc for the KNN model
cm_knn <- conf_knn$table
cm_knn## Reference
## Prediction false true
## false 0 0
## true 4 38
#the following shows the labeling of True positives, negatives, etc for the Naive Bayes model
cm_nb <- conf_nb$table
cm_nb## Reference
## Prediction false true
## false 0 0
## true 4 38
## k-NN Confusion Matrix:
## Reference
## Prediction false true
## false 0 0
## true 4 38
##
## k-NN Classification Breakdown:
## True Positives (TP): 38
## True Negatives (TN): 0
## False Positives (FP): 4
## False Negatives (FN): 0
##
##
## Naive Bayes Confusion Matrix:
## Reference
## Prediction false true
## false 0 0
## true 4 38
##
## Naive Bayes Classification Breakdown:
## True Positives (TP): 38
## True Negatives (TN): 0
## False Positives (FP): 4
## False Negatives (FN): 0