KNN & Classification Tree

Can we accuratley predict hamstring quadricep ratio in cancer patients as a means of accurate physical assesment using dual x-ray absorptiometry scans?

#Load Libraries
library(dplyr)
library(ggplot2)
library(lubridate)
library(caret)
library(gains)
library(pROC)
library(readxl)
library(stringr)
library(rpart)
library(rpart.plot)
library(forecast)

#DATA MANAGEMENT________________________________________________________________

MMF_biodex_raw<- read.csv("/Volumes/Kindl Portable /MMF/MMF Data/BIODEX Data Exports/MMF Data 9:6:22/MMF 9-6-22t.csv")
MMF_Baseline_DEXA <- read_excel("/Volumes/Kindl Portable /MMF/MMF Data/DEXA Data/MMF Baseline Data Pilot and Cohort1_tu.xlsx", 
                                sheet = "DXA_WbodyComposition Baseline")

#CLEAN AND CALCULATE HAMSTRING QUADRACEPT RATIO FROM BIODEX DATA
MMF_Filtered <- MMF_biodex_raw %>% filter(!LAST %in% c("AWB Test MCW","E4V_05_T1","GZ","LO","WP","TU","Greving","LA","M"))

#Change Test Date from CHR to Date and drop hours/minutes
MMF_Filtered$TEST.DATE <- format(as.POSIXct(MMF_Filtered$TEST.DATE,format = '%m/%d/%Y %H:%M'),format = '%m/%d/%Y')

MMF_Filtered$TEST.DATE<-mdy(MMF_Filtered$TEST.DATE) 

#Correcting miss labeled participants
MMF_Filtered$LAST <- ifelse(MMF_Filtered$LAST == "3010" & MMF_Filtered$TEST.DATE == "2020-10-09",
                            "3011"
                            ,ifelse(MMF_Filtered$LAST == "3010A",
                                    "3010",
                                    ifelse(MMF_Filtered$LAST == "MMF 1023" & MMF_Filtered$TEST.DATE == "2019-12-14",
                                           "MMF 1046",
                                           ifelse(MMF_Filtered$LAST == "MMF 1023A",
                                                  "MMF 1023",
                                                  ifelse(MMF_Filtered$LAST == "30088888","3008",
                                                         MMF_Filtered$LAST
                                                  )))))

#Remove data where peak torque was not recorded

MMF_Filtered <-MMF_Filtered %>% filter(PEAKTQ.AWY != 0| PEAKTQ.TWD != 0)

#Remove data for warm up sets

MMF_Filtered<-MMF_Filtered%>%filter(SET..%in% c(2,4,6))

#Remove un-used columns

MMF_Filtered <- subset(MMF_Filtered, select = -c(2,5,6,7,8,9,10,11,12,57))

#Make copy of filtered data

MMF <- MMF_Filtered
#Make new variable that is calculation of hamstring to quadriceps ratio
MMF$HM_QD_RATIO <- (MMF$PEAKTQ.BW.TWD/MMF$PEAKTQ.BW.AWY)

#ADD HAMSTRING QUADRACEPT RATIO TO DEXA DATA SET:
names(MMF_Baseline_DEXA)[2]<- "LAST"

#remove white spaces in "LAST' so data sets can combine
MMF$LAST<-str_replace_all(MMF$LAST," ","") 

#Merge data sets
combineddata <- merge(MMF,MMF_Baseline_DEXA,by=c("LAST"))

#Drop some more columns that we dont need
combineddata <-subset(combineddata, select = -c(50:53,90:95))

#NOW DATA SETS ARE COMBINED BUT FOR KNN AND DECISION TREE WE ONLY WANT DEXA DATA SINCE USING BIODEX DATA TO PREDICT THE HAMSTRING TO QUADRACEPT RATIO IS SELF FUFILLING:
LAST_RATIO <- select(MMF,LAST,HM_QD_RATIO)
DXA_RATIO <- merge(LAST_RATIO,MMF_Baseline_DEXA,by=c("LAST"))
DXA_RATIO <- subset(DXA_RATIO, select = -c(3:8,37:42))
DXA_RATIO$HM_QD_RATIO <- ifelse(DXA_RATIO$HM_QD_RATIO >.50,"1","0")

DXADATA <- DXA_RATIO

DXADATA$HM_QD_RATIO <-as.numeric(DXADATA$HM_QD_RATIO)
DXADATA <- subset(DXADATA, select = -c(1,31:36))

#CLEAN UP OUR WORKING DIRECTORY SO WE ONLY HAVE THE DATASET WE'LL USE FOR KNN AND DECISION TREE
rm(combineddata,DXA_RATIO,LAST_RATIO,MMF,MMF_Baseline_DEXA,MMF_biodex_raw,MMF_Filtered)

#KNN:___________________________________________________________________________

#Scale the data 
Data1 <- scale(DXADATA[1:29])

#Scaled data to data frame
Data1<-as.data.frame(Data1)
Data1$HM_QD_RATIO<- as.factor(DXADATA$HM_QD_RATIO)

#Partition the data
#Hamstring Quadriceps Ratio as a factor is our target variable
set.seed(1)
myIndex<- createDataPartition(Data1$HM_QD_RATIO, p=0.7, list=FALSE)
trainSet <- Data1[myIndex,]
validationSet <- Data1[-myIndex,]

#Train control 
myCtrl <- trainControl(method="cv", number=10)
#Cross fold validation to 10
myGrid <- expand.grid(.k=c(1:10))

set.seed(1)
KNN_fit <- train(HM_QD_RATIO ~ ., data=trainSet, method = "knn", trControl=myCtrl, tuneGrid = myGrid)
KNN_fit

## k-Nearest Neighbors 
## 
## 157 samples
##  28 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 142, 141, 141, 141, 142, 141, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa       
##    1  0.5725000   0.121666003
##    2  0.5329167   0.037759763
##    3  0.5275000   0.033315285
##    4  0.5083333   0.001008097
##    5  0.5075000   0.004868379
##    6  0.5200000   0.035461097
##    7  0.5316667   0.063493173
##    8  0.5079167   0.026788245
##    9  0.5079167   0.020373706
##   10  0.4766667  -0.045368350
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.

KNN_Class <- predict(KNN_fit, newdata = validationSet)
confusionMatrix(KNN_Class, validationSet$HM_QD_RATIO, positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 27 16
##          1 10 13
##                                           
##                Accuracy : 0.6061          
##                  95% CI : (0.4781, 0.7242)
##     No Information Rate : 0.5606          
##     P-Value [Acc > NIR] : 0.2688          
##                                           
##                   Kappa : 0.1821          
##                                           
##  Mcnemar's Test P-Value : 0.3268          
##                                           
##             Sensitivity : 0.4483          
##             Specificity : 0.7297          
##          Pos Pred Value : 0.5652          
##          Neg Pred Value : 0.6279          
##              Prevalence : 0.4394          
##          Detection Rate : 0.1970          
##    Detection Prevalence : 0.3485          
##       Balanced Accuracy : 0.5890          
##                                           
##        'Positive' Class : 1               
##

KNN_Class_prob <- predict(KNN_fit, newdata = validationSet, type='prob')
KNN_Class_prob

##            0         1
## 1  0.5000000 0.5000000
## 2  0.5000000 0.5000000
## 3  0.5000000 0.5000000
## 4  1.0000000 0.0000000
## 5  1.0000000 0.0000000
## 6  1.0000000 0.0000000
## 7  1.0000000 0.0000000
## 8  0.1666667 0.8333333
## 9  0.1666667 0.8333333
## 10 0.6666667 0.3333333
## 11 0.6666667 0.3333333
## 12 1.0000000 0.0000000
## 13 1.0000000 0.0000000
## 14 1.0000000 0.0000000
## 15 0.2500000 0.7500000
## 16 0.0000000 1.0000000
## 17 0.6000000 0.4000000
## 18 0.6000000 0.4000000
## 19 0.8333333 0.1666667
## 20 0.6666667 0.3333333
## 21 0.6666667 0.3333333
## 22 0.3333333 0.6666667
## 23 0.3333333 0.6666667
## 24 0.3333333 0.6666667
## 25 0.7500000 0.2500000
## 26 0.5555556 0.4444444
## 27 0.5555556 0.4444444
## 28 0.5555556 0.4444444
## 29 0.5555556 0.4444444
## 30 0.5555556 0.4444444
## 31 0.5555556 0.4444444
## 32 0.5555556 0.4444444
## 33 0.5555556 0.4444444
## 34 0.5555556 0.4444444
## 35 0.5000000 0.5000000
## 36 0.5000000 0.5000000
## 37 0.5000000 0.5000000
## 38 0.4000000 0.6000000
## 39 0.2500000 0.7500000
## 40 0.5000000 0.5000000
## 41 0.5000000 0.5000000
## 42 0.5000000 0.5000000
## 43 0.3750000 0.6250000
## 44 0.7142857 0.2857143
## 45 0.7142857 0.2857143
## 46 0.5000000 0.5000000
## 47 0.5000000 0.5000000
## 48 0.5000000 0.5000000
## 49 0.6666667 0.3333333
## 50 0.6666667 0.3333333
## 51 0.6666667 0.3333333
## 52 0.0000000 1.0000000
## 53 0.0000000 1.0000000
## 54 0.7142857 0.2857143
## 55 0.7142857 0.2857143
## 56 1.0000000 0.0000000
## 57 0.2857143 0.7142857
## 58 0.2857143 0.7142857
## 59 0.0000000 1.0000000
## 60 1.0000000 0.0000000
## 61 1.0000000 0.0000000
## 62 1.0000000 0.0000000
## 63 1.0000000 0.0000000
## 64 0.5000000 0.5000000
## 65 0.5000000 0.5000000
## 66 0.5000000 0.5000000

confusionMatrix(as.factor(ifelse(KNN_Class_prob[,2]>0.50, '1', '0')), 
                validationSet$HM_QD_RATIO, positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 30 21
##          1  7  8
##                                           
##                Accuracy : 0.5758          
##                  95% CI : (0.4479, 0.6966)
##     No Information Rate : 0.5606          
##     P-Value [Acc > NIR] : 0.45269         
##                                           
##                   Kappa : 0.0914          
##                                           
##  Mcnemar's Test P-Value : 0.01402         
##                                           
##             Sensitivity : 0.2759          
##             Specificity : 0.8108          
##          Pos Pred Value : 0.5333          
##          Neg Pred Value : 0.5882          
##              Prevalence : 0.4394          
##          Detection Rate : 0.1212          
##    Detection Prevalence : 0.2273          
##       Balanced Accuracy : 0.5433          
##                                           
##        'Positive' Class : 1               
##

validationSet$HM_QD_RATIO <- as.numeric(as.character(validationSet$HM_QD_RATIO))
gains_table <- gains(validationSet$HM_QD_RATIO, KNN_Class_prob[,2])
gains_table

## Depth                            Cume   Cume Pct                     Mean
##  of           Cume     Mean      Mean   of Total    Lift   Cume     Model
## File     N      N      Resp      Resp      Resp    Index   Lift     Score
## -------------------------------------------------------------------------
##    9     6      6      0.50      0.50      10.3%     114    114      0.94
##   20     7     13      0.43      0.46      20.7%      98    105      0.70
##   45    17     30      0.53      0.50      51.7%     120    114      0.51
##   59     9     39      0.11      0.41      55.2%      25     93      0.44
##   73     9     48      0.33      0.40      65.5%      76     90      0.35
##   79     4     52      0.75      0.42      75.9%     171     96      0.29
##  100    14     66      0.50      0.44     100.0%     114    100      0.03
##   NA    NA     NA        NA        NA        NA%      NA     NA        NA
##   NA    NA     NA        NA        NA        NA%      NA     NA        NA
##   NA    NA     NA        NA        NA        NA%      NA     NA        NA

##Cumulative lift chart
plot(c(0, gains_table$cume.pct.of.total*sum(validationSet$HM_QD_RATIO))~c(0, gains_table$cume.obs), 
     xlab = "# of cases", 
     ylab = "Cumulative", 
     main="Cumulative Lift Chart", 
     type="l")
lines(c(0, sum(validationSet$HM_QD_RATIO))~c(0, dim(validationSet)[1]), 
      col="red", 
      lty=2)

##Decile-wise lift
barplot(gains_table$mean.resp/mean(validationSet$HM_QD_RATIO), 
        names.arg=gains_table$depth, 
        xlab="Percentile", 
        ylab="Lift", 
        ylim=c(0,3), 
        main="Decile-Wise Lift Chart")

##ROC Curve
roc_object <- roc(validationSet$HM_QD_RATIO, KNN_Class_prob[,2])
plot.roc(roc_object)

auc(roc_object)

## Area under the curve: 0.5149

#CLASSIFICATION TREE:___________________________________________________________

TreeData <- DXADATA
#Create categorical outcome w/factor
TreeData$HM_QD_RATIO <- as.factor(TreeData$HM_QD_RATIO)

data <- TreeData

set.seed(1)
myIndex <- createDataPartition(data$HM_QD_RATIO, p=0.8, list=FALSE)
trainSet <- data[myIndex,]
validationSet <- data[-myIndex,]

#Default Tree
set.seed(1)
default_tree <- rpart(HM_QD_RATIO ~., 
                      data = trainSet, 
                      method = "class")
summary(default_tree)

## Call:
## rpart(formula = HM_QD_RATIO ~ ., data = trainSet, method = "class")
##   n= 179 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.09493671      0 1.0000000 1.0000000 0.08409302
## 2 0.03797468      2 0.8101266 0.9367089 0.08339824
## 3 0.02531646      3 0.7721519 0.9873418 0.08397584
## 4 0.01000000      5 0.7215190 0.9746835 0.08384783
## 
## Variable importance
## TRUNK_PFAT L_LEG_PFAT R_LEG_PFAT  TRUNK_FAT  LARM_PFAT  LARM_LEAN SUBTOT_FAT 
##         12         10         10          9          8          8          7 
##  RARM_PFAT TRUNK_MASS   LARM_FAT   HEAD_FAT  HEAD_LEAN  HEAD_MASS  LARM_MASS 
##          6          5          4          3          3          3          3 
##   RARM_FAT  RARM_MASS  HEAD_PFAT L_LEG_LEAN R_LEG_MASS 
##          2          2          2          2          1 
## 
## Node number 1: 179 observations,    complexity param=0.09493671
##   predicted class=0  expected loss=0.4413408  P(node) =1
##     class counts:   100    79
##    probabilities: 0.559 0.441 
##   left son=2 (28 obs) right son=3 (151 obs)
##   Primary splits:
##       LARM_PFAT  < 32.95919 to the right, improve=3.422366, (0 missing)
##       R_LEG_PFAT < 33.48621 to the right, improve=2.994498, (0 missing)
##       HEAD_MASS  < 5095.034 to the left,  improve=2.483308, (0 missing)
##       HEAD_LEAN  < 3858.3   to the left,  improve=2.322997, (0 missing)
##       HEAD_FAT   < 1261.939 to the left,  improve=1.963809, (0 missing)
##   Surrogate splits:
##       R_LEG_PFAT < 33.48621 to the right, agree=0.950, adj=0.679, (0 split)
##       TRUNK_PFAT < 37.54768 to the right, agree=0.944, adj=0.643, (0 split)
##       RARM_PFAT  < 29.87553 to the right, agree=0.927, adj=0.536, (0 split)
##       L_LEG_PFAT < 34.85107 to the right, agree=0.922, adj=0.500, (0 split)
##       LARM_LEAN  < 3358.73  to the left,  agree=0.911, adj=0.429, (0 split)
## 
## Node number 2: 28 observations
##   predicted class=0  expected loss=0.2142857  P(node) =0.1564246
##     class counts:    22     6
##    probabilities: 0.786 0.214 
## 
## Node number 3: 151 observations,    complexity param=0.09493671
##   predicted class=0  expected loss=0.4834437  P(node) =0.8435754
##     class counts:    78    73
##    probabilities: 0.517 0.483 
##   left son=6 (76 obs) right son=7 (75 obs)
##   Primary splits:
##       TRUNK_PFAT < 27.74399 to the left,  improve=4.048797, (0 missing)
##       HEAD_LEAN  < 3833.429 to the left,  improve=2.481704, (0 missing)
##       HEAD_MASS  < 5054.295 to the left,  improve=2.481704, (0 missing)
##       LARM_LEAN  < 4330.712 to the right, improve=2.387011, (0 missing)
##       HEAD_PFAT  < 24.81534 to the left,  improve=2.323573, (0 missing)
##   Surrogate splits:
##       TRUNK_FAT  < 12411.22 to the left,  agree=0.934, adj=0.867, (0 split)
##       SUBTOT_FAT < 25799.2  to the left,  agree=0.848, adj=0.693, (0 split)
##       L_LEG_PFAT < 26.6037  to the left,  agree=0.841, adj=0.680, (0 split)
##       R_LEG_PFAT < 26.30059 to the left,  agree=0.821, adj=0.640, (0 split)
##       TRUNK_MASS < 45188.76 to the left,  agree=0.815, adj=0.627, (0 split)
## 
## Node number 6: 76 observations,    complexity param=0.02531646
##   predicted class=0  expected loss=0.3684211  P(node) =0.424581
##     class counts:    48    28
##    probabilities: 0.632 0.368 
##   left son=12 (42 obs) right son=13 (34 obs)
##   Primary splits:
##       TRUNK_FAT  < 11216.93 to the right, improve=1.284387, (0 missing)
##       TRUNK_MASS < 43326.17 to the right, improve=1.284387, (0 missing)
##       LARM_LEAN  < 4233.126 to the right, improve=1.214575, (0 missing)
##       TRUNK_LEAN < 32041.47 to the right, improve=1.195171, (0 missing)
##       LARM_MASS  < 5725.131 to the right, improve=1.070613, (0 missing)
##   Surrogate splits:
##       LARM_FAT   < 1473.992 to the right, agree=0.868, adj=0.706, (0 split)
##       L_LEG_PFAT < 23.88228 to the right, agree=0.803, adj=0.559, (0 split)
##       SUBTOT_FAT < 21460.09 to the right, agree=0.803, adj=0.559, (0 split)
##       R_LEG_MASS < 16013.72 to the right, agree=0.789, adj=0.529, (0 split)
##       LARM_PFAT  < 20.72244 to the right, agree=0.763, adj=0.471, (0 split)
## 
## Node number 7: 75 observations,    complexity param=0.03797468
##   predicted class=1  expected loss=0.4  P(node) =0.4189944
##     class counts:    30    45
##    probabilities: 0.400 0.600 
##   left son=14 (11 obs) right son=15 (64 obs)
##   Primary splits:
##       LARM_LEAN < 4829.499 to the right, improve=1.4403410, (0 missing)
##       LARM_MASS < 7090.864 to the right, improve=1.4403410, (0 missing)
##       HEAD_PFAT < 25.38827 to the left,  improve=1.0210080, (0 missing)
##       RARM_LEAN < 3912.856 to the right, improve=0.9230769, (0 missing)
##       TRUNK_FAT < 12020.82 to the right, improve=0.9230769, (0 missing)
##   Surrogate splits:
##       LARM_MASS < 7090.864 to the right, agree=1.00, adj=1.000, (0 split)
##       LARM_FAT  < 2270.908 to the right, agree=0.96, adj=0.727, (0 split)
##       RARM_FAT  < 2332.712 to the right, agree=0.96, adj=0.727, (0 split)
##       RARM_MASS < 7592.971 to the right, agree=0.96, adj=0.727, (0 split)
##       RARM_PFAT < 31.79057 to the right, agree=0.96, adj=0.727, (0 split)
## 
## Node number 12: 42 observations
##   predicted class=0  expected loss=0.2857143  P(node) =0.2346369
##     class counts:    30    12
##    probabilities: 0.714 0.286 
## 
## Node number 13: 34 observations,    complexity param=0.02531646
##   predicted class=0  expected loss=0.4705882  P(node) =0.1899441
##     class counts:    18    16
##    probabilities: 0.529 0.471 
##   left son=26 (26 obs) right son=27 (8 obs)
##   Primary splits:
##       HEAD_FAT   < 1348.528 to the left,  improve=1.6334840, (0 missing)
##       HEAD_LEAN  < 4168.313 to the left,  improve=1.6334840, (0 missing)
##       HEAD_MASS  < 5516.551 to the left,  improve=1.6334840, (0 missing)
##       L_LEG_MASS < 14814.73 to the right, improve=1.4126050, (0 missing)
##       HEAD_PFAT  < 24.36065 to the right, improve=0.9411765, (0 missing)
##   Surrogate splits:
##       HEAD_LEAN  < 4168.313 to the left,  agree=1.000, adj=1.000, (0 split)
##       HEAD_MASS  < 5516.551 to the left,  agree=1.000, adj=1.000, (0 split)
##       HEAD_PFAT  < 24.56043 to the left,  agree=0.912, adj=0.625, (0 split)
##       LARM_LEAN  < 3396.856 to the right, agree=0.912, adj=0.625, (0 split)
##       L_LEG_LEAN < 9262.734 to the right, agree=0.912, adj=0.625, (0 split)
## 
## Node number 14: 11 observations
##   predicted class=0  expected loss=0.3636364  P(node) =0.06145251
##     class counts:     7     4
##    probabilities: 0.636 0.364 
## 
## Node number 15: 64 observations
##   predicted class=1  expected loss=0.359375  P(node) =0.3575419
##     class counts:    23    41
##    probabilities: 0.359 0.641 
## 
## Node number 26: 26 observations
##   predicted class=0  expected loss=0.3846154  P(node) =0.1452514
##     class counts:    16    10
##    probabilities: 0.615 0.385 
## 
## Node number 27: 8 observations
##   predicted class=1  expected loss=0.25  P(node) =0.04469274
##     class counts:     2     6
##    probabilities: 0.250 0.750

prp(default_tree, 
    type = 1, 
    extra = 1, 
    under = TRUE)

#Full Tree:
set.seed(1)
full_tree <- rpart(HM_QD_RATIO ~ ., 
                   data = trainSet, 
                   method = "class", 
                   cp = 0, 
                   minsplit = 2, 
                   minbucket = 1)
prp(full_tree, 
    type = 1, 
    extra = 1, 
    under = TRUE)

printcp(full_tree)

## 
## Classification tree:
## rpart(formula = HM_QD_RATIO ~ ., data = trainSet, method = "class", 
##     cp = 0, minsplit = 2, minbucket = 1)
## 
## Variables actually used in tree construction:
## [1] HEAD_FAT   L_LEG_LEAN LARM_LEAN  LARM_PFAT  TRUNK_FAT  TRUNK_PFAT
## 
## Root node error: 79/179 = 0.44134
## 
## n= 179 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.094937      0   1.00000 1.00000 0.084093
## 2 0.037975      2   0.81013 0.93671 0.083398
## 3 0.025316      3   0.77215 1.00000 0.084093
## 4 0.000000      6   0.69620 0.97468 0.083848

pruned_tree <- prune(full_tree, cp = 0.030)
prp(pruned_tree, 
    type = 1, 
    extra = 1, 
    under = TRUE)

predicted_class <- predict(pruned_tree, validationSet, type = "class")

confusionMatrix(predicted_class, validationSet$HM_QD_RATIO, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 18 13
##          1  7  6
##                                           
##                Accuracy : 0.5455          
##                  95% CI : (0.3885, 0.6961)
##     No Information Rate : 0.5682          
##     P-Value [Acc > NIR] : 0.6777          
##                                           
##                   Kappa : 0.0372          
##                                           
##  Mcnemar's Test P-Value : 0.2636          
##                                           
##             Sensitivity : 0.3158          
##             Specificity : 0.7200          
##          Pos Pred Value : 0.4615          
##          Neg Pred Value : 0.5806          
##              Prevalence : 0.4318          
##          Detection Rate : 0.1364          
##    Detection Prevalence : 0.2955          
##       Balanced Accuracy : 0.5179          
##                                           
##        'Positive' Class : 1               
##

length(which(data$HM_QD_RATIO=="1"))

## [1] 98

predicted_prob <- predict(pruned_tree, validationSet, type= 'prob')
head(predicted_prob)

##            0         1
## 3  0.3593750 0.6406250
## 7  0.3593750 0.6406250
## 8  0.6315789 0.3684211
## 9  0.6315789 0.3684211
## 10 0.6315789 0.3684211
## 15 0.3593750 0.6406250

confusionMatrix(as.factor(ifelse(predicted_prob[,2]>0.26, '1', '0')), 
                validationSet$HM_QD_RATIO, positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0  4  2
##          1 21 17
##                                           
##                Accuracy : 0.4773          
##                  95% CI : (0.3246, 0.6331)
##     No Information Rate : 0.5682          
##     P-Value [Acc > NIR] : 0.9140112       
##                                           
##                   Kappa : 0.0489          
##                                           
##  Mcnemar's Test P-Value : 0.0001746       
##                                           
##             Sensitivity : 0.8947          
##             Specificity : 0.1600          
##          Pos Pred Value : 0.4474          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.4318          
##          Detection Rate : 0.3864          
##    Detection Prevalence : 0.8636          
##       Balanced Accuracy : 0.5274          
##                                           
##        'Positive' Class : 1               
##

validationSet$HM_QD_RATIO <- as.numeric(as.character(validationSet$HM_QD_RATIO))
gains_table <- gains(validationSet$HM_QD_RATIO, predicted_prob[,2])
gains_table

## Depth                            Cume   Cume Pct                     Mean
##  of           Cume     Mean      Mean   of Total    Lift   Cume     Model
## File     N      N      Resp      Resp      Resp    Index   Lift     Score
## -------------------------------------------------------------------------
##   30    13     13      0.46      0.46      31.6%     107    107      0.64
##   84    24     37      0.42      0.43      84.2%      96    100      0.37
##   86     1     38      1.00      0.45      89.5%     232    104      0.36
##  100     6     44      0.33      0.43     100.0%      77    100      0.21

##LIFT CHART
plot(c(0, gains_table$cume.pct.of.total*sum(validationSet$HM_QD_RATIO)) ~ c(0, gains_table$cume.obs), 
     xlab = '# of cases', 
     ylab = "Cumulative", 
     type = "l")
lines(c(0, sum(validationSet$HM_QD_RATIO))~c(0, dim(validationSet)[1]), 
      col="red", 
      lty=2)

##DECILE-WISE LIFT CHART
barplot(gains_table$mean.resp/mean(validationSet$HM_QD_RATIO), 
        names.arg=gains_table$depth, 
        xlab="Percentile", 
        ylab="Lift", 
        ylim=c(0, 3.0), 
        main="Decile-Wise Lift Chart")

##RECEIVER OPERATOR CURVE
roc_object <- roc(validationSet$HM_QD_RATIO, predicted_prob[,2])
plot.roc(roc_object)

auc(roc_object)

## Area under the curve: 0.52

KNN & Classification Tree

Zak Kindl

Can we accuratley predict hamstring quadricep ratio in cancer patients as a means of accurate physical assesment using dual x-ray absorptiometry scans?