Week 10 Lab Practice

# Week 10 Lab Exercise
#Download the LabW9 dataset from spectrum.
#Load the dataset and appropriate packages.
library(readxl)
df1 <- read_excel("labW9.xlsx",1)
View(df1)

#any(grepl("caret", installed.packages())) 
#install.packages("caret")
#install.packages("klaR")
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(klaR)
## Loading required package: MASS
#Conduct data exploration and checking and cleaning if necessary
names(df1)
## [1] "Pregnancies"              "Glucose"                 
## [3] "BloodPressure"            "SkinThickness"           
## [5] "Insulin"                  "BMI"                     
## [7] "DiabetesPedigreeFunction" "Age"                     
## [9] "Outcome"
nrow(df1)
## [1] 768
ncol(df1)
## [1] 9
length(df1)
## [1] 9
str(df1)
## tibble [768 x 9] (S3: tbl_df/tbl/data.frame)
##  $ Pregnancies             : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...
summary(df1)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000
head(df1)
## # A tibble: 6 x 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI DiabetesPedigre~
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>            <dbl>
## 1           6     148            72            35       0  33.6            0.627
## 2           1      85            66            29       0  26.6            0.351
## 3           8     183            64             0       0  23.3            0.672
## 4           1      89            66            23      94  28.1            0.167
## 5           0     137            40            35     168  43.1            2.29 
## 6           5     116            74             0       0  25.6            0.201
## # ... with 2 more variables: Age <dbl>, Outcome <dbl>
tail(df1)
## # A tibble: 6 x 9
##   Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI DiabetesPedigre~
##         <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>            <dbl>
## 1           9      89            62             0       0  22.5            0.142
## 2          10     101            76            48     180  32.9            0.171
## 3           2     122            70            27       0  36.8            0.34 
## 4           5     121            72            23     112  26.2            0.245
## 5           1     126            60             0       0  30.1            0.349
## 6           1      93            70            31       0  30.4            0.315
## # ... with 2 more variables: Age <dbl>, Outcome <dbl>
sum(is.na(df1)) #no missing value is found
## [1] 0
#Partition data  70/30 using any method you feel comfortable with
split=0.70  # define an 70%/30% train/test split of the dataset
inTraining <- createDataPartition(df1$Outcome, p=split, list=FALSE)
training <- df1[ inTraining,]
testing <- df1[-inTraining,]

#Check both your training and test subsets
str(training)
## tibble [538 x 9] (S3: tbl_df/tbl/data.frame)
##  $ Pregnancies             : num [1:538] 6 3 2 8 10 5 0 1 3 8 ...
##  $ Glucose                 : num [1:538] 148 78 197 125 168 166 118 103 126 99 ...
##  $ BloodPressure           : num [1:538] 72 50 70 96 74 72 84 30 88 84 ...
##  $ SkinThickness           : num [1:538] 35 32 45 0 0 19 47 38 41 0 ...
##  $ Insulin                 : num [1:538] 0 88 543 0 0 175 230 83 235 0 ...
##  $ BMI                     : num [1:538] 33.6 31 30.5 0 38 25.8 45.8 43.3 39.3 35.4 ...
##  $ DiabetesPedigreeFunction: num [1:538] 0.627 0.248 0.158 0.232 0.537 0.587 0.551 0.183 0.704 0.388 ...
##  $ Age                     : num [1:538] 50 26 53 54 34 51 31 33 27 50 ...
##  $ Outcome                 : num [1:538] 1 1 1 1 1 1 1 0 0 0 ...
str(testing)
## tibble [230 x 9] (S3: tbl_df/tbl/data.frame)
##  $ Pregnancies             : num [1:230] 1 8 1 0 5 10 4 10 1 7 ...
##  $ Glucose                 : num [1:230] 85 183 89 137 116 115 110 139 189 100 ...
##  $ BloodPressure           : num [1:230] 66 64 66 40 74 0 92 80 60 0 ...
##  $ SkinThickness           : num [1:230] 29 0 23 35 0 0 0 0 23 0 ...
##  $ Insulin                 : num [1:230] 0 0 94 168 0 0 0 0 846 0 ...
##  $ BMI                     : num [1:230] 26.6 23.3 28.1 43.1 25.6 35.3 37.6 27.1 30.1 30 ...
##  $ DiabetesPedigreeFunction: num [1:230] 0.351 0.672 0.167 2.288 0.201 ...
##  $ Age                     : num [1:230] 31 32 21 33 30 29 30 57 59 32 ...
##  $ Outcome                 : num [1:230] 0 1 0 1 0 0 0 0 1 1 ...
#Check for cross validation if the model allows for it
train_control <- trainControl(method="cv", number = 10)
train_control
## $method
## [1] "cv"
## 
## $number
## [1] 10
## 
## $repeats
## [1] NA
## 
## $search
## [1] "grid"
## 
## $p
## [1] 0.75
## 
## $initialWindow
## NULL
## 
## $horizon
## [1] 1
## 
## $fixedWindow
## [1] TRUE
## 
## $skip
## [1] 0
## 
## $verboseIter
## [1] FALSE
## 
## $returnData
## [1] TRUE
## 
## $returnResamp
## [1] "final"
## 
## $savePredictions
## [1] FALSE
## 
## $classProbs
## [1] FALSE
## 
## $summaryFunction
## function (data, lev = NULL, model = NULL) 
## {
##     if (is.character(data$obs)) 
##         data$obs <- factor(data$obs, levels = lev)
##     postResample(data[, "pred"], data[, "obs"])
## }
## <bytecode: 0x000000003014b638>
## <environment: namespace:caret>
## 
## $selectionFunction
## [1] "best"
## 
## $preProcOptions
## $preProcOptions$thresh
## [1] 0.95
## 
## $preProcOptions$ICAcomp
## [1] 3
## 
## $preProcOptions$k
## [1] 5
## 
## $preProcOptions$freqCut
## [1] 19
## 
## $preProcOptions$uniqueCut
## [1] 10
## 
## $preProcOptions$cutoff
## [1] 0.9
## 
## 
## $sampling
## NULL
## 
## $index
## NULL
## 
## $indexOut
## NULL
## 
## $indexFinal
## NULL
## 
## $timingSamps
## [1] 0
## 
## $predictionBounds
## [1] FALSE FALSE
## 
## $seeds
## [1] NA
## 
## $adaptive
## $adaptive$min
## [1] 5
## 
## $adaptive$alpha
## [1] 0.05
## 
## $adaptive$method
## [1] "gls"
## 
## $adaptive$complete
## [1] TRUE
## 
## 
## $trim
## [1] FALSE
## 
## $allowParallel
## [1] TRUE
#Train your test data using KNN model
set.seed(123456) #to make sure that we get the same results

KNN <- train(factor(Outcome)~., data=df1, trControl=train_control, method="knn")
KNN
## k-Nearest Neighbors 
## 
## 768 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 691, 692, 691, 691, 691, 691, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.7201128  0.3589603
##   7  0.7278537  0.3770873
##   9  0.7395420  0.4017339
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
#Plot your model
plot(KNN) #The plot showed that when k=9, the accuracy is the highest, 0.739

#Predict using your test data onto your model
predictions<-predict(KNN, newdata = testing)
predictions  
##   [1] 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1
##  [38] 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0
##  [75] 1 1 0 0 0 1 0 0 0 1 0 1 1 1 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 0 1 1 0 0
## [112] 0 0 1 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## [149] 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 0 0 1
## [186] 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 1 1 0 0 1 0 1 0 0 1 1 0
## [223] 1 0 1 0 0 0 0 1
## Levels: 0 1
table(predictions)
## predictions
##   0   1 
## 164  66
plot(predictions)

#Evaluate outcome using confusion matrix
confusion_matrix<-confusionMatrix(predictions,as.factor(testing$Outcome))
confusion_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 134  30
##          1  14  52
##                                           
##                Accuracy : 0.8087          
##                  95% CI : (0.7518, 0.8574)
##     No Information Rate : 0.6435          
##     P-Value [Acc > NIR] : 3.116e-08       
##                                           
##                   Kappa : 0.5641          
##                                           
##  Mcnemar's Test P-Value : 0.02374         
##                                           
##             Sensitivity : 0.9054          
##             Specificity : 0.6341          
##          Pos Pred Value : 0.8171          
##          Neg Pred Value : 0.7879          
##              Prevalence : 0.6435          
##          Detection Rate : 0.5826          
##    Detection Prevalence : 0.7130          
##       Balanced Accuracy : 0.7698          
##                                           
##        'Positive' Class : 0               
## 
#Accuracy of the model is 81%.

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.