R Notebook

load("~/Dropbox/RProjects/Module 8/cdc.Rdata")
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

Logistic Regression smoke100

load("~/Dropbox/RProjects/Module 8/cdc.Rdata")
cdc$smoke100 = factor(cdc$smoke100)
set.seed(2345)
cdcsample = createDataPartition(cdc$smoke100,p=.8,list=F)
traind = cdc[cdcsample,]
testd = cdc[-cdcsample,]
table(traind$smoke100)

## 
##    0    1 
## 8448 7553

table(testd$smoke100)

## 
##    0    1 
## 2111 1888

Logistic Regression Model

tc = trainControl(method="cv",number=10)
modsmoke1 = train(smoke100 ~.,
                  method = "glm",
                  family="binomial", trControl=tc,data=traind)
summary(modsmoke1)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8592  -1.1018  -0.8454   1.1771   1.7207  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -2.9152179  0.4221218  -6.906 4.98e-12 ***
## `genhlthvery good`  0.3372864  0.0438739   7.688 1.50e-14 ***
## genhlthgood         0.5059594  0.0464814  10.885  < 2e-16 ***
## genhlthfair         0.6167228  0.0639113   9.650  < 2e-16 ***
## genhlthpoor         1.0282411  0.1015420  10.126  < 2e-16 ***
## exerany            -0.0374330  0.0384678  -0.973 0.330505    
## hlthplan           -0.2967520  0.0499693  -5.939 2.87e-09 ***
## height              0.0445117  0.0067662   6.579 4.75e-11 ***
## weight              0.0002635  0.0007133   0.369 0.711787    
## wtdesire           -0.0045067  0.0012277  -3.671 0.000242 ***
## age                 0.0132907  0.0010089  13.173  < 2e-16 ***
## genderf            -0.3977519  0.0506079  -7.859 3.86e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22132  on 16000  degrees of freedom
## Residual deviance: 21448  on 15989  degrees of freedom
## AIC: 21472
## 
## Number of Fisher Scoring iterations: 4

Check Logistic Performance

PredMod1 = predict(modsmoke1,testd)
confusionMatrix(PredMod1,testd$smoke100)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1457 1031
##          1  654  857
##                                          
##                Accuracy : 0.5786         
##                  95% CI : (0.5632, 0.594)
##     No Information Rate : 0.5279         
##     P-Value [Acc > NIR] : 6.308e-11      
##                                          
##                   Kappa : 0.1456         
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.6902         
##             Specificity : 0.4539         
##          Pos Pred Value : 0.5856         
##          Neg Pred Value : 0.5672         
##              Prevalence : 0.5279         
##          Detection Rate : 0.3643         
##    Detection Prevalence : 0.6222         
##       Balanced Accuracy : 0.5721         
##                                          
##        'Positive' Class : 0              
##

str(PredMod1)

##  Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...

str(testd$smoke100)

##  Factor w/ 2 levels "0","1": 2 2 1 2 1 1 1 2 2 1 ...

table(PredMod1)

## PredMod1
##    0    1 
## 2488 1511

varImp(modsmoke1)

## glm variable importance
## 
##                    Overall
## age                100.000
## genhlthgood         82.132
## genhlthpoor         76.205
## genhlthfair         72.482
## genderf             58.500
## `genhlthvery good`  57.158
## height              48.496
## hlthplan            43.498
## wtdesire            25.786
## exerany              4.715
## weight               0.000

knn Model

# Get a clean copy of cdc
load("~/Dropbox/RProjects/Module 8/cdc.Rdata")

# Dummify and factorize
ds = dummyVars(~.,data=cdc,fullRank = T)
cdc2 = as.data.frame(predict(ds,cdc))
cdc2$smoke100 = as.factor(cdc2$smoke100)

# Create traind and testd
set.seed(2345)
cdcsample = createDataPartition(cdc2$smoke100,p=.8,list=F)
traind = cdc2[cdcsample,]
testd = cdc2[-cdcsample,]

# Set Structure of Train Process
tc = trainControl(method="cv",number=5)

# Create model
modk1= train(smoke100 ~.,
                  method = "knn",
                  preProcess = c("center","scale"),
  tuneLength = 40,
  trControl=tc,data=traind)

# Examine tuning results
plot(modk1)

str(modk1)

## List of 23
##  $ method      : chr "knn"
##  $ modelInfo   :List of 13
##   ..$ label     : chr "k-Nearest Neighbors"
##   ..$ library   : NULL
##   ..$ loop      : NULL
##   ..$ type      : chr [1:2] "Classification" "Regression"
##   ..$ parameters:'data.frame':   1 obs. of  3 variables:
##   .. ..$ parameter: Factor w/ 1 level "k": 1
##   .. ..$ class    : Factor w/ 1 level "numeric": 1
##   .. ..$ label    : Factor w/ 1 level "#Neighbors": 1
##   ..$ grid      :function (x, y, len = NULL, search = "grid")  
##   .. ..- attr(*, "srcref")= 'srcref' int [1:8] 8 26 16 19 26 19 8 16
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f8bec813850> 
##   ..$ fit       :function (x, y, wts, param, lev, last, classProbs, ...)  
##   .. ..- attr(*, "srcref")= 'srcref' int [1:8] 17 25 24 19 25 19 17 24
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f8bec813850> 
##   ..$ predict   :function (modelFit, newdata, submodels = NULL)  
##   .. ..- attr(*, "srcref")= 'srcref' int [1:8] 25 29 33 19 29 19 25 33
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f8bec813850> 
##   ..$ predictors:function (x, ...)  
##   .. ..- attr(*, "srcref")= 'srcref' int [1:8] 34 32 34 67 32 67 34 34
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f8bec813850> 
##   ..$ tags      : chr "Prototype Models"
##   ..$ prob      :function (modelFit, newdata, submodels = NULL)  
##   .. ..- attr(*, "srcref")= 'srcref' int [1:8] 36 26 37 61 26 61 36 37
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f8bec813850> 
##   ..$ levels    :function (x)  
##   .. ..- attr(*, "srcref")= 'srcref' int [1:8] 38 28 38 56 28 56 38 38
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f8bec813850> 
##   ..$ sort      :function (x)  
##   .. ..- attr(*, "srcref")= 'srcref' int [1:8] 39 26 39 54 26 54 39 39
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f8bec813850> 
##  $ modelType   : chr "Classification"
##  $ results     :'data.frame':    40 obs. of  5 variables:
##   ..$ k         : int [1:40] 5 7 9 11 13 15 17 19 21 23 ...
##   ..$ Accuracy  : num [1:40] 0.552 0.554 0.561 0.568 0.567 ...
##   ..$ Kappa     : num [1:40] 0.0984 0.1024 0.1156 0.1295 0.1262 ...
##   ..$ AccuracySD: num [1:40] 0.0163 0.0087 0.0122 0.0101 0.0134 ...
##   ..$ KappaSD   : num [1:40] 0.0321 0.0177 0.0242 0.0201 0.0265 ...
##  $ pred        : NULL
##  $ bestTune    :'data.frame':    1 obs. of  1 variable:
##   ..$ k: int 57
##  $ call        : language train.formula(form = smoke100 ~ ., data = traind, method = "knn",      preProcess = c("center", "scale"), tuneLen| __truncated__
##  $ dots        : list()
##  $ metric      : chr "Accuracy"
##  $ control     :List of 27
##   ..$ method           : chr "cv"
##   ..$ number           : num 5
##   ..$ repeats          : logi NA
##   ..$ search           : chr "grid"
##   ..$ p                : num 0.75
##   ..$ initialWindow    : NULL
##   ..$ horizon          : num 1
##   ..$ fixedWindow      : logi TRUE
##   ..$ skip             : num 0
##   ..$ verboseIter      : logi FALSE
##   ..$ returnData       : logi TRUE
##   ..$ returnResamp     : chr "final"
##   ..$ savePredictions  : chr "none"
##   ..$ classProbs       : logi FALSE
##   ..$ summaryFunction  :function (data, lev = NULL, model = NULL)  
##   ..$ selectionFunction: chr "best"
##   ..$ preProcOptions   :List of 6
##   .. ..$ thresh   : num 0.95
##   .. ..$ ICAcomp  : num 3
##   .. ..$ k        : num 5
##   .. ..$ freqCut  : num 19
##   .. ..$ uniqueCut: num 10
##   .. ..$ cutoff   : num 0.9
##   ..$ sampling         : NULL
##   ..$ index            :List of 5
##   .. ..$ Fold1: int [1:12801] 1 2 3 4 6 7 8 9 10 11 ...
##   .. ..$ Fold2: int [1:12800] 1 3 4 5 6 7 8 9 11 13 ...
##   .. ..$ Fold3: int [1:12800] 1 2 3 4 5 6 7 10 12 13 ...
##   .. ..$ Fold4: int [1:12802] 2 3 4 5 7 8 9 10 11 12 ...
##   .. ..$ Fold5: int [1:12801] 1 2 5 6 8 9 10 11 12 13 ...
##   ..$ indexOut         :List of 5
##   .. ..$ Resample1: int [1:3200] 5 14 17 21 28 32 39 46 48 49 ...
##   .. ..$ Resample2: int [1:3201] 2 10 12 15 23 33 35 36 38 41 ...
##   .. ..$ Resample3: int [1:3201] 8 9 11 19 20 22 26 31 34 37 ...
##   .. ..$ Resample4: int [1:3199] 1 6 13 18 24 27 29 52 56 60 ...
##   .. ..$ Resample5: int [1:3200] 3 4 7 16 25 30 40 47 51 58 ...
##   ..$ indexFinal       : NULL
##   ..$ timingSamps      : num 0
##   ..$ predictionBounds : logi [1:2] FALSE FALSE
##   ..$ seeds            :List of 6
##   .. ..$ : int [1:40] 594834 742861 303440 919360 840091 381309 402417 510888 992173 803548 ...
##   .. ..$ : int [1:40] 751021 409982 105873 24361 904798 491722 324726 549381 118402 349022 ...
##   .. ..$ : int [1:40] 65810 839542 13074 311209 725477 508587 390979 499769 319814 221443 ...
##   .. ..$ : int [1:40] 785150 219299 276742 680260 662289 351109 351133 983325 920988 352018 ...
##   .. ..$ : int [1:40] 939 774323 309056 161854 289914 776366 522991 94367 440976 261746 ...
##   .. ..$ : int 384399
##   ..$ adaptive         :List of 4
##   .. ..$ min     : num 5
##   .. ..$ alpha   : num 0.05
##   .. ..$ method  : chr "gls"
##   .. ..$ complete: logi TRUE
##   ..$ trim             : logi FALSE
##   ..$ allowParallel    : logi TRUE
##  $ finalModel  :List of 8
##   ..$ learn      :List of 2
##   .. ..$ y: Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 2 2 2 ...
##   .. .. ..- attr(*, "names")= chr [1:16001] "1" "5" "6" "7" ...
##   .. ..$ X: num [1:16001, 1:11] -0.728 1.374 1.374 1.374 1.374 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:16001] "X1" "X5" "X6" "X7" ...
##   .. .. .. ..$ : chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   ..$ k          : int 57
##   ..$ theDots    : list()
##   ..$ xNames     : chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   ..$ problemType: chr "Classification"
##   ..$ tuneValue  :'data.frame':  1 obs. of  1 variable:
##   .. ..$ k: int 57
##   ..$ obsLevels  : chr [1:2] "0" "1"
##   .. ..- attr(*, "ordered")= logi FALSE
##   ..$ param      : list()
##   ..- attr(*, "class")= chr "knn3"
##  $ preProcess  :List of 22
##   ..$ dim              : int [1:2] 16001 11
##   ..$ bc               : NULL
##   ..$ yj               : NULL
##   ..$ et               : NULL
##   ..$ invHyperbolicSine: NULL
##   ..$ mean             : Named num [1:11] 0.3462 0.2864 0.1003 0.0332 0.7447 ...
##   .. ..- attr(*, "names")= chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   ..$ std              : Named num [1:11] 0.476 0.452 0.3 0.179 0.436 ...
##   .. ..- attr(*, "names")= chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   ..$ ranges           : NULL
##   ..$ rotation         : NULL
##   ..$ method           :List of 3
##   .. ..$ center: chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   .. ..$ scale : chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   .. ..$ ignore: chr(0) 
##   ..$ thresh           : num 0.95
##   ..$ pcaComp          : NULL
##   ..$ numComp          : NULL
##   ..$ ica              : NULL
##   ..$ wildcards        :List of 2
##   .. ..$ PCA: chr(0) 
##   .. ..$ ICA: chr(0) 
##   ..$ k                : num 5
##   ..$ knnSummary       :function (x, ...)  
##   ..$ bagImp           : NULL
##   ..$ median           : NULL
##   ..$ data             : NULL
##   ..$ rangeBounds      : num [1:2] 0 1
##   ..$ call             : chr "scrubed"
##   ..- attr(*, "class")= chr "preProcess"
##  $ trainingData:'data.frame':    16001 obs. of  12 variables:
##   ..$ .outcome         : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 2 2 2 ...
##   ..$ genhlth.very good: num [1:16001] 0 1 1 1 1 0 0 0 0 0 ...
##   ..$ genhlth.good     : num [1:16001] 1 0 0 0 0 1 1 0 0 0 ...
##   ..$ genhlth.fair     : num [1:16001] 0 0 0 0 0 0 0 0 1 0 ...
##   ..$ genhlth.poor     : num [1:16001] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ exerany          : num [1:16001] 0 0 1 1 0 0 1 1 1 1 ...
##   ..$ hlthplan         : num [1:16001] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ height           : num [1:16001] 70 61 64 71 67 65 70 69 69 70 ...
##   ..$ weight           : num [1:16001] 175 150 114 194 170 150 180 186 168 170 ...
##   ..$ wtdesire         : num [1:16001] 175 130 114 185 160 130 170 175 148 170 ...
##   ..$ age              : num [1:16001] 77 55 55 31 45 27 44 46 62 69 ...
##   ..$ gender.f         : num [1:16001] 0 1 1 0 0 1 0 0 0 0 ...
##  $ resample    :'data.frame':    5 obs. of  3 variables:
##   ..$ Accuracy: num [1:5] 0.586 0.609 0.576 0.6 0.573
##   ..$ Kappa   : num [1:5] 0.165 0.209 0.141 0.194 0.138
##   ..$ Resample: chr [1:5] "Fold1" "Fold2" "Fold3" "Fold4" ...
##  $ resampledCM :'data.frame':    200 obs. of  6 variables:
##   ..$ cell1   : num [1:200] 1022 1013 1036 1070 1090 ...
##   ..$ cell2   : num [1:200] 668 677 654 620 600 606 607 606 586 594 ...
##   ..$ cell3   : num [1:200] 736 752 744 751 756 750 736 746 738 740 ...
##   ..$ cell4   : num [1:200] 774 758 766 759 754 760 774 764 772 770 ...
##   ..$ k       : int [1:200] 5 7 9 11 13 15 17 19 21 23 ...
##   ..$ Resample: chr [1:200] "Fold1" "Fold1" "Fold1" "Fold1" ...
##  $ perfNames   : chr [1:2] "Accuracy" "Kappa"
##  $ maximize    : logi TRUE
##  $ yLimits     : NULL
##  $ times       :List of 3
##   ..$ everything: 'proc_time' Named num [1:5] 104.63 3.21 107.95 0 0
##   .. ..- attr(*, "names")= chr [1:5] "user.self" "sys.self" "elapsed" "user.child" ...
##   ..$ final     : 'proc_time' Named num [1:5] 0.064 0.014 0.078 0 0
##   .. ..- attr(*, "names")= chr [1:5] "user.self" "sys.self" "elapsed" "user.child" ...
##   ..$ prediction: logi [1:3] NA NA NA
##  $ levels      : chr [1:2] "0" "1"
##   ..- attr(*, "ordered")= logi FALSE
##  $ terms       :Classes 'terms', 'formula'  language smoke100 ~ `genhlth.very good` + genhlth.good + genhlth.fair + genhlth.poor +      exerany + hlthplan + height + | __truncated__
##   .. ..- attr(*, "variables")= language list(smoke100, `genhlth.very good`, genhlth.good, genhlth.fair, genhlth.poor,      exerany, hlthplan, height, wei| __truncated__
##   .. ..- attr(*, "factors")= int [1:12, 1:11] 0 1 0 0 0 0 0 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:12] "smoke100" "`genhlth.very good`" "genhlth.good" "genhlth.fair" ...
##   .. .. .. ..$ : chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   .. ..- attr(*, "term.labels")= chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   .. ..- attr(*, "order")= int [1:11] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(smoke100, `genhlth.very good`, genhlth.good, genhlth.fair, genhlth.poor,      exerany, hlthplan, height, wei| __truncated__
##   .. ..- attr(*, "dataClasses")= Named chr [1:12] "factor" "numeric" "numeric" "numeric" ...
##   .. .. ..- attr(*, "names")= chr [1:12] "smoke100" "genhlth.very good" "genhlth.good" "genhlth.fair" ...
##  $ coefnames   : chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##  $ xlevels     : Named list()
##  - attr(*, "class")= chr [1:2] "train" "train.formula"

modk1$bestTune

##     k
## 27 57