R Notebook

load("~/Dropbox/RProjects/Module 8/cdc.Rdata")
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

Logistic Regression smoke100

cdc$smoke100 = factor(cdc$smoke100)
set.seed(2345)
cdcsample = createDataPartition(cdc$smoke100,p=.8,list=F)
traind = cdc[cdcsample,]
testd = cdc[-cdcsample,]
table(traind$smoke100)

## 
##    0    1 
## 8448 7553

table(testd$smoke100)

## 
##    0    1 
## 2111 1888

Logistic Regression Model

tc = trainControl(method="cv",number=10)
modsmoke1 = train(smoke100 ~.,
                  method = "glm",
                  family="binomial", trControl=tc,data=traind)
summary(modsmoke1)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.866  -1.102  -0.851   1.179   1.957  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -3.1533555  0.4227391  -7.459 8.70e-14 ***
## `genhlthvery good`  0.3189912  0.0438159   7.280 3.33e-13 ***
## genhlthgood         0.4797927  0.0466862  10.277  < 2e-16 ***
## genhlthfair         0.5893825  0.0635359   9.276  < 2e-16 ***
## genhlthpoor         1.0468973  0.1005494  10.412  < 2e-16 ***
## exerany            -0.0238132  0.0383812  -0.620    0.535    
## hlthplan           -0.2858091  0.0502971  -5.682 1.33e-08 ***
## height              0.0490742  0.0067816   7.236 4.61e-13 ***
## weight              0.0003252  0.0007100   0.458    0.647    
## wtdesire           -0.0050493  0.0012276  -4.113 3.90e-05 ***
## age                 0.0133229  0.0010054  13.251  < 2e-16 ***
## genderf            -0.3983144  0.0506775  -7.860 3.85e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22132  on 16000  degrees of freedom
## Residual deviance: 21449  on 15989  degrees of freedom
## AIC: 21473
## 
## Number of Fisher Scoring iterations: 4

Check Logistic Performance

PredMod1 = predict(modsmoke1,testd)
confusionMatrix(PredMod1,testd$smoke100)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1447 1036
##          1  664  852
##                                           
##                Accuracy : 0.5749          
##                  95% CI : (0.5594, 0.5903)
##     No Information Rate : 0.5279          
##     P-Value [Acc > NIR] : 1.311e-09       
##                                           
##                   Kappa : 0.1382          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6855          
##             Specificity : 0.4513          
##          Pos Pred Value : 0.5828          
##          Neg Pred Value : 0.5620          
##              Prevalence : 0.5279          
##          Detection Rate : 0.3618          
##    Detection Prevalence : 0.6209          
##       Balanced Accuracy : 0.5684          
##                                           
##        'Positive' Class : 0               
##

str(PredMod1)

##  Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...

str(testd$smoke100)

##  Factor w/ 2 levels "0","1": 1 1 2 2 1 2 2 1 1 2 ...

table(PredMod1)

## PredMod1
##    0    1 
## 2483 1516

varImp(modsmoke1)

## glm variable importance
## 
##                    Overall
## age                 100.00
## genhlthpoor          77.80
## genhlthgood          76.75
## genhlthfair          68.93
## genderf              57.86
## `genhlthvery good`   53.33
## height               52.98
## hlthplan             40.84
## wtdesire             28.57
## exerany               1.27
## weight                0.00

knn Model

# Get a clean copy of cdc
load("~/Dropbox/RProjects/Module 8/cdc.Rdata")

# Dummify and factorize
ds = dummyVars(~.,data=cdc,fullRank = T)
cdc2 = as.data.frame(predict(ds,cdc))
cdc2$smoke100 = as.factor(cdc2$smoke100)

# Create traind and testd
set.seed(2345)
cdcsample = createDataPartition(cdc2$smoke100,p=.8,list=F)
traind = cdc2[cdcsample,]
testd = cdc2[-cdcsample,]

# Set Structure of Train Process
tc = trainControl(method="cv",number=5)

# Create model
modk1= train(smoke100 ~.,
                  method = "knn",
                  preProcess = c("center","scale"),
  tuneLength = 40,
  trControl=tc,data=traind)

# Examine tuning results
plot(modk1)

str(modk1)

## List of 23
##  $ method      : chr "knn"
##  $ modelInfo   :List of 13
##   ..$ label     : chr "k-Nearest Neighbors"
##   ..$ library   : NULL
##   ..$ loop      : NULL
##   ..$ type      : chr [1:2] "Classification" "Regression"
##   ..$ parameters:'data.frame':   1 obs. of  3 variables:
##   .. ..$ parameter: Factor w/ 1 level "k": 1
##   .. ..$ class    : Factor w/ 1 level "numeric": 1
##   .. ..$ label    : Factor w/ 1 level "#Neighbors": 1
##   ..$ grid      :function (x, y, len = NULL, search = "grid")  
##   .. ..- attr(*, "srcref")= 'srcref' int [1:8] 8 26 16 19 26 19 8 16
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f7f465ceeb0> 
##   ..$ fit       :function (x, y, wts, param, lev, last, classProbs, ...)  
##   .. ..- attr(*, "srcref")= 'srcref' int [1:8] 17 25 24 19 25 19 17 24
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f7f465ceeb0> 
##   ..$ predict   :function (modelFit, newdata, submodels = NULL)  
##   .. ..- attr(*, "srcref")= 'srcref' int [1:8] 25 29 33 19 29 19 25 33
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f7f465ceeb0> 
##   ..$ predictors:function (x, ...)  
##   .. ..- attr(*, "srcref")= 'srcref' int [1:8] 34 32 34 67 32 67 34 34
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f7f465ceeb0> 
##   ..$ tags      : chr "Prototype Models"
##   ..$ prob      :function (modelFit, newdata, submodels = NULL)  
##   .. ..- attr(*, "srcref")= 'srcref' int [1:8] 36 26 37 61 26 61 36 37
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f7f465ceeb0> 
##   ..$ levels    :function (x)  
##   .. ..- attr(*, "srcref")= 'srcref' int [1:8] 38 28 38 56 28 56 38 38
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f7f465ceeb0> 
##   ..$ sort      :function (x)  
##   .. ..- attr(*, "srcref")= 'srcref' int [1:8] 39 26 39 54 26 54 39 39
##   .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f7f465ceeb0> 
##  $ modelType   : chr "Classification"
##  $ results     :'data.frame':    40 obs. of  5 variables:
##   ..$ k         : int [1:40] 5 7 9 11 13 15 17 19 21 23 ...
##   ..$ Accuracy  : num [1:40] 0.557 0.56 0.562 0.569 0.572 ...
##   ..$ Kappa     : num [1:40] 0.11 0.115 0.119 0.133 0.138 ...
##   ..$ AccuracySD: num [1:40] 0.00625 0.00691 0.00656 0.00543 0.00903 ...
##   ..$ KappaSD   : num [1:40] 0.0131 0.0142 0.0137 0.011 0.0181 ...
##  $ pred        : NULL
##  $ bestTune    :'data.frame':    1 obs. of  1 variable:
##   ..$ k: int 53
##  $ call        : language train.formula(form = smoke100 ~ ., data = traind, method = "knn",      preProcess = c("center", "scale"), tuneLen| __truncated__
##  $ dots        : list()
##  $ metric      : chr "Accuracy"
##  $ control     :List of 27
##   ..$ method           : chr "cv"
##   ..$ number           : num 5
##   ..$ repeats          : logi NA
##   ..$ search           : chr "grid"
##   ..$ p                : num 0.75
##   ..$ initialWindow    : NULL
##   ..$ horizon          : num 1
##   ..$ fixedWindow      : logi TRUE
##   ..$ skip             : num 0
##   ..$ verboseIter      : logi FALSE
##   ..$ returnData       : logi TRUE
##   ..$ returnResamp     : chr "final"
##   ..$ savePredictions  : chr "none"
##   ..$ classProbs       : logi FALSE
##   ..$ summaryFunction  :function (data, lev = NULL, model = NULL)  
##   ..$ selectionFunction: chr "best"
##   ..$ preProcOptions   :List of 6
##   .. ..$ thresh   : num 0.95
##   .. ..$ ICAcomp  : num 3
##   .. ..$ k        : num 5
##   .. ..$ freqCut  : num 19
##   .. ..$ uniqueCut: num 10
##   .. ..$ cutoff   : num 0.9
##   ..$ sampling         : NULL
##   ..$ index            :List of 5
##   .. ..$ Fold1: int [1:12801] 1 5 6 7 8 9 10 11 12 14 ...
##   .. ..$ Fold2: int [1:12800] 1 2 3 4 5 6 7 8 9 11 ...
##   .. ..$ Fold3: int [1:12800] 1 2 3 4 5 6 10 11 12 13 ...
##   .. ..$ Fold4: int [1:12802] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..$ Fold5: int [1:12801] 2 3 4 7 8 9 10 11 12 13 ...
##   ..$ indexOut         :List of 5
##   .. ..$ Resample1: int [1:3200] 2 3 4 13 15 17 19 21 25 26 ...
##   .. ..$ Resample2: int [1:3201] 10 12 16 20 35 36 48 53 60 66 ...
##   .. ..$ Resample3: int [1:3201] 7 8 9 27 28 37 38 42 43 47 ...
##   .. ..$ Resample4: int [1:3199] 11 18 22 23 32 33 40 41 46 49 ...
##   .. ..$ Resample5: int [1:3200] 1 5 6 14 24 29 30 34 45 50 ...
##   ..$ indexFinal       : NULL
##   ..$ timingSamps      : num 0
##   ..$ predictionBounds : logi [1:2] FALSE FALSE
##   ..$ seeds            :List of 6
##   .. ..$ : int [1:40] 584201 708596 921924 500185 909341 691136 904158 668331 792488 311745 ...
##   .. ..$ : int [1:40] 710116 902092 402103 938440 806844 997466 191750 583127 250591 871748 ...
##   .. ..$ : int [1:40] 799263 166522 12490 759589 866546 372306 10524 456192 386774 521737 ...
##   .. ..$ : int [1:40] 475705 552751 46672 832001 783601 692225 978331 168496 141220 21487 ...
##   .. ..$ : int [1:40] 277383 420028 523184 794690 74468 768453 615879 478539 631760 217087 ...
##   .. ..$ : int 305207
##   ..$ adaptive         :List of 4
##   .. ..$ min     : num 5
##   .. ..$ alpha   : num 0.05
##   .. ..$ method  : chr "gls"
##   .. ..$ complete: logi TRUE
##   ..$ trim             : logi FALSE
##   ..$ allowParallel    : logi TRUE
##  $ finalModel  :List of 8
##   ..$ learn      :List of 2
##   .. ..$ y: Factor w/ 2 levels "0","1": 1 2 2 1 1 1 2 1 2 2 ...
##   .. .. ..- attr(*, "names")= chr [1:16001] "1" "2" "3" "5" ...
##   .. ..$ X: num [1:16001, 1:11] -0.732 -0.732 -0.732 1.366 1.366 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:16001] "X1" "X2" "X3" "X5" ...
##   .. .. .. ..$ : chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   ..$ k          : int 53
##   ..$ theDots    : list()
##   ..$ xNames     : chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   ..$ problemType: chr "Classification"
##   ..$ tuneValue  :'data.frame':  1 obs. of  1 variable:
##   .. ..$ k: int 53
##   ..$ obsLevels  : chr [1:2] "0" "1"
##   .. ..- attr(*, "ordered")= logi FALSE
##   ..$ param      : list()
##   ..- attr(*, "class")= chr "knn3"
##  $ preProcess  :List of 22
##   ..$ dim              : int [1:2] 16001 11
##   ..$ bc               : NULL
##   ..$ yj               : NULL
##   ..$ et               : NULL
##   ..$ invHyperbolicSine: NULL
##   ..$ mean             : Named num [1:11] 0.349 0.2817 0.1019 0.0345 0.7433 ...
##   .. ..- attr(*, "names")= chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   ..$ std              : Named num [1:11] 0.477 0.45 0.302 0.183 0.437 ...
##   .. ..- attr(*, "names")= chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   ..$ ranges           : NULL
##   ..$ rotation         : NULL
##   ..$ method           :List of 3
##   .. ..$ center: chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   .. ..$ scale : chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   .. ..$ ignore: chr(0) 
##   ..$ thresh           : num 0.95
##   ..$ pcaComp          : NULL
##   ..$ numComp          : NULL
##   ..$ ica              : NULL
##   ..$ wildcards        :List of 2
##   .. ..$ PCA: chr(0) 
##   .. ..$ ICA: chr(0) 
##   ..$ k                : num 5
##   ..$ knnSummary       :function (x, ...)  
##   ..$ bagImp           : NULL
##   ..$ median           : NULL
##   ..$ data             : NULL
##   ..$ rangeBounds      : num [1:2] 0 1
##   ..$ call             : chr "scrubed"
##   ..- attr(*, "class")= chr "preProcess"
##  $ trainingData:'data.frame':    16001 obs. of  12 variables:
##   ..$ .outcome         : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 2 1 2 2 ...
##   ..$ genhlth.very good: num [1:16001] 0 0 0 1 1 1 0 0 0 0 ...
##   ..$ genhlth.good     : num [1:16001] 1 1 1 0 0 0 1 1 0 0 ...
##   ..$ genhlth.fair     : num [1:16001] 0 0 0 0 0 0 0 0 1 0 ...
##   ..$ genhlth.poor     : num [1:16001] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ exerany          : num [1:16001] 0 0 1 0 1 0 0 1 1 1 ...
##   ..$ hlthplan         : num [1:16001] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ height           : num [1:16001] 70 64 60 61 64 67 65 70 69 70 ...
##   ..$ weight           : num [1:16001] 175 125 105 150 114 170 150 180 168 170 ...
##   ..$ wtdesire         : num [1:16001] 175 115 105 130 114 160 130 170 148 170 ...
##   ..$ age              : num [1:16001] 77 33 49 55 55 45 27 44 62 69 ...
##   ..$ gender.f         : num [1:16001] 0 1 1 1 1 0 1 0 0 0 ...
##  $ resample    :'data.frame':    5 obs. of  3 variables:
##   ..$ Accuracy: num [1:5] 0.596 0.589 0.592 0.586 0.589
##   ..$ Kappa   : num [1:5] 0.184 0.169 0.175 0.162 0.169
##   ..$ Resample: chr [1:5] "Fold1" "Fold4" "Fold5" "Fold2" ...
##  $ resampledCM :'data.frame':    200 obs. of  6 variables:
##   ..$ cell1   : num [1:200] 1016 1016 1035 1053 1058 ...
##   ..$ cell2   : num [1:200] 673 673 654 636 631 620 591 607 601 596 ...
##   ..$ cell3   : num [1:200] 775 769 770 762 771 763 748 747 763 763 ...
##   ..$ cell4   : num [1:200] 736 742 741 749 740 748 763 764 748 748 ...
##   ..$ k       : int [1:200] 5 7 9 11 13 15 17 19 21 23 ...
##   ..$ Resample: chr [1:200] "Fold1" "Fold1" "Fold1" "Fold1" ...
##  $ perfNames   : chr [1:2] "Accuracy" "Kappa"
##  $ maximize    : logi TRUE
##  $ yLimits     : NULL
##  $ times       :List of 3
##   ..$ everything: 'proc_time' Named num [1:5] 96.589 0.845 97.479 0 0
##   .. ..- attr(*, "names")= chr [1:5] "user.self" "sys.self" "elapsed" "user.child" ...
##   ..$ final     : 'proc_time' Named num [1:5] 0.053 0 0.052 0 0
##   .. ..- attr(*, "names")= chr [1:5] "user.self" "sys.self" "elapsed" "user.child" ...
##   ..$ prediction: logi [1:3] NA NA NA
##  $ levels      : chr [1:2] "0" "1"
##   ..- attr(*, "ordered")= logi FALSE
##  $ terms       :Classes 'terms', 'formula'  language smoke100 ~ `genhlth.very good` + genhlth.good + genhlth.fair + genhlth.poor +      exerany + hlthplan + height + | __truncated__
##   .. ..- attr(*, "variables")= language list(smoke100, `genhlth.very good`, genhlth.good, genhlth.fair, genhlth.poor,      exerany, hlthplan, height, wei| __truncated__
##   .. ..- attr(*, "factors")= int [1:12, 1:11] 0 1 0 0 0 0 0 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:12] "smoke100" "`genhlth.very good`" "genhlth.good" "genhlth.fair" ...
##   .. .. .. ..$ : chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   .. ..- attr(*, "term.labels")= chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##   .. ..- attr(*, "order")= int [1:11] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(smoke100, `genhlth.very good`, genhlth.good, genhlth.fair, genhlth.poor,      exerany, hlthplan, height, wei| __truncated__
##   .. ..- attr(*, "dataClasses")= Named chr [1:12] "factor" "numeric" "numeric" "numeric" ...
##   .. .. ..- attr(*, "names")= chr [1:12] "smoke100" "genhlth.very good" "genhlth.good" "genhlth.fair" ...
##  $ coefnames   : chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
##  $ xlevels     : Named list()
##  - attr(*, "class")= chr [1:2] "train" "train.formula"

modk1$bestTune

##     k
## 25 53