load("~/Dropbox/RProjects/Module 8/cdc.Rdata")
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
Logistic Regression smoke100
load("~/Dropbox/RProjects/Module 8/cdc.Rdata")
cdc$smoke100 = factor(cdc$smoke100)
set.seed(2345)
cdcsample = createDataPartition(cdc$smoke100,p=.8,list=F)
traind = cdc[cdcsample,]
testd = cdc[-cdcsample,]
table(traind$smoke100)
##
## 0 1
## 8448 7553
table(testd$smoke100)
##
## 0 1
## 2111 1888
Logistic Regression Model
tc = trainControl(method="cv",number=10)
modsmoke1 = train(smoke100 ~.,
method = "glm",
family="binomial", trControl=tc,data=traind)
summary(modsmoke1)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8592 -1.1018 -0.8454 1.1771 1.7207
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.9152179 0.4221218 -6.906 4.98e-12 ***
## `genhlthvery good` 0.3372864 0.0438739 7.688 1.50e-14 ***
## genhlthgood 0.5059594 0.0464814 10.885 < 2e-16 ***
## genhlthfair 0.6167228 0.0639113 9.650 < 2e-16 ***
## genhlthpoor 1.0282411 0.1015420 10.126 < 2e-16 ***
## exerany -0.0374330 0.0384678 -0.973 0.330505
## hlthplan -0.2967520 0.0499693 -5.939 2.87e-09 ***
## height 0.0445117 0.0067662 6.579 4.75e-11 ***
## weight 0.0002635 0.0007133 0.369 0.711787
## wtdesire -0.0045067 0.0012277 -3.671 0.000242 ***
## age 0.0132907 0.0010089 13.173 < 2e-16 ***
## genderf -0.3977519 0.0506079 -7.859 3.86e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22132 on 16000 degrees of freedom
## Residual deviance: 21448 on 15989 degrees of freedom
## AIC: 21472
##
## Number of Fisher Scoring iterations: 4
knn Model
# Get a clean copy of cdc
load("~/Dropbox/RProjects/Module 8/cdc.Rdata")
# Dummify and factorize
ds = dummyVars(~.,data=cdc,fullRank = T)
cdc2 = as.data.frame(predict(ds,cdc))
cdc2$smoke100 = as.factor(cdc2$smoke100)
# Create traind and testd
set.seed(2345)
cdcsample = createDataPartition(cdc2$smoke100,p=.8,list=F)
traind = cdc2[cdcsample,]
testd = cdc2[-cdcsample,]
# Set Structure of Train Process
tc = trainControl(method="cv",number=5)
# Create model
modk1= train(smoke100 ~.,
method = "knn",
preProcess = c("center","scale"),
tuneLength = 40,
trControl=tc,data=traind)
# Examine tuning results
plot(modk1)

str(modk1)
## List of 23
## $ method : chr "knn"
## $ modelInfo :List of 13
## ..$ label : chr "k-Nearest Neighbors"
## ..$ library : NULL
## ..$ loop : NULL
## ..$ type : chr [1:2] "Classification" "Regression"
## ..$ parameters:'data.frame': 1 obs. of 3 variables:
## .. ..$ parameter: Factor w/ 1 level "k": 1
## .. ..$ class : Factor w/ 1 level "numeric": 1
## .. ..$ label : Factor w/ 1 level "#Neighbors": 1
## ..$ grid :function (x, y, len = NULL, search = "grid")
## .. ..- attr(*, "srcref")= 'srcref' int [1:8] 8 26 16 19 26 19 8 16
## .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f8bec813850>
## ..$ fit :function (x, y, wts, param, lev, last, classProbs, ...)
## .. ..- attr(*, "srcref")= 'srcref' int [1:8] 17 25 24 19 25 19 17 24
## .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f8bec813850>
## ..$ predict :function (modelFit, newdata, submodels = NULL)
## .. ..- attr(*, "srcref")= 'srcref' int [1:8] 25 29 33 19 29 19 25 33
## .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f8bec813850>
## ..$ predictors:function (x, ...)
## .. ..- attr(*, "srcref")= 'srcref' int [1:8] 34 32 34 67 32 67 34 34
## .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f8bec813850>
## ..$ tags : chr "Prototype Models"
## ..$ prob :function (modelFit, newdata, submodels = NULL)
## .. ..- attr(*, "srcref")= 'srcref' int [1:8] 36 26 37 61 26 61 36 37
## .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f8bec813850>
## ..$ levels :function (x)
## .. ..- attr(*, "srcref")= 'srcref' int [1:8] 38 28 38 56 28 56 38 38
## .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f8bec813850>
## ..$ sort :function (x)
## .. ..- attr(*, "srcref")= 'srcref' int [1:8] 39 26 39 54 26 54 39 39
## .. .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x7f8bec813850>
## $ modelType : chr "Classification"
## $ results :'data.frame': 40 obs. of 5 variables:
## ..$ k : int [1:40] 5 7 9 11 13 15 17 19 21 23 ...
## ..$ Accuracy : num [1:40] 0.552 0.554 0.561 0.568 0.567 ...
## ..$ Kappa : num [1:40] 0.0984 0.1024 0.1156 0.1295 0.1262 ...
## ..$ AccuracySD: num [1:40] 0.0163 0.0087 0.0122 0.0101 0.0134 ...
## ..$ KappaSD : num [1:40] 0.0321 0.0177 0.0242 0.0201 0.0265 ...
## $ pred : NULL
## $ bestTune :'data.frame': 1 obs. of 1 variable:
## ..$ k: int 57
## $ call : language train.formula(form = smoke100 ~ ., data = traind, method = "knn", preProcess = c("center", "scale"), tuneLen| __truncated__
## $ dots : list()
## $ metric : chr "Accuracy"
## $ control :List of 27
## ..$ method : chr "cv"
## ..$ number : num 5
## ..$ repeats : logi NA
## ..$ search : chr "grid"
## ..$ p : num 0.75
## ..$ initialWindow : NULL
## ..$ horizon : num 1
## ..$ fixedWindow : logi TRUE
## ..$ skip : num 0
## ..$ verboseIter : logi FALSE
## ..$ returnData : logi TRUE
## ..$ returnResamp : chr "final"
## ..$ savePredictions : chr "none"
## ..$ classProbs : logi FALSE
## ..$ summaryFunction :function (data, lev = NULL, model = NULL)
## ..$ selectionFunction: chr "best"
## ..$ preProcOptions :List of 6
## .. ..$ thresh : num 0.95
## .. ..$ ICAcomp : num 3
## .. ..$ k : num 5
## .. ..$ freqCut : num 19
## .. ..$ uniqueCut: num 10
## .. ..$ cutoff : num 0.9
## ..$ sampling : NULL
## ..$ index :List of 5
## .. ..$ Fold1: int [1:12801] 1 2 3 4 6 7 8 9 10 11 ...
## .. ..$ Fold2: int [1:12800] 1 3 4 5 6 7 8 9 11 13 ...
## .. ..$ Fold3: int [1:12800] 1 2 3 4 5 6 7 10 12 13 ...
## .. ..$ Fold4: int [1:12802] 2 3 4 5 7 8 9 10 11 12 ...
## .. ..$ Fold5: int [1:12801] 1 2 5 6 8 9 10 11 12 13 ...
## ..$ indexOut :List of 5
## .. ..$ Resample1: int [1:3200] 5 14 17 21 28 32 39 46 48 49 ...
## .. ..$ Resample2: int [1:3201] 2 10 12 15 23 33 35 36 38 41 ...
## .. ..$ Resample3: int [1:3201] 8 9 11 19 20 22 26 31 34 37 ...
## .. ..$ Resample4: int [1:3199] 1 6 13 18 24 27 29 52 56 60 ...
## .. ..$ Resample5: int [1:3200] 3 4 7 16 25 30 40 47 51 58 ...
## ..$ indexFinal : NULL
## ..$ timingSamps : num 0
## ..$ predictionBounds : logi [1:2] FALSE FALSE
## ..$ seeds :List of 6
## .. ..$ : int [1:40] 594834 742861 303440 919360 840091 381309 402417 510888 992173 803548 ...
## .. ..$ : int [1:40] 751021 409982 105873 24361 904798 491722 324726 549381 118402 349022 ...
## .. ..$ : int [1:40] 65810 839542 13074 311209 725477 508587 390979 499769 319814 221443 ...
## .. ..$ : int [1:40] 785150 219299 276742 680260 662289 351109 351133 983325 920988 352018 ...
## .. ..$ : int [1:40] 939 774323 309056 161854 289914 776366 522991 94367 440976 261746 ...
## .. ..$ : int 384399
## ..$ adaptive :List of 4
## .. ..$ min : num 5
## .. ..$ alpha : num 0.05
## .. ..$ method : chr "gls"
## .. ..$ complete: logi TRUE
## ..$ trim : logi FALSE
## ..$ allowParallel : logi TRUE
## $ finalModel :List of 8
## ..$ learn :List of 2
## .. ..$ y: Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 2 2 2 ...
## .. .. ..- attr(*, "names")= chr [1:16001] "1" "5" "6" "7" ...
## .. ..$ X: num [1:16001, 1:11] -0.728 1.374 1.374 1.374 1.374 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:16001] "X1" "X5" "X6" "X7" ...
## .. .. .. ..$ : chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
## ..$ k : int 57
## ..$ theDots : list()
## ..$ xNames : chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
## ..$ problemType: chr "Classification"
## ..$ tuneValue :'data.frame': 1 obs. of 1 variable:
## .. ..$ k: int 57
## ..$ obsLevels : chr [1:2] "0" "1"
## .. ..- attr(*, "ordered")= logi FALSE
## ..$ param : list()
## ..- attr(*, "class")= chr "knn3"
## $ preProcess :List of 22
## ..$ dim : int [1:2] 16001 11
## ..$ bc : NULL
## ..$ yj : NULL
## ..$ et : NULL
## ..$ invHyperbolicSine: NULL
## ..$ mean : Named num [1:11] 0.3462 0.2864 0.1003 0.0332 0.7447 ...
## .. ..- attr(*, "names")= chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
## ..$ std : Named num [1:11] 0.476 0.452 0.3 0.179 0.436 ...
## .. ..- attr(*, "names")= chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
## ..$ ranges : NULL
## ..$ rotation : NULL
## ..$ method :List of 3
## .. ..$ center: chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
## .. ..$ scale : chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
## .. ..$ ignore: chr(0)
## ..$ thresh : num 0.95
## ..$ pcaComp : NULL
## ..$ numComp : NULL
## ..$ ica : NULL
## ..$ wildcards :List of 2
## .. ..$ PCA: chr(0)
## .. ..$ ICA: chr(0)
## ..$ k : num 5
## ..$ knnSummary :function (x, ...)
## ..$ bagImp : NULL
## ..$ median : NULL
## ..$ data : NULL
## ..$ rangeBounds : num [1:2] 0 1
## ..$ call : chr "scrubed"
## ..- attr(*, "class")= chr "preProcess"
## $ trainingData:'data.frame': 16001 obs. of 12 variables:
## ..$ .outcome : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 2 2 2 ...
## ..$ genhlth.very good: num [1:16001] 0 1 1 1 1 0 0 0 0 0 ...
## ..$ genhlth.good : num [1:16001] 1 0 0 0 0 1 1 0 0 0 ...
## ..$ genhlth.fair : num [1:16001] 0 0 0 0 0 0 0 0 1 0 ...
## ..$ genhlth.poor : num [1:16001] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ exerany : num [1:16001] 0 0 1 1 0 0 1 1 1 1 ...
## ..$ hlthplan : num [1:16001] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ height : num [1:16001] 70 61 64 71 67 65 70 69 69 70 ...
## ..$ weight : num [1:16001] 175 150 114 194 170 150 180 186 168 170 ...
## ..$ wtdesire : num [1:16001] 175 130 114 185 160 130 170 175 148 170 ...
## ..$ age : num [1:16001] 77 55 55 31 45 27 44 46 62 69 ...
## ..$ gender.f : num [1:16001] 0 1 1 0 0 1 0 0 0 0 ...
## $ resample :'data.frame': 5 obs. of 3 variables:
## ..$ Accuracy: num [1:5] 0.586 0.609 0.576 0.6 0.573
## ..$ Kappa : num [1:5] 0.165 0.209 0.141 0.194 0.138
## ..$ Resample: chr [1:5] "Fold1" "Fold2" "Fold3" "Fold4" ...
## $ resampledCM :'data.frame': 200 obs. of 6 variables:
## ..$ cell1 : num [1:200] 1022 1013 1036 1070 1090 ...
## ..$ cell2 : num [1:200] 668 677 654 620 600 606 607 606 586 594 ...
## ..$ cell3 : num [1:200] 736 752 744 751 756 750 736 746 738 740 ...
## ..$ cell4 : num [1:200] 774 758 766 759 754 760 774 764 772 770 ...
## ..$ k : int [1:200] 5 7 9 11 13 15 17 19 21 23 ...
## ..$ Resample: chr [1:200] "Fold1" "Fold1" "Fold1" "Fold1" ...
## $ perfNames : chr [1:2] "Accuracy" "Kappa"
## $ maximize : logi TRUE
## $ yLimits : NULL
## $ times :List of 3
## ..$ everything: 'proc_time' Named num [1:5] 104.63 3.21 107.95 0 0
## .. ..- attr(*, "names")= chr [1:5] "user.self" "sys.self" "elapsed" "user.child" ...
## ..$ final : 'proc_time' Named num [1:5] 0.064 0.014 0.078 0 0
## .. ..- attr(*, "names")= chr [1:5] "user.self" "sys.self" "elapsed" "user.child" ...
## ..$ prediction: logi [1:3] NA NA NA
## $ levels : chr [1:2] "0" "1"
## ..- attr(*, "ordered")= logi FALSE
## $ terms :Classes 'terms', 'formula' language smoke100 ~ `genhlth.very good` + genhlth.good + genhlth.fair + genhlth.poor + exerany + hlthplan + height + | __truncated__
## .. ..- attr(*, "variables")= language list(smoke100, `genhlth.very good`, genhlth.good, genhlth.fair, genhlth.poor, exerany, hlthplan, height, wei| __truncated__
## .. ..- attr(*, "factors")= int [1:12, 1:11] 0 1 0 0 0 0 0 0 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:12] "smoke100" "`genhlth.very good`" "genhlth.good" "genhlth.fair" ...
## .. .. .. ..$ : chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
## .. ..- attr(*, "term.labels")= chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
## .. ..- attr(*, "order")= int [1:11] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(smoke100, `genhlth.very good`, genhlth.good, genhlth.fair, genhlth.poor, exerany, hlthplan, height, wei| __truncated__
## .. ..- attr(*, "dataClasses")= Named chr [1:12] "factor" "numeric" "numeric" "numeric" ...
## .. .. ..- attr(*, "names")= chr [1:12] "smoke100" "genhlth.very good" "genhlth.good" "genhlth.fair" ...
## $ coefnames : chr [1:11] "`genhlth.very good`" "genhlth.good" "genhlth.fair" "genhlth.poor" ...
## $ xlevels : Named list()
## - attr(*, "class")= chr [1:2] "train" "train.formula"
modk1$bestTune
## k
## 27 57