library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
load("train.rdata")
data<- train
data[,c(2:21,31:34)] <- lapply(c(2:21,31:34), function(a)factor(data[[a]]))
# summary(data)
data <- data[-1]
set.seed(0)
train_idx <- sample(1:nrow(data), size = 0.7 * nrow(data))
train_data <- data[train_idx, ]
test_data <- data[-train_idx, ]
model <- glm(h1n1_vaccine ~ ., data = train_data, family = "binomial")
summary(model)
##
## Call:
## glm(formula = h1n1_vaccine ~ ., family = "binomial", data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4686 -0.5138 -0.2765 -0.1445 3.1589
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.6611597 0.2570945 -18.130 < 2e-16 ***
## h1n1_concern1 0.0223710 0.0903792 0.248 0.804503
## h1n1_concern2 -0.0392892 0.0910149 -0.432 0.665975
## h1n1_concern3 -0.1987530 0.1031681 -1.926 0.054042 .
## h1n1_knowledge1 -0.0484226 0.0921482 -0.525 0.599245
## h1n1_knowledge2 0.0412901 0.0972068 0.425 0.671007
## behavioral_antiviral_meds1 0.1631733 0.1026373 1.590 0.111879
## behavioral_avoidance1 -0.0631843 0.0582705 -1.084 0.278220
## behavioral_face_mask1 0.2383210 0.0863962 2.758 0.005807 **
## behavioral_wash_hands1 -0.0077819 0.0729530 -0.107 0.915050
## behavioral_large_gatherings1 -0.2507472 0.0595166 -4.213 2.52e-05 ***
## behavioral_outside_home1 -0.0304863 0.0603161 -0.505 0.613248
## behavioral_touch_face1 0.0008523 0.0561948 0.015 0.987899
## doctor_recc_h1n11 2.2714437 0.0675161 33.643 < 2e-16 ***
## doctor_recc_h1n12 -0.7941992 0.1104525 -7.190 6.46e-13 ***
## doctor_recc_seasonal1 -1.0291488 0.0666595 -15.439 < 2e-16 ***
## doctor_recc_seasonal2 NA NA NA NA
## chronic_med_condition1 0.0512349 0.0520272 0.985 0.324737
## child_under_6_months1 0.2535883 0.0799855 3.170 0.001522 **
## health_worker1 0.6755467 0.0676711 9.983 < 2e-16 ***
## opinion_h1n1_vacc_effective3 0.5655488 0.1561244 3.622 0.000292 ***
## opinion_h1n1_vacc_effective4 0.9412257 0.1458487 6.453 1.09e-10 ***
## opinion_h1n1_vacc_effective5 1.8451998 0.1486259 12.415 < 2e-16 ***
## opinion_h1n1_risk2 0.5110326 0.0690307 7.403 1.33e-13 ***
## opinion_h1n1_risk4 1.2696159 0.0794092 15.988 < 2e-16 ***
## opinion_h1n1_risk5 1.6855449 0.1076469 15.658 < 2e-16 ***
## opinion_h1n1_sick_from_vacc2 -0.3428712 0.0617139 -5.556 2.76e-08 ***
## opinion_h1n1_sick_from_vacc4 -0.0330691 0.0701861 -0.471 0.637524
## opinion_h1n1_sick_from_vacc5 -0.0307478 0.1040346 -0.296 0.767571
## opinion_seas_vacc_effective4 -0.1118716 0.1088141 -1.028 0.303903
## opinion_seas_vacc_effective5 -0.3583658 0.1138199 -3.149 0.001641 **
## opinion_seas_risk2 0.1809568 0.0804374 2.250 0.024470 *
## opinion_seas_risk4 0.0094817 0.0861716 0.110 0.912383
## opinion_seas_risk5 -0.0459962 0.1028933 -0.447 0.654855
## opinion_seas_sick_from_vacc2 0.0037181 0.0589156 0.063 0.949680
## opinion_seas_sick_from_vacc4 -0.0870321 0.0692468 -1.257 0.208812
## opinion_seas_sick_from_vacc5 -0.2357731 0.1128045 -2.090 0.036608 *
## age_group35 - 44 Years -0.2236329 0.0859782 -2.601 0.009294 **
## age_group45 - 54 Years -0.1741991 0.0808925 -2.153 0.031282 *
## age_group55 - 64 Years 0.1223979 0.0826342 1.481 0.138553
## age_group65+ Years -0.0082435 0.0900258 -0.092 0.927041
## education12 Years 0.0733119 0.0988344 0.742 0.458229
## educationCollege Graduate 0.1292342 0.0991099 1.304 0.192251
## educationSome College 0.0315370 0.0990303 0.318 0.750138
## raceHispanic 0.3488563 0.1276015 2.734 0.006258 **
## raceOther or Multiple 0.3552191 0.1298520 2.736 0.006227 **
## raceWhite 0.2142582 0.0982327 2.181 0.029174 *
## sexMale 0.1819071 0.0485781 3.745 0.000181 ***
## income_poverty> $75,000 -0.0551555 0.0574496 -0.960 0.337021
## income_povertyBelow Poverty -0.0107401 0.0888894 -0.121 0.903829
## marital_statusNot Married -0.1110293 0.0641974 -1.729 0.083720 .
## rent_or_ownRent 0.0089923 0.0641737 0.140 0.888562
## employment_statusNot in Labor Force 0.0056487 0.0588539 0.096 0.923538
## employment_statusUnemployed 0.1006140 0.1110766 0.906 0.365037
## census_msaMSA, Principle City 0.0759100 0.0547335 1.387 0.165471
## census_msaNon-MSA 0.0889092 0.0564080 1.576 0.114984
## household_adults1 0.0248465 0.0686551 0.362 0.717424
## household_adults2 0.0726677 0.0914408 0.795 0.426790
## household_children1 0.0391762 0.0786495 0.498 0.618406
## household_children2 -0.0723237 0.0850295 -0.851 0.395007
## household_children3 -0.0710714 0.1080214 -0.658 0.510578
## seasonal_vaccine1 2.0660939 0.0589995 35.019 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19348 on 18693 degrees of freedom
## Residual deviance: 12710 on 18633 degrees of freedom
## AIC: 12832
##
## Number of Fisher Scoring iterations: 6
# 結果:
#1 doctor_recc_h1n1
#2 doctor_recc_seasonal
#3 health_worker
#4 opinion_h1n1_vacc_effective
#5 opinion_h1n1_risk
#6 seasonal_vaccine
#7 opinion_h1n1_sick_from_vacc
#8 behavioral_large_gatherings
#9 sex
#10 child_under_6_months
#11 opinion_seas_vacc_effective
#12 behavioral_face_mask
#13 race
#14 age_group
#15 opinion_seas_risk
#16 opinion_seas_sick_from_vacc
#17 h1n1_concern
#18 marital_status
#19 behavioral_antiviral_meds
#20 census_msa
#21 education
#22 behavioral_avoidance
#23 chronic_med_condition
#24 income_poverty
#25 employment_status
#26 household_children
#27 household_adults
#28 h1n1_knowledge
#29 behavioral_outside_home
#30 rent_or_own
#31 behavioral_wash_hands
#32 behavioral_touch_face
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
rf <- randomForest(h1n1_vaccine ~ ., train_data)
rf_pred_train <- predict(rf, train_data)
rf_pred_test <- predict(rf, test_data)
importance(rf)
## MeanDecreaseGini
## h1n1_concern 233.62608
## h1n1_knowledge 156.09912
## behavioral_antiviral_meds 40.73726
## behavioral_avoidance 90.29804
## behavioral_face_mask 52.06175
## behavioral_wash_hands 61.89743
## behavioral_large_gatherings 96.69375
## behavioral_outside_home 95.07615
## behavioral_touch_face 94.32057
## doctor_recc_h1n1 678.15275
## doctor_recc_seasonal 193.86674
## chronic_med_condition 98.14627
## child_under_6_months 57.74945
## health_worker 122.48378
## opinion_h1n1_vacc_effective 363.51193
## opinion_h1n1_risk 429.10475
## opinion_h1n1_sick_from_vacc 237.11170
## opinion_seas_vacc_effective 147.72297
## opinion_seas_risk 258.55647
## opinion_seas_sick_from_vacc 226.34071
## age_group 309.30970
## education 230.05704
## race 145.69007
## sex 110.02518
## income_poverty 141.08452
## marital_status 91.72238
## rent_or_own 77.56683
## employment_status 131.21883
## census_msa 206.34013
## household_adults 150.96274
## household_children 172.10172
## seasonal_vaccine 549.50472
# 結果:
#1 doctor_recc_h1n1
#2 seasonal_vaccine
#3 opinion_h1n1_risk
#4 opinion_h1n1_vacc_effective
#5 age_group
#6 opinion_seas_risk
#7 opinion_h1n1_sick_from_vacc
#8 h1n1_concern
#9 education
#10 opinion_seas_sick_from_vacc
#11 census_msa
#12 doctor_recc_seasonal
#13 household_children
#14 h1n1_knowledge
#15 household_adults
#16 opinion_seas_vacc_effective
#17 race
#18 income_poverty
#19 employment_status
#20 health_worker
#21 sex
#22 chronic_med_condition
#23 behavioral_large_gatherings
#24 behavioral_outside_home
#25 behavioral_touch_face
#26 marital_status
#27 behavioral_avoidance
#28 rent_or_own
#29 behavioral_wash_hands
#30 child_under_6_months
#31 behavioral_face_mask
#32 behavioral_antiviral_meds
# Dumy code categorical predictor variables
x <- model.matrix(h1n1_vaccine ~ ., train_data)[,-1]
# Convert the outcome (class) to a numerical variable
y <- as.numeric(train_data$h1n1_vaccine)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 3.0-2
glmnet(x, y, family = "binomial", alpha = 1, lambda = NULL)
##
## Call: glmnet(x = x, y = y, family = "binomial", alpha = 1, lambda = NULL)
##
## Df %Dev Lambda
## 1 0 0.00000 0.162800
## 2 2 0.02862 0.148300
## 3 2 0.06288 0.135200
## 4 2 0.09095 0.123200
## 5 2 0.11420 0.112200
## 6 2 0.13360 0.102200
## 7 3 0.15300 0.093160
## 8 3 0.17120 0.084890
## 9 3 0.18650 0.077340
## 10 3 0.19960 0.070470
## 11 4 0.21090 0.064210
## 12 4 0.22210 0.058510
## 13 4 0.23160 0.053310
## 14 6 0.24170 0.048570
## 15 6 0.25140 0.044260
## 16 6 0.25960 0.040330
## 17 6 0.26660 0.036740
## 18 6 0.27260 0.033480
## 19 6 0.27770 0.030510
## 20 6 0.28210 0.027800
## 21 6 0.28580 0.025330
## 22 6 0.28900 0.023080
## 23 6 0.29160 0.021030
## 24 7 0.29440 0.019160
## 25 7 0.29830 0.017460
## 26 7 0.30170 0.015910
## 27 11 0.30500 0.014490
## 28 11 0.30840 0.013210
## 29 12 0.31130 0.012030
## 30 13 0.31390 0.010960
## 31 15 0.31620 0.009989
## 32 18 0.31840 0.009102
## 33 20 0.32060 0.008293
## 34 21 0.32300 0.007557
## 35 23 0.32520 0.006885
## 36 25 0.32710 0.006274
## 37 25 0.32910 0.005716
## 38 27 0.33090 0.005208
## 39 27 0.33250 0.004746
## 40 27 0.33390 0.004324
## 41 26 0.33510 0.003940
## 42 28 0.33600 0.003590
## 43 30 0.33680 0.003271
## 44 31 0.33760 0.002980
## 45 32 0.33820 0.002716
## 46 33 0.33870 0.002474
## 47 35 0.33920 0.002255
## 48 35 0.33960 0.002054
## 49 37 0.34000 0.001872
## 50 38 0.34030 0.001706
## 51 43 0.34060 0.001554
## 52 44 0.34080 0.001416
## 53 46 0.34110 0.001290
## 54 47 0.34130 0.001176
## 55 47 0.34160 0.001071
## 56 47 0.34180 0.000976
## 57 47 0.34200 0.000889
## 58 48 0.34210 0.000810
## 59 48 0.34230 0.000738
## 60 48 0.34240 0.000673
## 61 50 0.34250 0.000613
## 62 52 0.34260 0.000558
## 63 52 0.34260 0.000509
## 64 52 0.34270 0.000464
## 65 52 0.34280 0.000422
## 66 53 0.34280 0.000385
## 67 54 0.34280 0.000351
## 68 54 0.34290 0.000320
## 69 54 0.34290 0.000291
## 70 54 0.34290 0.000265
## 71 55 0.34300 0.000242
## 72 56 0.34300 0.000220
## 73 56 0.34300 0.000201
## 74 56 0.34300 0.000183
## 75 56 0.34300 0.000167
## 76 58 0.34300 0.000152
# Find the best lambda using cross-validation
set.seed(123)
cv.lasso <- cv.glmnet(x, y, alpha = 1, family = "binomial")
# Fit the final model on the training data
model <- glmnet(x, y, alpha = 1, family = "binomial",
lambda = cv.lasso$lambda.min)
# Display regression coefficients
coef(model)
## 62 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -4.2918160910
## h1n1_concern1 0.0397068298
## h1n1_concern2 .
## h1n1_concern3 -0.1547170522
## h1n1_knowledge1 -0.0258147848
## h1n1_knowledge2 0.0561269791
## behavioral_antiviral_meds1 0.1274422409
## behavioral_avoidance1 -0.0550508504
## behavioral_face_mask1 0.2030208300
## behavioral_wash_hands1 -0.0004918176
## behavioral_large_gatherings1 -0.2434868154
## behavioral_outside_home1 -0.0263386294
## behavioral_touch_face1 .
## doctor_recc_h1n11 2.2250658195
## doctor_recc_h1n12 -0.4061111854
## doctor_recc_seasonal1 -0.9824188151
## doctor_recc_seasonal2 -0.3381990157
## chronic_med_condition1 0.0321626339
## child_under_6_months1 0.2242859187
## health_worker1 0.6583198309
## opinion_h1n1_vacc_effective3 0.2256687133
## opinion_h1n1_vacc_effective4 0.6167767804
## opinion_h1n1_vacc_effective5 1.5076227882
## opinion_h1n1_risk2 0.4555663925
## opinion_h1n1_risk4 1.2003765342
## opinion_h1n1_risk5 1.5905801871
## opinion_h1n1_sick_from_vacc2 -0.3029451009
## opinion_h1n1_sick_from_vacc4 .
## opinion_h1n1_sick_from_vacc5 .
## opinion_seas_vacc_effective4 .
## opinion_seas_vacc_effective5 -0.2301220379
## opinion_seas_risk2 0.1607238300
## opinion_seas_risk4 .
## opinion_seas_risk5 -0.0307042198
## opinion_seas_sick_from_vacc2 .
## opinion_seas_sick_from_vacc4 -0.0750766839
## opinion_seas_sick_from_vacc5 -0.2176124948
## age_group35 - 44 Years -0.2022710831
## age_group45 - 54 Years -0.1511210240
## age_group55 - 64 Years 0.1177634296
## age_group65+ Years .
## education12 Years 0.0154925526
## educationCollege Graduate 0.0771036301
## educationSome College .
## raceHispanic 0.2284750812
## raceOther or Multiple 0.2415486185
## raceWhite 0.1253753217
## sexMale 0.1721449147
## income_poverty> $75,000 -0.0261027750
## income_povertyBelow Poverty .
## marital_statusNot Married -0.1058739529
## rent_or_ownRent .
## employment_statusNot in Labor Force .
## employment_statusUnemployed 0.0566076066
## census_msaMSA, Principle City 0.0489674024
## census_msaNon-MSA 0.0644289184
## household_adults1 .
## household_adults2 0.0314940833
## household_children1 0.0245661096
## household_children2 -0.0491030758
## household_children3 -0.0357309981
## seasonal_vaccine1 2.0274008582
# Make predictions on the test data
x.test <- model.matrix(h1n1_vaccine ~., test_data)[,-1]
probabilities <- model %>% predict(newx = x.test)
predicted.classes <- ifelse(probabilities > 0.5, "0", "1")
# Model accuracy
observed.classes <- test_data$h1n1_vaccine
mean(predicted.classes == observed.classes)
## [1] 0.1609884
library(glmnet)
set.seed(123)
cv.lasso <- cv.glmnet(x, y, alpha = 1, family = "binomial")
plot(cv.lasso)
coef(cv.lasso, cv.lasso$lambda.min)
## 62 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -4.2891182663
## h1n1_concern1 0.0396536632
## h1n1_concern2 .
## h1n1_concern3 -0.1546387596
## h1n1_knowledge1 -0.0277052952
## h1n1_knowledge2 0.0542458946
## behavioral_antiviral_meds1 0.1276831287
## behavioral_avoidance1 -0.0549870663
## behavioral_face_mask1 0.2030913045
## behavioral_wash_hands1 -0.0003458214
## behavioral_large_gatherings1 -0.2436000010
## behavioral_outside_home1 -0.0263000863
## behavioral_touch_face1 .
## doctor_recc_h1n11 2.2256860566
## doctor_recc_h1n12 -0.5475096081
## doctor_recc_seasonal1 -0.9828377949
## doctor_recc_seasonal2 -0.1971709402
## chronic_med_condition1 0.0321340180
## child_under_6_months1 0.2243628482
## health_worker1 0.6583935172
## opinion_h1n1_vacc_effective3 0.2201165380
## opinion_h1n1_vacc_effective4 0.6117284145
## opinion_h1n1_vacc_effective5 1.5028898463
## opinion_h1n1_risk2 0.4557652122
## opinion_h1n1_risk4 1.2006770678
## opinion_h1n1_risk5 1.5910251547
## opinion_h1n1_sick_from_vacc2 -0.3029826801
## opinion_h1n1_sick_from_vacc4 .
## opinion_h1n1_sick_from_vacc5 .
## opinion_seas_vacc_effective4 .
## opinion_seas_vacc_effective5 -0.2301889533
## opinion_seas_risk2 0.1608195887
## opinion_seas_risk4 .
## opinion_seas_risk5 -0.0306915621
## opinion_seas_sick_from_vacc2 .
## opinion_seas_sick_from_vacc4 -0.0749268135
## opinion_seas_sick_from_vacc5 -0.2172780015
## age_group35 - 44 Years -0.2021337824
## age_group45 - 54 Years -0.1510577433
## age_group55 - 64 Years 0.1178112899
## age_group65+ Years .
## education12 Years 0.0158972828
## educationCollege Graduate 0.0772361159
## educationSome College .
## raceHispanic 0.2321002903
## raceOther or Multiple 0.2451764530
## raceWhite 0.1280152000
## sexMale 0.1721864558
## income_poverty> $75,000 -0.0259869989
## income_povertyBelow Poverty .
## marital_statusNot Married -0.1057994628
## rent_or_ownRent .
## employment_statusNot in Labor Force .
## employment_statusUnemployed 0.0564782461
## census_msaMSA, Principle City 0.0491123669
## census_msaNon-MSA 0.0643672885
## household_adults1 .
## household_adults2 0.0314902839
## household_children1 0.0244918652
## household_children2 -0.0491974842
## household_children3 -0.0359665084
## seasonal_vaccine1 2.0281801566
# 結果:
#1 doctor_recc_h1n1
#2 seasonal_vaccine
#3 opinion_h1n1_risk
#4 opinion_h1n1_vacc_effective
#5 doctor_recc_seasonal
#6 health_worker
#7 opinion_h1n1_sick_from_vacc
#8 race
#9 behavioral_large_gatherings
#10 opinion_seas_vacc_effective
#11 child_under_6_months
#12 opinion_seas_sick_from_vacc
#13 behavioral_face_mask
#14 age_group
#15 sex
#16 opinion_seas_risk
#17 h1n1_concern
#18 behavioral_antiviral_meds
#19 marital_status
#20 education
#21 census_msa
#22 employment_status
#23 behavioral_avoidance
#24 h1n1_knowledge
#25 household_children
#26 chronic_med_condition
#27 household_adults
#28 behavioral_outside_home
#29 income_poverty
#30 behavioral_wash_hands
p_model <- glm(h1n1_vaccine ~ ., data = train_data, family = "binomial")
train_pred_prob = predict(p_model, newdata = train_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
test_pred_prob = predict(p_model, newdata = test_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
#畫出roc曲線
#看training和testing的AUC
library("pROC")
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(model_roc_train,print.thres = "best",
print.thres.best.method="youden", print.auc = T,col = "green")
# AUC = 0.877,cotoff = 0.222
plot.roc(model_roc_test,print.thres = "best",
print.thres.best.method="youden", print.auc = T, col = "blue")
# AUC = 0.867,cotoff = 0.196
p_model_GLM_16 <- glm(h1n1_vaccine ~ doctor_recc_h1n1
+doctor_recc_seasonal
+health_worker
+opinion_h1n1_vacc_effective
+opinion_h1n1_risk
+seasonal_vaccine
+opinion_h1n1_sick_from_vacc
+behavioral_large_gatherings
+sex
+child_under_6_months
+opinion_seas_vacc_effective
+behavioral_face_mask
+race
+age_group
+opinion_seas_risk
+opinion_seas_sick_from_vacc,data = train_data, family = "binomial")
train_pred_prob = predict(p_model_GLM_16 , newdata = train_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
test_pred_prob = predict(p_model_GLM_16 , newdata = test_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(model_roc_train,print.thres = "best",
print.thres.best.method="youden", print.auc = T,col = "green")
# AUC = 0.877,cotoff = 0.220
plot.roc(model_roc_test,print.thres = "best",
print.thres.best.method="youden", print.auc = T, col = "blue")
# AUC = 0.866,cotoff = 0.193
p_model_GLM_15 <- glm(h1n1_vaccine ~ doctor_recc_h1n1
+doctor_recc_seasonal
+health_worker
+opinion_h1n1_vacc_effective
+opinion_h1n1_risk
+seasonal_vaccine
+opinion_h1n1_sick_from_vacc
+behavioral_large_gatherings
+sex
+child_under_6_months
+opinion_seas_vacc_effective
+behavioral_face_mask
+race
+age_group
+opinion_seas_risk,data = train_data, family = "binomial")
train_pred_prob = predict(p_model_GLM_15 , newdata = train_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
test_pred_prob = predict(p_model_GLM_15 , newdata = test_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
#畫出roc曲線
#看看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(model_roc_train,print.thres = "best",
print.thres.best.method="youden", print.auc = T,col = "green")
# AUC = 0.877,cotoff = 0.243
plot.roc(model_roc_test,print.thres = "best",
print.thres.best.method="youden", print.auc = T, col = "blue")
# AUC = 0.866,cotoff = 0.209
p_model_GLM_14 <- glm(h1n1_vaccine ~ doctor_recc_h1n1
+doctor_recc_seasonal
+health_worker
+opinion_h1n1_vacc_effective
+opinion_h1n1_risk
+seasonal_vaccine
+opinion_h1n1_sick_from_vacc
+behavioral_large_gatherings
+sex
+child_under_6_months
+opinion_seas_vacc_effective
+behavioral_face_mask
+race
+age_group,data = train_data, family = "binomial")
train_pred_prob = predict(p_model_GLM_14 , newdata = train_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
test_pred_prob = predict(p_model_GLM_14 , newdata = test_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(model_roc_train,print.thres = "best",
print.thres.best.method="youden", print.auc = T,col = "green")
# AUC = 0.876,cotoff = 0.198
plot.roc(model_roc_test,print.thres = "best",
print.thres.best.method="youden", print.auc = T, col = "blue")
# AUC = 0.866,cotoff = 0.172
r_model_GLM_16 <- glm(h1n1_vaccine ~ doctor_recc_h1n1
+seasonal_vaccine
+opinion_h1n1_risk
+opinion_h1n1_vacc_effective
+age_group
+opinion_seas_risk
+opinion_h1n1_sick_from_vacc
+h1n1_concern
+education
+opinion_seas_sick_from_vacc
+census_msa
+doctor_recc_seasonal
+household_children
+h1n1_knowledge
+household_adults
+opinion_seas_vacc_effective,data = train_data, family = "binomial")
train_pred_prob = predict(r_model_GLM_16 , newdata = train_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
test_pred_prob = predict(r_model_GLM_16 , newdata = test_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(model_roc_train,print.thres = "best",
print.thres.best.method="youden", print.auc = T,col = "green")
# AUC = 0.874,cotoff = 0.220
plot.roc(model_roc_test,print.thres = "best",
print.thres.best.method="youden", print.auc = T, col = "blue")
# AUC = 0.865,cotoff = 0.231
e_model_GLM_16 <- glm(h1n1_vaccine ~ +doctor_recc_h1n1
+seasonal_vaccine
+opinion_h1n1_risk
+opinion_h1n1_vacc_effective
+doctor_recc_seasonal
+health_worker
+opinion_h1n1_sick_from_vacc
+race
+behavioral_large_gatherings
+opinion_seas_vacc_effective
+child_under_6_months
+opinion_seas_sick_from_vacc
+behavioral_face_mask
+age_group
+sex
+opinion_seas_risk,data = train_data, family = "binomial")
train_pred_prob = predict(e_model_GLM_16 , newdata = train_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
test_pred_prob = predict(e_model_GLM_16 , newdata = test_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(model_roc_train,print.thres = "best",
print.thres.best.method="youden", print.auc = T,col = "green")
# AUC = 0.877,cotoff = 0.220
plot.roc(model_roc_test,print.thres = "best",
print.thres.best.method="youden", print.auc = T, col = "blue")
# AUC = 0.866,cotoff = 0.193
e_model_GLM_15 <- glm(h1n1_vaccine ~ +doctor_recc_h1n1
+seasonal_vaccine
+opinion_h1n1_risk
+opinion_h1n1_vacc_effective
+doctor_recc_seasonal
+health_worker
+opinion_h1n1_sick_from_vacc
+race
+behavioral_large_gatherings
+opinion_seas_vacc_effective
+child_under_6_months
+opinion_seas_sick_from_vacc
+behavioral_face_mask
+age_group
+sex,data = train_data, family = "binomial")
train_pred_prob = predict(e_model_GLM_15 , newdata = train_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
test_pred_prob = predict(e_model_GLM_15 , newdata = test_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(model_roc_train,print.thres = "best",
print.thres.best.method="youden", print.auc = T,col = "green")
# AUC = 0.877,cotoff = 0.199
plot.roc(model_roc_test,print.thres = "best",
print.thres.best.method="youden", print.auc = T, col = "blue")
# AUC = .866,cotoff = 0.180
e_model_GLM_14 <- glm(h1n1_vaccine ~ +doctor_recc_h1n1
+seasonal_vaccine
+opinion_h1n1_risk
+opinion_h1n1_vacc_effective
+doctor_recc_seasonal
+health_worker
+opinion_h1n1_sick_from_vacc
+race
+behavioral_large_gatherings
+opinion_seas_vacc_effective
+child_under_6_months
+opinion_seas_sick_from_vacc
+behavioral_face_mask
+age_group,data = train_data, family = "binomial")
train_pred_prob = predict(e_model_GLM_14 , newdata = train_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
test_pred_prob = predict(e_model_GLM_14 , newdata = test_data,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(model_roc_train,print.thres = "best",
print.thres.best.method="youden", print.auc = T,col = "green")
# AUC = 0.876,cotoff = 0.236
plot.roc(model_roc_test,print.thres = "best",
print.thres.best.method="youden", print.auc = T, col = "blue")
# AUC = 0.866,cotoff = 0.189
library(rpart); library(rpart.plot);library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
p_model_CART_14 = rpart(h1n1_vaccine ~ doctor_recc_h1n1
+seasonal_vaccine
+opinion_h1n1_risk
+opinion_h1n1_vacc_effective
+age_group
+opinion_seas_risk
+opinion_h1n1_sick_from_vacc
+h1n1_concern
+education
+opinion_seas_sick_from_vacc
+census_msa
+doctor_recc_seasonal
+household_children
+h1n1_knowledge
+household_adults,train_data,na.action = na.omit,
control = rpart.control(maxdepth = 2))
p_model_CART_14
## n= 18694
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 18694 3976 0 (0.7873114 0.2126886)
## 2) doctor_recc_h1n1=0,2 14915 1950 0 (0.8692591 0.1307409) *
## 3) doctor_recc_h1n1=1 3779 1753 1 (0.4638793 0.5361207)
## 6) seasonal_vaccine=0 1260 295 0 (0.7658730 0.2341270) *
## 7) seasonal_vaccine=1 2519 788 1 (0.3128225 0.6871775) *
plot.party(as.party(p_model_CART_14))
printcp(x = p_model_CART_14)
##
## Classification tree:
## rpart(formula = h1n1_vaccine ~ doctor_recc_h1n1 + seasonal_vaccine +
## opinion_h1n1_risk + opinion_h1n1_vacc_effective + age_group +
## opinion_seas_risk + opinion_h1n1_sick_from_vacc + h1n1_concern +
## education + opinion_seas_sick_from_vacc + census_msa + doctor_recc_seasonal +
## household_children + h1n1_knowledge + household_adults, data = train_data,
## na.action = na.omit, control = rpart.control(maxdepth = 2))
##
## Variables actually used in tree construction:
## [1] doctor_recc_h1n1 seasonal_vaccine
##
## Root node error: 3976/18694 = 0.21269
##
## n= 18694
##
## CP nsplit rel error xerror xstd
## 1 0.11859 0 1.00000 1.00000 0.014072
## 2 0.01000 2 0.76283 0.76283 0.012678
plotcp(x = p_model_CART_14)
train_pred_prob = predict(p_model_CART_14 , newdata = train_data,type = "prob")[, "1"];
test_pred_prob = predict(p_model_CART_14 , newdata = test_data,type = "prob")[, "1"];
#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(model_roc_train,print.thres = "best",
print.thres.best.method="youden", print.auc = T,col = "green")
# AUC = 0.708,cotoff = 0.236
plot.roc(model_roc_test,print.thres = "best",
print.thres.best.method="youden", print.auc = T, col = "blue")
# AUC = 0.702,cotoff = 0.189
library(randomForest)
RF_rf = randomForest(h1n1_vaccine ~ doctor_recc_h1n1
+seasonal_vaccine
+opinion_h1n1_risk
+opinion_h1n1_vacc_effective
+age_group
+opinion_seas_risk
+opinion_h1n1_sick_from_vacc
+h1n1_concern
+education
+opinion_seas_sick_from_vacc
+census_msa
+doctor_recc_seasonal
+household_children
+h1n1_knowledge
+household_adults, train_data, ntree=100, mtry=2, importance=TRUE)
RF_rf
##
## Call:
## randomForest(formula = h1n1_vaccine ~ doctor_recc_h1n1 + seasonal_vaccine + opinion_h1n1_risk + opinion_h1n1_vacc_effective + age_group + opinion_seas_risk + opinion_h1n1_sick_from_vacc + h1n1_concern + education + opinion_seas_sick_from_vacc + census_msa + doctor_recc_seasonal + household_children + h1n1_knowledge + household_adults, data = train_data, ntree = 100, mtry = 2, importance = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 15.57%
## Confusion matrix:
## 0 1 class.error
## 0 13995 723 0.04912352
## 1 2188 1788 0.55030181
plot(RF_rf)
train_pred_prob = predict(RF_rf , newdata = train_data,type = "prob")[, "1"];
test_pred_prob = predict(RF_rf , newdata = test_data,type = "prob")[, "1"];
#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(model_roc_train,print.thres = "best",
print.thres.best.method="youden", print.auc = T,col = "green")
# AUC = 0.708,cotoff = 0.236
plot.roc(model_roc_test,print.thres = "best",
print.thres.best.method="youden", print.auc = T, col = "blue")
# AUC = 0.702,cotoff = 0.189