一、資料處理

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

load("train.rdata")
data<- train
data[,c(2:21,31:34)] <- lapply(c(2:21,31:34), function(a)factor(data[[a]]))

# summary(data)
data <- data[-1]
set.seed(0)
train_idx <- sample(1:nrow(data), size = 0.7 * nrow(data))
train_data <- data[train_idx, ]
test_data <- data[-train_idx, ]

二、挑選變數

1.用p-value排列重要性(小到大)

model <- glm(h1n1_vaccine ~ ., data = train_data, family = "binomial")
summary(model)

## 
## Call:
## glm(formula = h1n1_vaccine ~ ., family = "binomial", data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4686  -0.5138  -0.2765  -0.1445   3.1589  
## 
## Coefficients: (1 not defined because of singularities)
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -4.6611597  0.2570945 -18.130  < 2e-16 ***
## h1n1_concern1                        0.0223710  0.0903792   0.248 0.804503    
## h1n1_concern2                       -0.0392892  0.0910149  -0.432 0.665975    
## h1n1_concern3                       -0.1987530  0.1031681  -1.926 0.054042 .  
## h1n1_knowledge1                     -0.0484226  0.0921482  -0.525 0.599245    
## h1n1_knowledge2                      0.0412901  0.0972068   0.425 0.671007    
## behavioral_antiviral_meds1           0.1631733  0.1026373   1.590 0.111879    
## behavioral_avoidance1               -0.0631843  0.0582705  -1.084 0.278220    
## behavioral_face_mask1                0.2383210  0.0863962   2.758 0.005807 ** 
## behavioral_wash_hands1              -0.0077819  0.0729530  -0.107 0.915050    
## behavioral_large_gatherings1        -0.2507472  0.0595166  -4.213 2.52e-05 ***
## behavioral_outside_home1            -0.0304863  0.0603161  -0.505 0.613248    
## behavioral_touch_face1               0.0008523  0.0561948   0.015 0.987899    
## doctor_recc_h1n11                    2.2714437  0.0675161  33.643  < 2e-16 ***
## doctor_recc_h1n12                   -0.7941992  0.1104525  -7.190 6.46e-13 ***
## doctor_recc_seasonal1               -1.0291488  0.0666595 -15.439  < 2e-16 ***
## doctor_recc_seasonal2                       NA         NA      NA       NA    
## chronic_med_condition1               0.0512349  0.0520272   0.985 0.324737    
## child_under_6_months1                0.2535883  0.0799855   3.170 0.001522 ** 
## health_worker1                       0.6755467  0.0676711   9.983  < 2e-16 ***
## opinion_h1n1_vacc_effective3         0.5655488  0.1561244   3.622 0.000292 ***
## opinion_h1n1_vacc_effective4         0.9412257  0.1458487   6.453 1.09e-10 ***
## opinion_h1n1_vacc_effective5         1.8451998  0.1486259  12.415  < 2e-16 ***
## opinion_h1n1_risk2                   0.5110326  0.0690307   7.403 1.33e-13 ***
## opinion_h1n1_risk4                   1.2696159  0.0794092  15.988  < 2e-16 ***
## opinion_h1n1_risk5                   1.6855449  0.1076469  15.658  < 2e-16 ***
## opinion_h1n1_sick_from_vacc2        -0.3428712  0.0617139  -5.556 2.76e-08 ***
## opinion_h1n1_sick_from_vacc4        -0.0330691  0.0701861  -0.471 0.637524    
## opinion_h1n1_sick_from_vacc5        -0.0307478  0.1040346  -0.296 0.767571    
## opinion_seas_vacc_effective4        -0.1118716  0.1088141  -1.028 0.303903    
## opinion_seas_vacc_effective5        -0.3583658  0.1138199  -3.149 0.001641 ** 
## opinion_seas_risk2                   0.1809568  0.0804374   2.250 0.024470 *  
## opinion_seas_risk4                   0.0094817  0.0861716   0.110 0.912383    
## opinion_seas_risk5                  -0.0459962  0.1028933  -0.447 0.654855    
## opinion_seas_sick_from_vacc2         0.0037181  0.0589156   0.063 0.949680    
## opinion_seas_sick_from_vacc4        -0.0870321  0.0692468  -1.257 0.208812    
## opinion_seas_sick_from_vacc5        -0.2357731  0.1128045  -2.090 0.036608 *  
## age_group35 - 44 Years              -0.2236329  0.0859782  -2.601 0.009294 ** 
## age_group45 - 54 Years              -0.1741991  0.0808925  -2.153 0.031282 *  
## age_group55 - 64 Years               0.1223979  0.0826342   1.481 0.138553    
## age_group65+ Years                  -0.0082435  0.0900258  -0.092 0.927041    
## education12 Years                    0.0733119  0.0988344   0.742 0.458229    
## educationCollege Graduate            0.1292342  0.0991099   1.304 0.192251    
## educationSome College                0.0315370  0.0990303   0.318 0.750138    
## raceHispanic                         0.3488563  0.1276015   2.734 0.006258 ** 
## raceOther or Multiple                0.3552191  0.1298520   2.736 0.006227 ** 
## raceWhite                            0.2142582  0.0982327   2.181 0.029174 *  
## sexMale                              0.1819071  0.0485781   3.745 0.000181 ***
## income_poverty> $75,000             -0.0551555  0.0574496  -0.960 0.337021    
## income_povertyBelow Poverty         -0.0107401  0.0888894  -0.121 0.903829    
## marital_statusNot Married           -0.1110293  0.0641974  -1.729 0.083720 .  
## rent_or_ownRent                      0.0089923  0.0641737   0.140 0.888562    
## employment_statusNot in Labor Force  0.0056487  0.0588539   0.096 0.923538    
## employment_statusUnemployed          0.1006140  0.1110766   0.906 0.365037    
## census_msaMSA, Principle City        0.0759100  0.0547335   1.387 0.165471    
## census_msaNon-MSA                    0.0889092  0.0564080   1.576 0.114984    
## household_adults1                    0.0248465  0.0686551   0.362 0.717424    
## household_adults2                    0.0726677  0.0914408   0.795 0.426790    
## household_children1                  0.0391762  0.0786495   0.498 0.618406    
## household_children2                 -0.0723237  0.0850295  -0.851 0.395007    
## household_children3                 -0.0710714  0.1080214  -0.658 0.510578    
## seasonal_vaccine1                    2.0660939  0.0589995  35.019  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19348  on 18693  degrees of freedom
## Residual deviance: 12710  on 18633  degrees of freedom
## AIC: 12832
## 
## Number of Fisher Scoring iterations: 6

# 結果：
#1 doctor_recc_h1n1
#2 doctor_recc_seasonal
#3 health_worker
#4 opinion_h1n1_vacc_effective
#5 opinion_h1n1_risk
#6 seasonal_vaccine
#7 opinion_h1n1_sick_from_vacc
#8 behavioral_large_gatherings
#9 sex
#10 child_under_6_months
#11 opinion_seas_vacc_effective
#12 behavioral_face_mask
#13 race
#14 age_group
#15 opinion_seas_risk
#16 opinion_seas_sick_from_vacc
#17 h1n1_concern
#18 marital_status
#19 behavioral_antiviral_meds
#20 census_msa
#21 education
#22 behavioral_avoidance
#23 chronic_med_condition
#24 income_poverty
#25 employment_status
#26 household_children
#27 household_adults
#28 h1n1_knowledge
#29 behavioral_outside_home
#30 rent_or_own
#31 behavioral_wash_hands
#32 behavioral_touch_face

2.用隨機森林看變數重要性

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

rf <- randomForest(h1n1_vaccine ~ ., train_data)
rf_pred_train <- predict(rf, train_data)
rf_pred_test <- predict(rf, test_data)
importance(rf)

##                             MeanDecreaseGini
## h1n1_concern                       233.62608
## h1n1_knowledge                     156.09912
## behavioral_antiviral_meds           40.73726
## behavioral_avoidance                90.29804
## behavioral_face_mask                52.06175
## behavioral_wash_hands               61.89743
## behavioral_large_gatherings         96.69375
## behavioral_outside_home             95.07615
## behavioral_touch_face               94.32057
## doctor_recc_h1n1                   678.15275
## doctor_recc_seasonal               193.86674
## chronic_med_condition               98.14627
## child_under_6_months                57.74945
## health_worker                      122.48378
## opinion_h1n1_vacc_effective        363.51193
## opinion_h1n1_risk                  429.10475
## opinion_h1n1_sick_from_vacc        237.11170
## opinion_seas_vacc_effective        147.72297
## opinion_seas_risk                  258.55647
## opinion_seas_sick_from_vacc        226.34071
## age_group                          309.30970
## education                          230.05704
## race                               145.69007
## sex                                110.02518
## income_poverty                     141.08452
## marital_status                      91.72238
## rent_or_own                         77.56683
## employment_status                  131.21883
## census_msa                         206.34013
## household_adults                   150.96274
## household_children                 172.10172
## seasonal_vaccine                   549.50472

# 結果：
#1 doctor_recc_h1n1
#2 seasonal_vaccine
#3 opinion_h1n1_risk
#4 opinion_h1n1_vacc_effective
#5 age_group
#6 opinion_seas_risk
#7 opinion_h1n1_sick_from_vacc
#8 h1n1_concern
#9 education
#10 opinion_seas_sick_from_vacc
#11 census_msa
#12 doctor_recc_seasonal
#13 household_children
#14 h1n1_knowledge
#15 household_adults
#16 opinion_seas_vacc_effective
#17 race
#18 income_poverty
#19 employment_status
#20 health_worker
#21 sex
#22 chronic_med_condition
#23 behavioral_large_gatherings
#24 behavioral_outside_home
#25 behavioral_touch_face
#26 marital_status
#27 behavioral_avoidance
#28 rent_or_own
#29 behavioral_wash_hands
#30 child_under_6_months
#31 behavioral_face_mask
#32 behavioral_antiviral_meds

3.用elastic net 挑變數

# Dumy code categorical predictor variables
x <- model.matrix(h1n1_vaccine ~ ., train_data)[,-1]
# Convert the outcome (class) to a numerical variable
y <- as.numeric(train_data$h1n1_vaccine)

library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 3.0-2

glmnet(x, y, family = "binomial", alpha = 1, lambda = NULL)

## 
## Call:  glmnet(x = x, y = y, family = "binomial", alpha = 1, lambda = NULL) 
## 
##    Df    %Dev   Lambda
## 1   0 0.00000 0.162800
## 2   2 0.02862 0.148300
## 3   2 0.06288 0.135200
## 4   2 0.09095 0.123200
## 5   2 0.11420 0.112200
## 6   2 0.13360 0.102200
## 7   3 0.15300 0.093160
## 8   3 0.17120 0.084890
## 9   3 0.18650 0.077340
## 10  3 0.19960 0.070470
## 11  4 0.21090 0.064210
## 12  4 0.22210 0.058510
## 13  4 0.23160 0.053310
## 14  6 0.24170 0.048570
## 15  6 0.25140 0.044260
## 16  6 0.25960 0.040330
## 17  6 0.26660 0.036740
## 18  6 0.27260 0.033480
## 19  6 0.27770 0.030510
## 20  6 0.28210 0.027800
## 21  6 0.28580 0.025330
## 22  6 0.28900 0.023080
## 23  6 0.29160 0.021030
## 24  7 0.29440 0.019160
## 25  7 0.29830 0.017460
## 26  7 0.30170 0.015910
## 27 11 0.30500 0.014490
## 28 11 0.30840 0.013210
## 29 12 0.31130 0.012030
## 30 13 0.31390 0.010960
## 31 15 0.31620 0.009989
## 32 18 0.31840 0.009102
## 33 20 0.32060 0.008293
## 34 21 0.32300 0.007557
## 35 23 0.32520 0.006885
## 36 25 0.32710 0.006274
## 37 25 0.32910 0.005716
## 38 27 0.33090 0.005208
## 39 27 0.33250 0.004746
## 40 27 0.33390 0.004324
## 41 26 0.33510 0.003940
## 42 28 0.33600 0.003590
## 43 30 0.33680 0.003271
## 44 31 0.33760 0.002980
## 45 32 0.33820 0.002716
## 46 33 0.33870 0.002474
## 47 35 0.33920 0.002255
## 48 35 0.33960 0.002054
## 49 37 0.34000 0.001872
## 50 38 0.34030 0.001706
## 51 43 0.34060 0.001554
## 52 44 0.34080 0.001416
## 53 46 0.34110 0.001290
## 54 47 0.34130 0.001176
## 55 47 0.34160 0.001071
## 56 47 0.34180 0.000976
## 57 47 0.34200 0.000889
## 58 48 0.34210 0.000810
## 59 48 0.34230 0.000738
## 60 48 0.34240 0.000673
## 61 50 0.34250 0.000613
## 62 52 0.34260 0.000558
## 63 52 0.34260 0.000509
## 64 52 0.34270 0.000464
## 65 52 0.34280 0.000422
## 66 53 0.34280 0.000385
## 67 54 0.34280 0.000351
## 68 54 0.34290 0.000320
## 69 54 0.34290 0.000291
## 70 54 0.34290 0.000265
## 71 55 0.34300 0.000242
## 72 56 0.34300 0.000220
## 73 56 0.34300 0.000201
## 74 56 0.34300 0.000183
## 75 56 0.34300 0.000167
## 76 58 0.34300 0.000152

# Find the best lambda using cross-validation
set.seed(123) 
cv.lasso <- cv.glmnet(x, y, alpha = 1, family = "binomial")
# Fit the final model on the training data
model <- glmnet(x, y, alpha = 1, family = "binomial",
                lambda = cv.lasso$lambda.min)
# Display regression coefficients
coef(model)

## 62 x 1 sparse Matrix of class "dgCMatrix"
##                                                s0
## (Intercept)                         -4.2918160910
## h1n1_concern1                        0.0397068298
## h1n1_concern2                        .           
## h1n1_concern3                       -0.1547170522
## h1n1_knowledge1                     -0.0258147848
## h1n1_knowledge2                      0.0561269791
## behavioral_antiviral_meds1           0.1274422409
## behavioral_avoidance1               -0.0550508504
## behavioral_face_mask1                0.2030208300
## behavioral_wash_hands1              -0.0004918176
## behavioral_large_gatherings1        -0.2434868154
## behavioral_outside_home1            -0.0263386294
## behavioral_touch_face1               .           
## doctor_recc_h1n11                    2.2250658195
## doctor_recc_h1n12                   -0.4061111854
## doctor_recc_seasonal1               -0.9824188151
## doctor_recc_seasonal2               -0.3381990157
## chronic_med_condition1               0.0321626339
## child_under_6_months1                0.2242859187
## health_worker1                       0.6583198309
## opinion_h1n1_vacc_effective3         0.2256687133
## opinion_h1n1_vacc_effective4         0.6167767804
## opinion_h1n1_vacc_effective5         1.5076227882
## opinion_h1n1_risk2                   0.4555663925
## opinion_h1n1_risk4                   1.2003765342
## opinion_h1n1_risk5                   1.5905801871
## opinion_h1n1_sick_from_vacc2        -0.3029451009
## opinion_h1n1_sick_from_vacc4         .           
## opinion_h1n1_sick_from_vacc5         .           
## opinion_seas_vacc_effective4         .           
## opinion_seas_vacc_effective5        -0.2301220379
## opinion_seas_risk2                   0.1607238300
## opinion_seas_risk4                   .           
## opinion_seas_risk5                  -0.0307042198
## opinion_seas_sick_from_vacc2         .           
## opinion_seas_sick_from_vacc4        -0.0750766839
## opinion_seas_sick_from_vacc5        -0.2176124948
## age_group35 - 44 Years              -0.2022710831
## age_group45 - 54 Years              -0.1511210240
## age_group55 - 64 Years               0.1177634296
## age_group65+ Years                   .           
## education12 Years                    0.0154925526
## educationCollege Graduate            0.0771036301
## educationSome College                .           
## raceHispanic                         0.2284750812
## raceOther or Multiple                0.2415486185
## raceWhite                            0.1253753217
## sexMale                              0.1721449147
## income_poverty> $75,000             -0.0261027750
## income_povertyBelow Poverty          .           
## marital_statusNot Married           -0.1058739529
## rent_or_ownRent                      .           
## employment_statusNot in Labor Force  .           
## employment_statusUnemployed          0.0566076066
## census_msaMSA, Principle City        0.0489674024
## census_msaNon-MSA                    0.0644289184
## household_adults1                    .           
## household_adults2                    0.0314940833
## household_children1                  0.0245661096
## household_children2                 -0.0491030758
## household_children3                 -0.0357309981
## seasonal_vaccine1                    2.0274008582

# Make predictions on the test data
x.test <- model.matrix(h1n1_vaccine ~., test_data)[,-1]
probabilities <- model %>% predict(newx = x.test)
predicted.classes <- ifelse(probabilities > 0.5, "0", "1")
# Model accuracy
observed.classes <- test_data$h1n1_vaccine
mean(predicted.classes == observed.classes)

## [1] 0.1609884

library(glmnet)
set.seed(123)
cv.lasso <- cv.glmnet(x, y, alpha = 1, family = "binomial")
plot(cv.lasso)

coef(cv.lasso, cv.lasso$lambda.min)

## 62 x 1 sparse Matrix of class "dgCMatrix"
##                                                 1
## (Intercept)                         -4.2891182663
## h1n1_concern1                        0.0396536632
## h1n1_concern2                        .           
## h1n1_concern3                       -0.1546387596
## h1n1_knowledge1                     -0.0277052952
## h1n1_knowledge2                      0.0542458946
## behavioral_antiviral_meds1           0.1276831287
## behavioral_avoidance1               -0.0549870663
## behavioral_face_mask1                0.2030913045
## behavioral_wash_hands1              -0.0003458214
## behavioral_large_gatherings1        -0.2436000010
## behavioral_outside_home1            -0.0263000863
## behavioral_touch_face1               .           
## doctor_recc_h1n11                    2.2256860566
## doctor_recc_h1n12                   -0.5475096081
## doctor_recc_seasonal1               -0.9828377949
## doctor_recc_seasonal2               -0.1971709402
## chronic_med_condition1               0.0321340180
## child_under_6_months1                0.2243628482
## health_worker1                       0.6583935172
## opinion_h1n1_vacc_effective3         0.2201165380
## opinion_h1n1_vacc_effective4         0.6117284145
## opinion_h1n1_vacc_effective5         1.5028898463
## opinion_h1n1_risk2                   0.4557652122
## opinion_h1n1_risk4                   1.2006770678
## opinion_h1n1_risk5                   1.5910251547
## opinion_h1n1_sick_from_vacc2        -0.3029826801
## opinion_h1n1_sick_from_vacc4         .           
## opinion_h1n1_sick_from_vacc5         .           
## opinion_seas_vacc_effective4         .           
## opinion_seas_vacc_effective5        -0.2301889533
## opinion_seas_risk2                   0.1608195887
## opinion_seas_risk4                   .           
## opinion_seas_risk5                  -0.0306915621
## opinion_seas_sick_from_vacc2         .           
## opinion_seas_sick_from_vacc4        -0.0749268135
## opinion_seas_sick_from_vacc5        -0.2172780015
## age_group35 - 44 Years              -0.2021337824
## age_group45 - 54 Years              -0.1510577433
## age_group55 - 64 Years               0.1178112899
## age_group65+ Years                   .           
## education12 Years                    0.0158972828
## educationCollege Graduate            0.0772361159
## educationSome College                .           
## raceHispanic                         0.2321002903
## raceOther or Multiple                0.2451764530
## raceWhite                            0.1280152000
## sexMale                              0.1721864558
## income_poverty> $75,000             -0.0259869989
## income_povertyBelow Poverty          .           
## marital_statusNot Married           -0.1057994628
## rent_or_ownRent                      .           
## employment_statusNot in Labor Force  .           
## employment_statusUnemployed          0.0564782461
## census_msaMSA, Principle City        0.0491123669
## census_msaNon-MSA                    0.0643672885
## household_adults1                    .           
## household_adults2                    0.0314902839
## household_children1                  0.0244918652
## household_children2                 -0.0491974842
## household_children3                 -0.0359665084
## seasonal_vaccine1                    2.0281801566

# 結果：
#1 doctor_recc_h1n1
#2 seasonal_vaccine
#3 opinion_h1n1_risk
#4 opinion_h1n1_vacc_effective
#5 doctor_recc_seasonal
#6 health_worker
#7 opinion_h1n1_sick_from_vacc
#8 race
#9 behavioral_large_gatherings
#10 opinion_seas_vacc_effective
#11 child_under_6_months
#12 opinion_seas_sick_from_vacc
#13 behavioral_face_mask
#14 age_group
#15 sex
#16 opinion_seas_risk
#17 h1n1_concern
#18 behavioral_antiviral_meds
#19 marital_status
#20 education
#21 census_msa
#22 employment_status
#23 behavioral_avoidance
#24 h1n1_knowledge
#25 household_children
#26 chronic_med_condition
#27 household_adults
#28 behavioral_outside_home
#29 income_poverty
#30 behavioral_wash_hands

三、跑羅吉斯回歸模型

1. p-value + glm

a. 全部的變數

p_model <- glm(h1n1_vaccine ~ ., data = train_data, family = "binomial")

train_pred_prob = predict(p_model, newdata = train_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

test_pred_prob = predict(p_model, newdata = test_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

#畫出roc曲線
#看training和testing的AUC
library("pROC")

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

plot.roc(model_roc_train,print.thres = "best",
         print.thres.best.method="youden", print.auc = T,col = "green")

# AUC = 0.877，cotoff = 0.222
plot.roc(model_roc_test,print.thres = "best",
         print.thres.best.method="youden", print.auc = T, col = "blue")

# AUC = 0.867，cotoff = 0.196

b. 16個變數

p_model_GLM_16 <- glm(h1n1_vaccine ~  doctor_recc_h1n1
                  +doctor_recc_seasonal
                  +health_worker
                  +opinion_h1n1_vacc_effective
                  +opinion_h1n1_risk
                  +seasonal_vaccine
                  +opinion_h1n1_sick_from_vacc
                  +behavioral_large_gatherings
                  +sex
                  +child_under_6_months
                  +opinion_seas_vacc_effective
                  +behavioral_face_mask
                  +race
                  +age_group
                  +opinion_seas_risk
                  +opinion_seas_sick_from_vacc,data = train_data, family = "binomial")

train_pred_prob = predict(p_model_GLM_16 , newdata = train_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

test_pred_prob = predict(p_model_GLM_16 , newdata = test_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

plot.roc(model_roc_train,print.thres = "best",
         print.thres.best.method="youden", print.auc = T,col = "green")

# AUC = 0.877，cotoff = 0.220
plot.roc(model_roc_test,print.thres = "best",
         print.thres.best.method="youden", print.auc = T, col = "blue")

# AUC = 0.866，cotoff = 0.193

c. 15個變數

p_model_GLM_15 <- glm(h1n1_vaccine ~  doctor_recc_h1n1
                    +doctor_recc_seasonal
                    +health_worker
                    +opinion_h1n1_vacc_effective
                    +opinion_h1n1_risk
                    +seasonal_vaccine
                    +opinion_h1n1_sick_from_vacc
                    +behavioral_large_gatherings
                    +sex
                    +child_under_6_months
                    +opinion_seas_vacc_effective
                    +behavioral_face_mask
                    +race
                    +age_group
                    +opinion_seas_risk,data = train_data, family = "binomial")


train_pred_prob = predict(p_model_GLM_15 , newdata = train_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

test_pred_prob = predict(p_model_GLM_15 , newdata = test_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

#畫出roc曲線
#看看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

plot.roc(model_roc_train,print.thres = "best",
         print.thres.best.method="youden", print.auc = T,col = "green")

# AUC = 0.877，cotoff = 0.243
plot.roc(model_roc_test,print.thres = "best",
         print.thres.best.method="youden", print.auc = T, col = "blue")

# AUC = 0.866，cotoff = 0.209

d. 14個變數

p_model_GLM_14 <- glm(h1n1_vaccine ~  doctor_recc_h1n1
                  +doctor_recc_seasonal
                  +health_worker
                  +opinion_h1n1_vacc_effective
                  +opinion_h1n1_risk
                  +seasonal_vaccine
                  +opinion_h1n1_sick_from_vacc
                  +behavioral_large_gatherings
                  +sex
                  +child_under_6_months
                  +opinion_seas_vacc_effective
                  +behavioral_face_mask
                  +race
                  +age_group,data = train_data, family = "binomial")

train_pred_prob = predict(p_model_GLM_14 , newdata = train_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

test_pred_prob = predict(p_model_GLM_14 , newdata = test_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

plot.roc(model_roc_train,print.thres = "best",
         print.thres.best.method="youden", print.auc = T,col = "green")

# AUC = 0.876，cotoff = 0.198
plot.roc(model_roc_test,print.thres = "best",
         print.thres.best.method="youden", print.auc = T, col = "blue")

# AUC = 0.866，cotoff = 0.172

2. 隨機森林 + glm

a. 16個變數

r_model_GLM_16 <- glm(h1n1_vaccine ~ doctor_recc_h1n1
                 +seasonal_vaccine
                 +opinion_h1n1_risk
                 +opinion_h1n1_vacc_effective
                 +age_group
                 +opinion_seas_risk
                 +opinion_h1n1_sick_from_vacc
                 +h1n1_concern
                 +education
                 +opinion_seas_sick_from_vacc
                 +census_msa
                 +doctor_recc_seasonal
                 +household_children
                 +h1n1_knowledge
                 +household_adults
                 +opinion_seas_vacc_effective,data = train_data, family = "binomial")

train_pred_prob = predict(r_model_GLM_16 , newdata = train_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

test_pred_prob = predict(r_model_GLM_16 , newdata = test_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

plot.roc(model_roc_train,print.thres = "best",
         print.thres.best.method="youden", print.auc = T,col = "green")

# AUC = 0.874，cotoff = 0.220
plot.roc(model_roc_test,print.thres = "best",
         print.thres.best.method="youden", print.auc = T, col = "blue")

# AUC = 0.865，cotoff = 0.231

3. elastic net + glm

a. 16個變數

e_model_GLM_16 <- glm(h1n1_vaccine ~ +doctor_recc_h1n1
                 +seasonal_vaccine
                 +opinion_h1n1_risk
                 +opinion_h1n1_vacc_effective
                 +doctor_recc_seasonal
                 +health_worker
                 +opinion_h1n1_sick_from_vacc
                 +race
                 +behavioral_large_gatherings
                 +opinion_seas_vacc_effective
                 +child_under_6_months
                 +opinion_seas_sick_from_vacc
                 +behavioral_face_mask
                 +age_group
                 +sex
                 +opinion_seas_risk,data = train_data, family = "binomial")

train_pred_prob = predict(e_model_GLM_16 , newdata = train_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

test_pred_prob = predict(e_model_GLM_16 , newdata = test_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

plot.roc(model_roc_train,print.thres = "best",
         print.thres.best.method="youden", print.auc = T,col = "green")

# AUC = 0.877，cotoff = 0.220
plot.roc(model_roc_test,print.thres = "best",
         print.thres.best.method="youden", print.auc = T, col = "blue")

# AUC = 0.866，cotoff = 0.193

b. 15個變數

e_model_GLM_15 <- glm(h1n1_vaccine ~ +doctor_recc_h1n1
                   +seasonal_vaccine
                   +opinion_h1n1_risk
                   +opinion_h1n1_vacc_effective
                   +doctor_recc_seasonal
                   +health_worker
                   +opinion_h1n1_sick_from_vacc
                   +race
                   +behavioral_large_gatherings
                   +opinion_seas_vacc_effective
                   +child_under_6_months
                   +opinion_seas_sick_from_vacc
                   +behavioral_face_mask
                   +age_group
                   +sex,data = train_data, family = "binomial")

train_pred_prob = predict(e_model_GLM_15 , newdata = train_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

test_pred_prob = predict(e_model_GLM_15 , newdata = test_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

plot.roc(model_roc_train,print.thres = "best",
         print.thres.best.method="youden", print.auc = T,col = "green")

# AUC = 0.877，cotoff = 0.199
plot.roc(model_roc_test,print.thres = "best",
         print.thres.best.method="youden", print.auc = T, col = "blue")

# AUC = .866，cotoff = 0.180

c. 14個變數

e_model_GLM_14 <- glm(h1n1_vaccine ~ +doctor_recc_h1n1
                 +seasonal_vaccine
                 +opinion_h1n1_risk
                 +opinion_h1n1_vacc_effective
                 +doctor_recc_seasonal
                 +health_worker
                 +opinion_h1n1_sick_from_vacc
                 +race
                 +behavioral_large_gatherings
                 +opinion_seas_vacc_effective
                 +child_under_6_months
                 +opinion_seas_sick_from_vacc
                 +behavioral_face_mask
                 +age_group,data = train_data, family = "binomial")

train_pred_prob = predict(e_model_GLM_14 , newdata = train_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

test_pred_prob = predict(e_model_GLM_14 , newdata = test_data,type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

plot.roc(model_roc_train,print.thres = "best",
         print.thres.best.method="youden", print.auc = T,col = "green")

# AUC = 0.876，cotoff = 0.236
plot.roc(model_roc_test,print.thres = "best",
         print.thres.best.method="youden", print.auc = T, col = "blue")

# AUC = 0.866，cotoff = 0.189

四、CART(Classification and Regression Tree)決策樹演算法-rpart()

1. p-value + CART

a. 16個變數

library(rpart); library(rpart.plot);library(partykit)

## Loading required package: grid

## Loading required package: libcoin

## Loading required package: mvtnorm

p_model_CART_14 = rpart(h1n1_vaccine ~ doctor_recc_h1n1
                     +seasonal_vaccine
                     +opinion_h1n1_risk
                     +opinion_h1n1_vacc_effective
                     +age_group
                     +opinion_seas_risk
                     +opinion_h1n1_sick_from_vacc
                     +h1n1_concern
                     +education
                     +opinion_seas_sick_from_vacc
                     +census_msa
                     +doctor_recc_seasonal
                     +household_children
                     +h1n1_knowledge
                     +household_adults,train_data,na.action = na.omit,
                     control = rpart.control(maxdepth = 2))
p_model_CART_14

## n= 18694 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 18694 3976 0 (0.7873114 0.2126886)  
##   2) doctor_recc_h1n1=0,2 14915 1950 0 (0.8692591 0.1307409) *
##   3) doctor_recc_h1n1=1 3779 1753 1 (0.4638793 0.5361207)  
##     6) seasonal_vaccine=0 1260  295 0 (0.7658730 0.2341270) *
##     7) seasonal_vaccine=1 2519  788 1 (0.3128225 0.6871775) *

plot.party(as.party(p_model_CART_14))

printcp(x = p_model_CART_14)

## 
## Classification tree:
## rpart(formula = h1n1_vaccine ~ doctor_recc_h1n1 + seasonal_vaccine + 
##     opinion_h1n1_risk + opinion_h1n1_vacc_effective + age_group + 
##     opinion_seas_risk + opinion_h1n1_sick_from_vacc + h1n1_concern + 
##     education + opinion_seas_sick_from_vacc + census_msa + doctor_recc_seasonal + 
##     household_children + h1n1_knowledge + household_adults, data = train_data, 
##     na.action = na.omit, control = rpart.control(maxdepth = 2))
## 
## Variables actually used in tree construction:
## [1] doctor_recc_h1n1 seasonal_vaccine
## 
## Root node error: 3976/18694 = 0.21269
## 
## n= 18694 
## 
##        CP nsplit rel error  xerror     xstd
## 1 0.11859      0   1.00000 1.00000 0.014072
## 2 0.01000      2   0.76283 0.76283 0.012678

plotcp(x = p_model_CART_14)

train_pred_prob = predict(p_model_CART_14 , newdata = train_data,type = "prob")[, "1"];
test_pred_prob = predict(p_model_CART_14 , newdata = test_data,type = "prob")[, "1"];

#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

plot.roc(model_roc_train,print.thres = "best",
         print.thres.best.method="youden", print.auc = T,col = "green")

# AUC = 0.708，cotoff = 0.236
plot.roc(model_roc_test,print.thres = "best",
         print.thres.best.method="youden", print.auc = T, col = "blue")

# AUC = 0.702，cotoff = 0.189

b. 15個變數

c. 14個變數

2. 隨機森林 + CART

a. 16個變數

b. 15個變數

c. 14個變數

3. elastic net + CART

a. 16個變數

b. 15個變數

c. 14個變數

五、跑隨機森林模型

1. p-value + 隨機森林

a. 16個變數

library(randomForest)
RF_rf = randomForest(h1n1_vaccine ~ doctor_recc_h1n1
                             +seasonal_vaccine
                             +opinion_h1n1_risk
                             +opinion_h1n1_vacc_effective
                             +age_group
                             +opinion_seas_risk
                             +opinion_h1n1_sick_from_vacc
                             +h1n1_concern
                             +education
                             +opinion_seas_sick_from_vacc
                             +census_msa
                             +doctor_recc_seasonal
                             +household_children
                             +h1n1_knowledge
                             +household_adults, train_data, ntree=100, mtry=2, importance=TRUE)


RF_rf

## 
## Call:
##  randomForest(formula = h1n1_vaccine ~ doctor_recc_h1n1 + seasonal_vaccine +      opinion_h1n1_risk + opinion_h1n1_vacc_effective + age_group +      opinion_seas_risk + opinion_h1n1_sick_from_vacc + h1n1_concern +      education + opinion_seas_sick_from_vacc + census_msa + doctor_recc_seasonal +      household_children + h1n1_knowledge + household_adults, data = train_data,      ntree = 100, mtry = 2, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 15.57%
## Confusion matrix:
##       0    1 class.error
## 0 13995  723  0.04912352
## 1  2188 1788  0.55030181

plot(RF_rf)

train_pred_prob = predict(RF_rf , newdata = train_data,type = "prob")[, "1"];
test_pred_prob = predict(RF_rf , newdata = test_data,type = "prob")[, "1"];
#畫出roc曲線
#看training和testing的AUC
library("pROC")
model_roc_train = roc(train_data$h1n1_vaccine, train_pred_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

model_roc_test = roc(test_data$h1n1_vaccine, test_pred_prob)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

plot.roc(model_roc_train,print.thres = "best",
         print.thres.best.method="youden", print.auc = T,col = "green")

# AUC = 0.708，cotoff = 0.236
plot.roc(model_roc_test,print.thres = "best",
         print.thres.best.method="youden", print.auc = T, col = "blue")

# AUC = 0.702，cotoff = 0.189

final

一、資料處理

二、挑選變數

1.用p-value排列重要性(小到大)

2.用隨機森林看變數重要性

3.用elastic net 挑變數

三、跑羅吉斯回歸模型

1. p-value + glm

a. 全部的變數

b. 16個變數

c. 15個變數

d. 14個變數

2. 隨機森林 + glm

a. 16個變數

3. elastic net + glm

a. 16個變數

b. 15個變數

c. 14個變數

四、CART(Classification and Regression Tree)決策樹演算法-rpart()

1. p-value + CART

a. 16個變數

b. 15個變數

c. 14個變數

2. 隨機森林 + CART

a. 16個變數

b. 15個變數

c. 14個變數

3. elastic net + CART

a. 16個變數

b. 15個變數

c. 14個變數

五、跑隨機森林模型

1. p-value + 隨機森林

a. 16個變數

b. 15個變數

c. 14個變數

2. 隨機森林 + 隨機森林

a. 16個變數

b. 15個變數

c. 14個變數

3. elastic net + 隨機森林

a. 16個變數

b. 15個變數

c. 14個變數

六、最終模型

1. AUC

2. Confusion matrix