library(dplyr)
library(mdscore)
## Warning: package 'MASS' was built under R version 3.4.3
library(aod)
library(tidyr)
library(ggplot2)
library(magrittr)
library(quantreg)
library(ROCR)
setwd("~/Desktop/health~/")
health1994 <- read.csv("health1994.csv",fileEncoding = "big5")
barplot
std <- function(v){
  v_bar <- mean(v)
  s <- sd(v)
  t_score <- (v - v_bar)/s
  return(t_score)
}

health1234 <- health1994
health1234[,c("AGE","EDUC","HHNINC","NEWHSAT")] <- health1234[,c("AGE","EDUC","HHNINC","NEWHSAT")] %>% apply(2,std) %>% as.data.frame()
health123 <- health1234 %>% dplyr::select(DOCTOR,AGE,EDUC,NEWHSAT,HHNINC)
df<-gather(health123,Doctor,num,AGE:HHNINC)
df$DOCTOR%<>%as.factor()
ggplot(data=df)+geom_boxplot(aes(x=Doctor,y=num,fill=DOCTOR))+ylim(-3,3)
## Warning: Removed 27 rows containing non-finite values (stat_boxplot).

train & test
index <- sample(2, nrow(health1994) ,replace = T ,prob = c(0.8,0.2))
health.train <- health1994[index == 1, ]
health.test <- health1994[index == 2, ]
model select
health.glm <- glm(DOCTOR ~ (AGE  +HANDDUM + FEMALE + HHKIDS + EDUC + MARRIED + BLUEC + WHITEC + SELF + BEAMT + PUBLIC + ADDON + NEWHSAT + HHNINC) , health.train ,family = binomial(link = "logit"))
health.glm.2 <- glm(DOCTOR ~ (AGE  +HANDDUM + FEMALE + HHKIDS + EDUC + MARRIED + BLUEC + WHITEC + SELF + BEAMT + PUBLIC + ADDON + NEWHSAT + HHNINC)^2 , health.train ,family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
anova(health.glm , health.glm.2 , test = "LR")
## Analysis of Deviance Table
## 
## Model 1: DOCTOR ~ (AGE + HANDDUM + FEMALE + HHKIDS + EDUC + MARRIED + 
##     BLUEC + WHITEC + SELF + BEAMT + PUBLIC + ADDON + NEWHSAT + 
##     HHNINC)
## Model 2: DOCTOR ~ (AGE + HANDDUM + FEMALE + HHKIDS + EDUC + MARRIED + 
##     BLUEC + WHITEC + SELF + BEAMT + PUBLIC + ADDON + NEWHSAT + 
##     HHNINC)^2
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      2726     3186.8                     
## 2      2637     3089.9 89   96.881   0.2664
summary(health.glm)
## 
## Call:
## glm(formula = DOCTOR ~ (AGE + HANDDUM + FEMALE + HHKIDS + EDUC + 
##     MARRIED + BLUEC + WHITEC + SELF + BEAMT + PUBLIC + ADDON + 
##     NEWHSAT + HHNINC), family = binomial(link = "logit"), data = health.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5690  -1.1372   0.5971   0.9302   1.7498  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.302676   0.427057   5.392 6.97e-08 ***
## AGE          0.005344   0.004585   1.166  0.24380    
## HANDDUM      0.587508   0.181807   3.231  0.00123 ** 
## FEMALE       0.571515   0.093742   6.097 1.08e-09 ***
## HHKIDS      -0.324491   0.102899  -3.153  0.00161 ** 
## EDUC         0.001134   0.020392   0.056  0.95566    
## MARRIED      0.153606   0.115421   1.331  0.18324    
## BLUEC        0.042030   0.128722   0.327  0.74403    
## WHITEC       0.077275   0.115303   0.670  0.50274    
## SELF        -0.382236   0.201275  -1.899  0.05755 .  
## BEAMT       -0.122078   0.206824  -0.590  0.55502    
## PUBLIC      -0.051309   0.166523  -0.308  0.75799    
## ADDON       -0.067238   0.290165  -0.232  0.81675    
## NEWHSAT     -0.295713   0.023675 -12.491  < 2e-16 ***
## HHNINC      -0.207189   0.219744  -0.943  0.34575    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3536.3  on 2740  degrees of freedom
## Residual deviance: 3186.8  on 2726  degrees of freedom
## AIC: 3216.8
## 
## Number of Fisher Scoring iterations: 4
自己選
health.glm1 <- glm(DOCTOR ~ ( HANDDUM + FEMALE + HHKIDS + SELF + NEWHSAT) , health.train ,family = binomial(link = "logit"))
summary(health.glm1)
## 
## Call:
## glm(formula = DOCTOR ~ (HANDDUM + FEMALE + HHKIDS + SELF + NEWHSAT), 
##     family = binomial(link = "logit"), data = health.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5293  -1.1112   0.5947   0.8869   1.6953  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.56793    0.18496  13.883  < 2e-16 ***
## HANDDUM      0.63604    0.17770   3.579 0.000345 ***
## FEMALE       0.58920    0.08752   6.732 1.67e-11 ***
## HHKIDS      -0.30723    0.08724  -3.522 0.000429 ***
## SELF        -0.39792    0.18196  -2.187 0.028755 *  
## NEWHSAT     -0.30284    0.02342 -12.930  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3536.3  on 2740  degrees of freedom
## Residual deviance: 3193.7  on 2735  degrees of freedom
## AIC: 3205.7
## 
## Number of Fisher Scoring iterations: 4
anova(health.glm1 , test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: DOCTOR
## 
## Terms added sequentially (first to last)
## 
## 
##         Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                     2740     3536.3              
## HANDDUM  1   54.303      2739     3482.0 1.718e-13 ***
## FEMALE   1   73.791      2738     3408.2 < 2.2e-16 ***
## HHKIDS   1   16.923      2737     3391.3 3.893e-05 ***
## SELF     1    5.342      2736     3386.0   0.02082 *  
## NEWHSAT  1  192.234      2735     3193.7 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
backward
step(health.glm ,direction = "backward")
## Start:  AIC=3216.81
## DOCTOR ~ (AGE + HANDDUM + FEMALE + HHKIDS + EDUC + MARRIED + 
##     BLUEC + WHITEC + SELF + BEAMT + PUBLIC + ADDON + NEWHSAT + 
##     HHNINC)
## 
##           Df Deviance    AIC
## - EDUC     1   3186.8 3214.8
## - ADDON    1   3186.9 3214.9
## - PUBLIC   1   3186.9 3214.9
## - BLUEC    1   3186.9 3214.9
## - BEAMT    1   3187.2 3215.2
## - WHITEC   1   3187.3 3215.3
## - HHNINC   1   3187.7 3215.7
## - AGE      1   3188.2 3216.2
## - MARRIED  1   3188.6 3216.6
## <none>         3186.8 3216.8
## - SELF     1   3190.4 3218.4
## - HHKIDS   1   3196.8 3224.8
## - HANDDUM  1   3198.0 3226.0
## - FEMALE   1   3224.4 3252.4
## - NEWHSAT  1   3365.3 3393.3
## 
## Step:  AIC=3214.81
## DOCTOR ~ AGE + HANDDUM + FEMALE + HHKIDS + MARRIED + BLUEC + 
##     WHITEC + SELF + BEAMT + PUBLIC + ADDON + NEWHSAT + HHNINC
## 
##           Df Deviance    AIC
## - ADDON    1   3186.9 3212.9
## - PUBLIC   1   3186.9 3212.9
## - BLUEC    1   3186.9 3212.9
## - BEAMT    1   3187.2 3213.2
## - WHITEC   1   3187.3 3213.3
## - HHNINC   1   3187.7 3213.7
## - AGE      1   3188.2 3214.2
## - MARRIED  1   3188.6 3214.6
## <none>         3186.8 3214.8
## - SELF     1   3190.4 3216.4
## - HHKIDS   1   3196.8 3222.8
## - HANDDUM  1   3198.0 3224.0
## - FEMALE   1   3225.1 3251.1
## - NEWHSAT  1   3365.3 3391.3
## 
## Step:  AIC=3212.86
## DOCTOR ~ AGE + HANDDUM + FEMALE + HHKIDS + MARRIED + BLUEC + 
##     WHITEC + SELF + BEAMT + PUBLIC + NEWHSAT + HHNINC
## 
##           Df Deviance    AIC
## - BLUEC    1   3187.0 3211.0
## - PUBLIC   1   3187.0 3211.0
## - BEAMT    1   3187.2 3211.2
## - WHITEC   1   3187.3 3211.3
## - HHNINC   1   3187.8 3211.8
## - AGE      1   3188.2 3212.2
## - MARRIED  1   3188.6 3212.6
## <none>         3186.9 3212.9
## - SELF     1   3190.5 3214.5
## - HHKIDS   1   3196.9 3220.9
## - HANDDUM  1   3198.1 3222.1
## - FEMALE   1   3225.2 3249.2
## - NEWHSAT  1   3365.5 3389.5
## 
## Step:  AIC=3210.97
## DOCTOR ~ AGE + HANDDUM + FEMALE + HHKIDS + MARRIED + WHITEC + 
##     SELF + BEAMT + PUBLIC + NEWHSAT + HHNINC
## 
##           Df Deviance    AIC
## - PUBLIC   1   3187.1 3209.1
## - WHITEC   1   3187.3 3209.3
## - BEAMT    1   3187.5 3209.5
## - HHNINC   1   3187.9 3209.9
## - AGE      1   3188.2 3210.2
## - MARRIED  1   3188.8 3210.8
## <none>         3187.0 3211.0
## - SELF     1   3191.4 3213.4
## - HHKIDS   1   3197.0 3219.0
## - HANDDUM  1   3198.1 3220.1
## - FEMALE   1   3227.5 3249.5
## - NEWHSAT  1   3365.5 3387.5
## 
## Step:  AIC=3209.06
## DOCTOR ~ AGE + HANDDUM + FEMALE + HHKIDS + MARRIED + WHITEC + 
##     SELF + BEAMT + NEWHSAT + HHNINC
## 
##           Df Deviance    AIC
## - WHITEC   1   3187.4 3207.4
## - BEAMT    1   3187.5 3207.5
## - HHNINC   1   3187.9 3207.9
## - AGE      1   3188.4 3208.4
## - MARRIED  1   3188.8 3208.8
## <none>         3187.1 3209.1
## - SELF     1   3191.4 3211.4
## - HHKIDS   1   3197.1 3217.1
## - HANDDUM  1   3198.2 3218.2
## - FEMALE   1   3227.6 3247.6
## - NEWHSAT  1   3365.6 3385.6
## 
## Step:  AIC=3207.38
## DOCTOR ~ AGE + HANDDUM + FEMALE + HHKIDS + MARRIED + SELF + BEAMT + 
##     NEWHSAT + HHNINC
## 
##           Df Deviance    AIC
## - BEAMT    1   3188.0 3206.0
## - HHNINC   1   3188.0 3206.0
## - AGE      1   3188.6 3206.6
## - MARRIED  1   3189.1 3207.1
## <none>         3187.4 3207.4
## - SELF     1   3192.5 3210.5
## - HHKIDS   1   3197.4 3215.4
## - HANDDUM  1   3198.4 3216.4
## - FEMALE   1   3228.2 3246.2
## - NEWHSAT  1   3365.7 3383.7
## 
## Step:  AIC=3206.04
## DOCTOR ~ AGE + HANDDUM + FEMALE + HHKIDS + MARRIED + SELF + NEWHSAT + 
##     HHNINC
## 
##           Df Deviance    AIC
## - HHNINC   1   3188.9 3204.9
## - AGE      1   3189.3 3205.3
## - MARRIED  1   3189.8 3205.8
## <none>         3188.0 3206.0
## - SELF     1   3192.8 3208.8
## - HHKIDS   1   3198.2 3214.2
## - HANDDUM  1   3199.1 3215.1
## - FEMALE   1   3231.0 3247.0
## - NEWHSAT  1   3366.8 3382.8
## 
## Step:  AIC=3204.9
## DOCTOR ~ AGE + HANDDUM + FEMALE + HHKIDS + MARRIED + SELF + NEWHSAT
## 
##           Df Deviance    AIC
## - AGE      1   3190.1 3204.1
## - MARRIED  1   3190.3 3204.3
## <none>         3188.9 3204.9
## - SELF     1   3194.0 3208.0
## - HHKIDS   1   3198.8 3212.8
## - HANDDUM  1   3200.3 3214.3
## - FEMALE   1   3232.2 3246.2
## - NEWHSAT  1   3371.3 3385.3
## 
## Step:  AIC=3204.13
## DOCTOR ~ HANDDUM + FEMALE + HHKIDS + MARRIED + SELF + NEWHSAT
## 
##           Df Deviance    AIC
## <none>         3190.1 3204.1
## - MARRIED  1   3193.7 3205.7
## - SELF     1   3195.0 3207.0
## - HANDDUM  1   3203.0 3215.0
## - HHKIDS   1   3205.8 3217.8
## - FEMALE   1   3235.1 3247.1
## - NEWHSAT  1   3379.4 3391.4
## 
## Call:  glm(formula = DOCTOR ~ HANDDUM + FEMALE + HHKIDS + MARRIED + 
##     SELF + NEWHSAT, family = binomial(link = "logit"), data = health.train)
## 
## Coefficients:
## (Intercept)      HANDDUM       FEMALE       HHKIDS      MARRIED  
##      2.4475       0.6137       0.5832      -0.3703       0.1904  
##        SELF      NEWHSAT  
##     -0.4018      -0.3005  
## 
## Degrees of Freedom: 2740 Total (i.e. Null);  2734 Residual
## Null Deviance:       3536 
## Residual Deviance: 3190  AIC: 3204
health.glm2 <- glm(DOCTOR ~ HANDDUM + FEMALE + HHKIDS + WHITEC + SELF + 
                     ADDON + NEWHSAT, family = binomial(link = "logit"), data = health.train)
summary(health.glm2)
## 
## Call:
## glm(formula = DOCTOR ~ HANDDUM + FEMALE + HHKIDS + WHITEC + SELF + 
##     ADDON + NEWHSAT, family = binomial(link = "logit"), data = health.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5252  -1.1206   0.5981   0.8915   1.6937  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.55796    0.18662  13.707  < 2e-16 ***
## HANDDUM      0.63916    0.17789   3.593 0.000327 ***
## FEMALE       0.58821    0.08766   6.710 1.94e-11 ***
## HHKIDS      -0.30638    0.08726  -3.511 0.000446 ***
## WHITEC       0.03865    0.09239   0.418 0.675665    
## SELF        -0.38188    0.18509  -2.063 0.039090 *  
## ADDON       -0.07853    0.28606  -0.275 0.783688    
## NEWHSAT     -0.30318    0.02345 -12.931  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3536.3  on 2740  degrees of freedom
## Residual deviance: 3193.5  on 2733  degrees of freedom
## AIC: 3209.5
## 
## Number of Fisher Scoring iterations: 4
anova(health.glm2,health.glm1 , test = "LR")
## Analysis of Deviance Table
## 
## Model 1: DOCTOR ~ HANDDUM + FEMALE + HHKIDS + WHITEC + SELF + ADDON + 
##     NEWHSAT
## Model 2: DOCTOR ~ (HANDDUM + FEMALE + HHKIDS + SELF + NEWHSAT)
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      2733     3193.5                     
## 2      2735     3193.7 -2 -0.23406   0.8896
train預測
glm.pre <- predict(health.glm2 ,newdata = health.train , type = "response")
glm.pre <- ifelse(glm.pre > 0.5, 1, 0)
tab.tr <- table(real.y = health.train$DOCTOR ,pred.y = glm.pre)
sum(diag(tab.tr))/sum(tab.tr)
## [1] 0.6851514
test預測
glm.pre1 <- predict(health.glm2 ,newdata = health.test , type = "response")
glm.pre1 <- ifelse(glm.pre1 > 0.5, 1, 0)
tab.te <- table(real.y = health.test$DOCTOR ,pred.y = glm.pre1)
sum(diag(tab.te))/sum(tab.te)
## [1] 0.7106918
悲劇roc
pred <- prediction(glm.pre1 , health.test$DOCTOR)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
auc <- performance(pred, "auc")
plot(perf, col = rainbow(5), main = "ROC curve", xlab = "1 - Specificity(FPR)", ylab = "Sensitivity(TPR)")
abline(0, 1)
text(0.5, 0.5,paste0("AUC= ",as.character(auc@y.values[[1]])))

library(ggplot2)
ggplot(data = health1994 , aes(x= as.factor(HANDDUM),fill= DOCTOR)) + geom_bar( position = "dodge") 

ggplot(data = health1994 , aes(x= as.factor(FEMALE),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(HHKIDS),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(MARRIED),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(BLUEC),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(WHITEC),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(SELF),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(BEAMT),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(PUBLIC),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(ADDON),fill= DOCTOR)) + geom_bar( position = "dodge" )