library(dplyr)
library(mdscore)
## Warning: package 'MASS' was built under R version 3.4.3
library(aod)
library(tidyr)
library(ggplot2)
library(magrittr)
library(quantreg)
library(ROCR)
setwd("~/Desktop/health~/")
health1994 <- read.csv("health1994.csv",fileEncoding = "big5")
barplot
std <- function(v){
v_bar <- mean(v)
s <- sd(v)
t_score <- (v - v_bar)/s
return(t_score)
}
health1234 <- health1994
health1234[,c("AGE","EDUC","HHNINC","NEWHSAT")] <- health1234[,c("AGE","EDUC","HHNINC","NEWHSAT")] %>% apply(2,std) %>% as.data.frame()
health123 <- health1234 %>% dplyr::select(DOCTOR,AGE,EDUC,NEWHSAT,HHNINC)
df<-gather(health123,Doctor,num,AGE:HHNINC)
df$DOCTOR%<>%as.factor()
ggplot(data=df)+geom_boxplot(aes(x=Doctor,y=num,fill=DOCTOR))+ylim(-3,3)
## Warning: Removed 27 rows containing non-finite values (stat_boxplot).

train & test
index <- sample(2, nrow(health1994) ,replace = T ,prob = c(0.8,0.2))
health.train <- health1994[index == 1, ]
health.test <- health1994[index == 2, ]
model select
health.glm <- glm(DOCTOR ~ (AGE +HANDDUM + FEMALE + HHKIDS + EDUC + MARRIED + BLUEC + WHITEC + SELF + BEAMT + PUBLIC + ADDON + NEWHSAT + HHNINC) , health.train ,family = binomial(link = "logit"))
health.glm.2 <- glm(DOCTOR ~ (AGE +HANDDUM + FEMALE + HHKIDS + EDUC + MARRIED + BLUEC + WHITEC + SELF + BEAMT + PUBLIC + ADDON + NEWHSAT + HHNINC)^2 , health.train ,family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
anova(health.glm , health.glm.2 , test = "LR")
## Analysis of Deviance Table
##
## Model 1: DOCTOR ~ (AGE + HANDDUM + FEMALE + HHKIDS + EDUC + MARRIED +
## BLUEC + WHITEC + SELF + BEAMT + PUBLIC + ADDON + NEWHSAT +
## HHNINC)
## Model 2: DOCTOR ~ (AGE + HANDDUM + FEMALE + HHKIDS + EDUC + MARRIED +
## BLUEC + WHITEC + SELF + BEAMT + PUBLIC + ADDON + NEWHSAT +
## HHNINC)^2
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 2726 3186.8
## 2 2637 3089.9 89 96.881 0.2664
summary(health.glm)
##
## Call:
## glm(formula = DOCTOR ~ (AGE + HANDDUM + FEMALE + HHKIDS + EDUC +
## MARRIED + BLUEC + WHITEC + SELF + BEAMT + PUBLIC + ADDON +
## NEWHSAT + HHNINC), family = binomial(link = "logit"), data = health.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5690 -1.1372 0.5971 0.9302 1.7498
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.302676 0.427057 5.392 6.97e-08 ***
## AGE 0.005344 0.004585 1.166 0.24380
## HANDDUM 0.587508 0.181807 3.231 0.00123 **
## FEMALE 0.571515 0.093742 6.097 1.08e-09 ***
## HHKIDS -0.324491 0.102899 -3.153 0.00161 **
## EDUC 0.001134 0.020392 0.056 0.95566
## MARRIED 0.153606 0.115421 1.331 0.18324
## BLUEC 0.042030 0.128722 0.327 0.74403
## WHITEC 0.077275 0.115303 0.670 0.50274
## SELF -0.382236 0.201275 -1.899 0.05755 .
## BEAMT -0.122078 0.206824 -0.590 0.55502
## PUBLIC -0.051309 0.166523 -0.308 0.75799
## ADDON -0.067238 0.290165 -0.232 0.81675
## NEWHSAT -0.295713 0.023675 -12.491 < 2e-16 ***
## HHNINC -0.207189 0.219744 -0.943 0.34575
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3536.3 on 2740 degrees of freedom
## Residual deviance: 3186.8 on 2726 degrees of freedom
## AIC: 3216.8
##
## Number of Fisher Scoring iterations: 4
自己選
health.glm1 <- glm(DOCTOR ~ ( HANDDUM + FEMALE + HHKIDS + SELF + NEWHSAT) , health.train ,family = binomial(link = "logit"))
summary(health.glm1)
##
## Call:
## glm(formula = DOCTOR ~ (HANDDUM + FEMALE + HHKIDS + SELF + NEWHSAT),
## family = binomial(link = "logit"), data = health.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5293 -1.1112 0.5947 0.8869 1.6953
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.56793 0.18496 13.883 < 2e-16 ***
## HANDDUM 0.63604 0.17770 3.579 0.000345 ***
## FEMALE 0.58920 0.08752 6.732 1.67e-11 ***
## HHKIDS -0.30723 0.08724 -3.522 0.000429 ***
## SELF -0.39792 0.18196 -2.187 0.028755 *
## NEWHSAT -0.30284 0.02342 -12.930 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3536.3 on 2740 degrees of freedom
## Residual deviance: 3193.7 on 2735 degrees of freedom
## AIC: 3205.7
##
## Number of Fisher Scoring iterations: 4
anova(health.glm1 , test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: DOCTOR
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 2740 3536.3
## HANDDUM 1 54.303 2739 3482.0 1.718e-13 ***
## FEMALE 1 73.791 2738 3408.2 < 2.2e-16 ***
## HHKIDS 1 16.923 2737 3391.3 3.893e-05 ***
## SELF 1 5.342 2736 3386.0 0.02082 *
## NEWHSAT 1 192.234 2735 3193.7 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
backward
step(health.glm ,direction = "backward")
## Start: AIC=3216.81
## DOCTOR ~ (AGE + HANDDUM + FEMALE + HHKIDS + EDUC + MARRIED +
## BLUEC + WHITEC + SELF + BEAMT + PUBLIC + ADDON + NEWHSAT +
## HHNINC)
##
## Df Deviance AIC
## - EDUC 1 3186.8 3214.8
## - ADDON 1 3186.9 3214.9
## - PUBLIC 1 3186.9 3214.9
## - BLUEC 1 3186.9 3214.9
## - BEAMT 1 3187.2 3215.2
## - WHITEC 1 3187.3 3215.3
## - HHNINC 1 3187.7 3215.7
## - AGE 1 3188.2 3216.2
## - MARRIED 1 3188.6 3216.6
## <none> 3186.8 3216.8
## - SELF 1 3190.4 3218.4
## - HHKIDS 1 3196.8 3224.8
## - HANDDUM 1 3198.0 3226.0
## - FEMALE 1 3224.4 3252.4
## - NEWHSAT 1 3365.3 3393.3
##
## Step: AIC=3214.81
## DOCTOR ~ AGE + HANDDUM + FEMALE + HHKIDS + MARRIED + BLUEC +
## WHITEC + SELF + BEAMT + PUBLIC + ADDON + NEWHSAT + HHNINC
##
## Df Deviance AIC
## - ADDON 1 3186.9 3212.9
## - PUBLIC 1 3186.9 3212.9
## - BLUEC 1 3186.9 3212.9
## - BEAMT 1 3187.2 3213.2
## - WHITEC 1 3187.3 3213.3
## - HHNINC 1 3187.7 3213.7
## - AGE 1 3188.2 3214.2
## - MARRIED 1 3188.6 3214.6
## <none> 3186.8 3214.8
## - SELF 1 3190.4 3216.4
## - HHKIDS 1 3196.8 3222.8
## - HANDDUM 1 3198.0 3224.0
## - FEMALE 1 3225.1 3251.1
## - NEWHSAT 1 3365.3 3391.3
##
## Step: AIC=3212.86
## DOCTOR ~ AGE + HANDDUM + FEMALE + HHKIDS + MARRIED + BLUEC +
## WHITEC + SELF + BEAMT + PUBLIC + NEWHSAT + HHNINC
##
## Df Deviance AIC
## - BLUEC 1 3187.0 3211.0
## - PUBLIC 1 3187.0 3211.0
## - BEAMT 1 3187.2 3211.2
## - WHITEC 1 3187.3 3211.3
## - HHNINC 1 3187.8 3211.8
## - AGE 1 3188.2 3212.2
## - MARRIED 1 3188.6 3212.6
## <none> 3186.9 3212.9
## - SELF 1 3190.5 3214.5
## - HHKIDS 1 3196.9 3220.9
## - HANDDUM 1 3198.1 3222.1
## - FEMALE 1 3225.2 3249.2
## - NEWHSAT 1 3365.5 3389.5
##
## Step: AIC=3210.97
## DOCTOR ~ AGE + HANDDUM + FEMALE + HHKIDS + MARRIED + WHITEC +
## SELF + BEAMT + PUBLIC + NEWHSAT + HHNINC
##
## Df Deviance AIC
## - PUBLIC 1 3187.1 3209.1
## - WHITEC 1 3187.3 3209.3
## - BEAMT 1 3187.5 3209.5
## - HHNINC 1 3187.9 3209.9
## - AGE 1 3188.2 3210.2
## - MARRIED 1 3188.8 3210.8
## <none> 3187.0 3211.0
## - SELF 1 3191.4 3213.4
## - HHKIDS 1 3197.0 3219.0
## - HANDDUM 1 3198.1 3220.1
## - FEMALE 1 3227.5 3249.5
## - NEWHSAT 1 3365.5 3387.5
##
## Step: AIC=3209.06
## DOCTOR ~ AGE + HANDDUM + FEMALE + HHKIDS + MARRIED + WHITEC +
## SELF + BEAMT + NEWHSAT + HHNINC
##
## Df Deviance AIC
## - WHITEC 1 3187.4 3207.4
## - BEAMT 1 3187.5 3207.5
## - HHNINC 1 3187.9 3207.9
## - AGE 1 3188.4 3208.4
## - MARRIED 1 3188.8 3208.8
## <none> 3187.1 3209.1
## - SELF 1 3191.4 3211.4
## - HHKIDS 1 3197.1 3217.1
## - HANDDUM 1 3198.2 3218.2
## - FEMALE 1 3227.6 3247.6
## - NEWHSAT 1 3365.6 3385.6
##
## Step: AIC=3207.38
## DOCTOR ~ AGE + HANDDUM + FEMALE + HHKIDS + MARRIED + SELF + BEAMT +
## NEWHSAT + HHNINC
##
## Df Deviance AIC
## - BEAMT 1 3188.0 3206.0
## - HHNINC 1 3188.0 3206.0
## - AGE 1 3188.6 3206.6
## - MARRIED 1 3189.1 3207.1
## <none> 3187.4 3207.4
## - SELF 1 3192.5 3210.5
## - HHKIDS 1 3197.4 3215.4
## - HANDDUM 1 3198.4 3216.4
## - FEMALE 1 3228.2 3246.2
## - NEWHSAT 1 3365.7 3383.7
##
## Step: AIC=3206.04
## DOCTOR ~ AGE + HANDDUM + FEMALE + HHKIDS + MARRIED + SELF + NEWHSAT +
## HHNINC
##
## Df Deviance AIC
## - HHNINC 1 3188.9 3204.9
## - AGE 1 3189.3 3205.3
## - MARRIED 1 3189.8 3205.8
## <none> 3188.0 3206.0
## - SELF 1 3192.8 3208.8
## - HHKIDS 1 3198.2 3214.2
## - HANDDUM 1 3199.1 3215.1
## - FEMALE 1 3231.0 3247.0
## - NEWHSAT 1 3366.8 3382.8
##
## Step: AIC=3204.9
## DOCTOR ~ AGE + HANDDUM + FEMALE + HHKIDS + MARRIED + SELF + NEWHSAT
##
## Df Deviance AIC
## - AGE 1 3190.1 3204.1
## - MARRIED 1 3190.3 3204.3
## <none> 3188.9 3204.9
## - SELF 1 3194.0 3208.0
## - HHKIDS 1 3198.8 3212.8
## - HANDDUM 1 3200.3 3214.3
## - FEMALE 1 3232.2 3246.2
## - NEWHSAT 1 3371.3 3385.3
##
## Step: AIC=3204.13
## DOCTOR ~ HANDDUM + FEMALE + HHKIDS + MARRIED + SELF + NEWHSAT
##
## Df Deviance AIC
## <none> 3190.1 3204.1
## - MARRIED 1 3193.7 3205.7
## - SELF 1 3195.0 3207.0
## - HANDDUM 1 3203.0 3215.0
## - HHKIDS 1 3205.8 3217.8
## - FEMALE 1 3235.1 3247.1
## - NEWHSAT 1 3379.4 3391.4
##
## Call: glm(formula = DOCTOR ~ HANDDUM + FEMALE + HHKIDS + MARRIED +
## SELF + NEWHSAT, family = binomial(link = "logit"), data = health.train)
##
## Coefficients:
## (Intercept) HANDDUM FEMALE HHKIDS MARRIED
## 2.4475 0.6137 0.5832 -0.3703 0.1904
## SELF NEWHSAT
## -0.4018 -0.3005
##
## Degrees of Freedom: 2740 Total (i.e. Null); 2734 Residual
## Null Deviance: 3536
## Residual Deviance: 3190 AIC: 3204
health.glm2 <- glm(DOCTOR ~ HANDDUM + FEMALE + HHKIDS + WHITEC + SELF +
ADDON + NEWHSAT, family = binomial(link = "logit"), data = health.train)
summary(health.glm2)
##
## Call:
## glm(formula = DOCTOR ~ HANDDUM + FEMALE + HHKIDS + WHITEC + SELF +
## ADDON + NEWHSAT, family = binomial(link = "logit"), data = health.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5252 -1.1206 0.5981 0.8915 1.6937
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.55796 0.18662 13.707 < 2e-16 ***
## HANDDUM 0.63916 0.17789 3.593 0.000327 ***
## FEMALE 0.58821 0.08766 6.710 1.94e-11 ***
## HHKIDS -0.30638 0.08726 -3.511 0.000446 ***
## WHITEC 0.03865 0.09239 0.418 0.675665
## SELF -0.38188 0.18509 -2.063 0.039090 *
## ADDON -0.07853 0.28606 -0.275 0.783688
## NEWHSAT -0.30318 0.02345 -12.931 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3536.3 on 2740 degrees of freedom
## Residual deviance: 3193.5 on 2733 degrees of freedom
## AIC: 3209.5
##
## Number of Fisher Scoring iterations: 4
anova(health.glm2,health.glm1 , test = "LR")
## Analysis of Deviance Table
##
## Model 1: DOCTOR ~ HANDDUM + FEMALE + HHKIDS + WHITEC + SELF + ADDON +
## NEWHSAT
## Model 2: DOCTOR ~ (HANDDUM + FEMALE + HHKIDS + SELF + NEWHSAT)
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 2733 3193.5
## 2 2735 3193.7 -2 -0.23406 0.8896
train預測
glm.pre <- predict(health.glm2 ,newdata = health.train , type = "response")
glm.pre <- ifelse(glm.pre > 0.5, 1, 0)
tab.tr <- table(real.y = health.train$DOCTOR ,pred.y = glm.pre)
sum(diag(tab.tr))/sum(tab.tr)
## [1] 0.6851514
test預測
glm.pre1 <- predict(health.glm2 ,newdata = health.test , type = "response")
glm.pre1 <- ifelse(glm.pre1 > 0.5, 1, 0)
tab.te <- table(real.y = health.test$DOCTOR ,pred.y = glm.pre1)
sum(diag(tab.te))/sum(tab.te)
## [1] 0.7106918
悲劇roc
pred <- prediction(glm.pre1 , health.test$DOCTOR)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
auc <- performance(pred, "auc")
plot(perf, col = rainbow(5), main = "ROC curve", xlab = "1 - Specificity(FPR)", ylab = "Sensitivity(TPR)")
abline(0, 1)
text(0.5, 0.5,paste0("AUC= ",as.character(auc@y.values[[1]])))

library(ggplot2)
ggplot(data = health1994 , aes(x= as.factor(HANDDUM),fill= DOCTOR)) + geom_bar( position = "dodge")

ggplot(data = health1994 , aes(x= as.factor(FEMALE),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(HHKIDS),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(MARRIED),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(BLUEC),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(WHITEC),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(SELF),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(BEAMT),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(PUBLIC),fill= DOCTOR)) + geom_bar( position = "dodge" )

ggplot(data = health1994 , aes(x= as.factor(ADDON),fill= DOCTOR)) + geom_bar( position = "dodge" )
