Exercise 1

1.1

library(car)

## 载入需要的程序包：carData

library(lmtest)

## 载入需要的程序包：zoo

## 
## 载入程序包：'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## 载入程序包：'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# Fit a model 
ESS8UK<-readRDS("D:/LSE/ST411/seminar4/ESS8UK.rds")
ESS8UK$basicincome<-as.factor(ESS8UK$basicincome)
summary(cl4.m1 <- glm(basicincome~age+male+educ+leftright,data=ESS8UK,family=binomial))

## 
## Call:
## glm(formula = basicincome ~ age + male + educ + leftright, family = binomial, 
##     data = ESS8UK)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.37709    0.24007   5.736 9.68e-09 ***
## age                 -0.01359    0.00290  -4.688 2.76e-06 ***
## male                 0.28300    0.10144   2.790  0.00527 ** 
## educupper secondary -0.18591    0.12411  -1.498  0.13415    
## eductertiary        -0.18258    0.13352  -1.367  0.17147    
## leftright           -0.11238    0.02738  -4.104 4.06e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2255.4  on 1626  degrees of freedom
## Residual deviance: 2202.3  on 1621  degrees of freedom
## AIC: 2214.3
## 
## Number of Fisher Scoring iterations: 4

# Examples of Wald tests of hypotheses about the coefficients
# Example hypothesis A
linearHypothesis(cl4.m1,"age") # This is also included in standard output

## 
## Linear hypothesis test:
## age = 0
## 
## Model 1: restricted model
## Model 2: basicincome ~ age + male + educ + leftright
## 
##   Res.Df Df  Chisq Pr(>Chisq)    
## 1   1622                         
## 2   1621  1 21.979  2.756e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Example hypothesis B
linearHypothesis(cl4.m1,c("educupper secondary","eductertiary"))

## 
## Linear hypothesis test:
## educupper secondary = 0
## eductertiary = 0
## 
## Model 1: restricted model
## Model 2: basicincome ~ age + male + educ + leftright
## 
##   Res.Df Df  Chisq Pr(>Chisq)
## 1   1623                     
## 2   1621  2 2.6645     0.2639

# Example hypothesis C
linearHypothesis(cl4.m1,"educupper secondary=eductertiary")

## 
## Linear hypothesis test:
## educupper secondary - eductertiary = 0
## 
## Model 1: restricted model
## Model 2: basicincome ~ age + male + educ + leftright
## 
##   Res.Df Df Chisq Pr(>Chisq)
## 1   1622                    
## 2   1621  1 7e-04     0.9783

b.tmp <- coef(cl4.m1)
names(b.tmp)

## [1] "(Intercept)"         "age"                 "male"               
## [4] "educupper secondary" "eductertiary"        "leftright"

V.tmp <- summary(cl4.m1)$cov.scaled
R.tmp <- matrix(0,2,length(b.tmp))
R.tmp[1,4] <- R.tmp[2,5] <- 1
R.tmp

##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    0    0    0    1    0    0
## [2,]    0    0    0    0    1    0

c.tmp <- R.tmp%*%b.tmp
W.tmp <- t(c.tmp)%*%solve(R.tmp%*%V.tmp%*%t(R.tmp))%*%c.tmp
df.tmp <- nrow(R.tmp)
p.tmp <- 1-pchisq(W.tmp,df.tmp)

c(W.tmp, df.tmp, p.tmp)

## [1] 2.6645479 2.0000000 0.2638765

# Compare this with 
linearHypothesis(cl4.m1,c("educupper secondary","eductertiary"))

## 
## Linear hypothesis test:
## educupper secondary = 0
## eductertiary = 0
## 
## Model 1: restricted model
## Model 2: basicincome ~ age + male + educ + leftright
## 
##   Res.Df Df  Chisq Pr(>Chisq)
## 1   1623                     
## 2   1621  2 2.6645     0.2639

## End of digression

# Confidence intervals for coefficients and their exponentials (odds ratios)
confint(cl4.m1)

## Waiting for profiling to be done...

##                           2.5 %       97.5 %
## (Intercept)          0.90949200  1.851083568
## age                 -0.01930399 -0.007932276
## male                 0.08443900  0.482171916
## educupper secondary -0.42969562  0.056985538
## eductertiary        -0.44481870  0.078752796
## leftright           -0.16642330 -0.059006658

exp(confint(cl4.m1))

## Waiting for profiling to be done...

##                         2.5 %    97.5 %
## (Intercept)         2.4830608 6.3667146
## age                 0.9808811 0.9920991
## male                1.0881065 1.6195882
## educupper secondary 0.6507071 1.0586405
## eductertiary        0.6409405 1.0819368
## leftright           0.8466878 0.9427005

# Examples of likelihood ratio tests, for the same hypotheses as above
# Hypothesis A
lrtest(cl4.m1,"age")

## Likelihood ratio test
## 
## Model 1: basicincome ~ age + male + educ + leftright
## Model 2: basicincome ~ male + educ + leftright
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1   6 -1101.1                         
## 2   5 -1112.3 -1 22.312  2.318e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## Calculate this explicitly, for comparison
summary(cl4.m2 <- glm(basicincome~male+educ+leftright,data=ESS8UK,family=binomial))

## 
## Call:
## glm(formula = basicincome ~ male + educ + leftright, family = binomial, 
##     data = ESS8UK)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          0.71157    0.19109   3.724 0.000196 ***
## male                 0.27944    0.10074   2.774 0.005541 ** 
## educupper secondary -0.07193    0.12054  -0.597 0.550694    
## eductertiary        -0.06097    0.12995  -0.469 0.638955    
## leftright           -0.13137    0.02703  -4.860 1.18e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2255.4  on 1626  degrees of freedom
## Residual deviance: 2224.6  on 1622  degrees of freedom
## AIC: 2234.6
## 
## Number of Fisher Scoring iterations: 4

logLik(cl4.m1)

## 'log Lik.' -1101.144 (df=6)

logLik(cl4.m2)

## 'log Lik.' -1112.3 (df=5)

g.tmp <- 2*(logLik(cl4.m1)-logLik(cl4.m2))
g.tmp

## 'log Lik.' 22.31171 (df=6)

1-pchisq(g.tmp,cl4.m2$df.residual-cl4.m1$df.residual)

## 'log Lik.' 2.317888e-06 (df=6)

# Hypothesis B
lrtest(cl4.m1,"educ")

## Likelihood ratio test
## 
## Model 1: basicincome ~ age + male + educ + leftright
## Model 2: basicincome ~ age + male + leftright
##   #Df  LogLik Df Chisq Pr(>Chisq)
## 1   6 -1101.1                    
## 2   4 -1102.5 -2 2.673     0.2628

# Hypothesis C
levels(ESS8UK$educ)

## [1] "lower secondary or less" "upper secondary"        
## [3] "tertiary"

ESS8UK$educ2 <- ESS8UK$educ
levels(ESS8UK$educ2) <- c("lower","higher","higher") # Combine two levels of the variable
summary(cl4.m4 <- glm(basicincome~age+male+educ2+leftright,data=ESS8UK,family=binomial))

## 
## Call:
## glm(formula = basicincome ~ age + male + educ2 + leftright, family = binomial, 
##     data = ESS8UK)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.377499   0.239608   5.749 8.98e-09 ***
## age         -0.013595   0.002899  -4.689 2.75e-06 ***
## male         0.282985   0.101439   2.790  0.00528 ** 
## educ2higher -0.184517   0.113054  -1.632  0.10266    
## leftright   -0.112432   0.027307  -4.117 3.83e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2255.4  on 1626  degrees of freedom
## Residual deviance: 2202.3  on 1622  degrees of freedom
## AIC: 2212.3
## 
## Number of Fisher Scoring iterations: 4

lrtest(cl4.m1,cl4.m4)

## Likelihood ratio test
## 
## Model 1: basicincome ~ age + male + educ + leftright
## Model 2: basicincome ~ age + male + educ2 + leftright
##   #Df  LogLik Df Chisq Pr(>Chisq)
## 1   6 -1101.1                    
## 2   5 -1101.1 -1 7e-04     0.9783

1.2

# with Probit link function
cl4.glm.probit <- glm(basicincome~age+male+educ+leftright,data=ESS8UK,family=binomial(link="probit"))

# with cloglog link function
cl4.glm.cloglog <- glm(basicincome~age+male+educ+leftright,data=ESS8UK,family=binomial(link="cloglog"))

# ratios of logit to probit
ratio<-coef(cl4.m1)/coef(cl4.glm.probit)
ratio

##         (Intercept)                 age                male educupper secondary 
##            1.602138            1.602004            1.590388            1.584538 
##        eductertiary           leftright 
##            1.593776            1.604209

1.3

#####################################
binary.classification.table <- function (model.object,cutoff=0.5) 
{
# ST411
# Function for calculating the classification table
# and the various probabilities for it for a fitted binary model,
# given a value of the classification cutoff.
#

# Classification table (frequencies)
    cl.table <- table(fitted(model.object)>cutoff,model.object$y)
## Fix the table if all classifications are 0 or 1
    if(nrow(cl.table)==1){
        if(row.names(cl.table)=="FALSE")
            cl.table <- rbind(cl.table,0)
        else cl.table <- rbind(0,cl.table)
    }
    t2x2 <- cl.table
    cl.table <- rbind(cl.table,margin.table(cl.table,2))
    cl.table <- cbind(cl.table,margin.table(cl.table,1))
    dimnames(cl.table) <- list(c("Classified 0","Classified 1","Total"),
        c("Observed 0","Observed 1","Total"))

# The probabilities
    probs <- c(diag(prop.table(t2x2,2)),
        diag(prop.table(t2x2,1)))
    probs <- c(probs,1-probs)
    probs <- c(as.vector(prop.table(table(model.object$y))),probs)
## This turns it into a column matrix, for convenience of display
    dim(probs) <- c(length(probs),1) 
    dimnames(probs) <- list(c("Observed 0s","Observed 1s",
        "Specificity","Sensitivity",
        "Negative predictive value","Positive predictive value",
        "False positive rate for true negatives",   
        "False negative rate for true positives",   
        "False positive rate for classified negatives", 
        "False negative rate for classified positives"),"probability")

    result <- list(classification.table=cl.table, rates=round(probs,4))
    result 
}
#####################################

summary(cl4.m5 <- glm(basicincome~age+male+leftright,data=ESS8UK,family=binomial))

## 
## Call:
## glm(formula = basicincome ~ age + male + leftright, family = binomial, 
##     data = ESS8UK)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.189646   0.209251   5.685 1.31e-08 ***
## age         -0.012551   0.002822  -4.448 8.68e-06 ***
## male         0.282903   0.101351   2.791  0.00525 ** 
## leftright   -0.111553   0.027288  -4.088 4.35e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2255.4  on 1626  degrees of freedom
## Residual deviance: 2205.0  on 1623  degrees of freedom
## AIC: 2213
## 
## Number of Fisher Scoring iterations: 4

# Classification table and error rates, for different cut-offs
binary.classification.table(cl4.m5,.5)

## $classification.table
##              Observed 0 Observed 1 Total
## Classified 0        435        328   763
## Classified 1        373        491   864
## Total               808        819  1627
## 
## $rates
##                                              probability
## Observed 0s                                       0.4966
## Observed 1s                                       0.5034
## Specificity                                       0.5384
## Sensitivity                                       0.5995
## Negative predictive value                         0.5701
## Positive predictive value                         0.5683
## False positive rate for true negatives            0.4616
## False negative rate for true positives            0.4005
## False positive rate for classified negatives      0.4299
## False negative rate for classified positives      0.4317

binary.classification.table(cl4.m5,.3)

## $classification.table
##              Observed 0 Observed 1 Total
## Classified 0         10          6    16
## Classified 1        798        813  1611
## Total               808        819  1627
## 
## $rates
##                                              probability
## Observed 0s                                       0.4966
## Observed 1s                                       0.5034
## Specificity                                       0.0124
## Sensitivity                                       0.9927
## Negative predictive value                         0.6250
## Positive predictive value                         0.5047
## False positive rate for true negatives            0.9876
## False negative rate for true positives            0.0073
## False positive rate for classified negatives      0.3750
## False negative rate for classified positives      0.4953

binary.classification.table(cl4.m5,.8)

## $classification.table
##              Observed 0 Observed 1 Total
## Classified 0        808        819  1627
## Classified 1          0          0     0
## Total               808        819  1627
## 
## $rates
##                                              probability
## Observed 0s                                       0.4966
## Observed 1s                                       0.5034
## Specificity                                       1.0000
## Sensitivity                                       0.0000
## Negative predictive value                         0.4966
## Positive predictive value                            NaN
## False positive rate for true negatives            0.0000
## False negative rate for true positives            1.0000
## False positive rate for classified negatives      0.5034
## False negative rate for classified positives         NaN

# ROC curve and Area Under the Curve
roc(cl4.m5$y,fitted(cl4.m5),plot=T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## 
## Call:
## roc.default(response = cl4.m5$y, predictor = fitted(cl4.m5),     plot = T)
## 
## Data: fitted(cl4.m5) in 808 controls (cl4.m5$y 0) < 819 cases (cl4.m5$y 1).
## Area under the curve: 0.5979

Exercise 2

2.1

setwd("D:/LSE/ST411/seminar5")
BngDHS<-readRDS("BngDHS.rds")
BngDHS$educ<-as.factor(BngDHS$educ)
BngDHS$urban<-as.factor(BngDHS$urban)
model1<-glm(usemod~urban+educ+age,data=BngDHS,family=binomial)
# hypothesis A
lrtest(model1,"urban")

## Likelihood ratio test
## 
## Model 1: usemod ~ urban + educ + age
## Model 2: usemod ~ educ + age
##   #Df  LogLik Df  Chisq Pr(>Chisq)   
## 1   5 -3412.8                        
## 2   4 -3417.7 -1 9.8086   0.001737 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# hypothesis B
lrtest(model1,"educ")

## Likelihood ratio test
## 
## Model 1: usemod ~ urban + educ + age
## Model 2: usemod ~ urban + age
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1   5 -3412.8                         
## 2   3 -3433.7 -2 41.902  7.962e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# hypothesis C
lrtest(model1,"age")

## Likelihood ratio test
## 
## Model 1: usemod ~ urban + educ + age
## Model 2: usemod ~ urban + educ
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1   5 -3412.8                         
## 2   4 -3426.1 -1 26.784  2.276e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

All of the three variables are significant in the model, at the 5% level.

model2<-glm(usemod~urban+educ+age+urban*educ,data=BngDHS,family=binomial)
summary(model2)

## 
## Call:
## glm(formula = usemod ~ urban + educ + age + urban * educ, family = binomial, 
##     data = BngDHS)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.22733    0.10892 -11.268  < 2e-16 ***
## urbanurban                -0.02773    0.12691  -0.218  0.82705    
## educprimary                0.17643    0.07176   2.459  0.01395 *  
## educsecondary+             0.39106    0.09998   3.911 9.18e-05 ***
## age                        0.01674    0.00331   5.058 4.23e-07 ***
## urbanurban:educprimary     0.42257    0.20713   2.040  0.04134 *  
## urbanurban:educsecondary+  0.57022    0.19430   2.935  0.00334 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6909.9  on 5295  degrees of freedom
## Residual deviance: 6815.9  on 5289  degrees of freedom
## AIC: 6829.9
## 
## Number of Fisher Scoring iterations: 4

lrtest(model1,model2)

## Likelihood ratio test
## 
## Model 1: usemod ~ urban + educ + age
## Model 2: usemod ~ urban + educ + age + urban * educ
##   #Df  LogLik Df  Chisq Pr(>Chisq)   
## 1   5 -3412.8                        
## 2   7 -3407.9  2 9.6448   0.008048 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

It is significant at the 5% level.

#For living in a rural area + primary educ:
d.tmp <- BngDHS
d.tmp$age <- 25
d.tmp$urban <- "rural"
d.tmp$educ <- "primary"
mean(predict(model1, d.tmp, type = "response"))

## [1] 0.3506452

#For living in a rural area + secondary educ:
d.tmp <- BngDHS
d.tmp$age <- 25
d.tmp$urban <- "rural"
d.tmp$educ <- "secondary+"
mean(predict(model1, d.tmp, type = "response"))

## [1] 0.4226516

#For living in a rural area + none educ:
d.tmp <- BngDHS
d.tmp$age <- 25
d.tmp$urban <- "rural"
d.tmp$educ <- "none"
mean(predict(model1, d.tmp, type = "response"))

## [1] 0.3011179

#For living in a urban area + primary educ:
d.tmp <- BngDHS
d.tmp$age <- 25
d.tmp$urban <- "urban"
d.tmp$educ <- "primary"
mean(predict(model1, d.tmp, type = "response"))

## [1] 0.4112324

#For living in a urban area + secondary educ:
d.tmp <- BngDHS
d.tmp$age <- 25
d.tmp$urban <- "urban"
d.tmp$educ <- "secondary+"
mean(predict(model1, d.tmp, type = "response"))

## [1] 0.4863617

#For living in a urban area + none educ:
d.tmp <- BngDHS
d.tmp$age <- 25
d.tmp$urban <- "urban"
d.tmp$educ <- "none"
mean(predict(model1, d.tmp, type = "response"))

## [1] 0.3578636

2.2

1. Consider the likelihood ratio test of the association between education and use of modern 
contraception in Exercise 2. 
a. State the null hypothesis of this test. 
b. Show, with numbers, how the value of the test statistic was calculated. 
c. What do you conclude from the test here, and why?

lrtest(model1,"educ")

## Likelihood ratio test
## 
## Model 1: usemod ~ urban + educ + age
## Model 2: usemod ~ urban + age
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1   5 -3412.8                         
## 2   3 -3433.7 -2 41.902  7.962e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

null hypothesis: Levels of education is not significant in the model, i.e. \(\hat{\beta}_{educ} = 0\).
test statistic \(G^2 = 2*(-3412.8-(-3433.7)) = 41.902\).
Levels of education is significant in the model at the 5% significance, for the p value < 0.05.

2. From the coefficients of the model estimated in Exercise 3, calculate the following estimated odds ratios of using modern contraception, controlling for age: 
a. the odds ratio comparing urban to rural women, separately at each level of education. 
b. the odds ratio comparing each of the two other education levels to no education, 
separately for rural and urban women.

summary(model2)

## 
## Call:
## glm(formula = usemod ~ urban + educ + age + urban * educ, family = binomial, 
##     data = BngDHS)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.22733    0.10892 -11.268  < 2e-16 ***
## urbanurban                -0.02773    0.12691  -0.218  0.82705    
## educprimary                0.17643    0.07176   2.459  0.01395 *  
## educsecondary+             0.39106    0.09998   3.911 9.18e-05 ***
## age                        0.01674    0.00331   5.058 4.23e-07 ***
## urbanurban:educprimary     0.42257    0.20713   2.040  0.04134 *  
## urbanurban:educsecondary+  0.57022    0.19430   2.935  0.00334 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6909.9  on 5295  degrees of freedom
## Residual deviance: 6815.9  on 5289  degrees of freedom
## AIC: 6829.9
## 
## Number of Fisher Scoring iterations: 4

for educ=none, the odds ratio = \(e^{-0.02773}= 0.9726509\); for educ=primary, the odds ratio = \(e^{-0.02773+0.17643+0.42257}= 1.770514\); for educ=secondary+, the odds ratio = \(e^{-0.02773+0.39106+0.57022}= 2.543523\).
for rural women, the odds ratio of educ=primary to educ=none equals \(e^{0.17643}= 1.192951\), the odds ratio of educ=secondary+ to educ=none equals \(e^{0.39106}= 1.478547\), the odds ratio of educ=secondary+ to educ=primary equals \(e^{0.39106-0.17643}= 1.239403\); for urban women, the odds ratio of educ=primary to educ=none equals \(e^{0.17643+0.42257}= 1.820298\), the odds ratio of educ=secondary+ to educ=none equals \(e^{0.39106+0.57022}= 2.615042\), the odds ratio of educ=secondary+ to educ=primary equals \(e^{0.39106+0.57022-(0.17643+0.42257)}= 1.436601\).

3. Drawing on these odds ratios, and the fitted probabilities in Exercise 4, briefly describe, in your own words, the nature of the interaction between education and location in the model of Exercise 3.

Education and location are correlated when controlling for other variables, suggesting that these two variables will have an additional effect on the probability of use of modern contraception when considered as a combination rather than just included as the main effect. According to the calculation, women with higher education level and living in urban area, are more likely to use modern contraception.

Calculating classification tables, error rates, ROC curve and Area Under the Curve when the model is used for classification. Comment on the accuracy of the classification.

binary.classification.table <- function (model.object,cutoff=0.5) 
{
# Function for calculating the classification table
# and the various probabilities for it for a fitted binary model,
# given a value of the classification cutoff.

# Classification table (frequencies)
    cl.table <- table(fitted(model.object)>cutoff,model.object$y)
## Fix the table if all classifications are 0 or 1
    if(nrow(cl.table)==1){
        if(row.names(cl.table)=="FALSE")
            cl.table <- rbind(cl.table,0)
        else cl.table <- rbind(0,cl.table)
    }
    t2x2 <- cl.table
    cl.table <- rbind(cl.table,margin.table(cl.table,2))
    cl.table <- cbind(cl.table,margin.table(cl.table,1))
    dimnames(cl.table) <- list(c("Classified 0","Classified 1","Total"),
        c("Observed 0","Observed 1","Total"))

# The probabilities
    probs <- c(diag(prop.table(t2x2,2)),
        diag(prop.table(t2x2,1)))
    probs <- c(probs,1-probs)
    probs <- c(as.vector(prop.table(table(model.object$y))),probs)
## This turns it into a column matrix, for convenience of display
    dim(probs) <- c(length(probs),1) 
    dimnames(probs) <- list(c("Observed 0s","Observed 1s",
        "Specificity","Sensitivity",
        "Negative predictive value","Positive predictive value",
        "False positive rate for true negatives",   
        "False negative rate for true positives",   
        "False positive rate for classified negatives", 
        "False negative rate for classified positives"),"probability")

    result <- list(classification.table=cl.table, rates=round(probs,4))
    result 
}

# Classification table and error rates, for different cut-offs
binary.classification.table(model2,.5)

## $classification.table
##              Observed 0 Observed 1 Total
## Classified 0       3254       1733  4987
## Classified 1        145        164   309
## Total              3399       1897  5296
## 
## $rates
##                                              probability
## Observed 0s                                       0.6418
## Observed 1s                                       0.3582
## Specificity                                       0.9573
## Sensitivity                                       0.0865
## Negative predictive value                         0.6525
## Positive predictive value                         0.5307
## False positive rate for true negatives            0.0427
## False negative rate for true positives            0.9135
## False positive rate for classified negatives      0.3475
## False negative rate for classified positives      0.4693

binary.classification.table(model2,.3)

## $classification.table
##              Observed 0 Observed 1 Total
## Classified 0        567        145   712
## Classified 1       2832       1752  4584
## Total              3399       1897  5296
## 
## $rates
##                                              probability
## Observed 0s                                       0.6418
## Observed 1s                                       0.3582
## Specificity                                       0.1668
## Sensitivity                                       0.9236
## Negative predictive value                         0.7963
## Positive predictive value                         0.3822
## False positive rate for true negatives            0.8332
## False negative rate for true positives            0.0764
## False positive rate for classified negatives      0.2037
## False negative rate for classified positives      0.6178

binary.classification.table(model2,.8)

## $classification.table
##              Observed 0 Observed 1 Total
## Classified 0       3399       1897  5296
## Classified 1          0          0     0
## Total              3399       1897  5296
## 
## $rates
##                                              probability
## Observed 0s                                       0.6418
## Observed 1s                                       0.3582
## Specificity                                       1.0000
## Sensitivity                                       0.0000
## Negative predictive value                         0.6418
## Positive predictive value                            NaN
## False positive rate for true negatives            0.0000
## False negative rate for true positives            1.0000
## False positive rate for classified negatives      0.3582
## False negative rate for classified positives         NaN

# ROC curve and Area Under the Curve
roc(model2$y,fitted(model2),plot= T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## 
## Call:
## roc.default(response = model2$y, predictor = fitted(model2),     plot = T)
## 
## Data: fitted(model2) in 3399 controls (model2$y 0) < 1897 cases (model2$y 1).
## Area under the curve: 0.5777

The ROC is close to the line at the slope of 1, and the AUC value is near 0.5, suggesting that this model including the interaction term is not a very good illustration of classification.

Seminar 5

Huimin Tang

2024-11-01

Exercise 1

1.1

1.2

1.3

Exercise 2

2.1

2.2