Exercise 11 - Probit Analysis - Qingzhou Shi

Data information

A researcher is interested in how variables, such as reading score, science score, and SES level(1=low, 2=medium, 3=high) affect TestResult (0=fail, 1=pass). The response variable (TestResult), pass/fail is a binary variable. Please use ‘probit_exercise.sav’.

library(haven)
probit_exercise <- read_sav("probit_exercise.sav")
View(probit_exercise)
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/
## Resources/modules/R_de.so'' had status 1
prob_data <- probit_exercise
summary(prob_data)
##        id              ses             read          science     
##  Min.   :  1.00   Min.   :1.000   Min.   :28.00   Min.   :26.00  
##  1st Qu.: 50.75   1st Qu.:2.000   1st Qu.:44.00   1st Qu.:44.00  
##  Median :100.50   Median :2.000   Median :50.00   Median :53.00  
##  Mean   :100.50   Mean   :2.055   Mean   :52.23   Mean   :51.85  
##  3rd Qu.:150.25   3rd Qu.:3.000   3rd Qu.:60.00   3rd Qu.:58.00  
##  Max.   :200.00   Max.   :3.000   Max.   :76.00   Max.   :74.00  
##    TestResult   
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.265  
##  3rd Qu.:1.000  
##  Max.   :1.000
str(prob_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    200 obs. of  5 variables:
##  $ id        : num  1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "format.spss")= chr "F9.0"
##  $ ses       : 'haven_labelled' num  1 2 1 1 1 1 2 1 2 2 ...
##   ..- attr(*, "format.spss")= chr "F9.0"
##   ..- attr(*, "labels")= Named num  1 2 3
##   .. ..- attr(*, "names")= chr  "low" "middle" "high"
##  $ read      : num  34 39 63 44 47 47 57 39 48 47 ...
##   ..- attr(*, "label")= chr "read"
##   ..- attr(*, "format.spss")= chr "F9.0"
##  $ science   : num  39 42 63 39 45 40 47 44 44 53 ...
##   ..- attr(*, "label")= chr "science"
##   ..- attr(*, "format.spss")= chr "F9.0"
##  $ TestResult: 'haven_labelled' num  0 0 1 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.spss")= chr "F8.0"
##   ..- attr(*, "display_width")= int 10
##   ..- attr(*, "labels")= Named num  0 1
##   .. ..- attr(*, "names")= chr  "fail" "pass"
prob_data$ses <- factor(prob_data$ses,levels=c(1,2,3),labels=c("low", "middle", "high"))
prob_data$TestResult_re <- prob_data$TestResult
library(car)
## Loading required package: carData
prob_data$TestResult_re<-recode(prob_data$TestResult_re, "0=1; 1=0")
prob_data$TestResult_re <- factor(prob_data$TestResult_re,levels=c(0,1),labels=c("pass","fail"))
xtabs(~ses + TestResult_re, data = prob_data)
##         TestResult_re
## ses      pass fail
##   low      11   36
##   middle   16   79
##   high     26   32

Do a probit analysis

library(stats)
# Build the Probit logistic Regression model
prob_model <- glm(TestResult_re ~ ses + read + science, family = binomial(link = "probit"), 
    data = prob_data)
summary(prob_model)
## 
## Call:
## glm(formula = TestResult_re ~ ses + read + science, family = binomial(link = "probit"), 
##     data = prob_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3811  -0.6141   0.3236   0.5978   2.1084  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.58434    0.82835   6.742 1.57e-11 ***
## sesmiddle    0.60798    0.29759   2.043   0.0411 *  
## seshigh      0.03948    0.31160   0.127   0.8992    
## read        -0.05916    0.01437  -4.118 3.82e-05 ***
## science     -0.03692    0.01535  -2.405   0.0162 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 231.29  on 199  degrees of freedom
## Residual deviance: 164.84  on 195  degrees of freedom
## AIC: 174.84
## 
## Number of Fisher Scoring iterations: 6

1) Test a full model using a chi-square test and R2.

OIM <- glm(TestResult_re ~ 1,family = binomial(link = "probit"), data = prob_data)
summary(OIM)
## 
## Call:
## glm(formula = TestResult_re ~ 1, family = binomial(link = "probit"), 
##     data = prob_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6297  -1.6297   0.7847   0.7847   0.7847  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.62801    0.09528   6.591 4.36e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 231.29  on 199  degrees of freedom
## Residual deviance: 231.29  on 199  degrees of freedom
## AIC: 233.29
## 
## Number of Fisher Scoring iterations: 4
anova(OIM,prob_model,test="F")
## Warning: using F test with a 'binomial' family is inappropriate
## Analysis of Deviance Table
## 
## Model 1: TestResult_re ~ 1
## Model 2: TestResult_re ~ ses + read + science
##   Resid. Df Resid. Dev Df Deviance      F    Pr(>F)    
## 1       199     231.29                                 
## 2       195     164.84  4   66.445 16.611 1.276e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The chi-square test yeilds a result of 66.445 (df=4), p < .001. This means that we can reject the null hypothesis that the model without predictors is as good as the model with the predictors, incidating the two models are significantly different.

head(prob_model$fitted.values,20)
##         1         2         3         4         5         6         7 
## 0.9835309 0.9902084 0.3195324 0.9383689 0.8733142 0.9077154 0.8609771 
##         8         9        10        11        12        13        14 
## 0.9507746 0.9580006 0.9271361 0.9969357 0.9948164 0.9531633 0.9018985 
##        15        16        17        18        19        20 
## 0.9907763 0.9298281 0.9630395 0.9462594 0.9893657 0.4292829
head(predict(prob_model),20)
##          1          2          3          4          5          6 
##  2.1328351  2.3342385 -0.4690067  1.5412240  1.1421983  1.3268169 
##          7          8          9         10         11         12 
##  1.0847198  1.6524109  1.7279410  1.4547886  2.7408153  2.5633319 
##         13         14         15         16         17         18 
##  1.6763309  1.2924452  2.3565138  1.4745118  1.7871021  1.6096188 
##         19         20 
##  2.3031832 -0.1782001
chisq.test(prob_data$TestResult_re,predict(prob_model))
## Warning in chisq.test(prob_data$TestResult_re, predict(prob_model)): Chi-
## squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  prob_data$TestResult_re and predict(prob_model)
## X-squared = 179.46, df = 168, p-value = 0.2585

The Pearson’s Chi-squared test yeilded a value of 179.46 (df=168), p > .05. This means we can retain the null hypothesis that there is no significant difference between the observed and expected cell counts, indicating a good model fit.

library("DescTools")
## 
## Attaching package: 'DescTools'
## The following object is masked from 'package:car':
## 
##     Recode
PseudoR2(prob_model, which = c("CoxSnell","Nagelkerke","McFadden"))
##   CoxSnell Nagelkerke   McFadden 
##  0.2826732  0.4124225  0.2872804

There is a relationship of 28.27% between independent variables and dependent variable based on CoxSnell’s R2, 41.24% on Nagelkerke’s R2, and 28.73% on McFadden’s R2.

2) Interpret the parameter estimates and make your conclusion.

summary(prob_model)
## 
## Call:
## glm(formula = TestResult_re ~ ses + read + science, family = binomial(link = "probit"), 
##     data = prob_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3811  -0.6141   0.3236   0.5978   2.1084  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.58434    0.82835   6.742 1.57e-11 ***
## sesmiddle    0.60798    0.29759   2.043   0.0411 *  
## seshigh      0.03948    0.31160   0.127   0.8992    
## read        -0.05916    0.01437  -4.118 3.82e-05 ***
## science     -0.03692    0.01535  -2.405   0.0162 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 231.29  on 199  degrees of freedom
## Residual deviance: 164.84  on 195  degrees of freedom
## AIC: 174.84
## 
## Number of Fisher Scoring iterations: 6
(Prob_estimates <- coef(summary(prob_model)))
##                Estimate Std. Error    z value     Pr(>|z|)
## (Intercept)  5.58433845 0.82835063  6.7415153 1.567432e-11
## sesmiddle    0.60798011 0.29758897  2.0430196 4.105050e-02
## seshigh      0.03947572 0.31159631  0.1266887 8.991868e-01
## read        -0.05916111 0.01436673 -4.1179257 3.822980e-05
## science     -0.03692373 0.01535313 -2.4049648 1.617402e-02

The parameter estimates indicate that variables of sesmiddle (the midddle SES level, z value = 2.043, p < .05), read (reading score, z value = -4.118, p < .001) and science (science score, z value = -2.405, p < .05) are statistically significant. According to these results, a student is more likely to pass the test when his/her reading score and sciencde score are higher. Also, a student is less likely to pass the test when he/she is from a family with middle SES level than who is from a family with low SES level.
For each one unit increase in reading score, the z-score increases by 0.059. For each one unit increase in science score, the z-score increases by 0.037. Having a family with a middle SES level, versus having a family with a low SES level, the z-score decreases by 0.608. Having a family with a high SES level, versus having a family with a low SES level, the z-score decreases by 0.039.

3) Please provide a predictive equation

\(Probit[P(Y=0=pass)]=α_0-β_1sesmiddle-β_2seshigh-β_3read-β_4science\)
\(=5.584-.608sesmiddle-.039seshigh+.059read+.037science\)