R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
academic_performance <- read_excel("data_academic_performance.xlsx")

#First check that package required is installed, if not install it
# Specify your packages
needed_packages <- c("pastecs", "ggplot2", "semTools", "FSA")                                    
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]    
# Install not installed packages
if(length(not_installed)) install.packages(not_installed)                              
library(pastecs) #For creating descriptive statistic summaries
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(ggplot2) #For creating histograms with more detail than plot
library(semTools) #For skewness and kurtosis
## Warning: package 'semTools' was built under R version 4.0.3
## Loading required package: lavaan
## This is lavaan 0.6-7
## lavaan is BETA software! Please report any bugs.
## 
## ###############################################################################
## This is semTools 0.5-3
## All users of R (or SEM) are invited to submit functions or ideas for functions.
## ###############################################################################
library(stargazer)
## Warning: package 'stargazer' was built under R version 4.0.3
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(sjstats)#chi-square effect size
## Warning: package 'sjstats' was built under R version 4.0.3
library(gmodels) #For creating histograms with more detail than plot
## Warning: package 'gmodels' was built under R version 4.0.3
## 
## Attaching package: 'gmodels'
## The following object is masked from 'package:sjstats':
## 
##     ci

##missing data handling

df_2<- academic_performance %>% dplyr::na_if(0)
df_2<- df_2 %>% dplyr::na_if('Not sure')
df_2<- df_2 %>% dplyr::na_if('Not apply')
myData<- na.omit(df_2)
myData$PEOPLE_HOUSE=as.factor(ifelse(myData$PEOPLE_HOUSE=="Once","One",myData$PEOPLE_HOUSE))

introducing Dummy variable:

myData$SCHOOL_NAT=ifelse(myData$SCHOOL_NAT== "PRIVATE", 0, ifelse(myData$SCHOOL_NAT == "PUBLIC", 1, NA))
myData$GENDER=ifelse(myData$GENDER== "M", 0, ifelse(myData$GENDER == "F", 1, NA))

Logistic Regression Model1

#SCHOOL_TYPE is the type of school(a categorical variable with three values)
#SCHOOL_NAT, a two-level categorical variable
#write is a continuous variable
with(myData, table(SCHOOL_NAT, SCHOOL_TYPE))
##           SCHOOL_TYPE
## SCHOOL_NAT ACADEMIC TECHNICAL TECHNICAL/ACADEMIC
##          0     4743       250                368
##          1     1804       645               2693
#Because school type has three levels we need to indicate which level is our reference
#We will be comparing the other types of schools to academic

myData$SCHOOL_NAT= as.factor(myData$SCHOOL_NAT)
myData$SCHOOL_TYPE= as.factor(myData$SCHOOL_TYPE)
myData$GENDER= as.factor(myData$GENDER)
 
reg_model1<-glm(SCHOOL_NAT ~ MAT_S11+CR_S11+SCHOOL_TYPE, data = myData, na.action = na.exclude, family = binomial(link=logit))
summary(reg_model1)
## 
## Call:
## glm(formula = SCHOOL_NAT ~ MAT_S11 + CR_S11 + SCHOOL_TYPE, family = binomial(link = logit), 
##     data = myData, na.action = na.exclude)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4733  -0.8002  -0.5030   0.6351   2.1231  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    1.760200   0.161895  10.872  < 2e-16 ***
## MAT_S11                       -0.025613   0.002608  -9.822  < 2e-16 ***
## CR_S11                        -0.017425   0.003051  -5.712 1.12e-08 ***
## SCHOOL_TYPETECHNICAL           1.816594   0.080853  22.468  < 2e-16 ***
## SCHOOL_TYPETECHNICAL/ACADEMIC  2.911381   0.062875  46.304  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14556  on 10502  degrees of freedom
## Residual deviance: 10702  on 10498  degrees of freedom
## AIC: 10712
## 
## Number of Fisher Scoring iterations: 4
#nce residuals, are a measure of model fit. 
#Chi-square plus significance
lmtest::lrtest(reg_model1)
## Likelihood ratio test
## 
## Model 1: SCHOOL_NAT ~ MAT_S11 + CR_S11 + SCHOOL_TYPE
## Model 2: SCHOOL_NAT ~ 1
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1   5 -5351.0                         
## 2   1 -7277.8 -4 3853.7  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Pseudo Rsquared 
DescTools::PseudoR2(reg_model1, which="CoxSnell")
## Registered S3 method overwritten by 'DescTools':
##   method         from 
##   reorder.factor gdata
##  CoxSnell 
## 0.3071295
DescTools::PseudoR2(reg_model1, which="Nagelkerke")
## Nagelkerke 
##  0.4095653
#Check the assumption of linearity of independent variables and log odds using a Hosmer-Lemeshow test, if this is not statsitically significant we are ok
generalhoslem::logitgof(myData$SCHOOL_NAT, fitted(reg_model1))
## 
##  Hosmer and Lemeshow test (binary model)
## 
## data:  myData$SCHOOL_NAT, fitted(reg_model1)
## X-squared = 48.13, df = 8, p-value = 9.331e-08
#Collinearity
vifmodel<-car::vif(reg_model1)#You can ignore the warning messages, GVIF^(1/(2*Df)) is the value of interest
## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4
vifmodel
##                 GVIF Df GVIF^(1/(2*Df))
## MAT_S11     1.539520  1        1.240774
## CR_S11      1.540354  1        1.241110
## SCHOOL_TYPE 1.003816  2        1.000953
#Tolerance
1/vifmodel
##                  GVIF  Df GVIF^(1/(2*Df))
## MAT_S11     0.6495531 1.0       0.8059486
## CR_S11      0.6492012 1.0       0.8057303
## SCHOOL_TYPE 0.9961987 0.5       0.9990483
#Output the sensitivity, specificity, and ROC plot
Epi::ROC(form=myData$SCHOOL_NAT ~ myData$MAT_S11+myData$CR_S11+myData$SCHOOL_TYPE, plot="ROC")

#Confusion matrix
regclass::confusion_matrix(reg_model1)
##          Predicted 0 Predicted 1 Total
## Actual 0        4731         630  5361
## Actual 1        1795        3347  5142
## Total           6526        3977 10503
#Summary of the model with co-efficients
stargazer(reg_model1, type="text")
## 
## =========================================================
##                                   Dependent variable:    
##                               ---------------------------
##                                       SCHOOL_NAT         
## ---------------------------------------------------------
## MAT_S11                                -0.026***         
##                                         (0.003)          
##                                                          
## CR_S11                                 -0.017***         
##                                         (0.003)          
##                                                          
## SCHOOL_TYPETECHNICAL                   1.817***          
##                                         (0.081)          
##                                                          
## SCHOOL_TYPETECHNICAL/ACADEMIC          2.911***          
##                                         (0.063)          
##                                                          
## Constant                               1.760***          
##                                         (0.162)          
##                                                          
## ---------------------------------------------------------
## Observations                            10,503           
## Log Likelihood                        -5,351.002         
## Akaike Inf. Crit.                     10,712.000         
## =========================================================
## Note:                         *p<0.1; **p<0.05; ***p<0.01
#Exponentiate the co-efficients
exp(coef(reg_model1))
##                   (Intercept)                       MAT_S11 
##                     5.8136000                     0.9747124 
##                        CR_S11          SCHOOL_TYPETECHNICAL 
##                     0.9827262                     6.1508754 
## SCHOOL_TYPETECHNICAL/ACADEMIC 
##                    18.3821707
## odds ratios 
cbind(Estimate=round(coef(reg_model1),4),
      OR=round(exp(coef(reg_model1)),4))
##                               Estimate      OR
## (Intercept)                     1.7602  5.8136
## MAT_S11                        -0.0256  0.9747
## CR_S11                         -0.0174  0.9827
## SCHOOL_TYPETECHNICAL            1.8166  6.1509
## SCHOOL_TYPETECHNICAL/ACADEMIC   2.9114 18.3822

Regression Model2

reg_model2<-glm(SCHOOL_NAT ~ MAT_S11+ CR_S11+ SCHOOL_TYPE+ GENDER, data = myData, na.action = na.exclude, family = binomial(link=logit))
summary(reg_model2)
## 
## Call:
## glm(formula = SCHOOL_NAT ~ MAT_S11 + CR_S11 + SCHOOL_TYPE + GENDER, 
##     family = binomial(link = logit), data = myData, na.action = na.exclude)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4764  -0.8007  -0.5021   0.6368   2.1130  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    1.856762   0.164550  11.284  < 2e-16 ***
## MAT_S11                       -0.027469   0.002670 -10.288  < 2e-16 ***
## CR_S11                        -0.015960   0.003082  -5.179 2.24e-07 ***
## SCHOOL_TYPETECHNICAL           1.820545   0.080926  22.497  < 2e-16 ***
## SCHOOL_TYPETECHNICAL/ACADEMIC  2.919440   0.062996  46.343  < 2e-16 ***
## GENDER1                       -0.167036   0.049833  -3.352 0.000803 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14556  on 10502  degrees of freedom
## Residual deviance: 10691  on 10497  degrees of freedom
## AIC: 10703
## 
## Number of Fisher Scoring iterations: 4
lmtest::lrtest(reg_model2)
## Likelihood ratio test
## 
## Model 1: SCHOOL_NAT ~ MAT_S11 + CR_S11 + SCHOOL_TYPE + GENDER
## Model 2: SCHOOL_NAT ~ 1
##   #Df  LogLik Df Chisq Pr(>Chisq)    
## 1   6 -5345.4                        
## 2   1 -7277.8 -5  3865  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Pseudo Rsquared 
DescTools::PseudoR2(reg_model2, which="CoxSnell")
## CoxSnell 
## 0.307873
DescTools::PseudoR2(reg_model2, which="Nagelkerke")
## Nagelkerke 
##  0.4105569
#Check the assumption of linearity of independent variables and log odds using a Hosmer-Lemeshow test, if this is not statsitically significant we are ok
generalhoslem::logitgof(myData$SCHOOL_NAT, fitted(reg_model2))
## 
##  Hosmer and Lemeshow test (binary model)
## 
## data:  myData$SCHOOL_NAT, fitted(reg_model2)
## X-squared = 31.063, df = 8, p-value = 0.0001369
#Collinearity
vifmodel<-car::vif(reg_model2)#You can ignore the warning messages, GVIF^(1/(2*Df)) is the value of interest
vifmodel
##                 GVIF Df GVIF^(1/(2*Df))
## MAT_S11     1.611889  1        1.269602
## CR_S11      1.572105  1        1.253836
## SCHOOL_TYPE 1.007235  2        1.001804
## GENDER      1.049845  1        1.024619
#tolarence#
1/vifmodel
##                  GVIF  Df GVIF^(1/(2*Df))
## MAT_S11     0.6203901 1.0       0.7876484
## CR_S11      0.6360900 1.0       0.7975525
## SCHOOL_TYPE 0.9928172 0.5       0.9981994
## GENDER      0.9525218 1.0       0.9759722
#Output the sensitivity, specificity, and ROC plot
Epi::ROC(form=myData$SCHOOL_NAT ~ myData$MAT_S11+myData$CR_S11+ myData$SCHOOL_TYPE+myData$GENDER, plot="ROC")

#Confusion matrix
regclass::confusion_matrix(reg_model2)
##          Predicted 0 Predicted 1 Total
## Actual 0        4731         630  5361
## Actual 1        1797        3345  5142
## Total           6528        3975 10503
#Summary of the model with co-efficients
stargazer(reg_model2, type="text")
## 
## =========================================================
##                                   Dependent variable:    
##                               ---------------------------
##                                       SCHOOL_NAT         
## ---------------------------------------------------------
## MAT_S11                                -0.027***         
##                                         (0.003)          
##                                                          
## CR_S11                                 -0.016***         
##                                         (0.003)          
##                                                          
## SCHOOL_TYPETECHNICAL                   1.821***          
##                                         (0.081)          
##                                                          
## SCHOOL_TYPETECHNICAL/ACADEMIC          2.919***          
##                                         (0.063)          
##                                                          
## GENDER1                                -0.167***         
##                                         (0.050)          
##                                                          
## Constant                               1.857***          
##                                         (0.165)          
##                                                          
## ---------------------------------------------------------
## Observations                            10,503           
## Log Likelihood                        -5,345.364         
## Akaike Inf. Crit.                     10,702.730         
## =========================================================
## Note:                         *p<0.1; **p<0.05; ***p<0.01
#Exponentiate the co-efficients
exp(coef(reg_model2))
##                   (Intercept)                       MAT_S11 
##                     6.4029706                     0.9729049 
##                        CR_S11          SCHOOL_TYPETECHNICAL 
##                     0.9841671                     6.1752231 
## SCHOOL_TYPETECHNICAL/ACADEMIC                       GENDER1 
##                    18.5309037                     0.8461693
## odds ratios 
cbind(Estimate=round(coef(reg_model2),4),
      OR=round(exp(coef(reg_model2)),4))
##                               Estimate      OR
## (Intercept)                     1.8568  6.4030
## MAT_S11                        -0.0275  0.9729
## CR_S11                         -0.0160  0.9842
## SCHOOL_TYPETECHNICAL            1.8205  6.1752
## SCHOOL_TYPETECHNICAL/ACADEMIC   2.9194 18.5309
## GENDER1                        -0.1670  0.8462

Regression model 3

reg_model3<-glm(SCHOOL_NAT ~ MAT_S11+ CR_S11+ SCHOOL_TYPE+PEOPLE_HOUSE, data = myData, na.action = na.exclude, family = binomial(link=logit))
summary(reg_model3)
## 
## Call:
## glm(formula = SCHOOL_NAT ~ MAT_S11 + CR_S11 + SCHOOL_TYPE + PEOPLE_HOUSE, 
##     family = binomial(link = logit), data = myData, na.action = na.exclude)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7103  -0.7837  -0.4731   0.6389   2.2212  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    2.607124   0.260460  10.010  < 2e-16 ***
## MAT_S11                       -0.025256   0.002633  -9.592  < 2e-16 ***
## CR_S11                        -0.016487   0.003084  -5.346 9.00e-08 ***
## SCHOOL_TYPETECHNICAL           1.836275   0.081659  22.487  < 2e-16 ***
## SCHOOL_TYPETECHNICAL/ACADEMIC  2.936076   0.063445  46.278  < 2e-16 ***
## PEOPLE_HOUSEFive              -0.755469   0.213704  -3.535 0.000408 ***
## PEOPLE_HOUSEFour              -1.131284   0.211905  -5.339 9.36e-08 ***
## PEOPLE_HOUSENueve             -0.073059   0.378306  -0.193 0.846864    
## PEOPLE_HOUSEOne               -0.984798   0.527813  -1.866 0.062068 .  
## PEOPLE_HOUSESeven             -0.413780   0.248292  -1.667 0.095614 .  
## PEOPLE_HOUSESix               -0.442361   0.222537  -1.988 0.046833 *  
## PEOPLE_HOUSETen               -0.162184   0.415024  -0.391 0.695957    
## PEOPLE_HOUSEThree             -1.190191   0.215650  -5.519 3.41e-08 ***
## PEOPLE_HOUSETwelve or more     0.409049   0.494822   0.827 0.408430    
## PEOPLE_HOUSETwo               -1.067239   0.238130  -4.482 7.40e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14556  on 10502  degrees of freedom
## Residual deviance: 10546  on 10488  degrees of freedom
## AIC: 10576
## 
## Number of Fisher Scoring iterations: 4
lmtest::lrtest(reg_model3)
## Likelihood ratio test
## 
## Model 1: SCHOOL_NAT ~ MAT_S11 + CR_S11 + SCHOOL_TYPE + PEOPLE_HOUSE
## Model 2: SCHOOL_NAT ~ 1
##   #Df  LogLik  Df  Chisq Pr(>Chisq)    
## 1  15 -5272.8                          
## 2   1 -7277.8 -14 4010.1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Pseudo Rsquared 
DescTools::PseudoR2(reg_model3, which="CoxSnell")
##  CoxSnell 
## 0.3173715
DescTools::PseudoR2(reg_model3, which="Nagelkerke")
## Nagelkerke 
##  0.4232234
#Check the assumption of linearity of independent variables and log odds using a Hosmer-Lemeshow test, if this is not statsitically significant we are ok
generalhoslem::logitgof(myData$SCHOOL_NAT, fitted(reg_model3))
## 
##  Hosmer and Lemeshow test (binary model)
## 
## data:  myData$SCHOOL_NAT, fitted(reg_model3)
## X-squared = 35.121, df = 8, p-value = 2.542e-05
#Collinearity
vifmodel<-car::vif(reg_model3)#You can ignore the warning messages, GVIF^(1/(2*Df)) is the value of interest
vifmodel
##                  GVIF Df GVIF^(1/(2*Df))
## MAT_S11      1.539638  1        1.240821
## CR_S11       1.541052  1        1.241391
## SCHOOL_TYPE  1.014845  2        1.003691
## PEOPLE_HOUSE 1.013573 10        1.000674
1/vifmodel 
##                   GVIF  Df GVIF^(1/(2*Df))
## MAT_S11      0.6495035 1.0       0.8059178
## CR_S11       0.6489075 1.0       0.8055479
## SCHOOL_TYPE  0.9853718 0.5       0.9963227
## PEOPLE_HOUSE 0.9866086 0.1       0.9993261
#Output the sensitivity, specificity, and ROC plot
Epi::ROC(form=myData$SCHOOL_NAT ~ myData$MAT_S11+myData$CR_S11+ myData$SCHOOL_TYPE+myData$PEOPLE_HOUSE, plot="ROC")

#Confusion matrix
regclass::confusion_matrix(reg_model3)
##          Predicted 0 Predicted 1 Total
## Actual 0        4660         701  5361
## Actual 1        1695        3447  5142
## Total           6355        4148 10503
#Summary of the model with co-efficients
stargazer(reg_model3, type="text")
## 
## =========================================================
##                                   Dependent variable:    
##                               ---------------------------
##                                       SCHOOL_NAT         
## ---------------------------------------------------------
## MAT_S11                                -0.025***         
##                                         (0.003)          
##                                                          
## CR_S11                                 -0.016***         
##                                         (0.003)          
##                                                          
## SCHOOL_TYPETECHNICAL                   1.836***          
##                                         (0.082)          
##                                                          
## SCHOOL_TYPETECHNICAL/ACADEMIC          2.936***          
##                                         (0.063)          
##                                                          
## PEOPLE_HOUSEFive                       -0.755***         
##                                         (0.214)          
##                                                          
## PEOPLE_HOUSEFour                       -1.131***         
##                                         (0.212)          
##                                                          
## PEOPLE_HOUSENueve                       -0.073           
##                                         (0.378)          
##                                                          
## PEOPLE_HOUSEOne                         -0.985*          
##                                         (0.528)          
##                                                          
## PEOPLE_HOUSESeven                       -0.414*          
##                                         (0.248)          
##                                                          
## PEOPLE_HOUSESix                        -0.442**          
##                                         (0.223)          
##                                                          
## PEOPLE_HOUSETen                         -0.162           
##                                         (0.415)          
##                                                          
## PEOPLE_HOUSEThree                      -1.190***         
##                                         (0.216)          
##                                                          
## PEOPLE_HOUSETwelve or more               0.409           
##                                         (0.495)          
##                                                          
## PEOPLE_HOUSETwo                        -1.067***         
##                                         (0.238)          
##                                                          
## Constant                               2.607***          
##                                         (0.260)          
##                                                          
## ---------------------------------------------------------
## Observations                            10,503           
## Log Likelihood                        -5,272.795         
## Akaike Inf. Crit.                     10,575.590         
## =========================================================
## Note:                         *p<0.1; **p<0.05; ***p<0.01
#Exponentiate the co-efficients
exp(coef(reg_model3))
##                   (Intercept)                       MAT_S11 
##                    13.5599983                     0.9750603 
##                        CR_S11          SCHOOL_TYPETECHNICAL 
##                     0.9836484                     6.2731266 
## SCHOOL_TYPETECHNICAL/ACADEMIC              PEOPLE_HOUSEFive 
##                    18.8417632                     0.4697902 
##              PEOPLE_HOUSEFour             PEOPLE_HOUSENueve 
##                     0.3226186                     0.9295460 
##               PEOPLE_HOUSEOne             PEOPLE_HOUSESeven 
##                     0.3735145                     0.6611467 
##               PEOPLE_HOUSESix               PEOPLE_HOUSETen 
##                     0.6425174                     0.8502843 
##             PEOPLE_HOUSEThree    PEOPLE_HOUSETwelve or more 
##                     0.3041632                     1.5053862 
##               PEOPLE_HOUSETwo 
##                     0.3439567
## odds ratios 
cbind(Estimate=round(coef(reg_model3),4),
      OR=round(exp(coef(reg_model3)),4))
##                               Estimate      OR
## (Intercept)                     2.6071 13.5600
## MAT_S11                        -0.0253  0.9751
## CR_S11                         -0.0165  0.9836
## SCHOOL_TYPETECHNICAL            1.8363  6.2731
## SCHOOL_TYPETECHNICAL/ACADEMIC   2.9361 18.8418
## PEOPLE_HOUSEFive               -0.7555  0.4698
## PEOPLE_HOUSEFour               -1.1313  0.3226
## PEOPLE_HOUSENueve              -0.0731  0.9295
## PEOPLE_HOUSEOne                -0.9848  0.3735
## PEOPLE_HOUSESeven              -0.4138  0.6611
## PEOPLE_HOUSESix                -0.4424  0.6425
## PEOPLE_HOUSETen                -0.1622  0.8503
## PEOPLE_HOUSEThree              -1.1902  0.3042
## PEOPLE_HOUSETwelve or more      0.4090  1.5054
## PEOPLE_HOUSETwo                -1.0672  0.3440