This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
academic_performance <- read_excel("data_academic_performance.xlsx")
#First check that package required is installed, if not install it
# Specify your packages
needed_packages <- c("pastecs", "ggplot2", "semTools", "FSA")
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]
# Install not installed packages
if(length(not_installed)) install.packages(not_installed)
library(pastecs) #For creating descriptive statistic summaries
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(ggplot2) #For creating histograms with more detail than plot
library(semTools) #For skewness and kurtosis
## Warning: package 'semTools' was built under R version 4.0.3
## Loading required package: lavaan
## This is lavaan 0.6-7
## lavaan is BETA software! Please report any bugs.
##
## ###############################################################################
## This is semTools 0.5-3
## All users of R (or SEM) are invited to submit functions or ideas for functions.
## ###############################################################################
library(stargazer)
## Warning: package 'stargazer' was built under R version 4.0.3
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(sjstats)#chi-square effect size
## Warning: package 'sjstats' was built under R version 4.0.3
library(gmodels) #For creating histograms with more detail than plot
## Warning: package 'gmodels' was built under R version 4.0.3
##
## Attaching package: 'gmodels'
## The following object is masked from 'package:sjstats':
##
## ci
##missing data handling
df_2<- academic_performance %>% dplyr::na_if(0)
df_2<- df_2 %>% dplyr::na_if('Not sure')
df_2<- df_2 %>% dplyr::na_if('Not apply')
myData<- na.omit(df_2)
myData$PEOPLE_HOUSE=as.factor(ifelse(myData$PEOPLE_HOUSE=="Once","One",myData$PEOPLE_HOUSE))
myData$SCHOOL_NAT=ifelse(myData$SCHOOL_NAT== "PRIVATE", 0, ifelse(myData$SCHOOL_NAT == "PUBLIC", 1, NA))
myData$GENDER=ifelse(myData$GENDER== "M", 0, ifelse(myData$GENDER == "F", 1, NA))
#SCHOOL_TYPE is the type of school(a categorical variable with three values)
#SCHOOL_NAT, a two-level categorical variable
#write is a continuous variable
with(myData, table(SCHOOL_NAT, SCHOOL_TYPE))
## SCHOOL_TYPE
## SCHOOL_NAT ACADEMIC TECHNICAL TECHNICAL/ACADEMIC
## 0 4743 250 368
## 1 1804 645 2693
#Because school type has three levels we need to indicate which level is our reference
#We will be comparing the other types of schools to academic
myData$SCHOOL_NAT= as.factor(myData$SCHOOL_NAT)
myData$SCHOOL_TYPE= as.factor(myData$SCHOOL_TYPE)
myData$GENDER= as.factor(myData$GENDER)
reg_model1<-glm(SCHOOL_NAT ~ MAT_S11+CR_S11+SCHOOL_TYPE, data = myData, na.action = na.exclude, family = binomial(link=logit))
summary(reg_model1)
##
## Call:
## glm(formula = SCHOOL_NAT ~ MAT_S11 + CR_S11 + SCHOOL_TYPE, family = binomial(link = logit),
## data = myData, na.action = na.exclude)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4733 -0.8002 -0.5030 0.6351 2.1231
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.760200 0.161895 10.872 < 2e-16 ***
## MAT_S11 -0.025613 0.002608 -9.822 < 2e-16 ***
## CR_S11 -0.017425 0.003051 -5.712 1.12e-08 ***
## SCHOOL_TYPETECHNICAL 1.816594 0.080853 22.468 < 2e-16 ***
## SCHOOL_TYPETECHNICAL/ACADEMIC 2.911381 0.062875 46.304 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14556 on 10502 degrees of freedom
## Residual deviance: 10702 on 10498 degrees of freedom
## AIC: 10712
##
## Number of Fisher Scoring iterations: 4
#nce residuals, are a measure of model fit.
#Chi-square plus significance
lmtest::lrtest(reg_model1)
## Likelihood ratio test
##
## Model 1: SCHOOL_NAT ~ MAT_S11 + CR_S11 + SCHOOL_TYPE
## Model 2: SCHOOL_NAT ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 5 -5351.0
## 2 1 -7277.8 -4 3853.7 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Pseudo Rsquared
DescTools::PseudoR2(reg_model1, which="CoxSnell")
## Registered S3 method overwritten by 'DescTools':
## method from
## reorder.factor gdata
## CoxSnell
## 0.3071295
DescTools::PseudoR2(reg_model1, which="Nagelkerke")
## Nagelkerke
## 0.4095653
#Check the assumption of linearity of independent variables and log odds using a Hosmer-Lemeshow test, if this is not statsitically significant we are ok
generalhoslem::logitgof(myData$SCHOOL_NAT, fitted(reg_model1))
##
## Hosmer and Lemeshow test (binary model)
##
## data: myData$SCHOOL_NAT, fitted(reg_model1)
## X-squared = 48.13, df = 8, p-value = 9.331e-08
#Collinearity
vifmodel<-car::vif(reg_model1)#You can ignore the warning messages, GVIF^(1/(2*Df)) is the value of interest
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
vifmodel
## GVIF Df GVIF^(1/(2*Df))
## MAT_S11 1.539520 1 1.240774
## CR_S11 1.540354 1 1.241110
## SCHOOL_TYPE 1.003816 2 1.000953
#Tolerance
1/vifmodel
## GVIF Df GVIF^(1/(2*Df))
## MAT_S11 0.6495531 1.0 0.8059486
## CR_S11 0.6492012 1.0 0.8057303
## SCHOOL_TYPE 0.9961987 0.5 0.9990483
#Output the sensitivity, specificity, and ROC plot
Epi::ROC(form=myData$SCHOOL_NAT ~ myData$MAT_S11+myData$CR_S11+myData$SCHOOL_TYPE, plot="ROC")
#Confusion matrix
regclass::confusion_matrix(reg_model1)
## Predicted 0 Predicted 1 Total
## Actual 0 4731 630 5361
## Actual 1 1795 3347 5142
## Total 6526 3977 10503
#Summary of the model with co-efficients
stargazer(reg_model1, type="text")
##
## =========================================================
## Dependent variable:
## ---------------------------
## SCHOOL_NAT
## ---------------------------------------------------------
## MAT_S11 -0.026***
## (0.003)
##
## CR_S11 -0.017***
## (0.003)
##
## SCHOOL_TYPETECHNICAL 1.817***
## (0.081)
##
## SCHOOL_TYPETECHNICAL/ACADEMIC 2.911***
## (0.063)
##
## Constant 1.760***
## (0.162)
##
## ---------------------------------------------------------
## Observations 10,503
## Log Likelihood -5,351.002
## Akaike Inf. Crit. 10,712.000
## =========================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
#Exponentiate the co-efficients
exp(coef(reg_model1))
## (Intercept) MAT_S11
## 5.8136000 0.9747124
## CR_S11 SCHOOL_TYPETECHNICAL
## 0.9827262 6.1508754
## SCHOOL_TYPETECHNICAL/ACADEMIC
## 18.3821707
## odds ratios
cbind(Estimate=round(coef(reg_model1),4),
OR=round(exp(coef(reg_model1)),4))
## Estimate OR
## (Intercept) 1.7602 5.8136
## MAT_S11 -0.0256 0.9747
## CR_S11 -0.0174 0.9827
## SCHOOL_TYPETECHNICAL 1.8166 6.1509
## SCHOOL_TYPETECHNICAL/ACADEMIC 2.9114 18.3822
reg_model2<-glm(SCHOOL_NAT ~ MAT_S11+ CR_S11+ SCHOOL_TYPE+ GENDER, data = myData, na.action = na.exclude, family = binomial(link=logit))
summary(reg_model2)
##
## Call:
## glm(formula = SCHOOL_NAT ~ MAT_S11 + CR_S11 + SCHOOL_TYPE + GENDER,
## family = binomial(link = logit), data = myData, na.action = na.exclude)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4764 -0.8007 -0.5021 0.6368 2.1130
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.856762 0.164550 11.284 < 2e-16 ***
## MAT_S11 -0.027469 0.002670 -10.288 < 2e-16 ***
## CR_S11 -0.015960 0.003082 -5.179 2.24e-07 ***
## SCHOOL_TYPETECHNICAL 1.820545 0.080926 22.497 < 2e-16 ***
## SCHOOL_TYPETECHNICAL/ACADEMIC 2.919440 0.062996 46.343 < 2e-16 ***
## GENDER1 -0.167036 0.049833 -3.352 0.000803 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14556 on 10502 degrees of freedom
## Residual deviance: 10691 on 10497 degrees of freedom
## AIC: 10703
##
## Number of Fisher Scoring iterations: 4
lmtest::lrtest(reg_model2)
## Likelihood ratio test
##
## Model 1: SCHOOL_NAT ~ MAT_S11 + CR_S11 + SCHOOL_TYPE + GENDER
## Model 2: SCHOOL_NAT ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 6 -5345.4
## 2 1 -7277.8 -5 3865 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Pseudo Rsquared
DescTools::PseudoR2(reg_model2, which="CoxSnell")
## CoxSnell
## 0.307873
DescTools::PseudoR2(reg_model2, which="Nagelkerke")
## Nagelkerke
## 0.4105569
#Check the assumption of linearity of independent variables and log odds using a Hosmer-Lemeshow test, if this is not statsitically significant we are ok
generalhoslem::logitgof(myData$SCHOOL_NAT, fitted(reg_model2))
##
## Hosmer and Lemeshow test (binary model)
##
## data: myData$SCHOOL_NAT, fitted(reg_model2)
## X-squared = 31.063, df = 8, p-value = 0.0001369
#Collinearity
vifmodel<-car::vif(reg_model2)#You can ignore the warning messages, GVIF^(1/(2*Df)) is the value of interest
vifmodel
## GVIF Df GVIF^(1/(2*Df))
## MAT_S11 1.611889 1 1.269602
## CR_S11 1.572105 1 1.253836
## SCHOOL_TYPE 1.007235 2 1.001804
## GENDER 1.049845 1 1.024619
#tolarence#
1/vifmodel
## GVIF Df GVIF^(1/(2*Df))
## MAT_S11 0.6203901 1.0 0.7876484
## CR_S11 0.6360900 1.0 0.7975525
## SCHOOL_TYPE 0.9928172 0.5 0.9981994
## GENDER 0.9525218 1.0 0.9759722
#Output the sensitivity, specificity, and ROC plot
Epi::ROC(form=myData$SCHOOL_NAT ~ myData$MAT_S11+myData$CR_S11+ myData$SCHOOL_TYPE+myData$GENDER, plot="ROC")
#Confusion matrix
regclass::confusion_matrix(reg_model2)
## Predicted 0 Predicted 1 Total
## Actual 0 4731 630 5361
## Actual 1 1797 3345 5142
## Total 6528 3975 10503
#Summary of the model with co-efficients
stargazer(reg_model2, type="text")
##
## =========================================================
## Dependent variable:
## ---------------------------
## SCHOOL_NAT
## ---------------------------------------------------------
## MAT_S11 -0.027***
## (0.003)
##
## CR_S11 -0.016***
## (0.003)
##
## SCHOOL_TYPETECHNICAL 1.821***
## (0.081)
##
## SCHOOL_TYPETECHNICAL/ACADEMIC 2.919***
## (0.063)
##
## GENDER1 -0.167***
## (0.050)
##
## Constant 1.857***
## (0.165)
##
## ---------------------------------------------------------
## Observations 10,503
## Log Likelihood -5,345.364
## Akaike Inf. Crit. 10,702.730
## =========================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
#Exponentiate the co-efficients
exp(coef(reg_model2))
## (Intercept) MAT_S11
## 6.4029706 0.9729049
## CR_S11 SCHOOL_TYPETECHNICAL
## 0.9841671 6.1752231
## SCHOOL_TYPETECHNICAL/ACADEMIC GENDER1
## 18.5309037 0.8461693
## odds ratios
cbind(Estimate=round(coef(reg_model2),4),
OR=round(exp(coef(reg_model2)),4))
## Estimate OR
## (Intercept) 1.8568 6.4030
## MAT_S11 -0.0275 0.9729
## CR_S11 -0.0160 0.9842
## SCHOOL_TYPETECHNICAL 1.8205 6.1752
## SCHOOL_TYPETECHNICAL/ACADEMIC 2.9194 18.5309
## GENDER1 -0.1670 0.8462
reg_model3<-glm(SCHOOL_NAT ~ MAT_S11+ CR_S11+ SCHOOL_TYPE+PEOPLE_HOUSE, data = myData, na.action = na.exclude, family = binomial(link=logit))
summary(reg_model3)
##
## Call:
## glm(formula = SCHOOL_NAT ~ MAT_S11 + CR_S11 + SCHOOL_TYPE + PEOPLE_HOUSE,
## family = binomial(link = logit), data = myData, na.action = na.exclude)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7103 -0.7837 -0.4731 0.6389 2.2212
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.607124 0.260460 10.010 < 2e-16 ***
## MAT_S11 -0.025256 0.002633 -9.592 < 2e-16 ***
## CR_S11 -0.016487 0.003084 -5.346 9.00e-08 ***
## SCHOOL_TYPETECHNICAL 1.836275 0.081659 22.487 < 2e-16 ***
## SCHOOL_TYPETECHNICAL/ACADEMIC 2.936076 0.063445 46.278 < 2e-16 ***
## PEOPLE_HOUSEFive -0.755469 0.213704 -3.535 0.000408 ***
## PEOPLE_HOUSEFour -1.131284 0.211905 -5.339 9.36e-08 ***
## PEOPLE_HOUSENueve -0.073059 0.378306 -0.193 0.846864
## PEOPLE_HOUSEOne -0.984798 0.527813 -1.866 0.062068 .
## PEOPLE_HOUSESeven -0.413780 0.248292 -1.667 0.095614 .
## PEOPLE_HOUSESix -0.442361 0.222537 -1.988 0.046833 *
## PEOPLE_HOUSETen -0.162184 0.415024 -0.391 0.695957
## PEOPLE_HOUSEThree -1.190191 0.215650 -5.519 3.41e-08 ***
## PEOPLE_HOUSETwelve or more 0.409049 0.494822 0.827 0.408430
## PEOPLE_HOUSETwo -1.067239 0.238130 -4.482 7.40e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14556 on 10502 degrees of freedom
## Residual deviance: 10546 on 10488 degrees of freedom
## AIC: 10576
##
## Number of Fisher Scoring iterations: 4
lmtest::lrtest(reg_model3)
## Likelihood ratio test
##
## Model 1: SCHOOL_NAT ~ MAT_S11 + CR_S11 + SCHOOL_TYPE + PEOPLE_HOUSE
## Model 2: SCHOOL_NAT ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 15 -5272.8
## 2 1 -7277.8 -14 4010.1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Pseudo Rsquared
DescTools::PseudoR2(reg_model3, which="CoxSnell")
## CoxSnell
## 0.3173715
DescTools::PseudoR2(reg_model3, which="Nagelkerke")
## Nagelkerke
## 0.4232234
#Check the assumption of linearity of independent variables and log odds using a Hosmer-Lemeshow test, if this is not statsitically significant we are ok
generalhoslem::logitgof(myData$SCHOOL_NAT, fitted(reg_model3))
##
## Hosmer and Lemeshow test (binary model)
##
## data: myData$SCHOOL_NAT, fitted(reg_model3)
## X-squared = 35.121, df = 8, p-value = 2.542e-05
#Collinearity
vifmodel<-car::vif(reg_model3)#You can ignore the warning messages, GVIF^(1/(2*Df)) is the value of interest
vifmodel
## GVIF Df GVIF^(1/(2*Df))
## MAT_S11 1.539638 1 1.240821
## CR_S11 1.541052 1 1.241391
## SCHOOL_TYPE 1.014845 2 1.003691
## PEOPLE_HOUSE 1.013573 10 1.000674
1/vifmodel
## GVIF Df GVIF^(1/(2*Df))
## MAT_S11 0.6495035 1.0 0.8059178
## CR_S11 0.6489075 1.0 0.8055479
## SCHOOL_TYPE 0.9853718 0.5 0.9963227
## PEOPLE_HOUSE 0.9866086 0.1 0.9993261
#Output the sensitivity, specificity, and ROC plot
Epi::ROC(form=myData$SCHOOL_NAT ~ myData$MAT_S11+myData$CR_S11+ myData$SCHOOL_TYPE+myData$PEOPLE_HOUSE, plot="ROC")
#Confusion matrix
regclass::confusion_matrix(reg_model3)
## Predicted 0 Predicted 1 Total
## Actual 0 4660 701 5361
## Actual 1 1695 3447 5142
## Total 6355 4148 10503
#Summary of the model with co-efficients
stargazer(reg_model3, type="text")
##
## =========================================================
## Dependent variable:
## ---------------------------
## SCHOOL_NAT
## ---------------------------------------------------------
## MAT_S11 -0.025***
## (0.003)
##
## CR_S11 -0.016***
## (0.003)
##
## SCHOOL_TYPETECHNICAL 1.836***
## (0.082)
##
## SCHOOL_TYPETECHNICAL/ACADEMIC 2.936***
## (0.063)
##
## PEOPLE_HOUSEFive -0.755***
## (0.214)
##
## PEOPLE_HOUSEFour -1.131***
## (0.212)
##
## PEOPLE_HOUSENueve -0.073
## (0.378)
##
## PEOPLE_HOUSEOne -0.985*
## (0.528)
##
## PEOPLE_HOUSESeven -0.414*
## (0.248)
##
## PEOPLE_HOUSESix -0.442**
## (0.223)
##
## PEOPLE_HOUSETen -0.162
## (0.415)
##
## PEOPLE_HOUSEThree -1.190***
## (0.216)
##
## PEOPLE_HOUSETwelve or more 0.409
## (0.495)
##
## PEOPLE_HOUSETwo -1.067***
## (0.238)
##
## Constant 2.607***
## (0.260)
##
## ---------------------------------------------------------
## Observations 10,503
## Log Likelihood -5,272.795
## Akaike Inf. Crit. 10,575.590
## =========================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
#Exponentiate the co-efficients
exp(coef(reg_model3))
## (Intercept) MAT_S11
## 13.5599983 0.9750603
## CR_S11 SCHOOL_TYPETECHNICAL
## 0.9836484 6.2731266
## SCHOOL_TYPETECHNICAL/ACADEMIC PEOPLE_HOUSEFive
## 18.8417632 0.4697902
## PEOPLE_HOUSEFour PEOPLE_HOUSENueve
## 0.3226186 0.9295460
## PEOPLE_HOUSEOne PEOPLE_HOUSESeven
## 0.3735145 0.6611467
## PEOPLE_HOUSESix PEOPLE_HOUSETen
## 0.6425174 0.8502843
## PEOPLE_HOUSEThree PEOPLE_HOUSETwelve or more
## 0.3041632 1.5053862
## PEOPLE_HOUSETwo
## 0.3439567
## odds ratios
cbind(Estimate=round(coef(reg_model3),4),
OR=round(exp(coef(reg_model3)),4))
## Estimate OR
## (Intercept) 2.6071 13.5600
## MAT_S11 -0.0253 0.9751
## CR_S11 -0.0165 0.9836
## SCHOOL_TYPETECHNICAL 1.8363 6.2731
## SCHOOL_TYPETECHNICAL/ACADEMIC 2.9361 18.8418
## PEOPLE_HOUSEFive -0.7555 0.4698
## PEOPLE_HOUSEFour -1.1313 0.3226
## PEOPLE_HOUSENueve -0.0731 0.9295
## PEOPLE_HOUSEOne -0.9848 0.3735
## PEOPLE_HOUSESeven -0.4138 0.6611
## PEOPLE_HOUSESix -0.4424 0.6425
## PEOPLE_HOUSETen -0.1622 0.8503
## PEOPLE_HOUSEThree -1.1902 0.3042
## PEOPLE_HOUSETwelve or more 0.4090 1.5054
## PEOPLE_HOUSETwo -1.0672 0.3440