needed_packages <- c("foreign", "Epi", "arm", "DescTools", "stargazer", "lmtest", "car", "generalhoslem", "regclass")
# Extract not installed packages
needed_packages <- c("foreign", "Epi", "arm", "DescTools", "stargazer", "lmtest", "car", "generalhoslem", "regclass")
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]
# Install not installed packages
if(length(not_installed)) install.packages(not_installed, repos = "http://cran.us.r-project.org")
library(Epi)#ROC Curve
## Warning: package 'Epi' was built under R version 4.0.3
library(DescTools)#Pseudo Rsquare statistics
## Warning: package 'DescTools' was built under R version 4.0.3
library(stargazer)
## Warning: package 'stargazer' was built under R version 4.0.3
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(foreign)#read SPSS file.
library(arm)#for invlogit calculating predicted probabilities
## Loading required package: MASS
## Loading required package: Matrix
## Loading required package: lme4
##
## Attaching package: 'lme4'
## The following object is masked from 'package:Epi':
##
## factorize
##
## arm (Version 1.11-2, built: 2020-7-27)
## Working directory is C:/Users/dorot/OneDrive/Documents/Assignment
library(lmtest)#Simple calculation of Chi-square for model
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car)#Needed to test for colinearity of predictors
## Loading required package: carData
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
##
## Attaching package: 'car'
## The following object is masked from 'package:arm':
##
## logit
## The following object is masked from 'package:DescTools':
##
## Recode
library(generalhoslem)#Needed to test assumption of linearity
## Warning: package 'generalhoslem' was built under R version 4.0.3
## Loading required package: reshape
## Warning: package 'reshape' was built under R version 4.0.3
##
## Attaching package: 'reshape'
## The following object is masked from 'package:Matrix':
##
## expand
library("regclass")#For confusion matrix
## Warning: package 'regclass' was built under R version 4.0.3
## Loading required package: bestglm
## Warning: package 'bestglm' was built under R version 4.0.3
## Loading required package: leaps
## Loading required package: VGAM
## Warning: package 'VGAM' was built under R version 4.0.3
## Loading required package: stats4
## Loading required package: splines
##
## Attaching package: 'VGAM'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:lmtest':
##
## lrtest
## The following object is masked from 'package:arm':
##
## logit
## Loading required package: rpart
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 4.0.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## Important regclass change from 1.3:
## All functions that had a . in the name now have an _
## all.correlations -> all_correlations, cor.demo -> cor_demo, etc.
##
## Attaching package: 'regclass'
## The following object is masked from 'package:DescTools':
##
## VIF
### loading the dataset#####
studentData<- read.csv('Student-performance.csv')
studentData$sex=as.factor(studentData$sex)
studentData$freetime.m<-as.factor(studentData$freetime.m)
studentData$activities.m<- as.factor(studentData$activities.m)
#Check your proportions of the outcome variable for bias - are these representative?
table(studentData$activities.m)
##
## no yes
## 181 201
Build first model with sex as predictor
#Make sure categorical data is used as factors
logmodel1 <- glm(activities.m ~ sex, data = studentData, na.action = na.exclude, family = binomial(link=logit))
#Full summary of the model
summary(logmodel1)
##
## Call:
## glm(formula = activities.m ~ sex, family = binomial(link = logit),
## data = studentData, na.action = na.exclude)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.320 -1.135 1.041 1.041 1.221
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1011 0.1423 -0.710 0.4775
## sexM 0.4301 0.2064 2.084 0.0371 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 528.52 on 381 degrees of freedom
## Residual deviance: 524.15 on 380 degrees of freedom
## AIC: 528.15
##
## Number of Fisher Scoring iterations: 4
#Chi-square plus significance
lmtest::lrtest(logmodel1)
## Likelihood ratio test
##
## Model 1: activities.m ~ sex
## Model 2: activities.m ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 2 -262.07
## 2 1 -264.26 -1 4.3709 0.03656 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Chi-square and Pseudo R2 calculation - THESE ARE INCLUDED FOR INFORMATION ONLY
#lmtest:lrtest achieves the same thing
modelChi <- logmodel1$null.deviance - logmodel1$deviance
modelChi
## [1] 4.370875
pseudo.R2 <- modelChi / logmodel1$null.deviance
pseudo.R2
## [1] 0.008270078
chidf <- logmodel1$df.null - logmodel1$df.residual
chidf
## [1] 1
chisq.prob <- 1 - pchisq(modelChi, chidf)
chisq.prob
## [1] 0.03655821
#Output the sensitivity, specificity, and ROC plot
Epi::ROC(form=studentData$activities.m ~ studentData$sex, plot="ROC")

#Pseudo Rsquared
DescTools::PseudoR2(logmodel1, which="CoxSnell")
## CoxSnell
## 0.01137687
DescTools::PseudoR2(logmodel1, which="Nagelkerke")
## Nagelkerke
## 0.01518306
#Summary of the model with coefficients
stargazer(logmodel1, type="text")
##
## =============================================
## Dependent variable:
## ---------------------------
## activities.m
## ---------------------------------------------
## sexM 0.430**
## (0.206)
##
## Constant -0.101
## (0.142)
##
## ---------------------------------------------
## Observations 382
## Log Likelihood -262.073
## Akaike Inf. Crit. 528.146
## =============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
#Exponentiate the coefficients
exp(coefficients(logmodel1))
## (Intercept) sexM
## 0.9038462 1.5374413
## odds ratios and 95% CI
cbind(Estimate=round(coef(logmodel1),4),
OR=round(exp(coef(logmodel1)),4))
## Estimate OR
## (Intercept) -0.1011 0.9038
## sexM 0.4301 1.5374
#Probability of having extra curricular activities when male
arm::invlogit(coef(logmodel1)[1]+ coef(logmodel1)[2]*0)#YES this is the same as just having the 1st co-efficient
## (Intercept)
## 0.4747475
#Probability of having extra curricular activities when female
arm::invlogit(coef(logmodel1)[1]+ coef(logmodel1)[2]*1)
## (Intercept)
## 0.5815217
#Check the assumption of linearity of independent variables and log odds using a Hosmer-Lemeshow test, if this is not statistically significant we are ok
#Won't give a p-value here because only one predictor
generalhoslem::logitgof(studentData$activities.m, fitted(logmodel1))
## Warning in generalhoslem::logitgof(studentData$activities.m, fitted(logmodel1)):
## Not possible to compute 10 rows. There might be too few observations.
## Warning in pchisq(chisq, PARAMETER): NaNs produced
##
## Hosmer and Lemeshow test (binary model)
##
## data: studentData$activities.m, fitted(logmodel1)
## X-squared = 7.6336e-29, df = -1, p-value = NA
#We would check for collinearity here but as we only have one predictor it doesn't make sense
include free time to get interaction effect
logmodel2 <- glm(activities.m ~ sex+freetime.m, data = studentData, na.action = na.exclude, family = binomial(link=logit))
#Full summary of the model
summary(logmodel2)
##
## Call:
## glm(formula = activities.m ~ sex + freetime.m, family = binomial(link = logit),
## data = studentData, na.action = na.exclude)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5369 -1.1585 0.8563 1.0934 1.4106
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.5332 0.4873 -1.094 0.2739
## sexM 0.3557 0.2130 1.670 0.0948 .
## freetime.m2 0.3784 0.5494 0.689 0.4911
## freetime.m3 0.4885 0.5122 0.954 0.3402
## freetime.m4 0.3960 0.5256 0.753 0.4512
## freetime.m5 0.9918 0.6097 1.627 0.1038
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 528.52 on 381 degrees of freedom
## Residual deviance: 520.71 on 376 degrees of freedom
## AIC: 532.71
##
## Number of Fisher Scoring iterations: 4
#Chi-square plus significance
lmtest::lrtest(logmodel2)
## Likelihood ratio test
##
## Model 1: activities.m ~ sex + freetime.m
## Model 2: activities.m ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 6 -260.36
## 2 1 -264.26 -5 7.8065 0.1672
#Pseudo Rsquared
DescTools::PseudoR2(logmodel2, which="CoxSnell")
## CoxSnell
## 0.02022843
DescTools::PseudoR2(logmodel2, which="Nagelkerke")
## Nagelkerke
## 0.02699595
#Output the sensitivity, specificity, and ROC plot
Epi::ROC(form=studentData$activities.m ~ studentData$sex+ studentData$freetime.m, plot="ROC")

#Summary of the model with coefficients
stargazer(logmodel2, type="text")
##
## =============================================
## Dependent variable:
## ---------------------------
## activities.m
## ---------------------------------------------
## sexM 0.356*
## (0.213)
##
## freetime.m2 0.378
## (0.549)
##
## freetime.m3 0.489
## (0.512)
##
## freetime.m4 0.396
## (0.526)
##
## freetime.m5 0.992
## (0.610)
##
## Constant -0.533
## (0.487)
##
## ---------------------------------------------
## Observations 382
## Log Likelihood -260.355
## Akaike Inf. Crit. 532.710
## =============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
#Exponentiate the coefficients
exp(coefficients(logmodel2))
## (Intercept) sexM freetime.m2 freetime.m3 freetime.m4 freetime.m5
## 0.5867525 1.4272372 1.4598769 1.6299060 1.4858869 2.6961438
## odds ratios
cbind(Estimate=round(coef(logmodel2),4),
OR=round(exp(coef(logmodel2)),4))
## Estimate OR
## (Intercept) -0.5332 0.5868
## sexM 0.3557 1.4272
## freetime.m2 0.3784 1.4599
## freetime.m3 0.4885 1.6299
## freetime.m4 0.3960 1.4859
## freetime.m5 0.9918 2.6961
#s1pared 1 at least one parent with a degree, 2 at least one parent with an A Level, 3 neither parent with an A Level
#1. Probability of having extra curricular activity when male and has freetime m2
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*0)
## (Intercept)
## 0.369782
#2. Probability of having extra curricular activity when female and has freetime m2
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*1)
## (Intercept)
## 0.455763
#3.Probability of having extra curricular activity when male when when male and has freetime m3
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*0 +coef(logmodel2)[3]*1+coef(logmodel2)[3]*0)
## (Intercept)
## 0.4613771
#4. Probability of answering yes when female when when male and has freetime m3
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*1 +coef(logmodel2)[3]*1+coef(logmodel2)[3]*0)
## (Intercept)
## 0.5500668
#5.Probability of having extra curricular activity male when when male and has freetime m4
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*0 +coef(logmodel2)[3]*0+coef(logmodel2)[3]*1)
## (Intercept)
## 0.4613771
#6.Probability of having extra curricular activity when female when when male and has freetime m4
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*1 +coef(logmodel2)[3]*0+coef(logmodel2)[3]*1)
## (Intercept)
## 0.5500668
#7.Probability of having extra curricular activity male when when male and has freetime m5
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*0 +coef(logmodel2)[3]*0+coef(logmodel2)[3]*1)+ coef(logmodel2)[4]*0
## (Intercept)
## 0.4613771
#8.Probability of having extra curricular activity male when when male and has freetime m5
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*1 +coef(logmodel2)[3]*0+coef(logmodel2)[3]*1)+ coef(logmodel2)[4]*1
## (Intercept)
## 1.038589
#Confusion matrix
regclass::confusion_matrix(logmodel2)
## Predicted no Predicted yes Total
## Actual no 105 76 181
## Actual yes 89 112 201
## Total 194 188 382
#Check the assumption of linearity of independent variables and log odds using a Hosmer-Lemeshow test, if this is not statistically significant we are ok
generalhoslem::logitgof(studentData$activities.m, fitted(logmodel2))
## Warning in generalhoslem::logitgof(studentData$activities.m, fitted(logmodel2)):
## Not possible to compute 10 rows. There might be too few observations.
##
## Hosmer and Lemeshow test (binary model)
##
## data: studentData$activities.m, fitted(logmodel2)
## X-squared = 0.38673, df = 4, p-value = 0.9835
#Collinearity
vifmodel<-car::vif(logmodel2)#You can ignore the warning messages, GVIF^(1/(2*Df)) is the value of interest
vifmodel
## GVIF Df GVIF^(1/(2*Df))
## sex 1.055417 1 1.027335
## freetime.m 1.055417 4 1.006765
#Tolerance
1/vifmodel
## GVIF Df GVIF^(1/(2*Df))
## sex 0.9474928 1.00 0.9733924
## freetime.m 0.9474928 0.25 0.9932807
comparing the models
stargazer(logmodel1, logmodel2, type="text")
##
## ==============================================
## Dependent variable:
## ----------------------------
## activities.m
## (1) (2)
## ----------------------------------------------
## sexM 0.430** 0.356*
## (0.206) (0.213)
##
## freetime.m2 0.378
## (0.549)
##
## freetime.m3 0.489
## (0.512)
##
## freetime.m4 0.396
## (0.526)
##
## freetime.m5 0.992
## (0.610)
##
## Constant -0.101 -0.533
## (0.142) (0.487)
##
## ----------------------------------------------
## Observations 382 382
## Log Likelihood -262.073 -260.355
## Akaike Inf. Crit. 528.146 532.710
## ==============================================
## Note: *p<0.1; **p<0.05; ***p<0.01