needed_packages <- c("foreign",  "Epi", "arm", "DescTools", "stargazer", "lmtest",  "car", "generalhoslem", "regclass")                      
# Extract not installed packages
needed_packages <- c("foreign",  "Epi", "arm", "DescTools", "stargazer", "lmtest",  "car", "generalhoslem", "regclass")                      
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]    
# Install not installed packages
if(length(not_installed)) install.packages(not_installed, repos = "http://cran.us.r-project.org") 

library(Epi)#ROC Curve
## Warning: package 'Epi' was built under R version 4.0.3
library(DescTools)#Pseudo Rsquare statistics
## Warning: package 'DescTools' was built under R version 4.0.3
library(stargazer)
## Warning: package 'stargazer' was built under R version 4.0.3
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(foreign)#read SPSS file.
library(arm)#for invlogit calculating predicted probabilities
## Loading required package: MASS
## Loading required package: Matrix
## Loading required package: lme4
## 
## Attaching package: 'lme4'
## The following object is masked from 'package:Epi':
## 
##     factorize
## 
## arm (Version 1.11-2, built: 2020-7-27)
## Working directory is C:/Users/dorot/OneDrive/Documents/Assignment
library(lmtest)#Simple calculation of Chi-square for model
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)#Needed to test for colinearity of predictors
## Loading required package: carData
## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4
## 
## Attaching package: 'car'
## The following object is masked from 'package:arm':
## 
##     logit
## The following object is masked from 'package:DescTools':
## 
##     Recode
library(generalhoslem)#Needed to test assumption of linearity
## Warning: package 'generalhoslem' was built under R version 4.0.3
## Loading required package: reshape
## Warning: package 'reshape' was built under R version 4.0.3
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:Matrix':
## 
##     expand
library("regclass")#For confusion matrix
## Warning: package 'regclass' was built under R version 4.0.3
## Loading required package: bestglm
## Warning: package 'bestglm' was built under R version 4.0.3
## Loading required package: leaps
## Loading required package: VGAM
## Warning: package 'VGAM' was built under R version 4.0.3
## Loading required package: stats4
## Loading required package: splines
## 
## Attaching package: 'VGAM'
## The following object is masked from 'package:car':
## 
##     logit
## The following object is masked from 'package:lmtest':
## 
##     lrtest
## The following object is masked from 'package:arm':
## 
##     logit
## Loading required package: rpart
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 4.0.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## Important regclass change from 1.3:
## All functions that had a . in the name now have an _
## all.correlations -> all_correlations, cor.demo -> cor_demo, etc.
## 
## Attaching package: 'regclass'
## The following object is masked from 'package:DescTools':
## 
##     VIF
### loading the dataset#####
studentData<- read.csv('Student-performance.csv')
studentData$sex=as.factor(studentData$sex)
studentData$freetime.m<-as.factor(studentData$freetime.m)
studentData$activities.m<- as.factor(studentData$activities.m)
#Check your proportions of the outcome variable for bias - are these representative?
table(studentData$activities.m)
## 
##  no yes 
## 181 201

Build first model with sex as predictor

#Make sure categorical data is used as factors

logmodel1 <- glm(activities.m ~ sex, data = studentData, na.action = na.exclude, family = binomial(link=logit))
#Full summary of the model
summary(logmodel1)
## 
## Call:
## glm(formula = activities.m ~ sex, family = binomial(link = logit), 
##     data = studentData, na.action = na.exclude)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.320  -1.135   1.041   1.041   1.221  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -0.1011     0.1423  -0.710   0.4775  
## sexM          0.4301     0.2064   2.084   0.0371 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 528.52  on 381  degrees of freedom
## Residual deviance: 524.15  on 380  degrees of freedom
## AIC: 528.15
## 
## Number of Fisher Scoring iterations: 4
#Chi-square plus significance
lmtest::lrtest(logmodel1)
## Likelihood ratio test
## 
## Model 1: activities.m ~ sex
## Model 2: activities.m ~ 1
##   #Df  LogLik Df  Chisq Pr(>Chisq)  
## 1   2 -262.07                       
## 2   1 -264.26 -1 4.3709    0.03656 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Chi-square and Pseudo R2 calculation - THESE ARE INCLUDED FOR INFORMATION ONLY
#lmtest:lrtest achieves the same thing
modelChi <- logmodel1$null.deviance - logmodel1$deviance
modelChi
## [1] 4.370875
pseudo.R2 <- modelChi / logmodel1$null.deviance
pseudo.R2
## [1] 0.008270078
chidf <- logmodel1$df.null - logmodel1$df.residual
chidf
## [1] 1
chisq.prob <- 1 - pchisq(modelChi, chidf)
chisq.prob
## [1] 0.03655821
#Output the sensitivity, specificity, and ROC plot
Epi::ROC(form=studentData$activities.m ~ studentData$sex, plot="ROC")

#Pseudo Rsquared 
DescTools::PseudoR2(logmodel1, which="CoxSnell")
##   CoxSnell 
## 0.01137687
DescTools::PseudoR2(logmodel1, which="Nagelkerke")
## Nagelkerke 
## 0.01518306
#Summary of the model with coefficients
stargazer(logmodel1, type="text")
## 
## =============================================
##                       Dependent variable:    
##                   ---------------------------
##                          activities.m        
## ---------------------------------------------
## sexM                        0.430**          
##                             (0.206)          
##                                              
## Constant                    -0.101           
##                             (0.142)          
##                                              
## ---------------------------------------------
## Observations                  382            
## Log Likelihood             -262.073          
## Akaike Inf. Crit.           528.146          
## =============================================
## Note:             *p<0.1; **p<0.05; ***p<0.01
#Exponentiate the coefficients
exp(coefficients(logmodel1))
## (Intercept)        sexM 
##   0.9038462   1.5374413
## odds ratios and 95% CI 
cbind(Estimate=round(coef(logmodel1),4),
      OR=round(exp(coef(logmodel1)),4))
##             Estimate     OR
## (Intercept)  -0.1011 0.9038
## sexM          0.4301 1.5374
#Probability of having extra curricular activities when male 
arm::invlogit(coef(logmodel1)[1]+ coef(logmodel1)[2]*0)#YES this is the same as just having the 1st co-efficient
## (Intercept) 
##   0.4747475
#Probability of having extra curricular activities when female 
arm::invlogit(coef(logmodel1)[1]+ coef(logmodel1)[2]*1)
## (Intercept) 
##   0.5815217
#Check the assumption of linearity of independent variables and log odds using a Hosmer-Lemeshow test, if this is not statistically significant we are ok
#Won't give a p-value here because only one predictor
generalhoslem::logitgof(studentData$activities.m, fitted(logmodel1))
## Warning in generalhoslem::logitgof(studentData$activities.m, fitted(logmodel1)):
## Not possible to compute 10 rows. There might be too few observations.
## Warning in pchisq(chisq, PARAMETER): NaNs produced
## 
##  Hosmer and Lemeshow test (binary model)
## 
## data:  studentData$activities.m, fitted(logmodel1)
## X-squared = 7.6336e-29, df = -1, p-value = NA
#We would check for collinearity here but as we only have one predictor it doesn't make sense 

include free time to get interaction effect

logmodel2 <- glm(activities.m ~ sex+freetime.m, data = studentData, na.action = na.exclude, family = binomial(link=logit))
#Full summary of the model
summary(logmodel2)
## 
## Call:
## glm(formula = activities.m ~ sex + freetime.m, family = binomial(link = logit), 
##     data = studentData, na.action = na.exclude)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5369  -1.1585   0.8563   1.0934   1.4106  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -0.5332     0.4873  -1.094   0.2739  
## sexM          0.3557     0.2130   1.670   0.0948 .
## freetime.m2   0.3784     0.5494   0.689   0.4911  
## freetime.m3   0.4885     0.5122   0.954   0.3402  
## freetime.m4   0.3960     0.5256   0.753   0.4512  
## freetime.m5   0.9918     0.6097   1.627   0.1038  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 528.52  on 381  degrees of freedom
## Residual deviance: 520.71  on 376  degrees of freedom
## AIC: 532.71
## 
## Number of Fisher Scoring iterations: 4
#Chi-square plus significance
lmtest::lrtest(logmodel2)
## Likelihood ratio test
## 
## Model 1: activities.m ~ sex + freetime.m
## Model 2: activities.m ~ 1
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1   6 -260.36                     
## 2   1 -264.26 -5 7.8065     0.1672
#Pseudo Rsquared 
DescTools::PseudoR2(logmodel2, which="CoxSnell")
##   CoxSnell 
## 0.02022843
DescTools::PseudoR2(logmodel2, which="Nagelkerke")
## Nagelkerke 
## 0.02699595
#Output the sensitivity, specificity, and ROC plot
Epi::ROC(form=studentData$activities.m ~ studentData$sex+ studentData$freetime.m, plot="ROC")

#Summary of the model with coefficients
stargazer(logmodel2, type="text")
## 
## =============================================
##                       Dependent variable:    
##                   ---------------------------
##                          activities.m        
## ---------------------------------------------
## sexM                        0.356*           
##                             (0.213)          
##                                              
## freetime.m2                  0.378           
##                             (0.549)          
##                                              
## freetime.m3                  0.489           
##                             (0.512)          
##                                              
## freetime.m4                  0.396           
##                             (0.526)          
##                                              
## freetime.m5                  0.992           
##                             (0.610)          
##                                              
## Constant                    -0.533           
##                             (0.487)          
##                                              
## ---------------------------------------------
## Observations                  382            
## Log Likelihood             -260.355          
## Akaike Inf. Crit.           532.710          
## =============================================
## Note:             *p<0.1; **p<0.05; ***p<0.01
#Exponentiate the coefficients
exp(coefficients(logmodel2))
## (Intercept)        sexM freetime.m2 freetime.m3 freetime.m4 freetime.m5 
##   0.5867525   1.4272372   1.4598769   1.6299060   1.4858869   2.6961438
## odds ratios 
cbind(Estimate=round(coef(logmodel2),4),
      OR=round(exp(coef(logmodel2)),4))
##             Estimate     OR
## (Intercept)  -0.5332 0.5868
## sexM          0.3557 1.4272
## freetime.m2   0.3784 1.4599
## freetime.m3   0.4885 1.6299
## freetime.m4   0.3960 1.4859
## freetime.m5   0.9918 2.6961
#s1pared 1 at least one parent with a degree, 2 at least one parent with an A Level, 3 neither parent with an A Level

#1. Probability of having extra curricular activity when male and has freetime m2 
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*0)
## (Intercept) 
##    0.369782
#2. Probability of having extra curricular activity when female and has freetime m2
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*1)
## (Intercept) 
##    0.455763
#3.Probability of having extra curricular activity when male when when male and has freetime m3
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*0 +coef(logmodel2)[3]*1+coef(logmodel2)[3]*0)
## (Intercept) 
##   0.4613771
#4. Probability of answering yes when female when when male and has freetime m3
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*1 +coef(logmodel2)[3]*1+coef(logmodel2)[3]*0)
## (Intercept) 
##   0.5500668
#5.Probability of having extra curricular activity male when when male and has freetime m4
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*0 +coef(logmodel2)[3]*0+coef(logmodel2)[3]*1)
## (Intercept) 
##   0.4613771
#6.Probability of having extra curricular activity when female when when male and has freetime m4
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*1 +coef(logmodel2)[3]*0+coef(logmodel2)[3]*1)
## (Intercept) 
##   0.5500668
#7.Probability of having extra curricular activity male when when male and has freetime m5 
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*0 +coef(logmodel2)[3]*0+coef(logmodel2)[3]*1)+ coef(logmodel2)[4]*0
## (Intercept) 
##   0.4613771
#8.Probability of having extra curricular activity male when when male and has freetime m5
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*1 +coef(logmodel2)[3]*0+coef(logmodel2)[3]*1)+ coef(logmodel2)[4]*1
## (Intercept) 
##    1.038589
#Confusion matrix
regclass::confusion_matrix(logmodel2)
##            Predicted no Predicted yes Total
## Actual no           105            76   181
## Actual yes           89           112   201
## Total               194           188   382
#Check the assumption of linearity of independent variables and log odds using a Hosmer-Lemeshow test, if this is not statistically significant we are ok
generalhoslem::logitgof(studentData$activities.m, fitted(logmodel2))
## Warning in generalhoslem::logitgof(studentData$activities.m, fitted(logmodel2)):
## Not possible to compute 10 rows. There might be too few observations.
## 
##  Hosmer and Lemeshow test (binary model)
## 
## data:  studentData$activities.m, fitted(logmodel2)
## X-squared = 0.38673, df = 4, p-value = 0.9835
#Collinearity
vifmodel<-car::vif(logmodel2)#You can ignore the warning messages, GVIF^(1/(2*Df)) is the value of interest
vifmodel
##                GVIF Df GVIF^(1/(2*Df))
## sex        1.055417  1        1.027335
## freetime.m 1.055417  4        1.006765
#Tolerance
1/vifmodel
##                 GVIF   Df GVIF^(1/(2*Df))
## sex        0.9474928 1.00       0.9733924
## freetime.m 0.9474928 0.25       0.9932807

comparing the models

stargazer(logmodel1, logmodel2, type="text")
## 
## ==============================================
##                       Dependent variable:     
##                   ----------------------------
##                           activities.m        
##                        (1)            (2)     
## ----------------------------------------------
## sexM                 0.430**        0.356*    
##                      (0.206)        (0.213)   
##                                               
## freetime.m2                          0.378    
##                                     (0.549)   
##                                               
## freetime.m3                          0.489    
##                                     (0.512)   
##                                               
## freetime.m4                          0.396    
##                                     (0.526)   
##                                               
## freetime.m5                          0.992    
##                                     (0.610)   
##                                               
## Constant              -0.101        -0.533    
##                      (0.142)        (0.487)   
##                                               
## ----------------------------------------------
## Observations           382            382     
## Log Likelihood       -262.073      -260.355   
## Akaike Inf. Crit.    528.146        532.710   
## ==============================================
## Note:              *p<0.1; **p<0.05; ***p<0.01