needed_packages <- c("foreign",  "Epi", "arm", "DescTools", "stargazer", "lmtest",  "car", "generalhoslem", "regclass")                      
# Extract not installed packages
needed_packages <- c("foreign",  "Epi", "arm", "DescTools", "stargazer", "lmtest",  "car", "generalhoslem", "regclass")                      
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]    
# Install not installed packages
if(length(not_installed)) install.packages(not_installed, repos = "http://cran.us.r-project.org") 

library(Epi)#ROC Curve

## Warning: package 'Epi' was built under R version 4.0.3

library(DescTools)#Pseudo Rsquare statistics

## Warning: package 'DescTools' was built under R version 4.0.3

library(stargazer)

## Warning: package 'stargazer' was built under R version 4.0.3

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(foreign)#read SPSS file.
library(arm)#for invlogit calculating predicted probabilities

## Loading required package: MASS

## Loading required package: Matrix

## Loading required package: lme4

## 
## Attaching package: 'lme4'

## The following object is masked from 'package:Epi':
## 
##     factorize

## 
## arm (Version 1.11-2, built: 2020-7-27)

## Working directory is C:/Users/dorot/OneDrive/Documents/Assignment

library(lmtest)#Simple calculation of Chi-square for model

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(car)#Needed to test for colinearity of predictors

## Loading required package: carData

## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4

## 
## Attaching package: 'car'

## The following object is masked from 'package:arm':
## 
##     logit

## The following object is masked from 'package:DescTools':
## 
##     Recode

library(generalhoslem)#Needed to test assumption of linearity

## Warning: package 'generalhoslem' was built under R version 4.0.3

## Loading required package: reshape

## Warning: package 'reshape' was built under R version 4.0.3

## 
## Attaching package: 'reshape'

## The following object is masked from 'package:Matrix':
## 
##     expand

library("regclass")#For confusion matrix

## Warning: package 'regclass' was built under R version 4.0.3

## Loading required package: bestglm

## Warning: package 'bestglm' was built under R version 4.0.3

## Loading required package: leaps

## Loading required package: VGAM

## Warning: package 'VGAM' was built under R version 4.0.3

## Loading required package: stats4

## Loading required package: splines

## 
## Attaching package: 'VGAM'

## The following object is masked from 'package:car':
## 
##     logit

## The following object is masked from 'package:lmtest':
## 
##     lrtest

## The following object is masked from 'package:arm':
## 
##     logit

## Loading required package: rpart

## Loading required package: randomForest

## Warning: package 'randomForest' was built under R version 4.0.3

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## Important regclass change from 1.3:
## All functions that had a . in the name now have an _
## all.correlations -> all_correlations, cor.demo -> cor_demo, etc.

## 
## Attaching package: 'regclass'

## The following object is masked from 'package:DescTools':
## 
##     VIF

### loading the dataset#####
studentData<- read.csv('Student-performance.csv')
studentData$sex=as.factor(studentData$sex)
studentData$freetime.m<-as.factor(studentData$freetime.m)
studentData$activities.m<- as.factor(studentData$activities.m)
#Check your proportions of the outcome variable for bias - are these representative?
table(studentData$activities.m)

## 
##  no yes 
## 181 201

Build first model with sex as predictor

#Make sure categorical data is used as factors

logmodel1 <- glm(activities.m ~ sex, data = studentData, na.action = na.exclude, family = binomial(link=logit))
#Full summary of the model
summary(logmodel1)

## 
## Call:
## glm(formula = activities.m ~ sex, family = binomial(link = logit), 
##     data = studentData, na.action = na.exclude)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.320  -1.135   1.041   1.041   1.221  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -0.1011     0.1423  -0.710   0.4775  
## sexM          0.4301     0.2064   2.084   0.0371 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 528.52  on 381  degrees of freedom
## Residual deviance: 524.15  on 380  degrees of freedom
## AIC: 528.15
## 
## Number of Fisher Scoring iterations: 4

#Chi-square plus significance
lmtest::lrtest(logmodel1)

## Likelihood ratio test
## 
## Model 1: activities.m ~ sex
## Model 2: activities.m ~ 1
##   #Df  LogLik Df  Chisq Pr(>Chisq)  
## 1   2 -262.07                       
## 2   1 -264.26 -1 4.3709    0.03656 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Chi-square and Pseudo R2 calculation - THESE ARE INCLUDED FOR INFORMATION ONLY
#lmtest:lrtest achieves the same thing
modelChi <- logmodel1$null.deviance - logmodel1$deviance
modelChi

## [1] 4.370875

pseudo.R2 <- modelChi / logmodel1$null.deviance
pseudo.R2

## [1] 0.008270078

chidf <- logmodel1$df.null - logmodel1$df.residual
chidf

## [1] 1

chisq.prob <- 1 - pchisq(modelChi, chidf)
chisq.prob

## [1] 0.03655821

#Output the sensitivity, specificity, and ROC plot
Epi::ROC(form=studentData$activities.m ~ studentData$sex, plot="ROC")

#Pseudo Rsquared 
DescTools::PseudoR2(logmodel1, which="CoxSnell")

##   CoxSnell 
## 0.01137687

DescTools::PseudoR2(logmodel1, which="Nagelkerke")

## Nagelkerke 
## 0.01518306

#Summary of the model with coefficients
stargazer(logmodel1, type="text")

## 
## =============================================
##                       Dependent variable:    
##                   ---------------------------
##                          activities.m        
## ---------------------------------------------
## sexM                        0.430**          
##                             (0.206)          
##                                              
## Constant                    -0.101           
##                             (0.142)          
##                                              
## ---------------------------------------------
## Observations                  382            
## Log Likelihood             -262.073          
## Akaike Inf. Crit.           528.146          
## =============================================
## Note:             *p<0.1; **p<0.05; ***p<0.01

#Exponentiate the coefficients
exp(coefficients(logmodel1))

## (Intercept)        sexM 
##   0.9038462   1.5374413

## odds ratios and 95% CI 
cbind(Estimate=round(coef(logmodel1),4),
      OR=round(exp(coef(logmodel1)),4))

##             Estimate     OR
## (Intercept)  -0.1011 0.9038
## sexM          0.4301 1.5374

#Probability of having extra curricular activities when male 
arm::invlogit(coef(logmodel1)[1]+ coef(logmodel1)[2]*0)#YES this is the same as just having the 1st co-efficient

## (Intercept) 
##   0.4747475

#Probability of having extra curricular activities when female 
arm::invlogit(coef(logmodel1)[1]+ coef(logmodel1)[2]*1)

## (Intercept) 
##   0.5815217

#Check the assumption of linearity of independent variables and log odds using a Hosmer-Lemeshow test, if this is not statistically significant we are ok
#Won't give a p-value here because only one predictor
generalhoslem::logitgof(studentData$activities.m, fitted(logmodel1))

## Warning in generalhoslem::logitgof(studentData$activities.m, fitted(logmodel1)):
## Not possible to compute 10 rows. There might be too few observations.

## Warning in pchisq(chisq, PARAMETER): NaNs produced

## 
##  Hosmer and Lemeshow test (binary model)
## 
## data:  studentData$activities.m, fitted(logmodel1)
## X-squared = 7.6336e-29, df = -1, p-value = NA

#We would check for collinearity here but as we only have one predictor it doesn't make sense

include free time to get interaction effect

logmodel2 <- glm(activities.m ~ sex+freetime.m, data = studentData, na.action = na.exclude, family = binomial(link=logit))
#Full summary of the model
summary(logmodel2)

## 
## Call:
## glm(formula = activities.m ~ sex + freetime.m, family = binomial(link = logit), 
##     data = studentData, na.action = na.exclude)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5369  -1.1585   0.8563   1.0934   1.4106  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -0.5332     0.4873  -1.094   0.2739  
## sexM          0.3557     0.2130   1.670   0.0948 .
## freetime.m2   0.3784     0.5494   0.689   0.4911  
## freetime.m3   0.4885     0.5122   0.954   0.3402  
## freetime.m4   0.3960     0.5256   0.753   0.4512  
## freetime.m5   0.9918     0.6097   1.627   0.1038  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 528.52  on 381  degrees of freedom
## Residual deviance: 520.71  on 376  degrees of freedom
## AIC: 532.71
## 
## Number of Fisher Scoring iterations: 4

#Chi-square plus significance
lmtest::lrtest(logmodel2)

## Likelihood ratio test
## 
## Model 1: activities.m ~ sex + freetime.m
## Model 2: activities.m ~ 1
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1   6 -260.36                     
## 2   1 -264.26 -5 7.8065     0.1672

#Pseudo Rsquared 
DescTools::PseudoR2(logmodel2, which="CoxSnell")

##   CoxSnell 
## 0.02022843

DescTools::PseudoR2(logmodel2, which="Nagelkerke")

## Nagelkerke 
## 0.02699595

#Output the sensitivity, specificity, and ROC plot
Epi::ROC(form=studentData$activities.m ~ studentData$sex+ studentData$freetime.m, plot="ROC")

#Summary of the model with coefficients
stargazer(logmodel2, type="text")

## 
## =============================================
##                       Dependent variable:    
##                   ---------------------------
##                          activities.m        
## ---------------------------------------------
## sexM                        0.356*           
##                             (0.213)          
##                                              
## freetime.m2                  0.378           
##                             (0.549)          
##                                              
## freetime.m3                  0.489           
##                             (0.512)          
##                                              
## freetime.m4                  0.396           
##                             (0.526)          
##                                              
## freetime.m5                  0.992           
##                             (0.610)          
##                                              
## Constant                    -0.533           
##                             (0.487)          
##                                              
## ---------------------------------------------
## Observations                  382            
## Log Likelihood             -260.355          
## Akaike Inf. Crit.           532.710          
## =============================================
## Note:             *p<0.1; **p<0.05; ***p<0.01

#Exponentiate the coefficients
exp(coefficients(logmodel2))

## (Intercept)        sexM freetime.m2 freetime.m3 freetime.m4 freetime.m5 
##   0.5867525   1.4272372   1.4598769   1.6299060   1.4858869   2.6961438

## odds ratios 
cbind(Estimate=round(coef(logmodel2),4),
      OR=round(exp(coef(logmodel2)),4))

##             Estimate     OR
## (Intercept)  -0.5332 0.5868
## sexM          0.3557 1.4272
## freetime.m2   0.3784 1.4599
## freetime.m3   0.4885 1.6299
## freetime.m4   0.3960 1.4859
## freetime.m5   0.9918 2.6961

#s1pared 1 at least one parent with a degree, 2 at least one parent with an A Level, 3 neither parent with an A Level

#1. Probability of having extra curricular activity when male and has freetime m2 
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*0)

## (Intercept) 
##    0.369782

#2. Probability of having extra curricular activity when female and has freetime m2
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*1)

## (Intercept) 
##    0.455763

#3.Probability of having extra curricular activity when male when when male and has freetime m3
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*0 +coef(logmodel2)[3]*1+coef(logmodel2)[3]*0)

## (Intercept) 
##   0.4613771

#4. Probability of answering yes when female when when male and has freetime m3
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*1 +coef(logmodel2)[3]*1+coef(logmodel2)[3]*0)

## (Intercept) 
##   0.5500668

#5.Probability of having extra curricular activity male when when male and has freetime m4
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*0 +coef(logmodel2)[3]*0+coef(logmodel2)[3]*1)

## (Intercept) 
##   0.4613771

#6.Probability of having extra curricular activity when female when when male and has freetime m4
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*1 +coef(logmodel2)[3]*0+coef(logmodel2)[3]*1)

## (Intercept) 
##   0.5500668

#7.Probability of having extra curricular activity male when when male and has freetime m5 
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*0 +coef(logmodel2)[3]*0+coef(logmodel2)[3]*1)+ coef(logmodel2)[4]*0

## (Intercept) 
##   0.4613771

#8.Probability of having extra curricular activity male when when male and has freetime m5
arm::invlogit(coef(logmodel2)[1]+ coef(logmodel2)[2]*1 +coef(logmodel2)[3]*0+coef(logmodel2)[3]*1)+ coef(logmodel2)[4]*1

## (Intercept) 
##    1.038589

#Confusion matrix
regclass::confusion_matrix(logmodel2)

##            Predicted no Predicted yes Total
## Actual no           105            76   181
## Actual yes           89           112   201
## Total               194           188   382

#Check the assumption of linearity of independent variables and log odds using a Hosmer-Lemeshow test, if this is not statistically significant we are ok
generalhoslem::logitgof(studentData$activities.m, fitted(logmodel2))

## Warning in generalhoslem::logitgof(studentData$activities.m, fitted(logmodel2)):
## Not possible to compute 10 rows. There might be too few observations.

## 
##  Hosmer and Lemeshow test (binary model)
## 
## data:  studentData$activities.m, fitted(logmodel2)
## X-squared = 0.38673, df = 4, p-value = 0.9835

#Collinearity
vifmodel<-car::vif(logmodel2)#You can ignore the warning messages, GVIF^(1/(2*Df)) is the value of interest
vifmodel

##                GVIF Df GVIF^(1/(2*Df))
## sex        1.055417  1        1.027335
## freetime.m 1.055417  4        1.006765

#Tolerance
1/vifmodel

##                 GVIF   Df GVIF^(1/(2*Df))
## sex        0.9474928 1.00       0.9733924
## freetime.m 0.9474928 0.25       0.9932807

comparing the models

stargazer(logmodel1, logmodel2, type="text")

## 
## ==============================================
##                       Dependent variable:     
##                   ----------------------------
##                           activities.m        
##                        (1)            (2)     
## ----------------------------------------------
## sexM                 0.430**        0.356*    
##                      (0.206)        (0.213)   
##                                               
## freetime.m2                          0.378    
##                                     (0.549)   
##                                               
## freetime.m3                          0.489    
##                                     (0.512)   
##                                               
## freetime.m4                          0.396    
##                                     (0.526)   
##                                               
## freetime.m5                          0.992    
##                                     (0.610)   
##                                               
## Constant              -0.101        -0.533    
##                      (0.142)        (0.487)   
##                                               
## ----------------------------------------------
## Observations           382            382     
## Log Likelihood       -262.073      -260.355   
## Akaike Inf. Crit.    528.146        532.710   
## ==============================================
## Note:              *p<0.1; **p<0.05; ***p<0.01

Portfolio_logisticRegression

Dorothy

10/12/2020

Build first model with sex as predictor

include free time to get interaction effect

comparing the models