## Install and load libraries needed
needed_packages <- c("foreign", "stats", "lm.beta", "stargazer", "ggplot2", "car")
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]
# Install not installed packages
if(length(not_installed)) install.packages(not_installed)
library(car)
## Loading required package: carData
library(stats)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.3
library(foreign) #To work with SPSS data
library(lm.beta) #Will allow us to isolate the beta co-efficients
## Warning: package 'lm.beta' was built under R version 4.0.3
library(stargazer)#For formatting outputs/tables
## Warning: package 'stargazer' was built under R version 4.0.3
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
## Load the data####
studentData<- read.csv('Student-performance.csv')
studentData$mG3 <- ifelse((studentData$mG3==0),as.numeric((studentData$mG1+studentData$mG2)/2),studentData$mG3)
########Build baseline linear regression model1- Outcome variable: final grades in math, Predictors: weekly study time and extra paid classes
model1=lm(studentData$mG3~studentData$studytime.m+studentData$paid.m)
stargazer::stargazer(model1, type="text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## mG3
## -----------------------------------------------
## studytime.m 0.565**
## (0.219)
##
## paid.myes 0.263
## (0.371)
##
## Constant 9.739***
## (0.486)
##
## -----------------------------------------------
## Observations 382
## R2 0.021
## Adjusted R2 0.015
## Residual Std. Error 3.569 (df = 379)
## F Statistic 3.978** (df = 2; 379)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
## Assess how model meets key assumptions of linear regression
#Influential Outliers - Cook's distance
cooksd<-sort(cooks.distance(model1))
# plot Cook's distance
plot(cooksd, pch="*", cex=2, main="Influential Obs by Cooks distance")
abline(h = 4*mean(cooksd, na.rm=T), col="red") # add cutoff line
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red") # add labels
#find rows related to influential observations
influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))]) # influential row numbers
stem(influential)
##
## The decimal point is 2 digit(s) to the right of the |
##
## 0 | 256
## 1 | 023567
## 2 | 12335799
## 3 | 446
head(studentData[influential, ]) # influential observations.
## ï..school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason
## 271 GP M 16 U LE3 T 4 3 teacher other course
## 156 GP F 18 U GT3 T 2 2 at_home at_home other
## 131 GP F 17 U GT3 T 4 4 services teacher home
## 230 GP M 16 U GT3 T 2 1 other other course
## 341 GP M 20 U GT3 A 3 2 services other course
## 285 GP M 17 U GT3 T 2 1 other other home
## nursery internet guardian.m traveltime.m studytime.m failures.m schoolsup.m
## 271 no yes mother 1 1 0 no
## 156 yes yes mother 1 3 0 no
## 131 yes yes mother 2 1 1 no
## 230 yes yes mother 3 1 0 no
## 341 yes no other 1 1 0 no
## 285 yes yes mother 1 1 3 no
## famsup.m paid.m activities.m higher.m romantic.m famrel.m freetime.m
## 271 no no yes yes no 5 4
## 156 yes yes no yes no 4 3
## 131 yes no no yes no 4 2
## 230 no no no yes no 4 3
## 341 no no yes yes no 5 5
## 285 yes no no yes no 5 4
## goout.m Dalc.m Walc.m health.m absences.m mG1 mG2 mG3 guardian.p
## 271 5 1 1 3 0 6 0 3.0 mother
## 156 3 1 2 2 5 18 18 19.0 mother
## 131 4 2 3 2 24 18 18 18.0 mother
## 230 3 1 1 4 6 18 18 18.0 mother
## 341 3 1 1 5 0 17 18 18.0 other
## 285 5 1 2 5 0 5 0 2.5 mother
## traveltime.p studytime.p failures.p schoolsup.p famsup.p paid.p
## 271 1 1 0 no no no
## 156 1 3 0 no yes no
## 131 2 1 1 no yes no
## 230 3 1 0 no no no
## 341 1 1 2 no no no
## 285 2 1 3 yes yes no
## activities.p higher.p romantic.p famrel.p freetime.p goout.p Dalc.p Walc.p
## 271 yes yes no 5 4 5 1 1
## 156 no yes no 4 3 3 1 2
## 131 no yes no 4 2 4 2 3
## 230 no yes no 4 3 3 1 1
## 341 yes yes no 5 5 3 1 1
## 285 yes no no 4 5 1 1 1
## health.p absences.p pG1 pG2 pG3
## 271 3 7 14 14 15
## 156 2 0 18 18 18
## 131 2 30 14 15 16
## 230 4 7 15 16 16
## 341 5 0 14 15 15
## 285 3 0 9 9 10
head(studentData[influential, ]$mG3) # influential observations - look at the values of mG3
## [1] 3.0 19.0 18.0 18.0 18.0 2.5
head(studentData[influential, ]$studytime.m) # influential observations - look at the values of studytime
## [1] 1 3 1 1 1 1
head(studentData[influential, ]$paid.m) # influential observations -look at the values of paid classes
## [1] "no" "yes" "no" "no" "no" "no"
car::outlierTest(model1) # Bonferonni p-value for most extreme obs - Are there any cases where the outcome variable has an unusual variable for its predictor values?
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 210 2.462646 0.014237 NA
car::leveragePlots(model1) # leverage plots
#Assess homocedasticity
plot(model1,1)
plot(model1, 3)
#The first plot is the chart of residuals vs fitted values, in the second plot the standardised residuals are on the Y axis. If there is absolutely no heteroscedastity, you should see a completely random, equal distribution of points throughout the range of X axis and a flat red line. We reall want to see that there is no pattern in the residuals and that they are equally spread around the y = 0 line - the dashed line.
#n our case, as you can notice the red line is slightly distorted in the middle on plot 1 but is not a big problem. Looking at the second plot we can see that while it is a problem it is not huge. Only a concern if there are definite patterns.
#Create histogram and density plot of the residuals
plot(density(resid(model1)))
#Create a QQ plotqqPlot(model, main="QQ Plot") #qq plot for studentized resid
car::qqPlot(model1, main="QQ Plot") #qq plot for studentized resid
## [1] 210 215
#Calculate Collinearity
vifmodel<-car::vif(model1)
vifmodel
## studentData$studytime.m studentData$paid.m
## 1.026761 1.026761
#Calculate tolerance
1/vifmodel
## studentData$studytime.m studentData$paid.m
## 0.9739362 0.9739362
##Adding gender to investigate a differential effect##
#dummycode gender
studentData$gender=ifelse(studentData$sex == "M", 0, ifelse(studentData$sex == "F", 1, NA))
model2= lm(studentData$mG3~studentData$studytime.m+studentData$paid.m+studentData$gender)
summary(model2)
##
## Call:
## lm(formula = studentData$mG3 ~ studentData$studytime.m + studentData$paid.m +
## studentData$gender)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.2143 -2.4993 -0.3864 2.6148 8.6148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.9293 0.4824 20.583 < 2e-16 ***
## studentData$studytime.m 0.7850 0.2247 3.493 0.000533 ***
## studentData$paid.myes 0.3742 0.3670 1.020 0.308514
## studentData$gender -1.3291 0.3780 -3.516 0.000492 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.517 on 378 degrees of freedom
## Multiple R-squared: 0.05158, Adjusted R-squared: 0.04405
## F-statistic: 6.852 on 3 and 378 DF, p-value: 0.0001663
plot(model2)#Will output a number of plots including those needed for checking homocedasticity (the first and third)
#Influential Outliers - Cook's distance
cooksd<-sort(cooks.distance(model2))
# plot Cook's distance
plot(cooksd, pch="*", cex=2, main="Influential Obs by Cooks distance")
abline(h = 4*mean(cooksd, na.rm=T), col="red") # add cutoff line
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red") # add labels
#find rows related to influential observations
influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))]) # influential row numbers
stem(influential)
##
## The decimal point is 2 digit(s) to the right of the |
##
## 0 | 256
## 1 | 023567
## 2 | 01235799
## 3 | 346
head(studentData[influential, ]) # influential observations.
## ï..school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason
## 156 GP F 18 U GT3 T 2 2 at_home at_home other
## 271 GP M 16 U LE3 T 4 3 teacher other course
## 154 GP F 18 U GT3 T 2 1 other other course
## 225 GP M 16 R GT3 T 4 4 teacher teacher course
## 285 GP M 17 U GT3 T 2 1 other other home
## 286 GP M 17 U GT3 T 2 1 other other home
## nursery internet guardian.m traveltime.m studytime.m failures.m schoolsup.m
## 156 yes yes mother 1 3 0 no
## 271 no yes mother 1 1 0 no
## 154 no yes other 2 3 0 no
## 225 yes yes mother 1 1 0 no
## 285 yes yes mother 1 1 3 no
## 286 yes yes mother 1 1 3 no
## famsup.m paid.m activities.m higher.m romantic.m famrel.m freetime.m
## 156 yes yes no yes no 4 3
## 271 no no yes yes no 5 4
## 154 yes yes no yes yes 4 4
## 225 no yes yes yes no 3 5
## 285 yes no no yes no 5 4
## 286 yes no no yes no 5 4
## goout.m Dalc.m Walc.m health.m absences.m mG1 mG2 mG3 guardian.p
## 156 3 1 2 2 5 18 18 19.0 mother
## 271 5 1 1 3 0 6 0 3.0 mother
## 154 4 1 1 3 0 7 0 3.5 other
## 225 5 2 5 4 8 18 18 18.0 mother
## 285 5 1 2 5 0 5 0 2.5 mother
## 286 5 1 2 5 0 5 0 2.5 mother
## traveltime.p studytime.p failures.p schoolsup.p famsup.p paid.p
## 156 1 3 0 no yes no
## 271 1 1 0 no no no
## 154 2 3 0 no yes no
## 225 1 1 0 no no no
## 285 2 1 3 yes yes no
## 286 1 1 0 no yes no
## activities.p higher.p romantic.p famrel.p freetime.p goout.p Dalc.p Walc.p
## 156 no yes no 4 3 3 1 2
## 271 yes yes no 5 4 5 1 1
## 154 no yes yes 4 4 4 1 1
## 225 yes yes no 3 5 5 2 5
## 285 yes no no 4 5 1 1 1
## 286 no yes no 5 4 5 1 2
## health.p absences.p pG1 pG2 pG3 gender
## 156 2 0 18 18 18 1
## 271 3 7 14 14 15 0
## 154 3 10 12 10 11 1
## 225 4 8 14 14 15 0
## 285 3 0 9 9 10 0
## 286 5 22 9 7 6 0
car::outlierTest(model2) # Bonferonni p-value for most extreme obs - Are there any cases where the outcome variable has an unusual variable for its predictor values?
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 131 2.483349 0.013449 NA
car::leveragePlots(model2) # leverage plots
#Create histogram and a density plot of the residuals
plot(density(resid(model2)))
#Create a QQ plot qPlot(model, main="QQ Plot") #qq plot for standardized residuals
car::qqPlot(model2, main="QQ Plot Model 2") #qq plot for standardized residuals
## [1] 131 210
#Collinearity
vifmodel<-car::vif(model2)
vifmodel
## studentData$studytime.m studentData$paid.m studentData$gender
## 1.112951 1.034470 1.102221
#Tolerance
1/vifmodel
## studentData$studytime.m studentData$paid.m studentData$gender
## 0.8985122 0.9666787 0.9072591
stargazer(model1, model2, type="text")
##
## ================================================================
## Dependent variable:
## --------------------------------------------
## mG3
## (1) (2)
## ----------------------------------------------------------------
## studytime.m 0.565** 0.785***
## (0.219) (0.225)
##
## paid.myes 0.263 0.374
## (0.371) (0.367)
##
## gender -1.329***
## (0.378)
##
## Constant 9.739*** 9.929***
## (0.486) (0.482)
##
## ----------------------------------------------------------------
## Observations 382 382
## R2 0.021 0.052
## Adjusted R2 0.015 0.044
## Residual Std. Error 3.569 (df = 379) 3.517 (df = 378)
## F Statistic 3.978** (df = 2; 379) 6.852*** (df = 3; 378)
## ================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01