## Install and load libraries needed

needed_packages <- c("foreign", "stats", "lm.beta", "stargazer", "ggplot2", "car")                      
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]    
# Install not installed packages
if(length(not_installed)) install.packages(not_installed) 

library(car)
## Loading required package: carData
library(stats)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.3
library(foreign) #To work with SPSS data
library(lm.beta) #Will allow us to isolate the beta co-efficients
## Warning: package 'lm.beta' was built under R version 4.0.3
library(stargazer)#For formatting outputs/tables
## Warning: package 'stargazer' was built under R version 4.0.3
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
## Load the data####
studentData<- read.csv('Student-performance.csv')
studentData$mG3 <- ifelse((studentData$mG3==0),as.numeric((studentData$mG1+studentData$mG2)/2),studentData$mG3)

Model 1

########Build baseline linear regression model1- Outcome variable: final grades in math, Predictors: weekly study time and extra paid classes
model1=lm(studentData$mG3~studentData$studytime.m+studentData$paid.m)
stargazer::stargazer(model1, type="text")
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                 mG3            
## -----------------------------------------------
## studytime.m                   0.565**          
##                               (0.219)          
##                                                
## paid.myes                      0.263           
##                               (0.371)          
##                                                
## Constant                     9.739***          
##                               (0.486)          
##                                                
## -----------------------------------------------
## Observations                    382            
## R2                             0.021           
## Adjusted R2                    0.015           
## Residual Std. Error      3.569 (df = 379)      
## F Statistic            3.978** (df = 2; 379)   
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
## Assess how model meets key assumptions of linear regression

#Influential Outliers - Cook's distance
cooksd<-sort(cooks.distance(model1))
# plot Cook's distance
plot(cooksd, pch="*", cex=2, main="Influential Obs by Cooks distance")  
abline(h = 4*mean(cooksd, na.rm=T), col="red")  # add cutoff line
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red")  # add labels

#find rows related to influential observations
influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))])  # influential row numbers
stem(influential)
## 
##   The decimal point is 2 digit(s) to the right of the |
## 
##   0 | 256
##   1 | 023567
##   2 | 12335799
##   3 | 446
head(studentData[influential, ])  # influential observations.
##     ï..school sex age address famsize Pstatus Medu Fedu     Mjob    Fjob reason
## 271        GP   M  16       U     LE3       T    4    3  teacher   other course
## 156        GP   F  18       U     GT3       T    2    2  at_home at_home  other
## 131        GP   F  17       U     GT3       T    4    4 services teacher   home
## 230        GP   M  16       U     GT3       T    2    1    other   other course
## 341        GP   M  20       U     GT3       A    3    2 services   other course
## 285        GP   M  17       U     GT3       T    2    1    other   other   home
##     nursery internet guardian.m traveltime.m studytime.m failures.m schoolsup.m
## 271      no      yes     mother            1           1          0          no
## 156     yes      yes     mother            1           3          0          no
## 131     yes      yes     mother            2           1          1          no
## 230     yes      yes     mother            3           1          0          no
## 341     yes       no      other            1           1          0          no
## 285     yes      yes     mother            1           1          3          no
##     famsup.m paid.m activities.m higher.m romantic.m famrel.m freetime.m
## 271       no     no          yes      yes         no        5          4
## 156      yes    yes           no      yes         no        4          3
## 131      yes     no           no      yes         no        4          2
## 230       no     no           no      yes         no        4          3
## 341       no     no          yes      yes         no        5          5
## 285      yes     no           no      yes         no        5          4
##     goout.m Dalc.m Walc.m health.m absences.m mG1 mG2  mG3 guardian.p
## 271       5      1      1        3          0   6   0  3.0     mother
## 156       3      1      2        2          5  18  18 19.0     mother
## 131       4      2      3        2         24  18  18 18.0     mother
## 230       3      1      1        4          6  18  18 18.0     mother
## 341       3      1      1        5          0  17  18 18.0      other
## 285       5      1      2        5          0   5   0  2.5     mother
##     traveltime.p studytime.p failures.p schoolsup.p famsup.p paid.p
## 271            1           1          0          no       no     no
## 156            1           3          0          no      yes     no
## 131            2           1          1          no      yes     no
## 230            3           1          0          no       no     no
## 341            1           1          2          no       no     no
## 285            2           1          3         yes      yes     no
##     activities.p higher.p romantic.p famrel.p freetime.p goout.p Dalc.p Walc.p
## 271          yes      yes         no        5          4       5      1      1
## 156           no      yes         no        4          3       3      1      2
## 131           no      yes         no        4          2       4      2      3
## 230           no      yes         no        4          3       3      1      1
## 341          yes      yes         no        5          5       3      1      1
## 285          yes       no         no        4          5       1      1      1
##     health.p absences.p pG1 pG2 pG3
## 271        3          7  14  14  15
## 156        2          0  18  18  18
## 131        2         30  14  15  16
## 230        4          7  15  16  16
## 341        5          0  14  15  15
## 285        3          0   9   9  10
head(studentData[influential, ]$mG3)  # influential observations - look at the values of mG3
## [1]  3.0 19.0 18.0 18.0 18.0  2.5
head(studentData[influential, ]$studytime.m)  # influential observations - look at the values of studytime
## [1] 1 3 1 1 1 1
head(studentData[influential, ]$paid.m)  # influential observations -look at the values of paid classes
## [1] "no"  "yes" "no"  "no"  "no"  "no"
car::outlierTest(model1) # Bonferonni p-value for most extreme obs - Are there any cases where the outcome variable has an unusual variable for its predictor values?
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferroni p
## 210 2.462646           0.014237           NA
car::leveragePlots(model1) # leverage plots

#Assess homocedasticity 
plot(model1,1)

plot(model1, 3)

#The first plot is the chart of residuals vs fitted values, in the second plot the standardised residuals are on the Y axis. If there is absolutely no heteroscedastity, you should see a completely random, equal distribution of points throughout the range of X axis and a flat red line. We reall want to see that there is no pattern in the residuals and that they are equally spread around the y = 0 line - the dashed line.
#n our case, as you can notice the red line is slightly distorted in the middle on plot 1 but is not a big problem. Looking at the second plot we can see that while it is a problem it is not huge. Only a concern if there are definite patterns.

#Create histogram and  density plot of the residuals
plot(density(resid(model1))) 

#Create a QQ plotqqPlot(model, main="QQ Plot") #qq plot for studentized resid 
car::qqPlot(model1, main="QQ Plot") #qq plot for studentized resid

## [1] 210 215
#Calculate Collinearity
vifmodel<-car::vif(model1)
vifmodel
## studentData$studytime.m      studentData$paid.m 
##                1.026761                1.026761
#Calculate tolerance
1/vifmodel
## studentData$studytime.m      studentData$paid.m 
##               0.9739362               0.9739362

Model2

##Adding gender to investigate a differential effect##

#dummycode gender
studentData$gender=ifelse(studentData$sex == "M", 0, ifelse(studentData$sex == "F", 1, NA))

model2= lm(studentData$mG3~studentData$studytime.m+studentData$paid.m+studentData$gender)
summary(model2)
## 
## Call:
## lm(formula = studentData$mG3 ~ studentData$studytime.m + studentData$paid.m + 
##     studentData$gender)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.2143 -2.4993 -0.3864  2.6148  8.6148 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               9.9293     0.4824  20.583  < 2e-16 ***
## studentData$studytime.m   0.7850     0.2247   3.493 0.000533 ***
## studentData$paid.myes     0.3742     0.3670   1.020 0.308514    
## studentData$gender       -1.3291     0.3780  -3.516 0.000492 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.517 on 378 degrees of freedom
## Multiple R-squared:  0.05158,    Adjusted R-squared:  0.04405 
## F-statistic: 6.852 on 3 and 378 DF,  p-value: 0.0001663
plot(model2)#Will output a number of plots including those needed for checking homocedasticity (the first and third)

#Influential Outliers - Cook's distance
cooksd<-sort(cooks.distance(model2))
# plot Cook's distance
plot(cooksd, pch="*", cex=2, main="Influential Obs by Cooks distance")  
abline(h = 4*mean(cooksd, na.rm=T), col="red")  # add cutoff line
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red")  # add labels

#find rows related to influential observations
influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))])  # influential row numbers
stem(influential)
## 
##   The decimal point is 2 digit(s) to the right of the |
## 
##   0 | 256
##   1 | 023567
##   2 | 01235799
##   3 | 346
head(studentData[influential, ])  # influential observations.
##     ï..school sex age address famsize Pstatus Medu Fedu    Mjob    Fjob reason
## 156        GP   F  18       U     GT3       T    2    2 at_home at_home  other
## 271        GP   M  16       U     LE3       T    4    3 teacher   other course
## 154        GP   F  18       U     GT3       T    2    1   other   other course
## 225        GP   M  16       R     GT3       T    4    4 teacher teacher course
## 285        GP   M  17       U     GT3       T    2    1   other   other   home
## 286        GP   M  17       U     GT3       T    2    1   other   other   home
##     nursery internet guardian.m traveltime.m studytime.m failures.m schoolsup.m
## 156     yes      yes     mother            1           3          0          no
## 271      no      yes     mother            1           1          0          no
## 154      no      yes      other            2           3          0          no
## 225     yes      yes     mother            1           1          0          no
## 285     yes      yes     mother            1           1          3          no
## 286     yes      yes     mother            1           1          3          no
##     famsup.m paid.m activities.m higher.m romantic.m famrel.m freetime.m
## 156      yes    yes           no      yes         no        4          3
## 271       no     no          yes      yes         no        5          4
## 154      yes    yes           no      yes        yes        4          4
## 225       no    yes          yes      yes         no        3          5
## 285      yes     no           no      yes         no        5          4
## 286      yes     no           no      yes         no        5          4
##     goout.m Dalc.m Walc.m health.m absences.m mG1 mG2  mG3 guardian.p
## 156       3      1      2        2          5  18  18 19.0     mother
## 271       5      1      1        3          0   6   0  3.0     mother
## 154       4      1      1        3          0   7   0  3.5      other
## 225       5      2      5        4          8  18  18 18.0     mother
## 285       5      1      2        5          0   5   0  2.5     mother
## 286       5      1      2        5          0   5   0  2.5     mother
##     traveltime.p studytime.p failures.p schoolsup.p famsup.p paid.p
## 156            1           3          0          no      yes     no
## 271            1           1          0          no       no     no
## 154            2           3          0          no      yes     no
## 225            1           1          0          no       no     no
## 285            2           1          3         yes      yes     no
## 286            1           1          0          no      yes     no
##     activities.p higher.p romantic.p famrel.p freetime.p goout.p Dalc.p Walc.p
## 156           no      yes         no        4          3       3      1      2
## 271          yes      yes         no        5          4       5      1      1
## 154           no      yes        yes        4          4       4      1      1
## 225          yes      yes         no        3          5       5      2      5
## 285          yes       no         no        4          5       1      1      1
## 286           no      yes         no        5          4       5      1      2
##     health.p absences.p pG1 pG2 pG3 gender
## 156        2          0  18  18  18      1
## 271        3          7  14  14  15      0
## 154        3         10  12  10  11      1
## 225        4          8  14  14  15      0
## 285        3          0   9   9  10      0
## 286        5         22   9   7   6      0
car::outlierTest(model2) # Bonferonni p-value for most extreme obs - Are there any cases where the outcome variable has an unusual variable for its predictor values?
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferroni p
## 131 2.483349           0.013449           NA
car::leveragePlots(model2) # leverage plots

#Create histogram and a density plot of the residuals
plot(density(resid(model2))) 

#Create a QQ plot qPlot(model, main="QQ Plot") #qq plot for standardized residuals 
car::qqPlot(model2, main="QQ Plot Model 2") #qq plot for standardized residuals 

## [1] 131 210
#Collinearity
vifmodel<-car::vif(model2)
vifmodel
## studentData$studytime.m      studentData$paid.m      studentData$gender 
##                1.112951                1.034470                1.102221
#Tolerance
1/vifmodel
## studentData$studytime.m      studentData$paid.m      studentData$gender 
##               0.8985122               0.9666787               0.9072591

comparing the models

stargazer(model1, model2, type="text")
## 
## ================================================================
##                                 Dependent variable:             
##                     --------------------------------------------
##                                         mG3                     
##                              (1)                   (2)          
## ----------------------------------------------------------------
## studytime.m                0.565**               0.785***       
##                            (0.219)               (0.225)        
##                                                                 
## paid.myes                   0.263                 0.374         
##                            (0.371)               (0.367)        
##                                                                 
## gender                                          -1.329***       
##                                                  (0.378)        
##                                                                 
## Constant                  9.739***               9.929***       
##                            (0.486)               (0.482)        
##                                                                 
## ----------------------------------------------------------------
## Observations                 382                   382          
## R2                          0.021                 0.052         
## Adjusted R2                 0.015                 0.044         
## Residual Std. Error   3.569 (df = 379)       3.517 (df = 378)   
## F Statistic         3.978** (df = 2; 379) 6.852*** (df = 3; 378)
## ================================================================
## Note:                                *p<0.1; **p<0.05; ***p<0.01