#install.packages("plm")
#install.packages("stargazer")
#install.packages("mfx")
#install.packages("pROC")

Import libraries

library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
library(plm)
## Warning: package 'plm' was built under R version 4.2.3
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.2.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.2.2
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(foreign)
library(mfx)
## Warning: package 'mfx' was built under R version 4.2.3
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.2.3
## Loading required package: MASS
## Loading required package: betareg
## Warning: package 'betareg' was built under R version 4.2.3
library(pROC)
## Warning: package 'pROC' was built under R version 4.2.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
MSc_fail <- read_excel("C:\\Users\\admin\\Desktop\\ExcelForPractice\\MSc_fail.xls")
head(MSc_fail, 6)
## # A tibble: 6 x 13
##     Age Female  Fail WorkExperi~1 English Count~2 PGDeg~3 Agrade Below~4 Year2~5
##   <dbl>  <dbl> <dbl>        <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>
## 1    22      1     0            0       0      10       0      0       0       0
## 2    24      0     1            0       1       8       0      0       0       0
## 3    28      0     0            0       1      10       0      0       0       0
## 4    23      0     0            0       1       8       0      0       0       0
## 5    23      0     0            1       1       8       0      0       0       0
## 6    22      1     0            1       1       8       0      0       0       0
## # ... with 3 more variables: Year2005 <dbl>, Year2006 <dbl>, Year2007 <dbl>,
## #   and abbreviated variable names 1: WorkExperience, 2: `Country Code`,
## #   3: PGDegree, 4: BelowBGrade, 5: Year2004

Run Logit model:

h = glm(MSc_fail$Fail ~ MSc_fail$Age + MSc_fail$English +
          MSc_fail$Female + MSc_fail$`WorkExperience` +
          MSc_fail$Agrade + MSc_fail$BelowBGrade +
          MSc_fail$'PGDegree' + MSc_fail$Year2004 +
          MSc_fail$Year2005 + MSc_fail$Year2006 + MSc_fail$Year2007,
        family = binomial(link = 'logit'))
summary(h)
## 
## Call:
## glm(formula = MSc_fail$Fail ~ MSc_fail$Age + MSc_fail$English + 
##     MSc_fail$Female + MSc_fail$WorkExperience + MSc_fail$Agrade + 
##     MSc_fail$BelowBGrade + MSc_fail$PGDegree + MSc_fail$Year2004 + 
##     MSc_fail$Year2005 + MSc_fail$Year2006 + MSc_fail$Year2007, 
##     family = binomial(link = "logit"))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2275  -0.5870  -0.4228  -0.2980   2.5579  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)   
## (Intercept)             -2.25637    1.07300  -2.103  0.03548 * 
## MSc_fail$Age             0.01101    0.03813   0.289  0.77275   
## MSc_fail$English        -0.16512    0.28295  -0.584  0.55952   
## MSc_fail$Female         -0.33389    0.34923  -0.956  0.33902   
## MSc_fail$WorkExperience -0.56877    0.28847  -1.972  0.04865 * 
## MSc_fail$Agrade         -1.08503    0.49110  -2.209  0.02715 * 
## MSc_fail$BelowBGrade     0.56235    0.37351   1.506  0.13217   
## MSc_fail$PGDegree        0.21208    0.41990   0.505  0.61350   
## MSc_fail$Year2004        0.65321    0.50092   1.304  0.19223   
## MSc_fail$Year2005       -0.18382    0.58794  -0.313  0.75454   
## MSc_fail$Year2006        1.24658    0.47365   2.632  0.00849 **
## MSc_fail$Year2007        0.85042    0.49705   1.711  0.08710 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 393.92  on 499  degrees of freedom
## Residual deviance: 359.43  on 488  degrees of freedom
## AIC: 383.43
## 
## Number of Fisher Scoring iterations: 5

-> observations in the dataset for each series with no missing observations. The significant independent variables are Work Experience, Agrade, Year2006 and Year2007. Some insights: - Students having work experience and A grades have lower chance of failure. - Students in 2006 and 2007 have higher chance of failure.

logLik(h)
## 'log Lik.' -179.7167 (df=12)

Plot:

par(cex.axis =1.5)
plot(h$fitted.values , type ="l", xlab ="",ylab ="")

-> A test on model adequacy is to produce a set of in-sample forecasts

Marginal Effect:

logitmfx(Fail ~ Age+ English+ Female+ `WorkExperience` + Agrade+ BelowBGrade+ `PGDegree`+ Year2004+ Year2005+ Year2006+ Year2007, data = MSc_fail)
## Call:
## logitmfx(formula = Fail ~ Age + English + Female + WorkExperience + 
##     Agrade + BelowBGrade + PGDegree + Year2004 + Year2005 + Year2006 + 
##     Year2007, data = MSc_fail)
## 
## Marginal Effects:
##                     dF/dx  Std. Err.       z    P>|z|   
## Age             0.0010592  0.0036673  0.2888 0.772713   
## English        -0.0159936  0.0276090 -0.5793 0.562394   
## Female         -0.0305298  0.0302175 -1.0103 0.312335   
## WorkExperience -0.0568921  0.0297591 -1.9118 0.055908 . 
## Agrade         -0.0826957  0.0279924 -2.9542 0.003135 **
## BelowBGrade     0.0642508  0.0500076  1.2848 0.198855   
## PGDegree        0.0216560  0.0454387  0.4766 0.633648   
## Year2004        0.0734098  0.0642854  1.1419 0.253481   
## Year2005       -0.0169395  0.0518026 -0.3270 0.743667   
## Year2006        0.1606837  0.0757138  2.1223 0.033817 * 
## Year2007        0.1001326  0.0691120  1.4488 0.147381   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## dF/dx is for discrete change for the following variables:
## 
##  [1] "English"        "Female"         "WorkExperience" "Agrade"        
##  [5] "BelowBGrade"    "PGDegree"       "Year2004"       "Year2005"      
##  [9] "Year2006"       "Year2007"

-> Installing and including it, we simply apply the function margins on the two model objects and R should generate the table of marginal effects and corresponding statistics

Predicting using estimate model:

p<-predict(h, newdata=MSc_fail, type="response")
threshold=0.5
predicted_values<-ifelse(p>threshold,1,0)
actual_values<-MSc_fail$Fail
conf_matrix<-table(predicted_values,actual_values)
conf_matrix
##                 actual_values
## predicted_values   0   1
##                0 432  67
##                1   1   0

-> All predictions are false for failed students, which is not balanced at all.

Pseudo R2 or McFadden R2:

k<-glm(Fail~1,family=binomial(link="logit"), data = MSc_fail)
logLik(h)
## 'log Lik.' -179.7167 (df=12)
logLik(k)
## 'log Lik.' -196.9602 (df=1)
1-logLik(h)/logLik(k)
## 'log Lik.' 0.08754833 (df=12)

ROC curve:

test_roc = roc(MSc_fail$Fail ~ p, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

as.numeric(test_roc$auc)
## [1] 0.7186412

ROC curve: The Receiver Operating Characteristic (ROC) curve is a graphical representation of the performance of a binary classifier. It illustrates the trade-off between the true positive rate (sensitivity) and the false positive rate (1 - specificity) at various classification thresholds