#install.packages("plm")
#install.packages("stargazer")
#install.packages("mfx")
#install.packages("pROC")
Import libraries
library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
library(plm)
## Warning: package 'plm' was built under R version 4.2.3
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.2.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.2.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(foreign)
library(mfx)
## Warning: package 'mfx' was built under R version 4.2.3
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.2.3
## Loading required package: MASS
## Loading required package: betareg
## Warning: package 'betareg' was built under R version 4.2.3
library(pROC)
## Warning: package 'pROC' was built under R version 4.2.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
MSc_fail <- read_excel("C:\\Users\\admin\\Desktop\\ExcelForPractice\\MSc_fail.xls")
head(MSc_fail, 6)
## # A tibble: 6 x 13
## Age Female Fail WorkExperi~1 English Count~2 PGDeg~3 Agrade Below~4 Year2~5
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 22 1 0 0 0 10 0 0 0 0
## 2 24 0 1 0 1 8 0 0 0 0
## 3 28 0 0 0 1 10 0 0 0 0
## 4 23 0 0 0 1 8 0 0 0 0
## 5 23 0 0 1 1 8 0 0 0 0
## 6 22 1 0 1 1 8 0 0 0 0
## # ... with 3 more variables: Year2005 <dbl>, Year2006 <dbl>, Year2007 <dbl>,
## # and abbreviated variable names 1: WorkExperience, 2: `Country Code`,
## # 3: PGDegree, 4: BelowBGrade, 5: Year2004
Run Logit model:
h = glm(MSc_fail$Fail ~ MSc_fail$Age + MSc_fail$English +
MSc_fail$Female + MSc_fail$`WorkExperience` +
MSc_fail$Agrade + MSc_fail$BelowBGrade +
MSc_fail$'PGDegree' + MSc_fail$Year2004 +
MSc_fail$Year2005 + MSc_fail$Year2006 + MSc_fail$Year2007,
family = binomial(link = 'logit'))
summary(h)
##
## Call:
## glm(formula = MSc_fail$Fail ~ MSc_fail$Age + MSc_fail$English +
## MSc_fail$Female + MSc_fail$WorkExperience + MSc_fail$Agrade +
## MSc_fail$BelowBGrade + MSc_fail$PGDegree + MSc_fail$Year2004 +
## MSc_fail$Year2005 + MSc_fail$Year2006 + MSc_fail$Year2007,
## family = binomial(link = "logit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2275 -0.5870 -0.4228 -0.2980 2.5579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.25637 1.07300 -2.103 0.03548 *
## MSc_fail$Age 0.01101 0.03813 0.289 0.77275
## MSc_fail$English -0.16512 0.28295 -0.584 0.55952
## MSc_fail$Female -0.33389 0.34923 -0.956 0.33902
## MSc_fail$WorkExperience -0.56877 0.28847 -1.972 0.04865 *
## MSc_fail$Agrade -1.08503 0.49110 -2.209 0.02715 *
## MSc_fail$BelowBGrade 0.56235 0.37351 1.506 0.13217
## MSc_fail$PGDegree 0.21208 0.41990 0.505 0.61350
## MSc_fail$Year2004 0.65321 0.50092 1.304 0.19223
## MSc_fail$Year2005 -0.18382 0.58794 -0.313 0.75454
## MSc_fail$Year2006 1.24658 0.47365 2.632 0.00849 **
## MSc_fail$Year2007 0.85042 0.49705 1.711 0.08710 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 393.92 on 499 degrees of freedom
## Residual deviance: 359.43 on 488 degrees of freedom
## AIC: 383.43
##
## Number of Fisher Scoring iterations: 5
-> observations in the dataset for each series with no missing observations. The significant independent variables are Work Experience, Agrade, Year2006 and Year2007. Some insights: - Students having work experience and A grades have lower chance of failure. - Students in 2006 and 2007 have higher chance of failure.
logLik(h)
## 'log Lik.' -179.7167 (df=12)
Plot:
par(cex.axis =1.5)
plot(h$fitted.values , type ="l", xlab ="",ylab ="")
-> A test on model adequacy is to produce a set of in-sample
forecasts
Marginal Effect:
logitmfx(Fail ~ Age+ English+ Female+ `WorkExperience` + Agrade+ BelowBGrade+ `PGDegree`+ Year2004+ Year2005+ Year2006+ Year2007, data = MSc_fail)
## Call:
## logitmfx(formula = Fail ~ Age + English + Female + WorkExperience +
## Agrade + BelowBGrade + PGDegree + Year2004 + Year2005 + Year2006 +
## Year2007, data = MSc_fail)
##
## Marginal Effects:
## dF/dx Std. Err. z P>|z|
## Age 0.0010592 0.0036673 0.2888 0.772713
## English -0.0159936 0.0276090 -0.5793 0.562394
## Female -0.0305298 0.0302175 -1.0103 0.312335
## WorkExperience -0.0568921 0.0297591 -1.9118 0.055908 .
## Agrade -0.0826957 0.0279924 -2.9542 0.003135 **
## BelowBGrade 0.0642508 0.0500076 1.2848 0.198855
## PGDegree 0.0216560 0.0454387 0.4766 0.633648
## Year2004 0.0734098 0.0642854 1.1419 0.253481
## Year2005 -0.0169395 0.0518026 -0.3270 0.743667
## Year2006 0.1606837 0.0757138 2.1223 0.033817 *
## Year2007 0.1001326 0.0691120 1.4488 0.147381
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## dF/dx is for discrete change for the following variables:
##
## [1] "English" "Female" "WorkExperience" "Agrade"
## [5] "BelowBGrade" "PGDegree" "Year2004" "Year2005"
## [9] "Year2006" "Year2007"
-> Installing and including it, we simply apply the function margins on the two model objects and R should generate the table of marginal effects and corresponding statistics
Predicting using estimate model:
p<-predict(h, newdata=MSc_fail, type="response")
threshold=0.5
predicted_values<-ifelse(p>threshold,1,0)
actual_values<-MSc_fail$Fail
conf_matrix<-table(predicted_values,actual_values)
conf_matrix
## actual_values
## predicted_values 0 1
## 0 432 67
## 1 1 0
-> All predictions are false for failed students, which is not balanced at all.
Pseudo R2 or McFadden R2:
k<-glm(Fail~1,family=binomial(link="logit"), data = MSc_fail)
logLik(h)
## 'log Lik.' -179.7167 (df=12)
logLik(k)
## 'log Lik.' -196.9602 (df=1)
1-logLik(h)/logLik(k)
## 'log Lik.' 0.08754833 (df=12)
ROC curve:
test_roc = roc(MSc_fail$Fail ~ p, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
as.numeric(test_roc$auc)
## [1] 0.7186412
ROC curve: The Receiver Operating Characteristic (ROC) curve is a graphical representation of the performance of a binary classifier. It illustrates the trade-off between the true positive rate (sensitivity) and the false positive rate (1 - specificity) at various classification thresholds