rm(list = ls())
setwd("C:/00 Pablo/Programacion/R/Apunte 101")
Carga de paquetes y base de datos.
library("readxl") # for importing data
library("stargazer") # for formating regression tables
library("writexl") # for exporting tables
Note: for this exercise we will use the TeachingRatings Data from Stock and Watson’s Introduction to Econometrics. See the following link for more information: https://www.princeton.edu/~mwatson/Stock-Watson_3u/Students/EE_Datasets/TeachingRatings_Description.pdf
data_class1 <- read_excel("data/TeachingRatings_DataTasks.xls", sheet = "Data")
summary(data_class1)
## minority age female onecredit
## Min. :0.0000 Min. :29.00 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:42.00 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :48.00 Median :0.0000 Median :0.00000
## Mean :0.1401 Mean :48.35 Mean :0.4224 Mean :0.05819
## 3rd Qu.:0.0000 3rd Qu.:57.00 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :73.00 Max. :1.0000 Max. :1.00000
## beauty course_eval intro nnenglish
## Min. :-1.450494 Min. : 2.100 Min. :0.0000 Min. :0.00000
## 1st Qu.:-0.656269 1st Qu.: 3.600 1st Qu.:0.0000 1st Qu.:0.00000
## Median :-0.068014 Median : 4.000 Median :0.0000 Median :0.00000
## Mean : 0.001269 Mean : 4.011 Mean :0.3384 Mean :0.06034
## 3rd Qu.: 0.550288 3rd Qu.: 4.400 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. : 1.970023 Max. :10.000 Max. :1.0000 Max. :1.00000
We will study the effects that different variables have on the evaluation of some courses.
We start with a visual inspection of the data, in order to identify if there are outliers or something else that we should know before running a regression.
plot(data_class1$age, data_class1$course_eval,
main = "Scatter Plot of Teacher's Age and Course Evaluation",
xlab = "Age",
ylab = "Course Evaluation",
col = "blue", # dots color
pch = 19) # type of dots
Since there seems to be an outlier in Course Evaluation score, before moving to the regression, we will do a visual analysis of that variable alone to understand better its dispersion.
Histogram of Course Evaluation.
hist(data_class1$course_eval,
main = "Histogram of Course Evaluation")
Box plot of Course Evaluation.
boxplot(data_class1$course_eval,
main = "Box Plot of Course Evaluation")
Considering that the course evaluation variable should have only values from 1 to 5, the observation with a course evaluation equal to 10 should be treated with attention.
Regression 1. Model without outlier.
Initially we exclude those observations that have course evaluations higher than 5.
data_class1_clean <- data_class1[data_class1$course_eval<=5,]
plot(data_class1_clean$beauty, data_class1_clean$course_eval,
main = "Scatter Plot of Beauty and Course Evaluation (without outlier)",
xlab = "Beauty",
ylab = "Course Evaluation",
col = "blue", # dots color
pch = 19) # type of dots
Now we run a linear regression without the identified outlier and a few covariates.
reg_1 <- lm(course_eval~beauty + female + minority + nnenglish + intro + onecredit,
data = data_class1_clean)
First output of the regression.
reg_1
##
## Call:
## lm(formula = course_eval ~ beauty + female + minority + nnenglish +
## intro + onecredit, data = data_class1_clean)
##
## Coefficients:
## (Intercept) beauty female minority nnenglish intro
## 4.06829 0.16561 -0.17348 -0.16662 -0.24416 0.01133
## onecredit
## 0.63453
Summary of the output.
summary(reg_1)
##
## Call:
## lm(formula = course_eval ~ beauty + female + minority + nnenglish +
## intro + onecredit, data = data_class1_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.84611 -0.33300 0.04912 0.37672 1.05872
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.06829 0.03754 108.364 < 2e-16 ***
## beauty 0.16561 0.03073 5.389 1.14e-07 ***
## female -0.17348 0.04928 -3.520 0.000474 ***
## minority -0.16662 0.07628 -2.184 0.029448 *
## nnenglish -0.24416 0.10696 -2.283 0.022903 *
## intro 0.01133 0.05448 0.208 0.835413
## onecredit 0.63453 0.11134 5.699 2.17e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5135 on 456 degrees of freedom
## Multiple R-squared: 0.1546, Adjusted R-squared: 0.1435
## F-statistic: 13.9 on 6 and 456 DF, p-value: 1.529e-14
Summary of only the coefficients of the output
reg_1_df <- summary(reg_1)$coefficients
reg_1_df <- data.frame(Covariates = rownames(reg_1_df), reg_1_df, row.names = NULL)
reg_1_df
## Covariates Estimate Std..Error t.value Pr...t..
## 1 (Intercept) 4.06828885 0.03754295 108.3635839 0.000000e+00
## 2 beauty 0.16560995 0.03072958 5.3892689 1.136656e-07
## 3 female -0.17347744 0.04927908 -3.5203060 4.743781e-04
## 4 minority -0.16661542 0.07627840 -2.1843068 2.944834e-02
## 5 nnenglish -0.24416127 0.10695785 -2.2827803 2.290280e-02
## 6 intro 0.01132502 0.05447785 0.2078831 8.354131e-01
## 7 onecredit 0.63452703 0.11133910 5.6990496 2.165218e-08
Prediction of values according to the regression.
data_class1_clean$course_eval_predicted <- predict(reg_1,
data.frame(data_class1_clean))
plot(data_class1_clean$course_eval, data_class1_clean$course_eval_predicted,
main = "Scatter Plot of real and predicted values for Course Evaluation",
xlab = "Real Course Evaluation",
ylab = "Predicted Course Evaluation",
col = "blue", # dots color
pch = 19)
After the linear models, we will use the logit and probit models to predict a binary variable. We will analyze which variables help better to predict the probability of the courses to be teached by a non native english speaker.
First, we check that the nnenglish variable is actually binary.
table(data_class1_clean$nnenglish)
##
## 0 1
## 435 28
Regression 2. Logit model without outlier.
Now, we run the logit model with a few explanatory variables.
logit_model <- glm(nnenglish ~ course_eval + beauty + female + minority + intro + onecredit,
data = data_class1_clean, family = binomial(link = "logit"))
summary(logit_model)
##
## Call:
## glm(formula = nnenglish ~ course_eval + beauty + female + minority +
## intro + onecredit, family = binomial(link = "logit"), data = data_class1_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5923 1.5994 0.370 0.71114
## course_eval -0.9074 0.4107 -2.209 0.02715 *
## beauty 0.2661 0.3151 0.845 0.39833
## female -0.3109 0.4699 -0.662 0.50814
## minority 2.5753 0.4532 5.682 1.33e-08 ***
## intro -2.7802 0.9251 -3.005 0.00265 **
## onecredit 1.1685 1.3031 0.897 0.36987
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 211.38 on 462 degrees of freedom
## Residual deviance: 158.63 on 456 degrees of freedom
## AIC: 172.63
##
## Number of Fisher Scoring iterations: 7
Regression 3. Probit model without outlier.
Now we do the same but with a probit model.
probit_model <- glm(nnenglish ~ course_eval + beauty + female + minority + intro + onecredit,
data = data_class1_clean, family = binomial(link = "probit"))
summary(probit_model)
##
## Call:
## glm(formula = nnenglish ~ course_eval + beauty + female + minority +
## intro + onecredit, family = binomial(link = "probit"), data = data_class1_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.2116 0.7958 0.266 0.7904
## course_eval -0.4728 0.2016 -2.346 0.0190 *
## beauty 0.0812 0.1529 0.531 0.5952
## female -0.1034 0.2283 -0.453 0.6507
## minority 1.3148 0.2484 5.294 1.2e-07 ***
## intro -1.2601 0.4059 -3.105 0.0019 **
## onecredit 0.4048 0.6296 0.643 0.5202
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 211.38 on 462 degrees of freedom
## Residual deviance: 160.26 on 456 degrees of freedom
## AIC: 174.26
##
## Number of Fisher Scoring iterations: 8
Average Marginal Effects - AME.
We can get the average marginal effect.
# install.packages("margins")
library(margins)
## Warning: package 'margins' was built under R version 4.4.3
AME for logit model.
mfx_logit_df <- margins(logit_model)
mfx_logit_df <- summary(mfx_logit_df)
mfx_logit_df
## factor AME SE z p lower upper
## beauty 0.0122 0.0145 0.8428 0.3993 -0.0162 0.0406
## course_eval -0.0416 0.0191 -2.1787 0.0294 -0.0790 -0.0042
## female -0.0143 0.0215 -0.6616 0.5082 -0.0565 0.0280
## intro -0.1274 0.0433 -2.9427 0.0033 -0.2123 -0.0426
## minority 0.1180 0.0206 5.7406 0.0000 0.0777 0.1583
## onecredit 0.0536 0.0601 0.8914 0.3727 -0.0642 0.1713
AME for probit model.
mfx_probit_df <- margins(probit_model)
mfx_probit_df <- summary(mfx_probit_df)
mfx_probit_df
## factor AME SE z p lower upper
## beauty 0.0076 0.0143 0.5306 0.5957 -0.0204 0.0356
## course_eval -0.0442 0.0192 -2.3055 0.0211 -0.0817 -0.0066
## female -0.0097 0.0213 -0.4528 0.6507 -0.0515 0.0321
## intro -0.1177 0.0388 -3.0358 0.0024 -0.1937 -0.0417
## minority 0.1228 0.0239 5.1457 0.0000 0.0760 0.1696
## onecredit 0.0378 0.0590 0.6412 0.5214 -0.0778 0.1534
Marginal Effects at the Mean - MEM.
MEM for logit model.
mfx_logit_at_mean_df <- margins(logit_model, at = list(
course_eval = mean(data_class1_clean$course_eval, na.rm = TRUE),
beauty = mean(data_class1_clean$beauty, na.rm = TRUE),
female = mean(data_class1_clean$female, na.rm = TRUE),
minority = mean(data_class1_clean$minority, na.rm = TRUE),
intro = mean(data_class1_clean$intro, na.rm = TRUE),
onecredit = mean(data_class1_clean$onecredit, na.rm = TRUE)
))
mfx_logit_at_mean_df <- summary(mfx_logit_at_mean_df)[, c("factor", "AME", "SE", "z", "p", "lower", "upper")]
mfx_logit_at_mean_df
## factor AME SE z p lower upper
## beauty 0.0064 0.0076 0.8380 0.4020 -0.0085 0.0212
## course_eval -0.0217 0.0107 -2.0189 0.0435 -0.0427 -0.0006
## female -0.0074 0.0115 -0.6451 0.5189 -0.0300 0.0151
## intro -0.0664 0.0184 -3.6051 0.0003 -0.1025 -0.0303
## minority 0.0615 0.0189 3.2569 0.0011 0.0245 0.0985
## onecredit 0.0279 0.0294 0.9482 0.3430 -0.0298 0.0856
MEM for probit model.
mfx_probit_at_mean_df <- margins(probit_model, at = list(
course_eval = mean(data_class1_clean$course_eval, na.rm = TRUE),
beauty = mean(data_class1_clean$beauty, na.rm = TRUE),
female = mean(data_class1_clean$female, na.rm = TRUE),
minority = mean(data_class1_clean$minority, na.rm = TRUE),
intro = mean(data_class1_clean$intro, na.rm = TRUE),
onecredit = mean(data_class1_clean$onecredit, na.rm = TRUE)
))
mfx_probit_at_mean_df <- summary(mfx_probit_at_mean_df)[, c("factor", "AME", "SE", "z", "p", "lower", "upper")]
mfx_probit_at_mean_df
## factor AME SE z p lower upper
## beauty 0.0049 0.0092 0.5305 0.5957 -0.0132 0.0230
## course_eval -0.0285 0.0131 -2.1759 0.0296 -0.0542 -0.0028
## female -0.0062 0.0140 -0.4449 0.6564 -0.0337 0.0212
## intro -0.0759 0.0204 -3.7243 0.0002 -0.1159 -0.0360
## minority 0.0792 0.0223 3.5454 0.0004 0.0354 0.1230
## onecredit 0.0244 0.0367 0.6650 0.5060 -0.0475 0.0963
Prediction of probability from the logit and the probit models.
data_class1_clean$logit_pred <- predict(logit_model, type = "response")
data_class1_clean$probit_pred <- predict(probit_model, type = "response")
Prediction of binary values from the logit and the probit models.
data_class1_clean$logit_class <- ifelse(data_class1_clean$logit_pred > 0.5, 1, 0)
data_class1_clean$probit_class <- ifelse(data_class1_clean$probit_pred > 0.5, 1, 0)
Confusion Matrices.
Conf. Matrix Logit Model
confmat_logit <- table(data_class1_clean$logit_class, data_class1_clean$nnenglish)
dimnames(confmat_logit) <- list(
"Predicted values" = c("0", "1"),
"Actual values" = c("0", "1"))
confmat_logit
## Actual values
## Predicted values 0 1
## 0 434 24
## 1 1 4
Conf. Matrix Probit Model
confmat_probit <- table(data_class1_clean$probit_class, data_class1_clean$nnenglish)
dimnames(confmat_probit) <- list(
"Predicted values" = c("0", "1"),
"Actual values" = c("0", "1"))
confmat_probit
## Actual values
## Predicted values 0 1
## 0 435 26
## 1 0 2
Displaying regression tables.
Linear regression.
stargazer(reg_1, type="text", out = "Reg_result.txt") # type can also be "latex" or "html". Also out can be excluded to not exporting the table.
##
## ===============================================
## Dependent variable:
## ---------------------------
## course_eval
## -----------------------------------------------
## beauty 0.166***
## (0.031)
##
## female -0.173***
## (0.049)
##
## minority -0.167**
## (0.076)
##
## nnenglish -0.244**
## (0.107)
##
## intro 0.011
## (0.054)
##
## onecredit 0.635***
## (0.111)
##
## Constant 4.068***
## (0.038)
##
## -----------------------------------------------
## Observations 463
## R2 0.155
## Adjusted R2 0.144
## Residual Std. Error 0.514 (df = 456)
## F Statistic 13.904*** (df = 6; 456)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Probit and Logit regressions - AME.
stargazer(mfx_probit_df, mfx_logit_df, type="text", summary=FALSE, out="margins_probit.txt")
##
## =======================================================
## factor AME SE z p lower upper
## -------------------------------------------------------
## 1 beauty 0.008 0.014 0.531 0.596 -0.020 0.036
## 2 course_eval -0.044 0.019 -2.305 0.021 -0.082 -0.007
## 3 female -0.010 0.021 -0.453 0.651 -0.051 0.032
## 4 intro -0.118 0.039 -3.036 0.002 -0.194 -0.042
## 5 minority 0.123 0.024 5.146 0.00000 0.076 0.170
## 6 onecredit 0.038 0.059 0.641 0.521 -0.078 0.153
## -------------------------------------------------------
##
## =====================================================
## factor AME SE z p lower upper
## -----------------------------------------------------
## 1 beauty 0.012 0.014 0.843 0.399 -0.016 0.041
## 2 course_eval -0.042 0.019 -2.179 0.029 -0.079 -0.004
## 3 female -0.014 0.022 -0.662 0.508 -0.056 0.028
## 4 intro -0.127 0.043 -2.943 0.003 -0.212 -0.043
## 5 minority 0.118 0.021 5.741 0 0.078 0.158
## 6 onecredit 0.054 0.060 0.891 0.373 -0.064 0.171
## -----------------------------------------------------
Probit and Logit regressions - MEM.
stargazer(mfx_probit_at_mean_df, mfx_logit_at_mean_df, type="text", summary=FALSE, out="margins_probit.txt")
##
## ======================================================
## factor AME SE z p lower upper
## ------------------------------------------------------
## 1 beauty 0.005 0.009 0.531 0.596 -0.013 0.023
## 2 course_eval -0.028 0.013 -2.176 0.030 -0.054 -0.003
## 3 female -0.006 0.014 -0.445 0.656 -0.034 0.021
## 4 intro -0.076 0.020 -3.724 0.0002 -0.116 -0.036
## 5 minority 0.079 0.022 3.545 0.0004 0.035 0.123
## 6 onecredit 0.024 0.037 0.665 0.506 -0.047 0.096
## ------------------------------------------------------
##
## ======================================================
## factor AME SE z p lower upper
## ------------------------------------------------------
## 1 beauty 0.006 0.008 0.838 0.402 -0.009 0.021
## 2 course_eval -0.022 0.011 -2.019 0.044 -0.043 -0.001
## 3 female -0.007 0.012 -0.645 0.519 -0.030 0.015
## 4 intro -0.066 0.018 -3.605 0.0003 -0.102 -0.030
## 5 minority 0.061 0.019 3.257 0.001 0.024 0.099
## 6 onecredit 0.028 0.029 0.948 0.343 -0.030 0.086
## ------------------------------------------------------
Exporting regression tables.
list_tables <- list("Linear Regression" = reg_1_df,
"Probit AME" = mfx_probit_df,
"Logit AME" = mfx_logit_df,
"Probit MEM" = mfx_probit_at_mean_df,
"Logit MEM" = mfx_logit_at_mean_df)
write_xlsx(list_tables, "Regression Tables.xlsx")