Table of contents:

  1. Visual inspection
  2. Linear models
  3. Logit and probit models
  4. Exporting results
rm(list = ls())
setwd("C:/00 Pablo/Programacion/R/Apunte 101")

Carga de paquetes y base de datos.

library("readxl") # for importing data
library("stargazer") # for formating regression tables
library("writexl") # for exporting tables

Note: for this exercise we will use the TeachingRatings Data from Stock and Watson’s Introduction to Econometrics. See the following link for more information: https://www.princeton.edu/~mwatson/Stock-Watson_3u/Students/EE_Datasets/TeachingRatings_Description.pdf

data_class1 <- read_excel("data/TeachingRatings_DataTasks.xls", sheet = "Data")
summary(data_class1)
##     minority           age            female         onecredit      
##  Min.   :0.0000   Min.   :29.00   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:42.00   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :48.00   Median :0.0000   Median :0.00000  
##  Mean   :0.1401   Mean   :48.35   Mean   :0.4224   Mean   :0.05819  
##  3rd Qu.:0.0000   3rd Qu.:57.00   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :73.00   Max.   :1.0000   Max.   :1.00000  
##      beauty           course_eval         intro          nnenglish      
##  Min.   :-1.450494   Min.   : 2.100   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:-0.656269   1st Qu.: 3.600   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :-0.068014   Median : 4.000   Median :0.0000   Median :0.00000  
##  Mean   : 0.001269   Mean   : 4.011   Mean   :0.3384   Mean   :0.06034  
##  3rd Qu.: 0.550288   3rd Qu.: 4.400   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   : 1.970023   Max.   :10.000   Max.   :1.0000   Max.   :1.00000

1. Visual inspection

We will study the effects that different variables have on the evaluation of some courses.

We start with a visual inspection of the data, in order to identify if there are outliers or something else that we should know before running a regression.

plot(data_class1$age, data_class1$course_eval,
     main = "Scatter Plot of Teacher's Age and Course Evaluation",
     xlab = "Age",
     ylab = "Course Evaluation",
     col = "blue", # dots color
     pch = 19) # type of dots

Since there seems to be an outlier in Course Evaluation score, before moving to the regression, we will do a visual analysis of that variable alone to understand better its dispersion.

Histogram of Course Evaluation.

hist(data_class1$course_eval,
             main = "Histogram of Course Evaluation")

Box plot of Course Evaluation.

boxplot(data_class1$course_eval,
             main = "Box Plot of Course Evaluation")

Considering that the course evaluation variable should have only values from 1 to 5, the observation with a course evaluation equal to 10 should be treated with attention.

2. Linear model

Regression 1. Model without outlier.

Initially we exclude those observations that have course evaluations higher than 5.

data_class1_clean <- data_class1[data_class1$course_eval<=5,]
plot(data_class1_clean$beauty, data_class1_clean$course_eval,
     main = "Scatter Plot of Beauty and Course Evaluation (without outlier)",
     xlab = "Beauty",
     ylab = "Course Evaluation",
     col = "blue", # dots color
     pch = 19) # type of dots

Now we run a linear regression without the identified outlier and a few covariates.

reg_1 <- lm(course_eval~beauty + female + minority + nnenglish + intro + onecredit,
            data = data_class1_clean)

First output of the regression.

reg_1
## 
## Call:
## lm(formula = course_eval ~ beauty + female + minority + nnenglish + 
##     intro + onecredit, data = data_class1_clean)
## 
## Coefficients:
## (Intercept)       beauty       female     minority    nnenglish        intro  
##     4.06829      0.16561     -0.17348     -0.16662     -0.24416      0.01133  
##   onecredit  
##     0.63453

Summary of the output.

summary(reg_1)
## 
## Call:
## lm(formula = course_eval ~ beauty + female + minority + nnenglish + 
##     intro + onecredit, data = data_class1_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.84611 -0.33300  0.04912  0.37672  1.05872 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.06829    0.03754 108.364  < 2e-16 ***
## beauty       0.16561    0.03073   5.389 1.14e-07 ***
## female      -0.17348    0.04928  -3.520 0.000474 ***
## minority    -0.16662    0.07628  -2.184 0.029448 *  
## nnenglish   -0.24416    0.10696  -2.283 0.022903 *  
## intro        0.01133    0.05448   0.208 0.835413    
## onecredit    0.63453    0.11134   5.699 2.17e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5135 on 456 degrees of freedom
## Multiple R-squared:  0.1546, Adjusted R-squared:  0.1435 
## F-statistic:  13.9 on 6 and 456 DF,  p-value: 1.529e-14

Summary of only the coefficients of the output

reg_1_df <- summary(reg_1)$coefficients
reg_1_df <- data.frame(Covariates = rownames(reg_1_df), reg_1_df, row.names = NULL)
reg_1_df
##    Covariates    Estimate Std..Error     t.value     Pr...t..
## 1 (Intercept)  4.06828885 0.03754295 108.3635839 0.000000e+00
## 2      beauty  0.16560995 0.03072958   5.3892689 1.136656e-07
## 3      female -0.17347744 0.04927908  -3.5203060 4.743781e-04
## 4    minority -0.16661542 0.07627840  -2.1843068 2.944834e-02
## 5   nnenglish -0.24416127 0.10695785  -2.2827803 2.290280e-02
## 6       intro  0.01132502 0.05447785   0.2078831 8.354131e-01
## 7   onecredit  0.63452703 0.11133910   5.6990496 2.165218e-08

Prediction of values according to the regression.

data_class1_clean$course_eval_predicted <- predict(reg_1,
                                                   data.frame(data_class1_clean))

plot(data_class1_clean$course_eval, data_class1_clean$course_eval_predicted,
     main = "Scatter Plot of real and predicted values for Course Evaluation",
     xlab = "Real Course Evaluation",
     ylab = "Predicted Course Evaluation",
     col = "blue", # dots color
     pch = 19)

3. Models for predicting binary variables

After the linear models, we will use the logit and probit models to predict a binary variable. We will analyze which variables help better to predict the probability of the courses to be teached by a non native english speaker.

First, we check that the nnenglish variable is actually binary.

table(data_class1_clean$nnenglish)
## 
##   0   1 
## 435  28

Regression 2. Logit model without outlier.

Now, we run the logit model with a few explanatory variables.

logit_model <- glm(nnenglish ~ course_eval + beauty + female + minority + intro + onecredit,
                   data = data_class1_clean, family = binomial(link = "logit"))

summary(logit_model)
## 
## Call:
## glm(formula = nnenglish ~ course_eval + beauty + female + minority + 
##     intro + onecredit, family = binomial(link = "logit"), data = data_class1_clean)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.5923     1.5994   0.370  0.71114    
## course_eval  -0.9074     0.4107  -2.209  0.02715 *  
## beauty        0.2661     0.3151   0.845  0.39833    
## female       -0.3109     0.4699  -0.662  0.50814    
## minority      2.5753     0.4532   5.682 1.33e-08 ***
## intro        -2.7802     0.9251  -3.005  0.00265 ** 
## onecredit     1.1685     1.3031   0.897  0.36987    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 211.38  on 462  degrees of freedom
## Residual deviance: 158.63  on 456  degrees of freedom
## AIC: 172.63
## 
## Number of Fisher Scoring iterations: 7

Regression 3. Probit model without outlier.

Now we do the same but with a probit model.

probit_model <- glm(nnenglish ~ course_eval + beauty + female + minority + intro + onecredit,
                    data = data_class1_clean, family = binomial(link = "probit"))

summary(probit_model)
## 
## Call:
## glm(formula = nnenglish ~ course_eval + beauty + female + minority + 
##     intro + onecredit, family = binomial(link = "probit"), data = data_class1_clean)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.2116     0.7958   0.266   0.7904    
## course_eval  -0.4728     0.2016  -2.346   0.0190 *  
## beauty        0.0812     0.1529   0.531   0.5952    
## female       -0.1034     0.2283  -0.453   0.6507    
## minority      1.3148     0.2484   5.294  1.2e-07 ***
## intro        -1.2601     0.4059  -3.105   0.0019 ** 
## onecredit     0.4048     0.6296   0.643   0.5202    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 211.38  on 462  degrees of freedom
## Residual deviance: 160.26  on 456  degrees of freedom
## AIC: 174.26
## 
## Number of Fisher Scoring iterations: 8

Average Marginal Effects - AME.

We can get the average marginal effect.

# install.packages("margins")
library(margins)
## Warning: package 'margins' was built under R version 4.4.3

AME for logit model.

mfx_logit_df <- margins(logit_model)

mfx_logit_df <- summary(mfx_logit_df)

mfx_logit_df
##       factor     AME     SE       z      p   lower   upper
##       beauty  0.0122 0.0145  0.8428 0.3993 -0.0162  0.0406
##  course_eval -0.0416 0.0191 -2.1787 0.0294 -0.0790 -0.0042
##       female -0.0143 0.0215 -0.6616 0.5082 -0.0565  0.0280
##        intro -0.1274 0.0433 -2.9427 0.0033 -0.2123 -0.0426
##     minority  0.1180 0.0206  5.7406 0.0000  0.0777  0.1583
##    onecredit  0.0536 0.0601  0.8914 0.3727 -0.0642  0.1713

AME for probit model.

mfx_probit_df <- margins(probit_model)

mfx_probit_df <- summary(mfx_probit_df)

mfx_probit_df
##       factor     AME     SE       z      p   lower   upper
##       beauty  0.0076 0.0143  0.5306 0.5957 -0.0204  0.0356
##  course_eval -0.0442 0.0192 -2.3055 0.0211 -0.0817 -0.0066
##       female -0.0097 0.0213 -0.4528 0.6507 -0.0515  0.0321
##        intro -0.1177 0.0388 -3.0358 0.0024 -0.1937 -0.0417
##     minority  0.1228 0.0239  5.1457 0.0000  0.0760  0.1696
##    onecredit  0.0378 0.0590  0.6412 0.5214 -0.0778  0.1534

Marginal Effects at the Mean - MEM.

MEM for logit model.

mfx_logit_at_mean_df <- margins(logit_model, at = list(
  course_eval = mean(data_class1_clean$course_eval, na.rm = TRUE),
  beauty = mean(data_class1_clean$beauty, na.rm = TRUE),
  female = mean(data_class1_clean$female, na.rm = TRUE),
  minority = mean(data_class1_clean$minority, na.rm = TRUE),
  intro = mean(data_class1_clean$intro, na.rm = TRUE),
  onecredit = mean(data_class1_clean$onecredit, na.rm = TRUE)
))

mfx_logit_at_mean_df <- summary(mfx_logit_at_mean_df)[, c("factor", "AME", "SE", "z", "p", "lower", "upper")]

mfx_logit_at_mean_df
##       factor     AME     SE       z      p   lower   upper
##       beauty  0.0064 0.0076  0.8380 0.4020 -0.0085  0.0212
##  course_eval -0.0217 0.0107 -2.0189 0.0435 -0.0427 -0.0006
##       female -0.0074 0.0115 -0.6451 0.5189 -0.0300  0.0151
##        intro -0.0664 0.0184 -3.6051 0.0003 -0.1025 -0.0303
##     minority  0.0615 0.0189  3.2569 0.0011  0.0245  0.0985
##    onecredit  0.0279 0.0294  0.9482 0.3430 -0.0298  0.0856

MEM for probit model.

mfx_probit_at_mean_df <- margins(probit_model, at = list(
  course_eval = mean(data_class1_clean$course_eval, na.rm = TRUE),
  beauty = mean(data_class1_clean$beauty, na.rm = TRUE),
  female = mean(data_class1_clean$female, na.rm = TRUE),
  minority = mean(data_class1_clean$minority, na.rm = TRUE),
  intro = mean(data_class1_clean$intro, na.rm = TRUE),
  onecredit = mean(data_class1_clean$onecredit, na.rm = TRUE)
))

mfx_probit_at_mean_df <- summary(mfx_probit_at_mean_df)[, c("factor", "AME", "SE", "z", "p", "lower", "upper")]

mfx_probit_at_mean_df
##       factor     AME     SE       z      p   lower   upper
##       beauty  0.0049 0.0092  0.5305 0.5957 -0.0132  0.0230
##  course_eval -0.0285 0.0131 -2.1759 0.0296 -0.0542 -0.0028
##       female -0.0062 0.0140 -0.4449 0.6564 -0.0337  0.0212
##        intro -0.0759 0.0204 -3.7243 0.0002 -0.1159 -0.0360
##     minority  0.0792 0.0223  3.5454 0.0004  0.0354  0.1230
##    onecredit  0.0244 0.0367  0.6650 0.5060 -0.0475  0.0963

Prediction of probability from the logit and the probit models.

data_class1_clean$logit_pred <- predict(logit_model, type = "response")
data_class1_clean$probit_pred <- predict(probit_model, type = "response")

Prediction of binary values from the logit and the probit models.

data_class1_clean$logit_class <- ifelse(data_class1_clean$logit_pred > 0.5, 1, 0)
data_class1_clean$probit_class <- ifelse(data_class1_clean$probit_pred > 0.5, 1, 0)

Confusion Matrices.

Conf. Matrix Logit Model

confmat_logit <- table(data_class1_clean$logit_class, data_class1_clean$nnenglish)

dimnames(confmat_logit) <- list(
  "Predicted values" = c("0", "1"),
  "Actual values" = c("0", "1"))

confmat_logit
##                 Actual values
## Predicted values   0   1
##                0 434  24
##                1   1   4

Conf. Matrix Probit Model

confmat_probit <- table(data_class1_clean$probit_class, data_class1_clean$nnenglish)

dimnames(confmat_probit) <- list(
  "Predicted values" = c("0", "1"),
  "Actual values" = c("0", "1"))

confmat_probit
##                 Actual values
## Predicted values   0   1
##                0 435  26
##                1   0   2

4. Exporting results

Displaying regression tables.

Linear regression.

stargazer(reg_1, type="text", out = "Reg_result.txt") # type can also be "latex" or "html". Also out can be excluded to not exporting the table.
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                             course_eval        
## -----------------------------------------------
## beauty                       0.166***          
##                               (0.031)          
##                                                
## female                       -0.173***         
##                               (0.049)          
##                                                
## minority                     -0.167**          
##                               (0.076)          
##                                                
## nnenglish                    -0.244**          
##                               (0.107)          
##                                                
## intro                          0.011           
##                               (0.054)          
##                                                
## onecredit                    0.635***          
##                               (0.111)          
##                                                
## Constant                     4.068***          
##                               (0.038)          
##                                                
## -----------------------------------------------
## Observations                    463            
## R2                             0.155           
## Adjusted R2                    0.144           
## Residual Std. Error      0.514 (df = 456)      
## F Statistic           13.904*** (df = 6; 456)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

Probit and Logit regressions - AME.

stargazer(mfx_probit_df, mfx_logit_df, type="text", summary=FALSE, out="margins_probit.txt")
## 
## =======================================================
##     factor     AME    SE     z       p    lower  upper 
## -------------------------------------------------------
## 1   beauty    0.008  0.014 0.531   0.596  -0.020 0.036 
## 2 course_eval -0.044 0.019 -2.305  0.021  -0.082 -0.007
## 3   female    -0.010 0.021 -0.453  0.651  -0.051 0.032 
## 4    intro    -0.118 0.039 -3.036  0.002  -0.194 -0.042
## 5  minority   0.123  0.024 5.146  0.00000 0.076  0.170 
## 6  onecredit  0.038  0.059 0.641   0.521  -0.078 0.153 
## -------------------------------------------------------
## 
## =====================================================
##     factor     AME    SE     z      p   lower  upper 
## -----------------------------------------------------
## 1   beauty    0.012  0.014 0.843  0.399 -0.016 0.041 
## 2 course_eval -0.042 0.019 -2.179 0.029 -0.079 -0.004
## 3   female    -0.014 0.022 -0.662 0.508 -0.056 0.028 
## 4    intro    -0.127 0.043 -2.943 0.003 -0.212 -0.043
## 5  minority   0.118  0.021 5.741    0   0.078  0.158 
## 6  onecredit  0.054  0.060 0.891  0.373 -0.064 0.171 
## -----------------------------------------------------

Probit and Logit regressions - MEM.

stargazer(mfx_probit_at_mean_df, mfx_logit_at_mean_df, type="text", summary=FALSE, out="margins_probit.txt")
## 
## ======================================================
##     factor     AME    SE     z      p    lower  upper 
## ------------------------------------------------------
## 1   beauty    0.005  0.009 0.531  0.596  -0.013 0.023 
## 2 course_eval -0.028 0.013 -2.176 0.030  -0.054 -0.003
## 3   female    -0.006 0.014 -0.445 0.656  -0.034 0.021 
## 4    intro    -0.076 0.020 -3.724 0.0002 -0.116 -0.036
## 5  minority   0.079  0.022 3.545  0.0004 0.035  0.123 
## 6  onecredit  0.024  0.037 0.665  0.506  -0.047 0.096 
## ------------------------------------------------------
## 
## ======================================================
##     factor     AME    SE     z      p    lower  upper 
## ------------------------------------------------------
## 1   beauty    0.006  0.008 0.838  0.402  -0.009 0.021 
## 2 course_eval -0.022 0.011 -2.019 0.044  -0.043 -0.001
## 3   female    -0.007 0.012 -0.645 0.519  -0.030 0.015 
## 4    intro    -0.066 0.018 -3.605 0.0003 -0.102 -0.030
## 5  minority   0.061  0.019 3.257  0.001  0.024  0.099 
## 6  onecredit  0.028  0.029 0.948  0.343  -0.030 0.086 
## ------------------------------------------------------

Exporting regression tables.

list_tables <- list("Linear Regression" = reg_1_df,
                    "Probit AME" = mfx_probit_df,
                    "Logit AME" = mfx_logit_df,
                    "Probit MEM" = mfx_probit_at_mean_df,
                    "Logit MEM" = mfx_logit_at_mean_df)

write_xlsx(list_tables, "Regression Tables.xlsx")