Packages

install.packages("tidyverse", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//Rtmp2dV7l4/downloaded_packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
install.packages("psych", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//Rtmp2dV7l4/downloaded_packages
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
install.packages("knitr",repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//Rtmp2dV7l4/downloaded_packages
library("knitr")
install.packages("kableExtra",repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//Rtmp2dV7l4/downloaded_packages
library("kableExtra")
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
install.packages("robustbase",repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//Rtmp2dV7l4/downloaded_packages
library(robustbase)
install.packages("car",repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//Rtmp2dV7l4/downloaded_packages
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:psych':
## 
##     logit
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
install.packages("huxtable", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//Rtmp2dV7l4/downloaded_packages
library("huxtable")
## 
## Attaching package: 'huxtable'
## 
## The following object is masked from 'package:kableExtra':
## 
##     add_footnote
## 
## The following object is masked from 'package:dplyr':
## 
##     add_rownames
## 
## The following object is masked from 'package:ggplot2':
## 
##     theme_grey
install.packages("modelsummary", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//Rtmp2dV7l4/downloaded_packages
library("modelsummary")
## 
## Attaching package: 'modelsummary'
## 
## The following object is masked from 'package:psych':
## 
##     SD
install.packages("stargazer",repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//Rtmp2dV7l4/downloaded_packages
library("stargazer")
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

DATA

df <- read_csv("cps_2008.csv")
## Rows: 4733 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): wage, educ, age, exper, female, black, white, married, union, nort...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Create Table for descriptive stats

des1 <- describe(df)
des1
varsnmeansdmediantrimmedmadminmaxrangeskewkurtosisse
14.73e+0310.2   6.21 8.539.27  4.831.0578.777.72.11  9.46 0.0903 
24.73e+0313.3   2.36 13   13.2   1.481   18  17  -0.08791.42 0.0342 
34.73e+0338.3   11.3  38   38.1   13.3 18   64  46  0.17  -0.8040.164  
44.73e+0319     11.4  19   18.6   13.3 0   52  52  0.269 -0.7270.166  
54.73e+030.485 0.5  0   0.481 0   0   1  1  0.0596-2    0.00727
64.73e+030.09870.2980   0     0   0   1  1  2.69  5.24 0.00434
74.73e+030.901 0.2981   1     0   0   1  1  -2.69  5.24 0.00434
84.73e+030.604 0.4891   0.631 0   0   1  1  -0.427 -1.82 0.00711
94.73e+030.163 0.37 0   0.07920   0   1  1  1.82  1.32 0.00537
104.73e+030.222 0.4160   0.153 0   0   1  1  1.34  -0.2160.00604
114.73e+030.245 0.43 0   0.181 0   0   1  1  1.19  -0.59 0.00625
124.73e+030.31  0.4630   0.263 0   0   1  1  0.819 -1.33 0.00673
134.73e+030.223 0.4160   0.153 0   0   1  1  1.33  -0.2240.00605
144.73e+030.881 0.3231   0.977 0   0   1  1  -2.36  3.57 0.0047 
154.73e+030.794 0.4051   0.867 0   0   1  1  -1.45  0.1030.00588

Ask professor which variables are good for the table

tableF <- knitr::kable(des1, 
             "html",
             caption = "CPS 2008 DATA",
             digits = 2) %>%
footnote(general = "N = 4733") %>% 
  kable_styling(font_size = 10)
tableF
CPS 2008 DATA
vars n mean sd median trimmed mad min max range skew kurtosis se
wage 1 4733 10.19 6.21 8.53 9.27 4.83 1.05 78.71 77.66 2.11 9.46 0.09
educ 2 4733 13.30 2.36 13.00 13.25 1.48 1.00 18.00 17.00 -0.09 1.42 0.03
age 3 4733 38.33 11.30 38.00 38.07 13.34 18.00 64.00 46.00 0.17 -0.80 0.16
exper 4 4733 19.04 11.40 19.00 18.61 13.34 0.00 52.00 52.00 0.27 -0.73 0.17
female 5 4733 0.49 0.50 0.00 0.48 0.00 0.00 1.00 1.00 0.06 -2.00 0.01
black 6 4733 0.10 0.30 0.00 0.00 0.00 0.00 1.00 1.00 2.69 5.24 0.00
white 7 4733 0.90 0.30 1.00 1.00 0.00 0.00 1.00 1.00 -2.69 5.24 0.00
married 8 4733 0.60 0.49 1.00 0.63 0.00 0.00 1.00 1.00 -0.43 -1.82 0.01
union 9 4733 0.16 0.37 0.00 0.08 0.00 0.00 1.00 1.00 1.82 1.32 0.01
northeast 10 4733 0.22 0.42 0.00 0.15 0.00 0.00 1.00 1.00 1.34 -0.22 0.01
midwest 11 4733 0.24 0.43 0.00 0.18 0.00 0.00 1.00 1.00 1.19 -0.59 0.01
south 12 4733 0.31 0.46 0.00 0.26 0.00 0.00 1.00 1.00 0.82 -1.33 0.01
west 13 4733 0.22 0.42 0.00 0.15 0.00 0.00 1.00 1.00 1.33 -0.22 0.01
fulltime 14 4733 0.88 0.32 1.00 0.98 0.00 0.00 1.00 1.00 -2.36 3.57 0.00
metro 15 4733 0.79 0.40 1.00 0.87 0.00 0.00 1.00 1.00 -1.45 0.10 0.01
Note:
N = 4733

Create a regression of wage on educ

set.seed(77)
lmro <- lmrob(df$wage ~ df$educ)
summary(lmro)
## 
## Call:
## lmrob(formula = df$wage ~ df$educ)
##  \--> method = "MM"
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.1965  -2.8792  -0.2992   3.2908  65.1735 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.95530    0.45769  -6.457 1.18e-10 ***
## df$educ      0.91621    0.03672  24.949  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Robust residual standard error: 4.04 
## Multiple R-squared:  0.1963, Adjusted R-squared:  0.1962 
## Convergence in 16 IRWLS iterations
## 
## Robustness weights: 
##  62 observations c(4629,4630,4637,4643,4648,4656,4660,4664,4671,4674,4675,4677,4678,4682,4683,4684,4685,4687,4689,4690,4691,4693,4694,4695,4696,4697,4698,4699,4700,4701,4702,4703,4704,4705,4706,4707,4708,4709,4710,4711,4712,4713,4714,4715,4716,4717,4718,4719,4720,4721,4722,4723,4724,4725,4726,4727,4728,4729,4730,4731,4732,4733)
##   are outliers with |weight| = 0 ( < 2.1e-05); 
##  318 weights are ~= 1. The remaining 4353 ones are summarized as
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000354 0.8702000 0.9440000 0.8806000 0.9831000 0.9990000 
## Algorithmic parameters: 
##        tuning.chi                bb        tuning.psi        refine.tol 
##         1.548e+00         5.000e-01         4.685e+00         1.000e-07 
##           rel.tol         scale.tol         solve.tol       eps.outlier 
##         1.000e-07         1.000e-10         1.000e-07         2.113e-05 
##             eps.x warn.limit.reject warn.limit.meanrw 
##         3.274e-11         5.000e-01         5.000e-01 
##      nResample         max.it       best.r.s       k.fast.s          k.max 
##            500             50              2              1            200 
##    maxit.scale      trace.lev            mts     compute.rd fast.s.large.n 
##            200              0           1000              0           2000 
##                   psi           subsampling                   cov 
##            "bisquare"         "nonsingular"         ".vcov.avar1" 
## compute.outlier.stats 
##                  "SM" 
## seed : int(0)

Create a scatter plot with fitted linear regression

plot(x = df$educ,
     y = df$wage,
     main = "Wage on Education",
     xlab = "Years of Education",
     ylab = "Earnings per hour",
     pch = 19,
     cex = 0.5,
     sub = "N = 4733",
     frame = TRUE,
     col = "pink")

abline(lmro, col = "blue")

creating table for regression

stargazer(lmro,
          title = "Wage vs Education",
          dep.var.caption = "DV: Earnings per Hour",
          covariate.labels = c("Years of education"),
          notes.label = "Significance level:",
          add.lines = TRUE,
          keep.stat = c("n","rsq"),
          digits = 2,
          type = "html",
          out = "Economretics3.html")
## 
## <table style="text-align:center"><caption><strong>Wage vs Education</strong></caption>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"></td><td>DV: Earnings per Hour</td></tr>
## <tr><td></td><td colspan="1" style="border-bottom: 1px solid black"></td></tr>
## <tr><td style="text-align:left"></td><td>wage</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Years of education</td><td>0.92<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.04)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">Constant</td><td>-2.96<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.46)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">TRUE</td><td></td></tr>
## <tr><td style="text-align:left">Observations</td><td>4,733</td></tr>
## <tr><td style="text-align:left">R<sup>2</sup></td><td>0.20</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Significance level:</td><td style="text-align:right"><sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01</td></tr>
## </table>

Triying the car package

scatterplot(df$wage ~ df$educ,
            xlab = "Year of Education",
            ylab = "Earnings per Hour",
            main = " Wage VS Years of Education",
            grid = FALSE)

# crerating a a single plot for the 4 scatterplots

#Plot 1 Wage vs Educ

plot1 <- plot(x = df$educ,
     y = df$wage,
     main = "Wage vs Education",
     xlab = "Years of Education",
     ylab = "Earnings per hour",
     pch = 19,
     cex = 0.5,
     frame = TRUE,
     sub = "N = 4733",
     col = "pink")

# PLot 2 Wage vs ln educ

plot2 <- plot(x = log(df$educ),
     y = df$wage,
     main = "Wage vs ln(Education)",
     xlab = "ln(Years of Education)",
     ylab = "Earnings per hour",
     pch = 19,
     cex = 0.5,
     frame = TRUE,
     sub = " N = 4733",
     col = "pink")

# Plot 3 Ln wage vs educ

plot3 <- plot(x = df$educ,
     y = log(df$wage),
     main = "Ln(Wage) vs Education",
     xlab = "Years of Education",
     ylab = "Ln(Earnings per hour)",
     pch = 19,
     cex = 0.5,
     frame = TRUE,
     sub = "N = 4733",
     col = "pink")

PLot 4 ln wage vs ln educ

graph1 <- plot(x = log(df$educ),
     y = log(df$wage),
     main = "ln(Wage) vs ln(Education)",
     xlab = "ln(Years of Education)",
     ylab = "Ln(Earnings per hour)",
     pch = 19,
     cex = 0.5,
     frame = TRUE,
     sub = "N = 4733",
     col = "pink")

# Creating a single plot for the 4 graphs

We can add to the 4 plots abline(lmro, col = “blue”)

par(mfrow=c(2,2))
plot(x = df$educ,
     y = df$wage,
     main = "Wage vs Education",
     xlab = "Years of Education",
     ylab = "Earnings per hour",
     sub = "N = 4733",
     col = "pink")
plot(x = log(df$educ),
     y = df$wage,
     main = "Wage vs ln(Education)",
     xlab = "ln(Years of Education)",
     ylab = "Earnings per hour",
     sub = " N = 4733",
     col = "pink")
plot(x = df$educ,
     y = log(df$wage),
     main = "Ln(Wage) vs Education",
     xlab = "Years of Education",
     ylab = "Ln(Earnings per hour)",
     sub = "N = 4733",
     col = "pink")
plot(x = log(df$educ),
     y = log(df$wage),
     main = "ln(Wage) vs ln(Education)",
     xlab = "ln(Years of Education)",
     ylab = "Ln(Earnings per hour)",
     sub = "N = 4733",
     col = "pink")

# Reggress 5 models

Model 1 Ln wage on ln educ

model1 <- lmrob(log(df$wage)~log(df$educ))

Model 2 Ln wage on educ

model2 <- lmrob(log(df$wage)~df$educ)

Model 3 Ln wage on educ and exper

model3 <- lmrob(log(df$wage)~ df$educ + df$exper)

Model 4 Ln wage on educ, expr and female

model4 <- lmrob(log(df$wage)~ df$educ + df$exper + df$female)

Model 5 Ln wage on educ, expr, female female_educ

Female_educ is variable that we need to add

female_educ <- (df$female*df$educ)
df$female_educ = female_educ
head(df)
wageeducageexperfemaleblackwhitemarriedunionnortheastmidwestsouthwestfulltimemetrofemale_educ
1.05123719001010001110
1.05134223001001000110
1.2385440001000001110
1.281059431011110000110
1.34182841010000011018
1.471240221011101000112
model5 <- lmrob(log(df$wage)~ df$educ + df$exper + df$female + df$female_educ)

Create Side By side table using the 5 models

SBST <-huxreg(model1, model2, model3, model4, model5,
              statistics = character(0),
              number_format = 0)

Using another package

models <- list("Model 1" = model1,
               "Model 2" = model2,
               "Model 3" = model3,
               "Model 4" = model4,
               "Model 5" = model5)
modelsummary(models, 
             fmt = 2,
             stars = TRUE,
             output = "table.html")
plot(model1)
## recomputing robust Mahalanobis distances
## saving the robust distances 'MD' as part of 'model1'

plot(model2)
## recomputing robust Mahalanobis distances
## saving the robust distances 'MD' as part of 'model2'

Extending model 5 to incorporate region dummies

model51 <- lmrob(log(df$wage)~ df$educ + df$exper + df$female + df$female_educ + df$northeast + df$midwest + df$south)
stargazer(model51,
          title = "Ln(Wage) on Education",
          dep.var.caption = "DV: Earnings per hour (wage)" ,
          notes.label = "N = 4377| Significance level:",
          add.lines = TRUE,
          keep.stat = c("n","rsq"),
          digits = 2,
          type = "html",
          out = "Economretics4.html")
## 
## <table style="text-align:center"><caption><strong>Ln(Wage) on Education</strong></caption>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"></td><td>DV: Earnings per hour (wage)</td></tr>
## <tr><td></td><td colspan="1" style="border-bottom: 1px solid black"></td></tr>
## <tr><td style="text-align:left"></td><td>wage)</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">educ</td><td>0.11<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.004)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">exper</td><td>0.01<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.001)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">female</td><td>-0.57<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.08)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">female_educ</td><td>0.02<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.01)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">northeast</td><td>0.04<sup>**</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.02)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">midwest</td><td>0.001</td></tr>
## <tr><td style="text-align:left"></td><td>(0.02)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">south</td><td>-0.06<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.02)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">Constant</td><td>0.62<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.05)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">TRUE</td><td></td></tr>
## <tr><td style="text-align:left">Observations</td><td>4,733</td></tr>
## <tr><td style="text-align:left">R<sup>2</sup></td><td>0.35</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">N = 4377| Significance level:</td><td style="text-align:right"><sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01</td></tr>
## </table>

F- test regions jointly do not matter for earnings

Full model

full <- lm(log(df$wage)~ df$educ + df$exper + df$female + df$female_educ + df$northeast + df$midwest + df$south)
reduced <- lm(log(df$wage)~ df$educ + df$exper + df$female + df$female_educ)
anova(reduced, full)
Res.DfRSSDfSum of SqFPr(>F)
4.73e+03977            
4.72e+0397136.3810.48.59e-07