##Loading packages:

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(broom)
library(skimr)
library(haven)

Input nlsw88.dta

nlsw <- read_dta("nlsw88.dta")

view(nlsw)
glimpse(nlsw)
## Rows: 2,246
## Columns: 17
## $ idcode        <dbl> 1, 2, 3, 4, 6, 7, 9, 12, 13, 14, 15, 16, 18, 19, 20, 22,…
## $ age           <dbl> 37, 37, 42, 43, 42, 39, 37, 40, 40, 40, 39, 40, 40, 40, …
## $ race          <dbl+lbl> 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ married       <dbl+lbl> 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1…
## $ never_married <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ grade         <dbl> 12, 12, 12, 17, 12, 12, 12, 18, 14, 15, 16, 15, 15, 15, …
## $ collgrad      <dbl+lbl> 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1…
## $ south         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ smsa          <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1…
## $ c_city        <dbl> 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
## $ industry      <dbl+lbl>  5,  4,  4, 11,  4, 11,  5, 11, 11, 11, 11, 11,  6, …
## $ occupation    <dbl+lbl>  6,  5,  3, 13,  6,  3,  2,  2,  3,  1,  1,  1,  5, …
## $ union         <dbl+lbl>  1,  1, NA,  1,  0,  0,  1,  0,  0,  0,  1,  0,  0, …
## $ wage          <dbl> 11.739125, 6.400963, 5.016723, 9.033813, 8.083731, 4.629…
## $ hours         <dbl> 48, 40, 40, 42, 48, 30, 40, 45, 8, 50, 16, 40, 40, 40, 4…
## $ ttl_exp       <dbl> 10.333334, 13.621795, 17.730770, 13.211537, 17.820513, 7…
## $ tenure        <dbl> 5.3333335, 5.2500000, 1.2500000, 1.7500000, 17.7500000, …

#Rename variables

nlsw.clean <- nlsw %>%
  rename(.,
         college_grad = collgrad,
         total_work_exp = ttl_exp,
         age_years = age) %>%
  mutate(.,
         college_grad_fac = as_factor(college_grad))


glimpse(nlsw.clean)
## Rows: 2,246
## Columns: 18
## $ idcode           <dbl> 1, 2, 3, 4, 6, 7, 9, 12, 13, 14, 15, 16, 18, 19, 20, …
## $ age_years        <dbl> 37, 37, 42, 43, 42, 39, 37, 40, 40, 40, 39, 40, 40, 4…
## $ race             <dbl+lbl> 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ married          <dbl+lbl> 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1…
## $ never_married    <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ grade            <dbl> 12, 12, 12, 17, 12, 12, 12, 18, 14, 15, 16, 15, 15, 1…
## $ college_grad     <dbl+lbl> 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1…
## $ south            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ smsa             <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0…
## $ c_city           <dbl> 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ industry         <dbl+lbl>  5,  4,  4, 11,  4, 11,  5, 11, 11, 11, 11, 11,  …
## $ occupation       <dbl+lbl>  6,  5,  3, 13,  6,  3,  2,  2,  3,  1,  1,  1,  …
## $ union            <dbl+lbl>  1,  1, NA,  1,  0,  0,  1,  0,  0,  0,  1,  0,  …
## $ wage             <dbl> 11.739125, 6.400963, 5.016723, 9.033813, 8.083731, 4.…
## $ hours            <dbl> 48, 40, 40, 42, 48, 30, 40, 45, 8, 50, 16, 40, 40, 40…
## $ total_work_exp   <dbl> 10.333334, 13.621795, 17.730770, 13.211537, 17.820513…
## $ tenure           <dbl> 5.3333335, 5.2500000, 1.2500000, 1.7500000, 17.750000…
## $ college_grad_fac <fct> not college grad, not college grad, not college grad,…

#confirm renaming

skim(nlsw.clean)
Data summary
Name nlsw.clean
Number of rows 2246
Number of columns 18
_______________________
Column type frequency:
factor 1
numeric 17
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
college_grad_fac 0 1 FALSE 2 not: 1714, col: 532

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
idcode 0 1.00 2612.65 1480.86 1.00 1366.25 2614.00 3902.25 5159.00 ▇▇▇▇▇
age_years 0 1.00 39.15 3.06 34.00 36.00 39.00 42.00 46.00 ▇▆▇▃▃
race 0 1.00 1.28 0.48 1.00 1.00 1.00 2.00 3.00 ▇▁▃▁▁
married 0 1.00 0.64 0.48 0.00 0.00 1.00 1.00 1.00 ▅▁▁▁▇
never_married 0 1.00 0.10 0.31 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
grade 2 1.00 13.10 2.52 0.00 12.00 12.00 15.00 18.00 ▁▁▁▇▃
college_grad 0 1.00 0.24 0.43 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
south 0 1.00 0.42 0.49 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▆
smsa 0 1.00 0.70 0.46 0.00 0.00 1.00 1.00 1.00 ▃▁▁▁▇
c_city 0 1.00 0.29 0.45 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
industry 14 0.99 8.19 3.01 1.00 6.00 8.00 11.00 12.00 ▁▃▅▂▇
occupation 9 1.00 4.64 3.41 1.00 2.00 3.00 6.00 13.00 ▇▁▃▁▁
union 368 0.84 0.25 0.43 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
wage 0 1.00 7.77 5.76 1.00 4.26 6.27 9.59 40.75 ▇▂▁▁▁
hours 4 1.00 37.22 10.51 1.00 35.00 40.00 40.00 80.00 ▁▂▇▁▁
total_work_exp 0 1.00 12.53 4.61 0.12 9.21 13.13 15.98 28.88 ▂▅▇▂▁
tenure 15 0.99 5.98 5.51 0.00 1.58 3.83 9.33 25.92 ▇▃▂▁▁

#Simple Regression: total_work_exp –> wage

wagereg1 <- lm(wage ~ total_work_exp, data = nlsw.clean)

options(scipen=999)
summary(wagereg1)
## 
## Call:
## lm(formula = wage ~ total_work_exp, data = nlsw.clean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.502 -2.911 -1.295  1.308 35.370 
## 
## Coefficients:
##                Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)     3.61249    0.33935   10.64 <0.0000000000000002 ***
## total_work_exp  0.33143    0.02541   13.04 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.55 on 2244 degrees of freedom
## Multiple R-squared:  0.07048,    Adjusted R-squared:  0.07006 
## F-statistic: 170.1 on 1 and 2244 DF,  p-value: < 0.00000000000000022

#Multiple Regression: total_work_exp + age_years –> wage

wagereg2 <- lm(wage ~ total_work_exp + age_years, data = nlsw.clean)

summary(wagereg2)
## 
## Call:
## lm(formula = wage ~ total_work_exp + age_years, data = nlsw.clean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.107 -2.926 -1.267  1.337 35.050 
## 
## Coefficients:
##                Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)     8.64932    1.50566   5.745         0.0000000105 ***
## total_work_exp  0.34233    0.02555  13.401 < 0.0000000000000002 ***
## age_years      -0.13213    0.03849  -3.433             0.000607 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.537 on 2243 degrees of freedom
## Multiple R-squared:  0.07534,    Adjusted R-squared:  0.07451 
## F-statistic: 91.37 on 2 and 2243 DF,  p-value: < 0.00000000000000022

#Multiple Regression: total_work_exp + age_years + college_grad_fac –> wage

wagereg3 <- lm(wage ~ total_work_exp + age_years + college_grad_fac, data = nlsw.clean)

summary(wagereg3)
## 
## Call:
## lm(formula = wage ~ total_work_exp + age_years + college_grad_fac, 
##     data = nlsw.clean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.917 -2.689 -1.021  1.086 35.550 
## 
## Coefficients:
##                              Estimate Std. Error t value             Pr(>|t|)
## (Intercept)                   7.92690    1.46037   5.428         0.0000000631
## total_work_exp                0.30869    0.02491  12.391 < 0.0000000000000002
## age_years                    -0.12252    0.03731  -3.284              0.00104
## college_grad_faccollege grad  3.24134    0.26799  12.095 < 0.0000000000000002
##                                 
## (Intercept)                  ***
## total_work_exp               ***
## age_years                    ** 
## college_grad_faccollege grad ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.366 on 2242 degrees of freedom
## Multiple R-squared:  0.132,  Adjusted R-squared:  0.1308 
## F-statistic: 113.6 on 3 and 2242 DF,  p-value: < 0.00000000000000022

#Mult. Reg.: total_work_exp + age_years + college_grad_fac + age_years*total_work_exp –> wage

wagereg4 <- lm(wage ~ total_work_exp + age_years + college_grad_fac + total_work_exp:age_years, data = nlsw.clean)

summary(wagereg4)
## 
## Call:
## lm(formula = wage ~ total_work_exp + age_years + college_grad_fac + 
##     total_work_exp:age_years, data = nlsw.clean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.535 -2.646 -1.027  1.064 35.940 
## 
## Coefficients:
##                               Estimate Std. Error t value            Pr(>|t|)
## (Intercept)                   1.701472   4.151041   0.410              0.6819
## total_work_exp                0.816992   0.318256   2.567              0.0103
## age_years                     0.034692   0.104979   0.330              0.7411
## college_grad_faccollege grad  3.229193   0.268005  12.049 <0.0000000000000002
## total_work_exp:age_years     -0.012788   0.007982  -1.602              0.1093
##                                 
## (Intercept)                     
## total_work_exp               *  
## age_years                       
## college_grad_faccollege grad ***
## total_work_exp:age_years        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.364 on 2241 degrees of freedom
## Multiple R-squared:  0.133,  Adjusted R-squared:  0.1314 
## F-statistic: 85.92 on 4 and 2241 DF,  p-value: < 0.00000000000000022

#Summary Table

library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
summary.table <- stargazer(wagereg1,
                           wagereg2,
                           wagereg3,
                           wagereg4,
                           type = "text",
                           font.size = "normalsize",
                           digits = 2,
                           keep.stat = c("n", "rsq", "adj.rsq"),
                           out = "Module2_table.html",
                           star.cutoffs = c(.05, .01, .001))
## 
## =============================================================
##                                    Dependent variable:       
##                              --------------------------------
##                                            wage              
##                                (1)     (2)      (3)     (4)  
## -------------------------------------------------------------
## total_work_exp               0.33*** 0.34***  0.31***  0.82* 
##                              (0.03)   (0.03)  (0.02)  (0.32) 
##                                                              
## age_years                            -0.13*** -0.12**  0.03  
##                                       (0.04)  (0.04)  (0.10) 
##                                                              
## college_grad_faccollege grad                  3.24*** 3.23***
##                                               (0.27)  (0.27) 
##                                                              
## total_work_exp:age_years                               -0.01 
##                                                       (0.01) 
##                                                              
## Constant                     3.61*** 8.65***  7.93***  1.70  
##                              (0.34)   (1.51)  (1.46)  (4.15) 
##                                                              
## -------------------------------------------------------------
## Observations                  2,246   2,246    2,246   2,246 
## R2                            0.07     0.08    0.13    0.13  
## Adjusted R2                   0.07     0.07    0.13    0.13  
## =============================================================
## Note:                           *p<0.05; **p<0.01; ***p<0.001