Loading data

library(haven)
## Warning: package 'haven' was built under R version 4.1.3
data_nsw <- as.data.frame(read_dta("C:/Stuffs/IIT/3rd Semester/Advanced Econ Lab/Angrist and Wahba data/nsw_dw.dta"))
data_cps <- as.data.frame(read_dta("C:/Stuffs/IIT/3rd Semester/Advanced Econ Lab/Angrist and Wahba data/cps_controls.dta"))

data_psid <- as.data.frame(read_dta("C:/Stuffs/IIT/3rd Semester/Advanced Econ Lab/Angrist and Wahba data/psid_controls.dta"))

To generate U74 and U75 data as used in the paper

# NSW DATASET

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data_nsw <- data_nsw %>%
  mutate(u74 = case_when(
    re74==0~1,re74!=0~0
    ))                                    
data_nsw <- data_nsw %>%
  mutate(u75 = case_when(
    re75==0~1,re75!=0~0
    ))

data_nsw=subset(data_nsw,select = -c(data_id))  ### Dropping the data_id
# CPS DATASET 

data_cps <- data_cps %>%
  mutate(u74 = case_when(
    re74==0~1,re74!=0~0
    ))                                    
data_cps <- data_cps %>%
  mutate(u75 = case_when(
    re75==0~1,re75!=0~0
    ))

data_cps=subset(data_cps,select = -c(data_id))  ### Dropping the data_id
# PSID DATASET
data_psid <- data_psid %>%
  mutate(u74 = case_when(
    re74==0~1,re74!=0~0
    ))                                    
data_psid <- data_psid %>%
  mutate(u75 = case_when(
    re75==0~1,re75!=0~0
    ))

data_psid=subset(data_psid,select = -c(data_id))  ### Dropping the data_id
### Binding NSW CPS and NSW PSID
data_nsw_cps <-subset(data_nsw,treat!=0)
data_nsw_psid <- subset(data_nsw,treat!=0) 
data_nsw_cps <- rbind(data_cps,data_nsw_cps)     #Integrating NSW & CPS
data_nsw_psid <- rbind(data_psid,data_nsw_psid)     #Integrating NSW & PSID

Creating Table 1 NSW Data set

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v stringr 1.4.0
## v tidyr   1.2.0     v forcats 0.5.1
## v readr   2.1.2
## Warning: package 'ggplot2' was built under R version 4.1.3
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'purrr' was built under R version 4.1.3
## Warning: package 'stringr' was built under R version 4.1.3
## Warning: package 'forcats' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
table1 <- data_nsw %>%
  group_by(treat) %>%
  summarize(across(everything(), list( mean=mean,sd = sd), na.rm = TRUE))
table1=subset(table1,select = -c(treat))

library(tibble)
add_column(table1, Treatment_status=0:1, .after = 0)
## # A tibble: 2 x 23
##   Treatment_status age_mean age_sd education_mean education_sd black_mean
##              <int>    <dbl>  <dbl>          <dbl>        <dbl>      <dbl>
## 1                0     25.1   7.06           10.1         1.61      0.827
## 2                1     25.8   7.16           10.3         2.01      0.843
## # ... with 17 more variables: black_sd <dbl>, hispanic_mean <dbl>,
## #   hispanic_sd <dbl>, married_mean <dbl>, married_sd <dbl>,
## #   nodegree_mean <dbl>, nodegree_sd <dbl>, re74_mean <dbl>, re74_sd <dbl>,
## #   re75_mean <dbl>, re75_sd <dbl>, re78_mean <dbl>, re78_sd <dbl>,
## #   u74_mean <dbl>, u74_sd <dbl>, u75_mean <dbl>, u75_sd <dbl>
final_table1 <- t(table1)
Covariate_table <- as.data.frame(final_table1)



# V1 :: Controls
# V2 :: Treated
Covariate_table
##                          V1           V2
## age_mean         25.0538462 2.581622e+01
## age_sd            7.0577448 7.155019e+00
## education_mean   10.0884615 1.034595e+01
## education_sd      1.6143246 2.010650e+00
## black_mean        0.8269231 8.432432e-01
## black_sd          0.3790434 3.645579e-01
## hispanic_mean     0.1076923 5.945946e-02
## hispanic_sd       0.3105893 2.371244e-01
## married_mean      0.1538462 1.891892e-01
## married_sd        0.3614971 3.927217e-01
## nodegree_mean     0.8346154 7.081081e-01
## nodegree_sd       0.3722439 4.558666e-01
## re74_mean      2107.0266512 2.095574e+03
## re74_sd        5687.9056394 4.886620e+03
## re75_mean      1266.9090145 1.532055e+03
## re75_sd        3102.9820880 3.219251e+03
## re78_mean      4554.8011202 6.349144e+03
## re78_sd        5483.8360014 7.867402e+03
## u74_mean          0.7500000 7.081081e-01
## u74_sd            0.4338478 4.558666e-01
## u75_mean          0.6846154 6.000000e-01
## u75_sd            0.4655651 4.912274e-01

PROPENSITY SCORE CALCULATION AND MATCHING

TABLE-2

# Calculating Propensity scores for NSW and CPS Samples
# In the paper the propensity scores are measured using a logit model where the 
# covariates are Age, Age^2,Age^3 School, School^2, Married, No degree, Married, No degree, Black, Hispanic, RE74, RE75, U74, U75, School*RE74

prop_score_nsw <- glm(treat ~ age+I(age^2)+I(age^3)+education+I(education^2)+married+nodegree+black+hispanic+re74+re75+u74+u75+education*re74,family = binomial(), data = data_nsw)

prsrnsw_df <- data.frame(pr_score = predict(prop_score_nsw, type = "response"),
                     treat_status = prop_score_nsw$model$treat)   #prsrnsw::propensity score for nsw dataset




prop_score_nsw_cps <- glm(treat ~ age+I(age^2)+I(age^3)+education+I(education^2)+married+nodegree+black+hispanic+re74+re75+u74+u75+education*re74,family = binomial(), data = data_nsw_cps)

prsrnswcps_df <- data.frame(pr_score = predict(prop_score_nsw_cps, type = "response"),
                     treat_status = prop_score_nsw_cps$model$treat)   #prsrnsw::propensity score for nsw_cps dataset

x1=mean(prsrnsw_df$pr_score)
x2=mean(prsrnswcps_df$pr_score)
sprintf("The mean propensity score of the NSW Dataset: %f" ,x1 )  
## [1] "The mean propensity score of the NSW Dataset: 0.415730"
sprintf("The mean propensity score of the NSW_CPS Dataset: %f" ,x2)  
## [1] "The mean propensity score of the NSW_CPS Dataset: 0.011436"

TABLE-3

# Calculating Propensity scores for NSW and PSID Samples
# In the paper the propensity scores are measured using a logit model where the 
# covariates are Age, Age^2, School, School^2, Married, No degree, Black, Hisp, RE74, RE74^2, RE75, RE75^2, U74, U75, U74 * Hisp.


prp_score_nsw <- glm(treat ~ age+I(age^2)+education+I(education^2)+married+nodegree+black+hispanic+re74+re75+I(re74^2)+I(re75^2)+u74+u75+u74*hispanic,family = binomial(), data = data_nsw)
prsnsw_df <- data.frame(pr_score = predict(prp_score_nsw, type = "response"),
                     treat_status = prp_score_nsw$model$treat)    #prsnsw::propensity score for nsw dataset

prp_score_nsw_psid <- glm(treat ~ age+I(age^2)+education+I(education^2)+married+nodegree+black+hispanic+re74+re75+I(re74^2)+I(re75^2)+u74+u75+u74*hispanic,family = binomial(), data = data_nsw_psid)
prsnswpsid_df <- data.frame(pr_score = predict(prp_score_nsw_psid, type = "response"),
                     treat_status = prp_score_nsw_psid$model$treat)  #prsnswpsid::propensity score for nsw_psid dataset


x3=mean(prsnsw_df$pr_score)
x4=mean(prsnswpsid_df$pr_score)
sprintf("The mean propensity score of the NSW Dataset: %f" ,x3 )  
## [1] "The mean propensity score of the NSW Dataset: 0.415730"
sprintf("The mean propensity score of the NSW_PSID Dataset: %f" ,x4)
## [1] "The mean propensity score of the NSW_PSID Dataset: 0.069159"

Matching Algorithm

library(MatchIt)
## Warning: package 'MatchIt' was built under R version 4.1.3
reg1 <- matchit(treat ~ age +education+ black+hispanic + married + nodegree + re74 + re75+u74+u75, data = data_nsw, method = "nearest", distance = "glm",estimand = "ATT")


reg2 <- matchit(treat ~ age +education+ black+hispanic + married + nodegree + re74 + re75+u74+u75, data = data_nsw_cps, method = "nearest", distance = "glm",estimand = "ATT")


reg3 <- matchit(treat ~ age +education+ black+hispanic + married + nodegree + re74 + re75+u74+u75, data = data_nsw_psid, method = "nearest", distance = "glm",estimand = "ATT")

# Create a data frame of Matched Data
dataframeNSW<- match.data(reg1,group="all",distance = "prop.score",data = data_nsw)  
dataframeNSWCPS<- match.data(reg2,group="all",distance = "prop.score",data = data_nsw_cps)  
dataframeNSWPSID<- match.data(reg3,group="all",distance = "prop.score",data = data_nsw_psid)  

Regression Treatment Effects

treat_reg_1 <- lm(re78~treat+age +education+ black+hispanic + married + nodegree + re74 + re75+u74+u75,data = dataframeNSW)
treat_reg_2 <- lm(re78~treat+age +education+ black+hispanic + married + nodegree + re74 + re75+u74+u75,data = dataframeNSWCPS)
treat_reg_3 <- lm(re78~treat+age +education+ black+hispanic + married + nodegree + re74 + re75+u74+u75,data = dataframeNSWPSID)
library(sjmisc)
## Warning: package 'sjmisc' was built under R version 4.1.3
## 
## Attaching package: 'sjmisc'
## The following object is masked from 'package:purrr':
## 
##     is_empty
## The following object is masked from 'package:tidyr':
## 
##     replace_na
## The following object is masked from 'package:tibble':
## 
##     add_case
library(sjlabelled)
## Warning: package 'sjlabelled' was built under R version 4.1.3
## 
## Attaching package: 'sjlabelled'
## The following object is masked from 'package:forcats':
## 
##     as_factor
## The following object is masked from 'package:ggplot2':
## 
##     as_label
## The following object is masked from 'package:dplyr':
## 
##     as_label
## The following objects are masked from 'package:haven':
## 
##     as_factor, read_sas, read_spss, read_stata, write_sas, zap_labels
library(sjPlot)
## Warning: package 'sjPlot' was built under R version 4.1.3
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
tab_model(treat_reg_1,treat_reg_2,treat_reg_3,terms = c("treat"),show.ci = FALSE,show.se = TRUE,auto.label = TRUE,dv.labels = c("NSW","NSW_CPS","NSW_PSID"))
  NSW NSW_CPS NSW_PSID
Predictors Estimates std. Error p Estimates std. Error p Estimates std. Error p
treat 1617.03 700.62 0.022 2090.76 686.15 0.002 169.92 1152.44 0.883
Observations 370 370 370
R2 / R2 adjusted 0.065 / 0.036 0.153 / 0.127 0.130 / 0.103