rm(list = ls())   # Clear environment 
gc()              # Clear unused memory

##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 531291 28.4    1183341 63.2         NA   669265 35.8
## Vcells 975543  7.5    8388608 64.0      16384  1840436 14.1

cat("\f")         # Clear the console

graphics.off()    # Clear the charts


library(foreign)

## Warning: package 'foreign' was built under R version 4.2.3

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.2.3

## Warning: package 'tidyr' was built under R version 4.2.3

## Warning: package 'readr' was built under R version 4.2.3

## Warning: package 'dplyr' was built under R version 4.2.3

## Warning: package 'stringr' was built under R version 4.2.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)

EXAMPLE 0:

1 EXAMPLE 1: Estimating Diff-in-Diff

SOURCE: https://www.princeton.edu/~otorres/DID101R.pdf

1.0.1 Getting sample data.

data()

mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
str(mydata)

## 'data.frame':    70 obs. of  9 variables:
##  $ country: Factor w/ 7 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year   : int  1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
##  $ y      : num  1.34e+09 -1.90e+09 -1.12e+07 2.65e+09 3.01e+09 ...
##  $ y_bin  : num  1 0 0 1 1 1 1 1 1 1 ...
##  $ x1     : num  0.278 0.321 0.363 0.246 0.425 ...
##  $ x2     : num  -1.108 -0.949 -0.789 -0.886 -0.73 ...
##  $ x3     : num  0.2826 0.4925 0.7025 -0.0944 0.9461 ...
##  $ opinion: Factor w/ 4 levels "Str agree","Agree",..: 1 3 3 3 3 1 3 1 3 4 ...
##  $ op     : num  1 0 0 0 0 1 0 1 0 0 ...
##  - attr(*, "datalabel")= chr ""
##  - attr(*, "time.stamp")= chr " 3 Jan 2020 11:28"
##  - attr(*, "formats")= chr [1:9] "%14.0g" "%8.0g" "%10.0g" "%9.0g" ...
##  - attr(*, "types")= int [1:9] 253 252 255 254 254 254 254 254 254
##  - attr(*, "val.labels")= chr [1:9] "country" "" "" "" ...
##  - attr(*, "var.labels")= chr [1:9] "Country" "Year" "Outcome Y" "Binary outcome Y" ...
##  - attr(*, "expansion.fields")=List of 35
##   ..$ : chr [1:3] "_dta" "_dta" "st"
##   ..$ : chr [1:3] "_dta" "st_ver" "2"
##   ..$ : chr [1:3] "_dta" "st_id" "new_period"
##   ..$ : chr [1:3] "_dta" "st_bt" "elapsed"
##   ..$ : chr [1:3] "_dta" "st_bd" "liberalization"
##   ..$ : chr [1:3] "_dta" "st_o" "_origin"
##   ..$ : chr [1:3] "_dta" "st_orig" "time entrydate"
##   ..$ : chr [1:3] "_dta" "st_orexp" "entrydate"
##   ..$ : chr [1:3] "_dta" "st_s" "365.25"
##   ..$ : chr [1:3] "_dta" "st_bs" "365.25"
##   ..$ : chr [1:3] "_dta" "st_d" "_d"
##   ..$ : chr [1:3] "_dta" "st_t0" "_t0"
##   ..$ : chr [1:3] "_dta" "st_t" "_t"
##   ..$ : chr [1:3] "_dta" "st_ornl" "1"
##   ..$ : chr [1:3] "_dta" "st_orvn" "begin"
##   ..$ : chr [1:3] "_dta" "_lang_list" "default"
##   ..$ : chr [1:3] "_dta" "_lang_c" "default"
##   ..$ : chr [1:3] "_dta" "ReS_i" "country"
##   ..$ : chr [1:3] "_dta" "ReS_ver" "v.2"
##   ..$ : chr [1:3] "_dta" "ReS_j" "year"
##   ..$ : chr [1:3] "_dta" "ReS_str" "0"
##   ..$ : chr [1:3] "_dta" "ReS_Xij" "schooling"
##   ..$ : chr [1:3] "year" "destring" "Characters removed were:"
##   ..$ : chr [1:3] "_dta" "st_exvn" "regtrans"
##   ..$ : chr [1:3] "_dta" "st_exnl" "-66 -77 98"
##   ..$ : chr [1:3] "_dta" "st_exexp" "."
##   ..$ : chr [1:3] "_dta" "st_enexp" "entrydate"
##   ..$ : chr [1:3] "_dta" "__xi__Vars__To__Drop__" "_Icountry_2 _Icountry_3 _Icountry_4 _Icountry_5 _Icountry_6 _Icountry_7"
##   ..$ : chr [1:3] "_dta" "__xi__Vars__Prefix__" "_I _I _I _I _I _I"
##   ..$ : chr [1:3] "_dta" "_TStvar" "year"
##   ..$ : chr [1:3] "_dta" "_TSpanel" "country"
##   ..$ : chr [1:3] "_dta" "_TSdelta" "+1.0000000000000X+000"
##   ..$ : chr [1:3] "_dta" "_TSitrvl" "1"
##   ..$ : chr [1:3] "_dta" "tis" "year"
##   ..$ : chr [1:3] "_dta" "iis" "country"
##  - attr(*, "version")= int 12
##  - attr(*, "label.table")=List of 2
##   ..$ agree  : Named int [1:4] 1 2 3 4
##   .. ..- attr(*, "names")= chr [1:4] "Str agree" "Agree" "Disag" "Str disag"
##   ..$ country: Named int [1:7] 1 2 3 4 5 6 7
##   .. ..- attr(*, "names")= chr [1:7] "A" "B" "C" "D" ...

stargazer(mydata, 
          type = "text")

## 
## =============================================================================
## Statistic N        Mean            St. Dev.           Min            Max     
## -----------------------------------------------------------------------------
## year      70     1,994.500           2.893           1,990          1,999    
## y         70 1,845,072,181.000 3,015,166,973.000 -7,863,482,880 8,941,232,128
## y_bin     70       0.800             0.403             0              1      
## x1        70       0.648             0.468           -0.568         1.446    
## x2        70       0.134             1.371           -1.622         2.530    
## x3        70       0.762             1.446           -1.165         7.169    
## op        70       0.500             0.504             0              1      
## -----------------------------------------------------------------------------

table(mydata$year)

## 
## 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 
##    7    7    7    7    7    7    7    7    7    7

Create a dummy variable to indicate the time when the treatment started. Lets assume that treatment started in 1994. In this case, years before 1994 will have a value of 0 and 1994 onwards a value of 1.

mydata$time = ifelse(test = mydata$year >= 1994, 
                     yes = 1, 
                     no = 0)

1.1 Create a dummy variable to identify the group exposed to the treatment.

In this example lets assumed that countries with code 5,6, and 7 were treated (=1).
Countries 1-4 were not treated (=0). If you already have this skip this step.

table(mydata$country)

## 
##  A  B  C  D  E  F  G 
## 10 10 10 10 10 10 10

mydata$treated = ifelse(test = mydata$country == "E" | mydata$country == "F" | mydata$country == "G", 
                        yes = 1,
                        no = 0
                        )

1.2 Interaction Term

Create an interaction between time and treated. We will call this interaction ‘did’.

mydata$did = mydata$time * mydata$treated

1.3 Estimating the DID estimator

didreg = lm( data = mydata, 
             formula = y ~ treated + time + did
             )

summary(didreg)

## 
## Call:
## lm(formula = y ~ treated + time + did, data = mydata)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -9.768e+09 -1.623e+09  1.167e+08  1.393e+09  6.807e+09 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  3.581e+08  7.382e+08   0.485   0.6292  
## treated      1.776e+09  1.128e+09   1.575   0.1200  
## time         2.289e+09  9.530e+08   2.402   0.0191 *
## did         -2.520e+09  1.456e+09  -1.731   0.0882 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.953e+09 on 66 degrees of freedom
## Multiple R-squared:  0.08273,    Adjusted R-squared:  0.04104 
## F-statistic: 1.984 on 3 and 66 DF,  p-value: 0.1249

The coefficient for did is the differences-in-differences estimator. The effect is significant at 10% with the treatment having a negative effect.

1.4 Estimating the DID estimator

Using the multiplication method, noneed to generate the interaction)

didreg1 = lm(data = mydata, 
             formula = y ~ treated*time )
summary(didreg1)

## 
## Call:
## lm(formula = y ~ treated * time, data = mydata)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -9.768e+09 -1.623e+09  1.167e+08  1.393e+09  6.807e+09 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   3.581e+08  7.382e+08   0.485   0.6292  
## treated       1.776e+09  1.128e+09   1.575   0.1200  
## time          2.289e+09  9.530e+08   2.402   0.0191 *
## treated:time -2.520e+09  1.456e+09  -1.731   0.0882 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.953e+09 on 66 degrees of freedom
## Multiple R-squared:  0.08273,    Adjusted R-squared:  0.04104 
## F-statistic: 1.984 on 3 and 66 DF,  p-value: 0.1249

stargazer(didreg, didreg1, 
          type = "text")

## 
## =====================================================================
##                                         Dependent variable:          
##                               ---------------------------------------
##                                                  y                   
##                                       (1)                 (2)        
## ---------------------------------------------------------------------
## treated                        1,775,969,673.000   1,775,969,673.000 
##                               (1,127,561,848.000) (1,127,561,848.000)
##                                                                      
## time                          2,289,454,651.000** 2,289,454,651.000**
##                                (952,963,694.000)   (952,963,694.000) 
##                                                                      
## did                           -2,519,511,630.000*                    
##                               (1,455,676,087.000)                    
##                                                                      
## treated:time                                      -2,519,511,630.000*
##                                                   (1,455,676,087.000)
##                                                                      
## Constant                        358,143,950.000     358,143,950.000  
##                                (738,162,503.000)   (738,162,503.000) 
##                                                                      
## ---------------------------------------------------------------------
## Observations                          70                  70         
## R2                                   0.083               0.083       
## Adjusted R2                          0.041               0.041       
## Residual Std. Error (df = 66)  2,952,650,012.000   2,952,650,012.000 
## F Statistic (df = 3; 66)             1.984               1.984       
## =====================================================================
## Note:                                     *p<0.1; **p<0.05; ***p<0.01

2 EXAMPLE 2

data <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Econometrics/Fall_2023/share/W6/us_fred_coastal_us_states_avg_hpi_before_after_2005.csv")

stargazer(data, 
          type = "text")

## 
## ================================================
## Statistic         N  Mean  St. Dev.  Min    Max 
## ------------------------------------------------
## HPI_CHG           48 0.022  0.017   -0.006 0.061
## Time_Period       48 0.500  0.505     0      1  
## Disaster_Affected 48 0.208  0.410     0      1  
## NUM_DISASTERS     48 3.208  2.143     1     10  
## NUM_IND_ASSIST    48 8.583  14.946    0     55  
## ------------------------------------------------

table(data$Time_Period)

## 
##  0  1 
## 24 24

table(data$Disaster_Affected)

## 
##  0  1 
## 38 10

did_model_results <- 
          lm(data = data, 
          formula = HPI_CHG ~ Time_Period * Disaster_Affected)

summary(did_model_results)

## 
## Call:
## lm(formula = HPI_CHG ~ Time_Period * Disaster_Affected, data = data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.023081 -0.007610 -0.000171  0.004656  0.035981 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    0.037090   0.002819  13.157  < 2e-16 ***
## Time_Period                   -0.027847   0.003987  -6.985  1.2e-08 ***
## Disaster_Affected             -0.013944   0.006176  -2.258   0.0290 *  
## Time_Period:Disaster_Affected  0.019739   0.008734   2.260   0.0288 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01229 on 44 degrees of freedom
## Multiple R-squared:  0.5356, Adjusted R-squared:  0.504 
## F-statistic: 16.92 on 3 and 44 DF,  p-value: 1.882e-07

table(data$Time_Period, 
      data$Disaster_Affected
      )

##    
##      0  1
##   0 19  5
##   1 19  5

# Create a two-way table with labels
raw_table <- table(Time_Period = ifelse(test = data$Time_Period == 0, yes = "pre", no = "post"),
                   Treatment_Status = ifelse(data$Disaster_Affected == 0, yes = "control", no = "treatment")
                   )

raw_table

##            Treatment_Status
## Time_Period control treatment
##        post      19         5
##        pre       19         5

?tapply # Apply a function to each cell of a ragged array, that is to each (non-empty) group of values given by a unique combination of the levels of certain factors.

# Calculate means for each group
mean_table <- tapply(X = data$HPI_CHG, 
                     INDEX = list(data$Time_Period, 
                                  data$Disaster_Affected),
                     FUN =  mean)
mean_table

##             0          1
## 0 0.037090020 0.02314612
## 1 0.009242792 0.01503835

# Calculate means for each group
mean_table <- tapply(data$HPI_CHG, list(Time_Period = ifelse(data$Time_Period == 0, "pre", "post"),
                                       Treatment_Status = ifelse(data$Disaster_Affected == 0, "control", "treatment")),
                    mean)

# Display the table
print(mean_table)

##            Treatment_Status
## Time_Period     control  treatment
##        post 0.009242792 0.01503835
##        pre  0.037090020 0.02314612

# Calculate DiD effect
DiD_effect <- (mean_table[1, 2] - mean_table[2, 2])  - (mean_table[1, 1] - mean_table[2, 1])
print(DiD_effect)

## [1] 0.01973946

Source:

https://towardsdatascience.com/a-guide-to-using-the-difference-in-differences-regression-model-87cd2fb3224a

3 Identifying Assumptions

Difference-in-differences (DiD) analysis relies on certain assumptions to identify the causal effect of a treatment. The key threats to identification or implicit assumptions in DiD include:

3.1 Parallel Trends Assumption:

Assumption: The average treatment and control groups would have followed parallel trends in the absence of the treatment.
Implication: Any difference in trends post-treatment can be attributed to the treatment effect.
Threat: If there are pre-existing trends or shocks that affect the treatment and control groups differently, the parallel trends assumption may be violated.

Common Time Trends:
Assumption: Unobservable factors affecting outcomes in both groups evolve similarly over time.
Implication: Controls for unobserved time-varying factors.
Threat: If there are unobserved time-varying factors that systematically differ between treatment and control groups and are related to the outcomes, the estimates may be biased.

3.2 No Spillover Effects:

Assumption: The treatment does not have direct or indirect effects on the control group.
Implication: The treatment effect is isolated to the treated units.
Threat: If there are spillover effects, where the treatment affects the control group, it could confound the DiD estimates.

3.3 No Other Contemporaneous Events:

Assumption: There are no other contemporaneous events that systematically affect the treatment and control groups differently.
Implication: Any observed changes in outcomes can be attributed to the treatment.
Threat: If other events occur simultaneously and affect the groups differently, they may confound the estimates.

3.4 Sufficient Time Periods:

Assumption: There are enough pre-treatment and post-treatment time periods to observe trends.
Implication: Provides information to identify and control for trends.
Threat: With insufficient time periods, it may be challenging to establish pre-treatment trends or control for changes in trends.

3.5 Homogeneous Treatment Effects:

Assumption: The treatment effect is homogeneous across all units.
Implication: Assumes that the treatment effect does not vary systematically across different units.
Threat: If treatment effects vary, aggregating them may obscure heterogeneity.

It’s important to note that, like in any statistical method, the validity of DiD results depends on the robustness of these assumptions.

Violations of these assumptions can lead to biased estimates. Researchers often perform sensitivity analyses and robustness checks to assess the impact of potential violations on the results. Additionally, employing matching methods or incorporating additional control variables can help address some of these threats.

Diff_in_Diff

Arvind Sharma

2024-04-01