rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 531291 28.4 1183341 63.2 NA 669265 35.8
## Vcells 975543 7.5 8388608 64.0 16384 1840436 14.1
cat("\f") # Clear the console
graphics.off() # Clear the charts
library(foreign)
## Warning: package 'foreign' was built under R version 4.2.3
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
EXAMPLE 0:
data()
mydata = read.dta("http://dss.princeton.edu/training/Panel101.dta")
str(mydata)
## 'data.frame': 70 obs. of 9 variables:
## $ country: Factor w/ 7 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 ...
## $ y : num 1.34e+09 -1.90e+09 -1.12e+07 2.65e+09 3.01e+09 ...
## $ y_bin : num 1 0 0 1 1 1 1 1 1 1 ...
## $ x1 : num 0.278 0.321 0.363 0.246 0.425 ...
## $ x2 : num -1.108 -0.949 -0.789 -0.886 -0.73 ...
## $ x3 : num 0.2826 0.4925 0.7025 -0.0944 0.9461 ...
## $ opinion: Factor w/ 4 levels "Str agree","Agree",..: 1 3 3 3 3 1 3 1 3 4 ...
## $ op : num 1 0 0 0 0 1 0 1 0 0 ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr " 3 Jan 2020 11:28"
## - attr(*, "formats")= chr [1:9] "%14.0g" "%8.0g" "%10.0g" "%9.0g" ...
## - attr(*, "types")= int [1:9] 253 252 255 254 254 254 254 254 254
## - attr(*, "val.labels")= chr [1:9] "country" "" "" "" ...
## - attr(*, "var.labels")= chr [1:9] "Country" "Year" "Outcome Y" "Binary outcome Y" ...
## - attr(*, "expansion.fields")=List of 35
## ..$ : chr [1:3] "_dta" "_dta" "st"
## ..$ : chr [1:3] "_dta" "st_ver" "2"
## ..$ : chr [1:3] "_dta" "st_id" "new_period"
## ..$ : chr [1:3] "_dta" "st_bt" "elapsed"
## ..$ : chr [1:3] "_dta" "st_bd" "liberalization"
## ..$ : chr [1:3] "_dta" "st_o" "_origin"
## ..$ : chr [1:3] "_dta" "st_orig" "time entrydate"
## ..$ : chr [1:3] "_dta" "st_orexp" "entrydate"
## ..$ : chr [1:3] "_dta" "st_s" "365.25"
## ..$ : chr [1:3] "_dta" "st_bs" "365.25"
## ..$ : chr [1:3] "_dta" "st_d" "_d"
## ..$ : chr [1:3] "_dta" "st_t0" "_t0"
## ..$ : chr [1:3] "_dta" "st_t" "_t"
## ..$ : chr [1:3] "_dta" "st_ornl" "1"
## ..$ : chr [1:3] "_dta" "st_orvn" "begin"
## ..$ : chr [1:3] "_dta" "_lang_list" "default"
## ..$ : chr [1:3] "_dta" "_lang_c" "default"
## ..$ : chr [1:3] "_dta" "ReS_i" "country"
## ..$ : chr [1:3] "_dta" "ReS_ver" "v.2"
## ..$ : chr [1:3] "_dta" "ReS_j" "year"
## ..$ : chr [1:3] "_dta" "ReS_str" "0"
## ..$ : chr [1:3] "_dta" "ReS_Xij" "schooling"
## ..$ : chr [1:3] "year" "destring" "Characters removed were:"
## ..$ : chr [1:3] "_dta" "st_exvn" "regtrans"
## ..$ : chr [1:3] "_dta" "st_exnl" "-66 -77 98"
## ..$ : chr [1:3] "_dta" "st_exexp" "."
## ..$ : chr [1:3] "_dta" "st_enexp" "entrydate"
## ..$ : chr [1:3] "_dta" "__xi__Vars__To__Drop__" "_Icountry_2 _Icountry_3 _Icountry_4 _Icountry_5 _Icountry_6 _Icountry_7"
## ..$ : chr [1:3] "_dta" "__xi__Vars__Prefix__" "_I _I _I _I _I _I"
## ..$ : chr [1:3] "_dta" "_TStvar" "year"
## ..$ : chr [1:3] "_dta" "_TSpanel" "country"
## ..$ : chr [1:3] "_dta" "_TSdelta" "+1.0000000000000X+000"
## ..$ : chr [1:3] "_dta" "_TSitrvl" "1"
## ..$ : chr [1:3] "_dta" "tis" "year"
## ..$ : chr [1:3] "_dta" "iis" "country"
## - attr(*, "version")= int 12
## - attr(*, "label.table")=List of 2
## ..$ agree : Named int [1:4] 1 2 3 4
## .. ..- attr(*, "names")= chr [1:4] "Str agree" "Agree" "Disag" "Str disag"
## ..$ country: Named int [1:7] 1 2 3 4 5 6 7
## .. ..- attr(*, "names")= chr [1:7] "A" "B" "C" "D" ...
stargazer(mydata,
type = "text")
##
## =============================================================================
## Statistic N Mean St. Dev. Min Max
## -----------------------------------------------------------------------------
## year 70 1,994.500 2.893 1,990 1,999
## y 70 1,845,072,181.000 3,015,166,973.000 -7,863,482,880 8,941,232,128
## y_bin 70 0.800 0.403 0 1
## x1 70 0.648 0.468 -0.568 1.446
## x2 70 0.134 1.371 -1.622 2.530
## x3 70 0.762 1.446 -1.165 7.169
## op 70 0.500 0.504 0 1
## -----------------------------------------------------------------------------
table(mydata$year)
##
## 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999
## 7 7 7 7 7 7 7 7 7 7
Create a dummy variable to indicate the time when the treatment started. Lets assume that treatment started in 1994. In this case, years before 1994 will have a value of 0 and 1994 onwards a value of 1.
mydata$time = ifelse(test = mydata$year >= 1994,
yes = 1,
no = 0)
In this example lets assumed that countries with code 5,6, and 7
were treated (=1).
Countries 1-4 were not treated (=0). If you already
have this skip this step.
table(mydata$country)
##
## A B C D E F G
## 10 10 10 10 10 10 10
mydata$treated = ifelse(test = mydata$country == "E" | mydata$country == "F" | mydata$country == "G",
yes = 1,
no = 0
)
Create an interaction between time and treated. We will call this interaction ‘did’.
mydata$did = mydata$time * mydata$treated
didreg = lm( data = mydata,
formula = y ~ treated + time + did
)
summary(didreg)
##
## Call:
## lm(formula = y ~ treated + time + did, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.768e+09 -1.623e+09 1.167e+08 1.393e+09 6.807e+09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.581e+08 7.382e+08 0.485 0.6292
## treated 1.776e+09 1.128e+09 1.575 0.1200
## time 2.289e+09 9.530e+08 2.402 0.0191 *
## did -2.520e+09 1.456e+09 -1.731 0.0882 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.953e+09 on 66 degrees of freedom
## Multiple R-squared: 0.08273, Adjusted R-squared: 0.04104
## F-statistic: 1.984 on 3 and 66 DF, p-value: 0.1249
The coefficient for did is the
differences-in-differences estimator. The effect is significant at 10%
with the treatment having a negative effect.
Using the multiplication method, noneed to generate the interaction)
didreg1 = lm(data = mydata,
formula = y ~ treated*time )
summary(didreg1)
##
## Call:
## lm(formula = y ~ treated * time, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.768e+09 -1.623e+09 1.167e+08 1.393e+09 6.807e+09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.581e+08 7.382e+08 0.485 0.6292
## treated 1.776e+09 1.128e+09 1.575 0.1200
## time 2.289e+09 9.530e+08 2.402 0.0191 *
## treated:time -2.520e+09 1.456e+09 -1.731 0.0882 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.953e+09 on 66 degrees of freedom
## Multiple R-squared: 0.08273, Adjusted R-squared: 0.04104
## F-statistic: 1.984 on 3 and 66 DF, p-value: 0.1249
stargazer(didreg, didreg1,
type = "text")
##
## =====================================================================
## Dependent variable:
## ---------------------------------------
## y
## (1) (2)
## ---------------------------------------------------------------------
## treated 1,775,969,673.000 1,775,969,673.000
## (1,127,561,848.000) (1,127,561,848.000)
##
## time 2,289,454,651.000** 2,289,454,651.000**
## (952,963,694.000) (952,963,694.000)
##
## did -2,519,511,630.000*
## (1,455,676,087.000)
##
## treated:time -2,519,511,630.000*
## (1,455,676,087.000)
##
## Constant 358,143,950.000 358,143,950.000
## (738,162,503.000) (738,162,503.000)
##
## ---------------------------------------------------------------------
## Observations 70 70
## R2 0.083 0.083
## Adjusted R2 0.041 0.041
## Residual Std. Error (df = 66) 2,952,650,012.000 2,952,650,012.000
## F Statistic (df = 3; 66) 1.984 1.984
## =====================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
data <- read.csv("~/Library/CloudStorage/Dropbox/WCAS/Econometrics/Fall_2023/share/W6/us_fred_coastal_us_states_avg_hpi_before_after_2005.csv")
stargazer(data,
type = "text")
##
## ================================================
## Statistic N Mean St. Dev. Min Max
## ------------------------------------------------
## HPI_CHG 48 0.022 0.017 -0.006 0.061
## Time_Period 48 0.500 0.505 0 1
## Disaster_Affected 48 0.208 0.410 0 1
## NUM_DISASTERS 48 3.208 2.143 1 10
## NUM_IND_ASSIST 48 8.583 14.946 0 55
## ------------------------------------------------
table(data$Time_Period)
##
## 0 1
## 24 24
table(data$Disaster_Affected)
##
## 0 1
## 38 10
did_model_results <-
lm(data = data,
formula = HPI_CHG ~ Time_Period * Disaster_Affected)
summary(did_model_results)
##
## Call:
## lm(formula = HPI_CHG ~ Time_Period * Disaster_Affected, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.023081 -0.007610 -0.000171 0.004656 0.035981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.037090 0.002819 13.157 < 2e-16 ***
## Time_Period -0.027847 0.003987 -6.985 1.2e-08 ***
## Disaster_Affected -0.013944 0.006176 -2.258 0.0290 *
## Time_Period:Disaster_Affected 0.019739 0.008734 2.260 0.0288 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01229 on 44 degrees of freedom
## Multiple R-squared: 0.5356, Adjusted R-squared: 0.504
## F-statistic: 16.92 on 3 and 44 DF, p-value: 1.882e-07
table(data$Time_Period,
data$Disaster_Affected
)
##
## 0 1
## 0 19 5
## 1 19 5
# Create a two-way table with labels
raw_table <- table(Time_Period = ifelse(test = data$Time_Period == 0, yes = "pre", no = "post"),
Treatment_Status = ifelse(data$Disaster_Affected == 0, yes = "control", no = "treatment")
)
raw_table
## Treatment_Status
## Time_Period control treatment
## post 19 5
## pre 19 5
?tapply # Apply a function to each cell of a ragged array, that is to each (non-empty) group of values given by a unique combination of the levels of certain factors.
# Calculate means for each group
mean_table <- tapply(X = data$HPI_CHG,
INDEX = list(data$Time_Period,
data$Disaster_Affected),
FUN = mean)
mean_table
## 0 1
## 0 0.037090020 0.02314612
## 1 0.009242792 0.01503835
# Calculate means for each group
mean_table <- tapply(data$HPI_CHG, list(Time_Period = ifelse(data$Time_Period == 0, "pre", "post"),
Treatment_Status = ifelse(data$Disaster_Affected == 0, "control", "treatment")),
mean)
# Display the table
print(mean_table)
## Treatment_Status
## Time_Period control treatment
## post 0.009242792 0.01503835
## pre 0.037090020 0.02314612
# Calculate DiD effect
DiD_effect <- (mean_table[1, 2] - mean_table[2, 2]) - (mean_table[1, 1] - mean_table[2, 1])
print(DiD_effect)
## [1] 0.01973946
Source:
Difference-in-differences (DiD) analysis relies on certain assumptions to identify the causal effect of a treatment. The key threats to identification or implicit assumptions in DiD include:
Assumption: The average treatment and control groups would have followed parallel trends in the absence of the treatment.
Implication: Any difference in trends post-treatment can be attributed to the treatment effect.
Threat: If there are pre-existing trends or shocks that affect the treatment and control groups differently, the parallel trends assumption may be violated.
Assumption: Unobservable factors affecting outcomes in both groups evolve similarly over time.
Implication: Controls for unobserved time-varying factors.
Threat: If there are unobserved time-varying factors that systematically differ between treatment and control groups and are related to the outcomes, the estimates may be biased.
Assumption: The treatment does not have direct or indirect effects on the control group.
Implication: The treatment effect is isolated to the treated units.
Threat: If there are spillover effects, where the treatment affects the control group, it could confound the DiD estimates.
Assumption: There are no other contemporaneous events that systematically affect the treatment and control groups differently.
Implication: Any observed changes in outcomes can be attributed to the treatment.
Threat: If other events occur simultaneously and affect the groups differently, they may confound the estimates.
Assumption: There are enough pre-treatment and post-treatment time periods to observe trends.
Implication: Provides information to identify and control for trends.
Threat: With insufficient time periods, it may be challenging to establish pre-treatment trends or control for changes in trends.
Assumption: The treatment effect is homogeneous across all units.
Implication: Assumes that the treatment effect does not vary systematically across different units.
Threat: If treatment effects vary, aggregating them may obscure heterogeneity.
It’s important to note that, like in any statistical method, the validity of DiD results depends on the robustness of these assumptions.
Violations of these assumptions can lead to biased estimates. Researchers often perform sensitivity analyses and robustness checks to assess the impact of potential violations on the results. Additionally, employing matching methods or incorporating additional control variables can help address some of these threats.