This tutorial shows how to estimate synthetic control models (SCM) using the Synth and the gsynth packages for R. The first package was written by Abadie, Diamond, and Hainmueller (2011) and gsynth was written by Xiu (2017).

First, we need to install the packages from the CRAN repositories:

install.packages("Synth", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/47/m41mb_3j3fn_7wld1j1r2q100000gp/T//RtmpAik4Bg/downloaded_packages
install.packages("gsynth", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/47/m41mb_3j3fn_7wld1j1r2q100000gp/T//RtmpAik4Bg/downloaded_packages

Then, we load both packages:

library("Synth")
## ##
## ## Synth Package: Implements Synthetic Control Methods.
## ## See http://www.mit.edu/~jhainm/software.htm for additional information.
library("gsynth")

The packages are now ready to use. Let’s simulate some data to see how they work. I will simulate data for 10 states and 30 years. State A receives the treatment T = 20 after year 15.

set.seed(1)
year <- rep(1:30, 10) 
state <- rep(LETTERS[1:10], each = 30)
X1 <- round(rnorm(300, mean = 2, sd = 1), 2)
X2 <- round(rbinom(300, 1, 0.5) + rnorm(300), 2)
Y <- round(1 + 2*X1 + rnorm(300), 2)
df <- as.data.frame(cbind(Y, X1, X2, state, year))
df$Y <- as.numeric(as.character(df$Y))
df$X1 <- as.numeric(as.character(df$X1))
df$X2 <- as.numeric(as.character(df$X2))
df$year <- as.numeric(as.character(df$year))
df$state.num <- as.numeric(df$state)
df$state <- as.character(df$state)
df$T <- ifelse(df$state == "A" & df$year >= 15, 1, 0)
df$Y <- ifelse(df$state == "A" & df$year >= 15, df$Y + 20, df$Y)

Let’s see how our dataset looks like:

str(df)
## 'data.frame':    300 obs. of  7 variables:
##  $ Y        : num  2.29 4.51 2.07 8.87 4.37 1.32 8 7.49 6.98 3.72 ...
##  $ X1       : num  1.37 2.18 1.16 3.6 2.33 1.18 2.49 2.74 2.58 1.69 ...
##  $ X2       : num  1.96 0.4 -0.75 -0.56 -0.45 1.06 0.51 -2.1 0 0.54 ...
##  $ state    : chr  "A" "A" "A" "A" ...
##  $ year     : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ state.num: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ T        : num  0 0 0 0 0 0 0 0 0 0 ...
head(df)
##      Y   X1    X2 state year state.num T
## 1 2.29 1.37  1.96     A    1         1 0
## 2 4.51 2.18  0.40     A    2         1 0
## 3 2.07 1.16 -0.75     A    3         1 0
## 4 8.87 3.60 -0.56     A    4         1 0
## 5 4.37 2.33 -0.45     A    5         1 0
## 6 1.32 1.18  1.06     A    6         1 0

Now we can estimate the models. I will first use Synth. Notice the weights for the two independent variables (v.weights) and for the control cases (w.weights).

dataprep.out <-
        dataprep(df,
                 predictors = c("X1", "X2"),
                 dependent     = "Y",
                 unit.variable = "state.num",
                 time.variable = "year",
                 unit.names.variable = "state",
                 treatment.identifier  = 1,
                 controls.identifier   = c(2:10),
                 time.predictors.prior = c(1:14),
                 time.optimize.ssr     = c(1:14),
                 time.plot             = c(1:30)
                 )

# Run synth
synth.out <- synth(dataprep.out)
## 
## X1, X0, Z1, Z0 all come directly from dataprep object.
## 
## 
## **************** 
##  searching for synthetic control unit  
##  
## 
## **************** 
## **************** 
## **************** 
## 
## MSPE (LOSS V): 9.831789 
## 
## solution.v:
##  0.3888387 0.6111613 
## 
## solution.w:
##  0.1115941 0.1832781 0.1027237 0.312091 0.06096758 0.03509706 0.05893735 0.05746256 0.07784853
# Get result tables
print(synth.tables   <- synth.tab(
        dataprep.res = dataprep.out,
        synth.res    = synth.out)
      )
## $tab.pred
##    Treated Synthetic Sample Mean
## X1   2.028     2.028       2.017
## X2   0.513     0.513       0.394
## 
## $tab.v
##    v.weights
## X1 0.389    
## X2 0.611    
## 
## $tab.w
##    w.weights unit.names unit.numbers
## 2      0.112          B            2
## 3      0.183          C            3
## 4      0.103          D            4
## 5      0.312          E            5
## 6      0.061          F            6
## 7      0.035          G            7
## 8      0.059          H            8
## 9      0.057          I            9
## 10     0.078          J           10
## 
## $tab.loss
##            Loss W   Loss V
## [1,] 9.761708e-12 9.831789

Plot:

path.plot(synth.res    = synth.out,
          dataprep.res = dataprep.out,
          Ylab         = c("Y"),
          Xlab         = c("Year"),
          Legend       = c("State A","Synthetic State A"),
          Legend.position = c("topleft")
)

abline(v   = 15,
       lty = 2)

Gaps plot:

gaps.plot(synth.res    = synth.out,
          dataprep.res = dataprep.out,
          Ylab         = c("Gap"),
          Xlab         = c("Year"),
          Ylim         = c(-30, 30),
          Main         = ""
)

abline(v   = 15,
       lty = 2)

Lastly, I will run a similar procedure using gsynth, which is a more recent package. It provides several options to estimate iterative fixed effects and can handle multiple treated units at a time. I will use two-way fixed effects and bootstrapped standard errors.

gsynth.out <- gsynth(Y ~ T + X1 + X2, data = df,
                     index = c("state","year"), force = "two-way", 
                     CV = TRUE, r = c(0, 5), se = TRUE, 
                     inference = "parametric", nboots = 1000,
                     parallel = TRUE)
## Parallel computing ...
## Cross-validating ... 
##  r = 0; sigma2 = 0.97435; IC = 0.01548; MSPE = 1.65502
##  r = 1; sigma2 = 0.81720; IC = 0.62752; MSPE = 1.33373
##  r = 2; sigma2 = 0.67509; IC = 1.18295; MSPE = 1.27333*
##  r = 3; sigma2 = 0.57336; IC = 1.72459; MSPE = 1.79368
##  r = 4; sigma2 = 0.48099; IC = 2.21245; MSPE = 2.02632
##  r = 5; sigma2 = 0.39640; IC = 2.64108; MSPE = 2.79403
## 
##  r* = 2
## 
## 
Simulating errors ...
Bootstrapping ...
## 
Call:
## gsynth.formula(formula = Y ~ T + X1 + X2, data = df, index = c("state", 
##     "year"), force = "two-way", r = c(0, 5), CV = TRUE, se = TRUE, 
##     nboots = 1000, inference = "parametric", parallel = TRUE)
## 
## Average Treatment Effect on the Treated:
##      ATT.avg   S.E. CI.lower CI.upper p.value
## [1,]   20.45 0.7073    19.04    21.78       0
## 
##    ~ by Period (including Pre-treatment Periods):
##         ATT   S.E. CI.lower CI.upper p.value n.Treated
## 1  -0.43308 1.1219  -2.8099   1.3605   0.508         0
## 2  -0.78303 0.7924  -2.5372   0.6043   0.272         0
## 3  -0.19200 0.9644  -2.6560   1.1220   0.644         0
## 4  -0.98271 1.0716  -2.0018   2.2940   0.722         0
## 5  -0.94517 0.8561  -2.7858   0.4560   0.224         0
## 6  -0.23968 0.9562  -2.7478   0.9594   0.546         0
## 7   1.74515 1.1032  -0.1506   4.0772   0.078         0
## 8   1.74006 1.2028  -0.7870   3.8099   0.196         0
## 9   0.63656 0.9510  -0.7860   2.8959   0.360         0
## 10 -0.03283 0.8512  -1.5840   1.5385   0.906         0
## 11 -0.25336 0.7864  -1.6081   1.4220   0.968         0
## 12  0.10275 0.7721  -1.3894   1.6682   0.734         0
## 13 -0.23383 0.5940  -1.6283   0.6983   0.550         0
## 14 -0.12884 0.8398  -2.0887   1.4384   0.998         0
## 15 21.19526 1.7364  17.1828  24.0191   0.000         1
## 16 22.84202 1.2329  20.1896  24.8276   0.000         1
## 17 20.36897 1.2983  18.4319  23.5180   0.000         1
## 18 22.33265 1.9344  18.1115  25.3980   0.000         1
## 19 19.68360 2.5404  15.0589  24.8618   0.000         1
## 20 18.08852 0.8818  16.6546  20.1623   0.000         1
## 21 19.75864 1.4264  17.0234  22.3764   0.000         1
## 22 18.92588 2.1003  14.7868  23.1134   0.000         1
## 23 21.74316 0.9950  19.6417  23.3728   0.000         1
## 24 21.64685 0.8908  20.2210  23.7518   0.000         1
## 25 19.39844 1.2054  16.9431  21.8050   0.000         1
## 26 19.31778 0.7979  17.7527  20.7814   0.000         1
## 27 22.28355 1.1695  19.9120  24.2646   0.000         1
## 28 20.38835 0.8634  18.6112  21.9347   0.000         1
## 29 18.92461 1.4941  16.3352  22.0607   0.000         1
## 30 20.37698 1.5886  17.5509  23.6583   0.000         1
## 
## Coefficients for the Covariates:
##       beta    S.E. CI.lower CI.upper p.value
## X1 1.93050 0.05916  1.83078   2.0672   0.000
## X2 0.03792 0.04585 -0.05587   0.1364   0.352

Plots:

plot(gsynth.out)

plot(gsynth.out, type = "counterfactual")

plot(gsynth.out, type = "counterfactual", raw = "all") # shows estimations for the control cases