library(tidyverse)
library(haven)
library(dplyr)
library(gridExtra)
library(ggplot2)
library(stargazer)
library(lmtest)
library(sandwich)
library(Synth)
library(grid)
# Creating the dataset
set.seed(123456)
df <- data.frame(female = as.numeric(1:400 > 300),
error = rnorm(400, mean = 0, sd = 0.55),
tau = 0.1 + 0.1 * (1:400 > 300))
# Potential outcomes
alpha <- 1.2
df$y0 <- alpha + df$error
df$y1 <- df$y0 + df$tau
# Random treatment assignment
df$W_0 <- runif(400)
df <- df[order(df$female, df$W_0), ]
df$treat0 <- ave(df$female, df$female, FUN = function(x) seq_along(x) <= 50)
# Section B Task 1: Propensity score
logit_model <- glm(treat0 ~ female, data = df, family = binomial(link = "logit"))
# Predicting probabilities
df$e_0 <- predict(logit_model, type = "response")
e_female <- mean(df$e_0[df$female == 1])
cat("average e(female_i) = ", e_female, "\n")
## average e(female_i) = 0.5
# ATE
df$ate <- df$y1 - df$y0
ATE <- mean(df$ate)
cat("ATE = ", ATE, "\n")
## ATE = 0.125
# ATT
ATT <- mean(df$ate[df$treat0 == 1], na.rm = TRUE)
cat("ATT = ", ATT, "\n")
## ATT = 0.15
# Section B Task 2: Simulation
# Define the simulation function
SIM <- replicate(5000, {
df$W <- runif(nrow(df))
df <- df[order(df$female, df$W), ]
df$treat <- ave(df$female, df$female, FUN = function(x) seq_along(x) <= 50)
df$y_obs <- df$y0 + df$treat * (df$y1 - df$y0)
df$ehat <- predict(logit_model, type = "response")
df$weight <- with(df, 1 / (ehat^treat * (1 - ehat)^(1 - treat)))
# Models
model1 <- lm(y_obs ~ treat, data = df)
model2 <- lm(y_obs ~ treat + female, data = df)
model3 <- lm(y_obs ~ treat, data = df, subset = (female == 0))
model4 <- lm(y_obs ~ treat, data = df, subset = (female == 1))
model5 <- lm(y_obs ~ treat, data = df, weights = weight)
coef <- c(model1$coefficients[2],model2$coefficients[2],model3$coefficients[2], model4$coefficients[2],model5$coefficients[2])
},simplify = "array")
# Summary Statistics
mat1 <-t(SIM)
colnames(mat1)<-c("beta1","beta2","beta3","beta4","beta5")
summary(mat1)
## beta1 beta2 beta3 beta4
## Min. :-0.05133 Min. :-0.09591 Min. :-0.1968 Min. :-0.2560
## 1st Qu.: 0.11549 1st Qu.: 0.09177 1st Qu.: 0.0411 1st Qu.: 0.1189
## Median : 0.15777 Median : 0.13933 Median : 0.1021 Median : 0.2023
## Mean : 0.15725 Mean : 0.13875 Mean : 0.1006 Mean : 0.2024
## 3rd Qu.: 0.19894 3rd Qu.: 0.18565 3rd Qu.: 0.1581 3rd Qu.: 0.2854
## Max. : 0.38711 Max. : 0.39735 Max. : 0.3746 Max. : 0.6228
## beta5
## Min. :-0.1151
## 1st Qu.: 0.0779
## Median : 0.1267
## Mean : 0.1260
## 3rd Qu.: 0.1730
## Max. : 0.3740
mean_values <- colMeans(mat1)
print(mean_values)
## beta1 beta2 beta3 beta4 beta5
## 0.1572496 0.1387484 0.1005691 0.2023806 0.1260219
std_devs <- apply(mat1, 2, sd)
print(std_devs)
## beta1 beta2 beta3 beta4 beta5
## 0.06147586 0.06916034 0.08510520 0.12208385 0.06994299
Table <- rbind(mean_values, std_devs)
print(Table)
## beta1 beta2 beta3 beta4 beta5
## mean_values 0.15724964 0.13874838 0.1005691 0.2023806 0.12602194
## std_devs 0.06147586 0.06916034 0.0851052 0.1220839 0.06994299
###Section C
df3 <- read_dta("~/Documents/lfs_2010_2019_ages2539_per50.dta")
df3 <- df3 %>%
filter(province %in% c(5, 6, 9, 10)) %>%
mutate(
province = factor(province, levels = c(5, 6, 9, 10), labels = c("Quebec", "Ontario", "Alberta", "BC")),
year = as.numeric(year),
sex = as.factor(sex),
marstat = as.factor(marstat),
cohab = as.factor(cohab)
)
# Section C Task 1.1: Formal Marriage Rates Among Cohabiting Individuals
# For Males
df_male <- df3 %>%
filter(sex == "1", marstat != "0") %>%
mutate(fm = as.numeric(marstat == "1" & cohab == "1")) %>%
group_by(year, province) %>%
summarise(fm = mean(fm, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Plot for Males
plot_male <- ggplot(df_male, aes(x = year, y = fm, group = province, color = province)) +
geom_line() +
labs(title = "Male", y = "Share of cohabiting individuals aged 25-39") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 10))
# For Females
df_female <- df3 %>%
filter(sex == "2", marstat != "0") %>%
mutate(fm = as.numeric(marstat == "1" & cohab == "1")) %>%
group_by(year, province) %>%
summarise(fm = mean(fm, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Plot for Females
plot_female <- ggplot(df_female, aes(x = year, y = fm, group = province, color = province)) +
geom_line() +
labs(title = "Female", y = "Share of cohabiting individuals aged 25-39") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 10))
# Combining Plots
grid.arrange(plot_male, plot_female, ncol = 2,
top = textGrob("Formal marriage rates among cohabiting individuals, ages 25-39", gp = gpar(fontsize = 13))
)
# Section C Task 1.2: Cohabitation rates Among Cohabiting Individuals
# Calculate the cohabitation ratio
df3$cohab <- as.numeric(df3$cohab == 1)
# For Males
df_male_cohab <- df3 %>%
filter(sex == "1") %>%
group_by(year, province) %>%
summarise(cohab_ratio = mean(cohab), .groups = "drop")
# Plot for Male Cohabitation Rates
plot_male_cohab <- ggplot(df_male_cohab, aes(x = year, y = cohab_ratio, group = province, color = province)) +
geom_line() +
labs(title = "Male", y = "Share of cohabiting individuals aged 25-39") +
geom_vline(xintercept = 2013, linetype = "dotted", color = "red") +
scale_y_continuous(limits = c(0, 1)) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 10))
# For Females
df_female_cohab <- df3 %>%
filter(sex == "2") %>%
group_by(year, province) %>%
summarise(cohab_ratio = mean(cohab), .groups = "drop")
# Plot for Female Cohabitation Rates
plot_female_cohab <- ggplot(df_female_cohab, aes(x = year, y = cohab_ratio, group = province, color = province)) +
geom_line() +
labs(title = "Female", y = "Share of cohabiting individuals aged 25-39") +
geom_vline(xintercept = 2013, linetype = "dotted", color = "red") +
scale_y_continuous(limits = c(0, 1)) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 10))
# Combining Plots for Cohabitation Rates
grid.arrange(plot_male_cohab, plot_female_cohab, ncol = 2,
top = textGrob("Cohabitation rates among cohabiting individuals, ages 25-39", gp = gpar(fontsize = 13))
)
Discussion: The cohabitation rates remain relatively steady across provinces, including BC. This implies that the reform might not have had a deterring effect on the choice to cohabit. It’s possible that the two-year cohabitation rule for legal protections did not influence couples’ decisions to cohabit or not, or that individuals were not fully aware of the legal implications.
In terms of trends, both the formal marriage rates and cohabitation rates appear relatively stable across the provinces, with some year-to-year fluctuations. There doesn’t seem to be a significant divergence in trends between BC and the other provinces leading up to and following 2013, suggesting that the parallel trend assumption might hold in this context. However, a formal statistical test would be required for a definitive conclusion.In terms of gender differences, there do not appear to be significant disparities in response to the reform between males and females. Both genders show a similar pattern in formal marriage and cohabitation rates, suggesting that the reform affected men and women in a similar way regarding their marital decisions.
# Section C Task 2
df_did <- df3 %>%
filter(year >= 2011, year <= 2014, sex == "2", cohab == 1) %>%
mutate(marstat = as.numeric(marstat == 1),
D = as.numeric(province == "BC"),
T = as.numeric(year >= 2013),
DT = D * T)
# Spec 1 - Alberta as control
model1 <- lm(marstat ~ D + T + DT, data = filter(df_did, province %in% c("BC","Alberta")))
# Spec 1 - Ontario as control
model2 <- lm(marstat ~ D + T + DT, data = filter(df_did, province %in% c("BC", "Ontario")))
# Spec 2 - Alberta as control
model3 <- lm(marstat ~ D + T + DT + edugrp:sp_edugrp, data = filter(df_did, province %in% c("BC","Alberta")))
# Spec 2 - Ontario as control
model4 <- lm(marstat ~ D + T + DT + edugrp:sp_edugrp, data = filter(df_did, province %in% c("BC", "Ontario")))
# Present results
stargazer(model1, model2, model3, model4, type = "text", out = "Difference-in-Difference Estimates", append = TRUE)
##
## ===========================================================================================================================
## Dependent variable:
## -------------------------------------------------------------------------------------------------------
## marstat
## (1) (2) (3) (4)
## ---------------------------------------------------------------------------------------------------------------------------
## D 0.006 -0.003 -0.006 0.002
## (0.006) (0.005) (0.006) (0.005)
##
## T -0.008 -0.022*** -0.018*** -0.022***
## (0.005) (0.004) (0.006) (0.004)
##
## DT -0.007 0.006 -0.007 -0.003
## (0.008) (0.007) (0.008) (0.007)
##
## edugrp:sp_edugrp 0.022*** 0.023***
## (0.001) (0.001)
##
## Constant 0.775*** 0.784*** 0.689*** 0.677***
## (0.004) (0.003) (0.005) (0.004)
##
## ---------------------------------------------------------------------------------------------------------------------------
## Observations 44,725 73,452 41,880 69,307
## R2 0.0002 0.001 0.020 0.023
## Adjusted R2 0.0002 0.001 0.020 0.023
## Residual Std. Error 0.420 (df = 44721) 0.419 (df = 73448) 0.416 (df = 41875) 0.414 (df = 69302)
## F Statistic 3.260** (df = 3; 44721) 14.140*** (df = 3; 73448) 217.679*** (df = 4; 41875) 413.969*** (df = 4; 69302)
## ===========================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
##
## ====
## TRUE
## ----
Discussion: In Task 2, a constant is incorporated to establish a baseline for the dummy variables. The computed disparities between the actual formar in BC and the hypothetical formar in BC, using Alberta and Ontario as control groups, are (-0.007, 0.006, -0.007, -0.003), which are nearly zero. The Average Treatment Effect on the Treated (ATT) and interaction terms are not statistically significant. Including additional covariates in the model has a minimal impact on the ATT coefficient’s value.
df3$yearf <- as.factor(df3$year)
df3$yearf <- relevel(df3$yearf, ref ="2010")
df_did2 <- df3 %>%
filter(sex == 2, cohab == 1, year >= 2006, year <= 2019) %>%
mutate(
marstat = as.numeric(marstat != 1),
D = as.numeric(province == "BC"),
T = as.numeric(year >= 2013),
DT = D * T
)
# Selecting the subset of data for the first regression
data_subset <- df_did2 %>% filter(province %in% c("BC", "Alberta"))
# Running the regression with interaction terms
reg1 <- lm(marstat ~ D + year + yearf * D, data = data_subset
%>% filter(province =="BC"))
reg2 <- lm(marstat ~ D + year + yearf * D, data = data_subset
%>% filter(province =="Alberta"))
reg3 <- lm(marstat ~ D + year + yearf:D + factor(edugrp)*factor(sp_edugrp), data = data_subset
%>% filter(province =="BC"))
reg4 <- lm(marstat ~ D + year + yearf:D + factor(edugrp)*factor(sp_edugrp), data = data_subset
%>% filter(province =="Alberta"))
summarytask3 <- stargazer(title = "section3 task 3", type = "text",reg1,reg2,reg3,reg4)
##
## section3 task 3
## =============================================================================================================================================
## Dependent variable:
## ----------------------------------------------------------------------------------------------------------
## marstat
## (1) (2) (3) (4)
## ---------------------------------------------------------------------------------------------------------------------------------------------
## D
##
##
## year 0.007*** 0.002** 0.013*** 0.004***
## (0.001) (0.001) (0.001) (0.001)
##
## yearf2006 -0.005 -0.023**
## (0.010) (0.010)
##
## yearf2007 0.003 -0.008
## (0.010) (0.009)
##
## yearf2008 0.008 -0.003
## (0.009) (0.009)
##
## yearf2009 0.012 -0.003
## (0.008) (0.008)
##
## yearf2011 0.012 0.002
## (0.008) (0.007)
##
## yearf2012 -0.020*** -0.004
## (0.007) (0.007)
##
## yearf2013 -0.001 0.0003
## (0.007) (0.007)
##
## yearf2014 -0.005 0.006
## (0.007) (0.007)
##
## yearf2015 0.009 -0.018***
## (0.007) (0.007)
##
## yearf2016 0.021*** -0.008
## (0.007) (0.007)
##
## yearf2017 0.025*** -0.016**
## (0.007) (0.007)
##
## yearf2018 -0.005 -0.014**
## (0.008) (0.007)
##
## yearf2019
##
##
## factor(edugrp)2 0.002 -0.010*
## (0.007) (0.006)
##
## factor(edugrp)3 -0.032*** -0.053***
## (0.010) (0.010)
##
## factor(sp_edugrp)2 -0.021*** -0.014**
## (0.007) (0.006)
##
## factor(sp_edugrp)3 -0.129*** -0.128***
## (0.013) (0.013)
##
## D:yearf2006 0.013
## (0.012)
##
## D:yearf2007 0.016
## (0.011)
##
## D:yearf2008 0.016*
## (0.010)
##
## D:yearf2009 0.023***
## (0.009)
##
## D:yearf2011 0.011
## (0.008)
##
## D:yearf2012 -0.024***
## (0.007)
##
## D:yearf2013 0.0001
## (0.007)
##
## D:yearf2014 -0.016**
## (0.007)
##
## D:yearf2015 -0.010
## (0.008)
##
## D:yearf2016
##
##
## D:yearf2017
##
##
## D:yearf2018
##
##
## D:yearf2019
##
##
## factor(edugrp)2:factor(sp_edugrp)2 -0.010 -0.040***
## (0.010) (0.009)
##
## factor(edugrp)3:factor(sp_edugrp)2 -0.011 -0.026**
## (0.013) (0.012)
##
## factor(edugrp)2:factor(sp_edugrp)3 0.042*** -0.012
## (0.015) (0.015)
##
## factor(edugrp)3:factor(sp_edugrp)3 0.016 0.008
## (0.016) (0.016)
##
## Constant -14.384*** -3.505** -25.310*** -8.253***
## (1.789) (1.706) (2.742) (1.028)
##
## ---------------------------------------------------------------------------------------------------------------------------------------------
## Observations 76,703 84,143 55,892 61,909
## R2 0.006 0.001 0.021 0.022
## Adjusted R2 0.006 0.001 0.021 0.022
## Residual Std. Error 0.422 (df = 76689) 0.415 (df = 84129) 0.411 (df = 55873) 0.407 (df = 61899)
## F Statistic 36.232*** (df = 13; 76689) 4.609*** (df = 13; 84129) 66.368*** (df = 18; 55873) 155.357*** (df = 9; 61899)
## =============================================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Discussion: Prior to the intervention, most 95% confidence intervals included the zero line, suggesting, under the parallel trend assumption, that there were no significant deviations from the 2010 reference level, indicating a lack of preemptive behavior. Nonetheless, after incorporating additional covariates, the model with Alberta as the control deviated from zero in 2009, and the one with Ontario did so in 2008. This implies that in the first specification, the betas partially reflect the impact of family education level, revealing the emergence of preemptive behavior. For validating the parallel trend assumption, where we presume an absence of preemptive behavior, the second specification fails this test as not all periods before the treatment equate to zero. Consequently, the interpretation could be that there is a persistent parallel trend assumption, but preemptive behavior emerges around 2008 or 2009 depending on the model configuration; or, there is no preemptive behavior, but the parallel trend assumption is violated. The insignificant ATTs immediately following the treatment and the significant ATTs observed after several years suggest a delayed negative effect of the policies in both specifications.
Discussion: The Synthetic Control method estimates the treatment effect by constructing the unobserved counterfactual. For instance, the post-2013 formal marriage rate in British Columbia (BC), in the absence of the family law reform, is estimated by aligning covariates and assigning optimal weights to simulate BC’s marriage rate. This approach’s findings are consistent with those from the dynamic difference-in-difference (DiD) model. However, the static DiD model yields an insignificant result, differing from the other methodologies, which might indicate its lesser effectiveness in this context.
The majority of the weights in the Synthetic Control method are assigned to Alberta and Ontario, presumed to follow trends parallel to BC, indicating a closer similarity with BC in terms of the covariates considered. Yet, some weight is also allocated to provinces less likely to share parallel trends with BC, hinting that the weight distribution might not fully reflect real-world considerations.
The advantage of the Synthetic Control method is that it doesn’t require the assumption of parallel trends and is adaptable to situations with limited data. Its limitation lies in the potential non-existence of optimal weights and the fact that weight distribution might not always align with practical realities. The estimation may not assign the intuitively important group with the largest weight. In contrast, the DiD method allows for the selection of a control group more likely to exhibit parallel trends, though it invariably relies on the existence of such trends.
This is the end of my assignment. Thank you for grading my assignment. I am sorry for any errors it may contain.