## PROBLEM 1
# replace with your actual file name 

library(dplyr)

df <- read.csv("correlatesofstatepolicyprojectv2_1.csv")

df <- df %>%
  arrange(state, year) %>%
  group_by(state) %>%
  mutate(
    dem_control = as.integer(ranney4_control == 1),
    rep_control = as.integer(ranney4_control == 0),
    divided_control = as.integer(ranney4_control == 0.5),
    lead4_povrate = dplyr::lead(povrate, n = 4)
  ) %>%
  ungroup() %>%
  mutate(
    region = factor(
      region,
      levels = c(1, 2, 3, 4),
      labels = c("South", "West", "Midwest", "Northeast")
    )
  )
## A) Call your new varibale lead4_povrate.

table(df$dem_control, useNA = "ifany")
## 
##    0    1 <NA> 
## 2581 1314 2123
table(df$rep_control, useNA = "ifany")
## 
##    0    1 <NA> 
## 2981  914 2123
table(df$divided_control, useNA = "ifany")
## 
##    0    1 <NA> 
## 2228 1667 2123
head(df[, c("state", "year", "ranney4_control", "povrate", "lead4_povrate",
            "dem_control", "rep_control", "divided_control")], 12)
## # A tibble: 12 × 8
##    state    year ranney4_control povrate lead4_povrate dem_control rep_control
##    <chr>   <int>           <dbl>   <dbl>         <dbl>       <int>       <int>
##  1 Alabama  1900              NA      NA            NA          NA          NA
##  2 Alabama  1901              NA      NA            NA          NA          NA
##  3 Alabama  1902              NA      NA            NA          NA          NA
##  4 Alabama  1903              NA      NA            NA          NA          NA
##  5 Alabama  1904              NA      NA            NA          NA          NA
##  6 Alabama  1905              NA      NA            NA          NA          NA
##  7 Alabama  1906              NA      NA            NA          NA          NA
##  8 Alabama  1907              NA      NA            NA          NA          NA
##  9 Alabama  1908              NA      NA            NA          NA          NA
## 10 Alabama  1909              NA      NA            NA          NA          NA
## 11 Alabama  1910              NA      NA            NA          NA          NA
## 12 Alabama  1911              NA      NA            NA          NA          NA
## # ℹ 1 more variable: divided_control <int>
## B) Provide a regression table

df <- df %>%
  mutate(region = factor(region))
model <- lm(lead4_povrate ~ dem_control + region + statemin, data = df)
summary(model)
## 
## Call:
## lm(formula = lead4_povrate ~ dem_control + region + statemin, 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.5915 -2.1223 -0.1807  1.9032 12.4783 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     15.36195    0.32674  47.016   <2e-16 ***
## dem_control      0.48439    0.20250   2.392   0.0169 *  
## regionWest      -2.63433    0.22932 -11.487   <2e-16 ***
## regionMidwest   -3.37544    0.24459 -13.800   <2e-16 ***
## regionNortheast -4.85447    0.25350 -19.150   <2e-16 ***
## statemin        -0.05681    0.06242  -0.910   0.3629    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.264 on 1464 degrees of freedom
##   (4548 observations deleted due to missingness)
## Multiple R-squared:  0.2472, Adjusted R-squared:  0.2447 
## F-statistic: 96.16 on 5 and 1464 DF,  p-value: < 2.2e-16
## C) dem_control meaning
# The coefficient tells us the predicted difference in the poverty rate four years later between states with Democratic control and those without, holding other factors constant. A positive coefficient would suggest that Democratic control is associated with a higher poverty rate in the future, while a negative coefficient would suggest the opposite. The statistical significance of the coefficient can be assessed using the p-value, which indicates whether the observed relationship is likely due to chance.rate 
## D) 
# No, the regression from part B may not be able to isolate the causal effect of Democratic control on the state poverty rate due to potential confounding variables that are not included in the model. For example, there may be other factors such as economic conditions, demographic changes, or policy decisions that could influence both the likelihood of Democratic control and the poverty rate. Additionally, there may be reverse causality where higher poverty rates could lead to changes in political control. Without controlling for these confounding factors, it is difficult to establish a clear causal relationship between Democratic control and the poverty rate.
## E) 
# The assumption of homoscedasticity means that the variance of the error terms in the regression model is constant across all levels of the independent variables. In substantive terms, this means that the variability of the poverty rate should be the same regardless of whether a state has Democratic control or not, and regardless of other factors included in the model. If this assumption is violated, it can lead to inefficient estimates and incorrect standard errors, which may affect the validity of hypothesis tests and confidence intervals.
## F) 
library(lmtest)
library(sandwich)

coeftest(model, vcov = vcovHC(model, type = "HC1"))
## 
## t test of coefficients:
## 
##                  Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)     15.361946   0.324385  47.3572  < 2e-16 ***
## dem_control      0.484391   0.219649   2.2053  0.02759 *  
## regionWest      -2.634327   0.256775 -10.2593  < 2e-16 ***
## regionMidwest   -3.375435   0.224800 -15.0153  < 2e-16 ***
## regionNortheast -4.854475   0.243499 -19.9364  < 2e-16 ***
## statemin        -0.056810   0.057466  -0.9886  0.32303    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# - dem_control coefficient is 0.484
# - The HC1 robust standard error is now 0.220
# - The p-value is 0.028, which is less than the conventional threshold of 0.05, indicating that the coefficient for dem_control is statistically significant at the 5% level.
## PROBLEM 2
# A) 
model2 <- feols(
  povrate ~ dem_control | state + year,
  cluster = ~state,
  data = df
)
## NOTE: 4,352 observations removed because of NA values (LHS: 4,284, RHS: 2,123).
etable(model2)
##                          model2
## Dependent Var.:         povrate
##                                
## dem_control     0.0115 (0.1825)
## Fixed-Effects:  ---------------
## state                       Yes
## year                        Yes
## _______________ _______________
## S.E.: Clustered       by: state
## Observations              1,666
## R2                      0.80666
## Within R2               6.22e-6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# - I estimated a two way fixed effects difference in differences model of state poverty rate on Democratic control, including state and year effects and clustering standard errors by state. The estimated coefficient for dem_control is 0.0115 with clustered standard error of 0.1825, suggesting no statistical significance.
## B) 
## C) After comparing states to themselves overtime and controlling for year specific shocks, the estimated effect of Democratic control on the poverty rate is much smaller and not statistically significant. This suggests that the initial positive association observed in the OLS regression may have been driven by unobserved state characteristics or time trends that are not accounted for in the  model.
## D) No. While the model improves on the earlier regression by controlling for state fixed effects and year fixed effects, it does not automatically isolate the causal effect of Democratic control on poverty.
## E) It doesn't make sense to include region in the two way fixed effects model because region is time invariant and is already absorbed by the state effects,so it is perfectly collinear. statemin can vary over time, so it is not ruled out by multicollinearity.However it may be a post treatment variable if Democratic control affects minimum wag, which in-turn affects poverety. In this case, controlling for statemin would block part of the treatment effect  and bias the total effect of Democratic control on poverty.
## F) 
plot_df <- df %>%
  filter(ranney4_control %in% c(0, 1)) %>%
  mutate(control = ifelse(ranney4_control == 1,
                          "Unified Democratic",
                          "Unified Republican")) %>%
  group_by(year, control) %>%
  summarise(avg_povrate = mean(povrate, na.rm = TRUE), .groups = "drop")

ggplot(plot_df, aes(x = year, y = avg_povrate, color = control)) +
  geom_line(linewidth = 1) +
  labs(
    x = "Year",
    y = "Average Poverty Rate",
    color = "Party Control",
    title = "Average Poverty Rate by Year",
    subtitle = "Unified Democratic vs Unified Republican States"
  ) +
  theme_minimal()
## Warning: Removed 92 rows containing missing values or values outside the scale range
## (`geom_line()`).

# The graph shows that unified Democratic states generally have a higher average poverty rate compared to unified Republican states over the years. However, the trends for both groups appear to be relatively stable, with some fluctuations.