library(ggplot2)
library(binsreg)
library(usmap)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fixest)
library(dplyr)
library(modelsummary)
library(kableExtra)
## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 3 > 1' in coercion to 'logical(1)'
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
df = read.csv("climatemort.csv")
  1. DESCRIPTIVE RELATIONSHIPS

The two key variables are temperature and mortality. Do A-C for each of ave_temp and drate_6599:

1a. Plot the average for each calendar month. (This could be a connected scatterplot with 12 dots.)

#1A - Temperature x Month

ave_monthly <- df %>%
  group_by(month) %>%
  summarize(avg_temp = mean(ave_temp),
            avg_mort = mean(drate_6599))

temp_month <- ggplot(ave_monthly, aes(x = month, y = avg_temp)) + geom_line() + geom_point() +
  ggtitle("Average temperature by month") + 
  labs(y="Average temperature (°F)", x="Month")

temp_month

#mortality x month
mort_month <- ggplot(ave_monthly, aes(x = month, y = avg_mort)) + geom_line() + geom_point() +
  ggtitle("Average 65+ mortality rate by month") + 
  labs(y="Average 65+ mortality rate (per 100,000)", x="Month")

mort_month

1b. Plot the average by year. (This could be a connected scatterplot with 44 dots.)

ave_yearly <- df %>%
  group_by(year) %>%
  summarize(avg_temp = mean(ave_temp),
            avg_mort = mean(drate_6599))

temp_year <- ggplot(ave_yearly, aes(x = year, y = avg_temp)) + geom_line() + geom_point() + 
  ggtitle("Average temperature by year") + 
  labs(y="Average temperature (°F)", x="Year")

temp_year

mort_year <- ggplot(ave_yearly, aes(x = year, y = avg_mort)) + geom_line() + geom_point() + 
  ggtitle("Average 65+ mortality rate by year") + 
  labs(y="Average 65+ mortality rate (per 100,000)", x="Year")

mort_year

1c. Map the average by state. (You may want to use the usmap package.)

avg_by_state <- df %>%
  group_by(stfips) %>%
  summarize(avg_temp = mean(ave_temp),
            avg_mort = mean(drate_6599))

avg_by_state$stfips = str_pad(avg_by_state$stfips, 2, pad = '0') # stfips into 2-digit fips

avg_by_state <- rename(avg_by_state, fips = stfips)

state_x_mort_map <- plot_usmap(data = avg_by_state, values = "avg_mort", color = "black") +
  scale_fill_continuous(name = "Mortality rate per 100,000", label = scales::comma) +
  theme(legend.position = "right") +
  labs(title = "Average 65+ mortality rate by state")

state_x_mort_map

state_x_temp_map <- plot_usmap(data = avg_by_state, values = "avg_temp", color = "black") +
  scale_fill_continuous(name = "Temperature (°F)", label = scales::comma) +
  theme(legend.position = "right") +
  labs(title = "Average temperature by state")

state_x_temp_map

  1. CROSS-SECTIONAL CORRELATIONS

2a. Make a binned scatterplot with drate_6599 on the y-axis and ave_temp on the x-axis.

orig_bins <- binsreg(drate_6599, ave_temp, data = df)
## Warning in binsreg(drate_6599, ave_temp, data = df): To speed up computation,
## bin/degree selection uses a subsample of roughly max(5,000, 0.01n) observations
## if the sample size n>5,000. To use the full sample, set randcut=1.

orig_bins
## Call: binsreg
## 
## Binscatter Plot
## Bin/Degree selection method (binsmethod)  =  IMSE direct plug-in (select # of bins)
## Placement (binspos)                       =  Quantile-spaced
## Derivative (deriv)                        =  0
## 
## Group (by)                         =  Full Sample
## Sample size (n)                    =  26460
## # of distinct values (Ndist)       =  26437
## # of clusters (Nclust)             =  NA
## dots, degree (p)                   =  0
## dots, smoothness (s)               =  0
## # of bins (nbins)                  =  51

2b. Repeat the binned scatterplot but now add controls (using the “w = …” option in binsreg) for state and year. This shows the conditional expectation function (CEF) conditional on state and year fixed effects.

FE_bins <- binsreg(data = df, drate_6599, ave_temp,
                     w = ~factor(df$stfips) + factor(df$year))
## Warning in binsreg(data = df, drate_6599, ave_temp, w = ~factor(df$stfips) + :
## To speed up computation, bin/degree selection uses a subsample of roughly
## max(5,000, 0.01n) observations if the sample size n>5,000. To use the full
## sample, set randcut=1.

FE_bins
## Call: binsreg
## 
## Binscatter Plot
## Bin/Degree selection method (binsmethod)  =  IMSE direct plug-in (select # of bins)
## Placement (binspos)                       =  Quantile-spaced
## Derivative (deriv)                        =  0
## 
## Group (by)                         =  Full Sample
## Sample size (n)                    =  26460
## # of distinct values (Ndist)       =  26437
## # of clusters (Nclust)             =  NA
## dots, degree (p)                   =  0
## dots, smoothness (s)               =  0
## # of bins (nbins)                  =  30

2c. Are either of these two relationships plausibly causal? In other words, does the CEF tell us the effect of temperature on mortality? Explain why or why not. The figures from Part 1 above may help illustrate your explanation.

The first relationship is not plausibly causal, as it does not take into account the state and year fixed effects. Without controlling for these fixed effects, we can’t rule out the possibility that unobserved factors are driving the observed relationship, which makes it difficult to infer a causal relationship.

We can see that this is, in fact, the case when we analyze the output of the CEF, as the entire curve (while maintaining its rough shape) shifts upward on the y-axis by around 100 units. Given that it accounts for state and year fixed effects, as well as the fact that it demonstrates a consistent, inverse relationship between temperature and mortality, this second relationship could be plausibly causal. It is important to note, however, that there are other potential confounding factors such as the use of air conditioning or other adaptive strategies that provide additional, important context to this relationship and are not accounted for within this analysis.

  1. CAUSAL ANALYSIS

3a. Estimate equation (1) and report the results.

regress1 <- feols(data = df,
                  fml = ln_drate_6599 ~ days_below40 + days_8090 + days_above90 +
                          devp25 + devp75 | year^month + stfips^month,
                    cluster = ~ stfips)
summary(regress1)
## OLS estimation, Dep. Var.: ln_drate_6599
## Observations: 26,460 
## Fixed-effects: year^month: 540,  stfips^month: 588
## Standard-errors: Clustered (stfips) 
##               Estimate Std. Error  t value   Pr(>|t|)    
## days_below40  0.003419   0.000239 14.30880  < 2.2e-16 ***
## days_8090     0.002991   0.000428  6.99432 7.5021e-09 ***
## days_above90  0.005795   0.001219  4.75506 1.8467e-05 ***
## devp25        0.002245   0.001165  1.92618 6.0015e-02 .  
## devp75       -0.001927   0.000951 -2.02679 4.8259e-02 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.051742     Adj. R2: 0.848416
##                  Within R2: 0.023041

3b. Interpret your estimate of j and its standard error, for j corresponding to the number of days above 90 degrees. (This should be a sentence like, “I estimate that the effect of a rainy day compared to a sunny day is to reduce exercise by 4 percent, with a standard error of 1 percent.”)

For j = 90+ degrees, I estimate that the effect of a day with avg. temperature above 90F compared to one with avg. temperature below 90F is to increase mortality by ~0.58%, with a standard error of ~0.1%.

3c. Explain the intuition for why you should cluster standard errors.

We assume that our data is independent, but when there are groups of data (perhaps based on common locations or time periods), there can be fixed effects that come into play. Clustering the standard errors is one way to address the potential bias caused by these fixed effects. When we cluster the standard errors, we group the observations into clusters based on some common factor (e.g., countries or companies) and estimate the standard errors within each cluster. This accounts for the potential correlation of the error terms within each cluster, which might arise due to unobserved factors affecting the treatment variable and the outcome variable in similar ways. Clustering the standard errors can help us obtain more accurate estimates of the treatment effect and avoid overestimating the statistical significance of our results.

3d. Re-estimate equation (1) except replacing the temperature bins with ave_temp. Explain why a regression with this average temperature variable is not the ideal way to predict how higher temperatures will affect mortality.

A regression with the average temperature variable is not the ideal way to predict how higher temperatures will affect mortality, because we would lose the ability to analyze the non-linear effects of temperature upon mortality rates (likely a u-shape, if we consider intuitively that extreme heat and cold are most likely to cause death than something in the middle). The bins help us to understand the differential impacts of these different temperature ranges upon mortality, rather than lumping them all into one aggregate.

regress2 <- feols(data = df,
                  fml = ln_drate_6599 ~ ave_temp + 
                    devp25 + devp75 | month^year + stfips^month,
                  cluster = ~ stfips)
summary(regress2)
## OLS estimation, Dep. Var.: ln_drate_6599
## Observations: 26,460 
## Fixed-effects: month^year: 540,  stfips^month: 588
## Standard-errors: Clustered (stfips) 
##           Estimate Std. Error   t value   Pr(>|t|)    
## ave_temp -0.002805   0.000225 -12.44652  < 2.2e-16 ***
## devp25    0.004699   0.001197   3.92661 2.7475e-04 ***
## devp75   -0.004252   0.000887  -4.79104 1.6361e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.051925     Adj. R2: 0.84735 
##                  Within R2: 0.016097

3e. Create a regression results table with estimates of equation (1) in four sub-samples:

i <- feols (data = df %>% filter(uscr>=7 & uscr<=9 & year>=1960 & year<=1980),
                     fml = ln_drate_6599 ~ days_below40 + days_8090 + days_above90 +
                       devp25 + devp75 | year^month + stfips^month,
                     cluster = ~ stfips)

#3eii (southern 1981 to 2004)

ii <- feols (data = df %>% filter(uscr>=7 & uscr<=9 & year>=1981 & year<=2004),
                     fml = ln_drate_6599 ~ days_below40 + days_8090 + days_above90 +
                       devp25 + devp75 | year^month + stfips^month,
                     cluster = ~ stfips)

#3eiii (non southern 1960 to 1980)

iii <- feols (data = df %>% filter(uscr>=1 & uscr<=6 & year>=1960 & year<=1980),
                     fml = ln_drate_6599 ~ days_below40 + days_8090 + days_above90 +
                       devp25 + devp75 | year^month + stfips^month,
                     cluster = ~ stfips)

#3eiv (non southern 1981 to 2004)

iv <- feols (data = df %>% filter(uscr>=1 & uscr<=6 & year>=1981 & year<=2004),
                      fml = ln_drate_6599 ~ days_below40 + days_8090 + days_above90 +
                        devp25 + devp75 | year^month + stfips^month,
                      cluster = ~ stfips)

coef_labels = c(
  'days_below40' = 'Number of days <40°F', 
  'days_8090' = 'Number of days 80-90°F',
  'days_above90' = 'Number of days >90°F',
  'devp25' = 'Low precipitation',
  'devp75' = 'High precipitation'
)

table = modelsummary(
  dvnames(list(i, ii, iii, iv)),
  coef_map = coef_labels,
  statistic = "conf.int",
  stars = TRUE,
  conf_level = 0.95,
  gof_omit = 'R2 Adj.|R2 Within|R2 Within Adj.|AIC|BIC|RMSE|Std.Errors',
  title = '(1) = Southern 1960-1980, (2) = Southern 1981-2004, 
  (3) = non-Southern 1960-1980, (4) = non-Southern 1981-2004',
  fmt = fmt_sprintf("%.4f"),
)
table %>% kable_styling(latex_options = "HOLD_position")
  1. = Southern 1960-1980, (2) = Southern 1981-2004,
  2. = non-Southern 1960-1980, (4) = non-Southern 1981-2004
 ln_drate_6599  ln_drate_6599  ln_drate_6599  ln_drate_6599
Number of days <40°F 0.0045*** 0.0029*** 0.0037*** 0.0023***
[0.0029, 0.0061] [0.0015, 0.0044] [0.0028, 0.0045] [0.0016, 0.0030]
Number of days 80-90°F 0.0025** 0.0002 0.0074*** 0.0007
[0.0010, 0.0041] [−0.0007, 0.0012] [0.0053, 0.0094] [−0.0001, 0.0015]
Number of days >90°F 0.0083*** 0.0009 0.0142*** 0.0042***
[0.0041, 0.0124] [−0.0010, 0.0028] [0.0067, 0.0217] [0.0032, 0.0052]
Low precipitation 0.0034+ 0.0048* 0.0004 0.0025*
[−0.0004, 0.0072] [0.0006, 0.0091] [−0.0027, 0.0036] [0.0002, 0.0047]
High precipitation −0.0013 −0.0002 −0.0001 −0.0036*
[−0.0057, 0.0030] [−0.0044, 0.0040] [−0.0028, 0.0026] [−0.0066, −0.0007]
Num.Obs. 4032 4608 8316 9504
R2 0.886 0.923 0.819 0.820
FE: year^month X X X X
FE: stfips^month X X X X
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001

3f. Compare the coefficients across the four columns of this table. How do these results illustrate adaptation? Looking into the future, do you think the effects of extreme heat on mortality will be higher or lower than the effects we estimate from historical data?

These results illustrate adaptation to temperatures. For each region, the estimator decreases significantly between the two time periods. Though it is notable that in some cases the estimate for the second time period is not statistically significant, we should consider that they still indicate a significant decrease in the coefficient estimate.

Looking into the future, I think the effects of extreme heat on mortality will be lower than the effects we estimate from historical data because of continued adaptation with AC adoption and continued expansion to access to adaptive technologies.

However, it’s important to note that there are potential downstream effects of extreme heat that could lead to higher rates of mortality. For instance, if there are more days of extreme heat (>90F), to the point that it decreases crop yield or causes drought, that could lead to an observed increase of the effect of extreme heat on mortality.

  1. PLACEBO TESTS

4a. Estimate equation (1) after defining Ysym as the infectious disease death rate instead of the log 65+ death rate.

regress4a <- feols(data = df,
                  fml = drate_ifd ~ days_below40 + days_8090 + days_above90 +
                    devp25 + devp75 | year^month + stfips^month,
                  cluster = ~ stfips)
summary(regress4a)
## OLS estimation, Dep. Var.: drate_ifd
## Observations: 26,460 
## Fixed-effects: year^month: 540,  stfips^month: 588
## Standard-errors: Clustered (stfips) 
##               Estimate Std. Error   t value Pr(>|t|)    
## days_below40  0.000255   0.002143  0.119030  0.90575    
## days_8090     0.010671   0.004385  2.433436  0.01873 *  
## days_above90 -0.009767   0.020526 -0.475837  0.63635    
## devp25       -0.001455   0.006812 -0.213617  0.83175    
## devp75        0.006894   0.005915  1.165374  0.24963    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.569254     Adj. R2: 0.622133
##                  Within R2: 7.302e-4
#Running a regression on the ln(drate_ifd), then creating a table to compare it to the initial estimate on drate_6599.

df$ln_ifd <- log(df$drate_ifd)

regress4b <- feols(data = df,
                   fml = ln_ifd ~ days_below40 + days_8090 + days_above90 +
                     devp25 + devp75 | year^month + stfips^month,
                   cluster = ~ stfips)
## NOTE: 163 observations removed because of infinite values (LHS: 163).
summary(regress4b)
## OLS estimation, Dep. Var.: ln_ifd
## Observations: 26,297 
## Fixed-effects: year^month: 540,  stfips^month: 588
## Standard-errors: Clustered (stfips) 
##               Estimate Std. Error   t value Pr(>|t|)    
## days_below40  0.000066   0.001770  0.037067 0.970585    
## days_8090     0.004958   0.002368  2.093589 0.041604 *  
## days_above90 -0.005338   0.018393 -0.290198 0.772914    
## devp25        0.005282   0.005207  1.014541 0.315412    
## devp75        0.009218   0.005494  1.677823 0.099882 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.334349     Adj. R2: 0.709182
##                  Within R2: 5.703e-4
table2 = modelsummary(
  dvnames(list(regress1, regress4b, regress4a)),
  coef_map = coef_labels,
  statistic = c("conf.int", 
                "p={p.value}"),
  stars = TRUE,
  conf_level = 0.95,
  gof_omit = 'R2 Adj.|R2 Within|R2 Within Adj.|AIC|BIC|RMSE|Std.Errors',
  title = 'Placebo outcome test: 65+ all-cause vs infectious disease mortality',
  fmt = fmt_sprintf("%.4f"),
)

table2 %>% kable_styling(latex_options = "HOLD_position")
Placebo outcome test: 65+ all-cause vs infectious disease mortality
 ln_drate_6599 ln_ifd drate_ifd
Number of days <40°F 0.0034*** 0.0001 0.0003
[0.0029, 0.0039] [−0.0035, 0.0036] [−0.0041, 0.0046]
p=0.0000 p=0.9706 p=0.9057
Number of days 80-90°F 0.0030*** 0.0050* 0.0107*
[0.0021, 0.0039] [0.0002, 0.0097] [0.0019, 0.0195]
p=0.0000 p=0.0416 p=0.0187
Number of days >90°F 0.0058*** −0.0053 −0.0098
[0.0033, 0.0082] [−0.0423, 0.0316] [−0.0510, 0.0315]
p=0.0000 p=0.7729 p=0.6363
Low precipitation 0.0022+ 0.0053 −0.0015
[−0.0001, 0.0046] [−0.0052, 0.0158] [−0.0152, 0.0122]
p=0.0600 p=0.3154 p=0.8318
High precipitation −0.0019* 0.0092+ 0.0069
[−0.0038, −0.0000] [−0.0018, 0.0203] [−0.0050, 0.0188]
p=0.0483 p=0.0999 p=0.2496
Num.Obs. 26460 26297 26460
R2 0.855 0.722 0.638
FE: year^month X X X
FE: stfips^month X X X
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001

4b. Interpret the results. Do these results make you more or less confident in the analysis from Part 3?

These results make me more confident in the analysis from Part 3.

The point of a placebo outcome test is to check whether there could be an unobserved confounding effect that causes bias in regression estimates. This would be the case if coefficients for the placebo outcome is yields statistically significant estimates from each of the estimates.

According to the table above, in this case the results make me more confident in the analysis from Part 3. Save for the estimator for number of days between 80-90F, all of the placebo regression’s estimates have p-values above 0.05, deeming them statistically insignificant.

The days_8090 estimator is statistically significant, with a p-value of 0.01 (below the 0.05 threshold), but this effect can also be explained by the fact that more disease vectors like bacteria will thrive in this temperature band. This doesn’t necessarily make me less confident about the analysis in Part 3, but it does provide important context as to a potential confounding aspect – however slight - to the relationship that we observe in the original regression.