Problem Set III - Economics 970

library(haven)
library(sandwich)
library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

library(ggplot2)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.1     ✔ tidyr     1.3.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(marginaleffects)
library(dplyr)
library(fixest)
library(modelsummary)
library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(tinytable)


irs_county = read_dta("/Users/cartercrowley/Documents/Economics 970/ps3/rawdata/irssoi_county.dta")

Problem 3:

irs_county = irs_county %>% mutate(incomebin = as_factor(incomebin))

Problem 4:

datasummary_balance(capital_share + salary_share + business_share + partnership_share + agi_per_return + contrib_per_return + cg_per_return + wages_per_return + capital_per_return + business_per_return + partnership_per_return + anycg + anywages + anybusiness + anypartnership +anycharitable~incomebin, data = irs_county, weights=irs_county$returns, dinm=FALSE, fmt=1, title = "Table 1: Composition of Income by Size of Adjusted Gross Income", notes = "NOTE--Table reports return-weighted means and standard deviations by income bin.")

Table 1: Composition of Income by Size of Adjusted Gross Income
	All (N=21714)		$1 under $25,000 (N=21714)		$25,000 under $50,000 (N=21714)		$50,000 under $75,000 (N=21696)		$75,000 under $100,000 (N=21632)		$100,000 under $200,000 (N=21517)		$200,000 or more (N=18299)
	Mean	Std. Dev.	Mean	Std. Dev.	Mean	Std. Dev.	Mean	Std. Dev.	Mean	Std. Dev.	Mean	Std. Dev.	Mean	Std. Dev.
NOTE--Table reports return-weighted means and standard deviations by income bin.
capital_share	5.8	3.8	4.6	5.3	3.2	3.4	3.8	2.8	4.5	5.2	7.9	6.8	17.8	8.9
salary_share	69.4	7.9	74.3	6.3	78.4	6.2	75.6	6.8	74.0	8.3	66.8	12.0	43.5	12.7
business_share	3.1	1.2	7.2	3.1	3.1	1.8	2.6	1.7	2.8	2.5	4.0	3.3	5.2	3.3
partnership_share	3.7	2.4	0.8	3.4	1.0	2.4	1.5	2.6	2.2	4.7	5.1	5.5	20.4	8.6
agi_per_return	49.8	14.8	12.3	0.7	36.2	5.7	62.1	2.7	87.6	5.2	140.1	19.0	431.6	191.6
contrib_per_return	0.8	0.6	0.1	0.1	0.4	0.3	1.0	0.5	1.7	2.4	3.4	1.8	12.9	8.9
cg_per_return	1.6	3.4	0.1	0.4	0.4	3.8	0.9	2.2	1.8	4.0	5.5	8.3	47.8	58.8
wages_per_return	33.9	8.7	9.2	1.1	28.3	2.1	46.8	4.1	64.3	7.1	91.3	13.6	179.1	55.0
capital_per_return	3.4	5.6	0.6	0.6	1.2	5.7	2.3	1.9	3.9	4.6	11.2	11.1	84.4	83.1
business_per_return	1.6	0.8	0.9	0.4	1.1	0.7	1.6	1.1	2.4	2.2	5.4	4.5	22.1	13.8
partnership_per_return	2.3	2.2	0.1	0.4	0.4	0.9	0.9	1.6	1.9	4.1	7.1	7.9	88.8	47.6
anycg	13.6	5.8	8.0	4.6	12.6	8.7	19.5	10.3	24.8	12.2	35.2	10.2	63.5	9.1
anywages	82.5	4.2	77.1	5.2	85.9	4.3	87.1	5.2	88.2	6.2	88.0	6.5	84.3	8.7
anybusiness	15.8	3.5	15.1	5.1	15.7	6.7	19.7	7.3	21.9	8.9	23.8	6.6	30.2	8.2
anypartnership	5.1	2.8	1.9	2.1	4.3	4.3	7.8	7.3	9.3	10.7	16.8	9.4	47.7	12.8
anycharitable	17.9	8.5	4.3	2.9	14.8	6.8	28.8	12.0	41.5	15.9	61.4	15.9	83.7	11.5

Problem 5:

Partnerships per return is much higher for higher earners relative to lower earners, exhibiting a nearly 900-fold increase over the various income bins. This makes intuitive sense as partnerships are investment and business structures typically only accessible to individuals who have a modest amount of capital on hand or access to it via secondary sources, meaning lower-income households are almost entirely excluded from this income source.

Problem 6:

health = read_dta("/Users/cartercrowley/Documents/Economics 970/ps3/rawdata/health_county_2026.dta")

health = pivot_longer(data = health, cols = c("healthy_days2009", "healthy_days2010", "healthy_days2011","healthy_days2012","healthy_days2012", "healthy_days2013", "healthy_days2014", "healthy_days2015"), names_to = "year",names_prefix = "healthy_days", values_to = "healthy_days")

irs_county$year = as.numeric(irs_county$year)

health$year = as.numeric(health$year)

irs_health = left_join(irs_county,health, by = c("CountyFIPS", "year"))

#Number of observations from both the CDC and the IRS
both_irs_CDC = inner_join(irs_county,health, by = c("CountyFIPS", "year"))
nrow(both_irs_CDC)

## [1] 148188

#Number of observations from just the IRS
just_irs = anti_join(irs_county,health, by = c("CountyFIPS", "year"))
nrow(just_irs)

## [1] 98

#Number of observations from just the CDC
just_CDC = anti_join(health, irs_county, by = c("CountyFIPS", "year"))
nrow(just_CDC)

## [1] 266

Problem 7:

irs_health = irs_health %>% filter(incomebin == "All")

mod1=feols(healthy_days~wages_per_return + capital_per_return, data = irs_health, weights = irs_health$returns, cluster = irs_health$CountyFIPS)

## NOTE: 1,458 observations removed because of NA values (LHS: 1,458).

mod2=feols(healthy_days~wages_per_return + capital_per_return | year, data = irs_health, weights = irs_health$returns, cluster = irs_health$CountyFIPS)

## NOTE: 1,458 observations removed because of NA values (LHS: 1,458).

mod3=feols(healthy_days~wages_per_return + capital_per_return | year+CountyFIPS, data = irs_health, weights = irs_health$returns, cluster = irs_health$CountyFIPS)

## NOTES: 1,458 observations removed because of NA values (LHS: 1,458).
##        0/145 fixed-effect singletons were removed (145 observations).

modelsummary(list(mod1,mod2,mod3), coef_rename = c("wages_per_return" = "Wages ($1000s)","capital_per_return" = "Capital Income ($1000s)"), coef_omit = "Intercept", gof_omit = "R2|R2 Adj.|R2 Within|R2 Within Adj.|AIC|BIC|RMSE|Std", title = "Table 2: The Effect of Wealth on Health", notes = "NOTE--Table reports regression coefficients with standard errors clustered by county in parentheses. The dependent variable
in each regression is the average number of healthy days per month. Columns 2 and 3 include year fixed effects. Column 3
includes year and county fixed effects. All regressions are weighted by the number of tax returns in each county. Data cover
2009-15. Capital income includes interest income, capital gains, and dividends. Wage and capital income are measured in
$1000s and taken from IRS form 1040.")

Table 2: The Effect of Wealth on Health
	(1)	(2)	(3)
NOTE--Table reports regression coefficients with standard errors clustered by county in parentheses. The dependent variable in each regression is the average number of healthy days per month. Columns 2 and 3 include year fixed effects. Column 3 includes year and county fixed effects. All regressions are weighted by the number of tax returns in each county. Data cover 2009-15. Capital income includes interest income, capital gains, and dividends. Wage and capital income are measured in $1000s and taken from IRS form 1040.
Wages ($1000s)	0.029	0.030	0.005
	(0.002)	(0.002)	(0.003)
Capital Income ($1000s)	-0.011	-0.011	-0.007
	(0.008)	(0.009)	(0.003)
Num.Obs.	20256	20256	20111
FE: year		X	X
FE: CountyFIPS			X

Problem 8:

Adding county fixed effects accounts for certain idiosyncratic features that are time-invariant and fixed at the county level that might impact an individual’s health outcomes, thereby obscuring the causal effect of wages and capital income that we are trying to estimate. County fixed effects might account for differences in wealth between counties that might lead one to have better hospitals than the other and overstate the true causal effect of income on health were they not accounted for. In this data specifically, the addition of county fixed effects decreases the wage coefficient from 0.03 to 0.005 and increases the capital income coefficient from -0.011 to -0.007 (both relative to Model 2). The fact that the inclusion of county fixed effects reduced the predictive power of both regression coefficients is a literal representation of our abstract prediction that higher wealth in certain counties would skew our coefficient estimates upwards and accounting for them would help address omitted variable bias.