PROBLEM 2. Data description and analysis based on the DAG

(i) Construct the datasets

# Load the housing data
housingdata <- readRDS("testdata20250121.RDS")

# Extract the day, month, and year
housingdata$sale_date <- as.integer(housingdata$sale_date)
housingdata$day <- 100 * ((housingdata$sale_date / 100) - round(housingdata$sale_date / 100))
housingdata$month <- 100 * (((housingdata$sale_date - housingdata$day) / 10000) - round((housingdata$sale_date - housingdata$day) / 10000))
housingdata$year <- round(housingdata$sale_date / 10000)

# Generate categorical variables by year built
housingdata$year_built_very_old <- ifelse(housingdata$year_built < 1960, 1, 0)
housingdata$year_built_quite_old <- ifelse(housingdata$year_built >= 1960 & housingdata$year_built < 1980, 1, 0)
housingdata$year_built_quite_new <- ifelse(housingdata$year_built >= 1980 & housingdata$year_built < 2000, 1, 0)
housingdata$year_built_very_new <- ifelse(housingdata$year_built >= 2000, 1, 0)
housingdata$year_built_grp <- 1 * housingdata$year_built_very_old + 2 * housingdata$year_built_quite_old + 3 * housingdata$year_built_quite_new + 4 * housingdata$year_built_very_new
housingdata$log_sale_amount <- log(housingdata$sale_amount)

# Load the HPI data
hpidata <- read_excel("hpi_po_us_and_census.xls")

# Clean the data to corresponding range
hpidata <- hpidata[hpidata$division != "USA", ]
hpidata <- hpidata[hpidata$year >= 2008, ]

# Re-adjust HPI according to the new base
hpi_base <- hpidata %>%
  group_by(division) %>%
  slice(1)
hpi_base$base <- hpi_base$index_po_seasonally_adjusted
hpi_base <- subset(hpi_base, select = c(division, base))
hpidata <- merge(hpidata, hpi_base, by = "division")
hpidata$hpi <- round(hpidata$index_po_seasonally_adjusted / hpidata$base * 100, digits = 2)
hpidata <- subset(hpidata, select = c(division, year, qtr, hpi))

# Convert the data to division level
df_abbr_to_division <- read_csv("df_abbr_to_division.csv", col_names = TRUE)

## Rows: 51 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): abbr, division
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

housingdata <- merge(housingdata, df_abbr_to_division, by = "abbr")

housingdata$qtr1 <- ifelse(housingdata$month > 0 & housingdata$month <= 3, 1, 0)
housingdata$qtr2 <- ifelse(housingdata$month > 3 & housingdata$month <= 6, 1, 0)
housingdata$qtr3 <- ifelse(housingdata$month > 6 & housingdata$month <= 9, 1, 0)
housingdata$qtr4 <- ifelse(housingdata$month > 9 & housingdata$month <= 12, 1, 0)
housingdata$qtr <- 1 * housingdata$qtr1 + 2 * housingdata$qtr2 + 3 * housingdata$qtr3 + 4 * housingdata$qtr4

housingdata <- merge(housingdata, hpidata, by = c("division", "year", "qtr"))

housingdata$dlog_sale_amount <- log(housingdata$sale_amount * 100 / housingdata$hpi)

# Procedure to get the income and population data:
# 1. Go to https://apps.bea.gov/regional/downloadzip.htm.
# 2. Select "Personal income (state and local)", download "CAINC1".
# 3. Unzip the folder and import the first Excel spreadsheet.

# Import the income and population data
incomedata <- read_csv("CAINC1__ALL_AREAS_1969_2023.csv", col_names = TRUE)

## Rows: 9600 Columns: 57
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (56): GeoFIPS, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 197...
## dbl  (1): LineCode
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

populationdata <- incomedata[incomedata$LineCode == 2, ]
populationdata <- populationdata[-2]
populationdata$GeoFIPS <- gsub('"', '', populationdata$GeoFIPS)
populationdata$GeoFIPS <- as.integer(populationdata$GeoFIPS)
populationdata <- populationdata %>%
  pivot_longer(
    cols = -GeoFIPS,
    names_to = "year",
    values_to = "pop"
  )
populationdata$pop <- as.integer(populationdata$pop)

## Warning: NAs introduced by coercion

populationdata$fips_county_code <- populationdata$GeoFIPS

income <- incomedata[incomedata$LineCode == 3, ]
incomedata <- incomedata[-2]
incomedata$GeoFIPS <- gsub('"', '', incomedata$GeoFIPS)
incomedata$GeoFIPS <- as.integer(incomedata$GeoFIPS)
incomedata <- incomedata %>%
  pivot_longer(
    cols = -GeoFIPS,
    names_to = "year",
    values_to = "inc_per_cap"
  )
incomedata$inc_per_cap <- as.integer(incomedata$inc_per_cap)

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion to integer range

incomedata$fips_county_code <- incomedata$GeoFIPS

# Merge the data
housingincomedata <- merge(housingdata, incomedata, c("fips_county_code", "year"))
housingincomedata <- merge(housingincomedata, populationdata, c("fips_county_code", "year"))

(ii) Map the treatement variable: income per capita

mappingdata <- housingincomedata[housingincomedata$year == 2008, ] 

mappingdata <- mappingdata %>%
  group_by(abbr) %>%
  summarise(mean_inc_per_cap = mean(inc_per_cap))
mappingdata$state <- mappingdata$abbr
plot_usmap(data = mappingdata, values = "mean_inc_per_cap", color = "red") +
    labs(title = "Mean of income per capita, 2008") +
  scale_fill_continuous(name = "Mean of income per capita, 2008") +
  theme(legend.position = "bottom")

mappingdata <- housingincomedata[housingincomedata$year == 2019, ] 

mappingdata <- mappingdata %>%
  group_by(abbr) %>%
  summarise(mean_inc_per_cap = mean(inc_per_cap))
mappingdata$state <- mappingdata$abbr
plot_usmap(data = mappingdata, values = "mean_inc_per_cap", color = "red") +
    labs(title = "Mean of income per capita, 2019") +
  scale_fill_continuous(name = "Mean of income per capita, 2019") +
  theme(legend.position = "bottom")

# The income per capita generally increases over time all over the United States. CA has the highest income per capita, following FL, GA, IL, and, then other states.

(iii) Map the outcome variable: deflated log of sale amount

mappingdata <- housingincomedata[housingincomedata$year == 2008, ] 

mappingdata <- mappingdata %>%
  group_by(abbr) %>%
  summarise(mean_dlog_sale_amount = mean(dlog_sale_amount))
mappingdata$state <- mappingdata$abbr
plot_usmap(data = mappingdata, values = "mean_dlog_sale_amount", color = "red") +
    labs(title = "Mean of log of deflated sale amounts, 2008") +
  scale_fill_continuous(name = "Mean of log of deflated sale amounts, 2008") +
  theme(legend.position = "bottom")

mappingdata <- housingincomedata[housingincomedata$year == 2008, ] 

mappingdata <- mappingdata %>%
  group_by(abbr) %>%
  summarise(mean_dlog_sale_amount = mean(dlog_sale_amount))
mappingdata$state <- mappingdata$abbr
plot_usmap(data = mappingdata, values = "mean_dlog_sale_amount", color = "red") +
    labs(title = "Mean of log of deflated sale amounts, 2019") +
  scale_fill_continuous(name = "Mean of log of deflated sale amounts, 2019") +
  theme(legend.position = "bottom")

# But NY has the highest value in housing price. Connected to the income data, the residents in NY may suffer much more to afford their housing options compared to the other states.

(iv) Put the descriptive statistics into well-formated tables

modelvars <- c("dlog_sale_amount", "inc_per_cap", "pop")

sumstatdata <- subset(housingincomedata, select = modelvars)
stargazer(sumstatdata, type = "text", title = "Summary Statistics of housing characteristics of very old houses", digits = 2)

## 
## Summary Statistics of housing characteristics of very old houses
## =======================================================================
## Statistic            N         Mean        St. Dev.    Min      Max    
## -----------------------------------------------------------------------
## dlog_sale_amount 7,003,476     12.57         0.83      4.39    21.64   
## inc_per_cap      7,003,476 29,810,895.00 96,728,249.00 435  629,245,755
## pop              7,003,476 1,727,078.00  2,898,026.00  435  10,123,521 
## -----------------------------------------------------------------------

(v) Run the regressions and put the results into well-formatted tables

m1 <- feols(dlog_sale_amount ~ inc_per_cap, data = housingincomedata)
summary(m1)

## OLS estimation, Dep. Var.: dlog_sale_amount
## Observations: 7,003,476
## Standard-errors: IID 
##                 Estimate   Std. Error   t value  Pr(>|t|)    
## (Intercept) 1.251075e+01 3.201894e-04 39072.970 < 2.2e-16 ***
## inc_per_cap 1.820000e-09 3.160000e-12   575.814 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.809767   Adj. R2: 0.045202

m2 <- feols(dlog_sale_amount ~ inc_per_cap | fips_county_code + year, data = housingincomedata)
summary(m2)

## OLS estimation, Dep. Var.: dlog_sale_amount
## Observations: 7,003,476
## Fixed-effects: fips_county_code: 496,  year: 12
## Standard-errors: Clustered (fips_county_code) 
##             Estimate Std. Error t value Pr(>|t|)    
## inc_per_cap 1.74e-11   8.96e-12 1.94714 0.052083 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.629221     Adj. R2: 0.423461
##                  Within R2: 5.185e-6

m3 <- feols(dlog_sale_amount ~ inc_per_cap + pop, data = housingincomedata)
summary(m3)

## OLS estimation, Dep. Var.: dlog_sale_amount
## Observations: 7,003,476
## Standard-errors: IID 
##                 Estimate   Std. Error    t value  Pr(>|t|)    
## (Intercept) 1.238148e+01 3.383142e-04 36597.5917 < 2.2e-16 ***
## inc_per_cap 2.100000e-10 3.530000e-12    59.4013 < 2.2e-16 ***
## pop         1.026710e-07 1.177400e-10   871.9921 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.769092   Adj. R2: 0.138713

m4 <- feols(dlog_sale_amount ~ inc_per_cap + pop | fips_county_code + year, data = housingincomedata)
summary(m4)

## OLS estimation, Dep. Var.: dlog_sale_amount
## Observations: 7,003,476
## Fixed-effects: fips_county_code: 496,  year: 12
## Standard-errors: Clustered (fips_county_code) 
##                 Estimate   Std. Error  t value   Pr(>|t|)    
## inc_per_cap 3.010000e-12 4.530000e-12 0.665499 5.0604e-01    
## pop         5.123006e-07 1.208807e-07 4.238067 2.6899e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.628901     Adj. R2: 0.424047
##                  Within R2: 0.001022

tab <- etable(m1, m2, m3, m4, se.below = T)
tab

##                                m1               m2               m3
## Dependent Var.:  dlog_sale_amount dlog_sale_amount dlog_sale_amount
##                                                                    
## Constant              12.51***                          12.38***   
##                       (0.0003)                          (0.0003)   
## inc_per_cap            1.82e-9***        1.74e-11        2.1e-10***
##                       (3.16e-12)        (8.96e-12)      (3.53e-12) 
## pop                                                      1.03e-7***
##                                                         (1.18e-10) 
## Fixed-Effects:   ---------------- ---------------- ----------------
## fips_county_code               No              Yes               No
## year                           No              Yes               No
## ________________ ________________ ________________ ________________
## S.E. type                     IID by: fips_count..              IID
## Observations            7,003,476        7,003,476        7,003,476
## R2                        0.04520          0.42350          0.13871
## Within R2                      --          5.19e-6               --
## 
##                                m4
## Dependent Var.:  dlog_sale_amount
##                                  
## Constant                         
##                                  
## inc_per_cap            3.01e-12  
##                       (4.53e-12) 
## pop                    5.12e-7***
##                       (1.21e-7)  
## Fixed-Effects:   ----------------
## fips_county_code              Yes
## year                          Yes
## ________________ ________________
## S.E. type        by: fips_count..
## Observations            7,003,476
## R2                        0.42409
## Within R2                 0.00102
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# The average income per capita in the county does increase the housing price, either controlling for the population or not. But with two-way fixed effects, such an increase disappears.

EEFE 530, Spring 2025

Problem Set 2

Timothy (Yuan) Zhuang

20 February 2025

PROBLEM 1. Research question and DAG

(i) Research question

(ii) DAG

(iii) Causal paths

(iv) Confounder(s) and/or collider(s)

(v) Write down the equation to estimate

PROBLEM 2. Data description and analysis based on the DAG

(i) Construct the datasets

(ii) Map the treatement variable: income per capita

(iii) Map the outcome variable: deflated log of sale amount

(iv) Put the descriptive statistics into well-formated tables

(v) Run the regressions and put the results into well-formatted tables