# Does the average income per capita of the county impact the housing price?
dagify(
Inc ~ Pop,
P ~ Inc + Pop
) %>%
ggplot(aes(x = x, y = y, xend = xend, yend = yend)) +
geom_dag_point() +
geom_dag_edges_arc() +
geom_dag_text() +
theme_dag()
\[ (1) Pop \rightarrow P \] \[ (2) Pop \rightarrow Inc \rightarrow P \]
# The POPULATION (POP) is a CONFOUNDER, as it impact both the INCOME PER CAPITA (INC) and the HOUSING PRICE (P). If a factor impact both the explanatory variable and outcome variable, it is a CONFOUNDER.
\[ log(P_{it}) = IncomeperCapita_{t}\beta_{1} + Population_{t}\beta_{2} + \gamma_{s} + \delta_{t} + \epsilon_{it} \]
# Load the housing data
housingdata <- readRDS("testdata20250121.RDS")
# Extract the day, month, and year
housingdata$sale_date <- as.integer(housingdata$sale_date)
housingdata$day <- 100 * ((housingdata$sale_date / 100) - round(housingdata$sale_date / 100))
housingdata$month <- 100 * (((housingdata$sale_date - housingdata$day) / 10000) - round((housingdata$sale_date - housingdata$day) / 10000))
housingdata$year <- round(housingdata$sale_date / 10000)
# Generate categorical variables by year built
housingdata$year_built_very_old <- ifelse(housingdata$year_built < 1960, 1, 0)
housingdata$year_built_quite_old <- ifelse(housingdata$year_built >= 1960 & housingdata$year_built < 1980, 1, 0)
housingdata$year_built_quite_new <- ifelse(housingdata$year_built >= 1980 & housingdata$year_built < 2000, 1, 0)
housingdata$year_built_very_new <- ifelse(housingdata$year_built >= 2000, 1, 0)
housingdata$year_built_grp <- 1 * housingdata$year_built_very_old + 2 * housingdata$year_built_quite_old + 3 * housingdata$year_built_quite_new + 4 * housingdata$year_built_very_new
housingdata$log_sale_amount <- log(housingdata$sale_amount)
# Load the HPI data
hpidata <- read_excel("hpi_po_us_and_census.xls")
# Clean the data to corresponding range
hpidata <- hpidata[hpidata$division != "USA", ]
hpidata <- hpidata[hpidata$year >= 2008, ]
# Re-adjust HPI according to the new base
hpi_base <- hpidata %>%
group_by(division) %>%
slice(1)
hpi_base$base <- hpi_base$index_po_seasonally_adjusted
hpi_base <- subset(hpi_base, select = c(division, base))
hpidata <- merge(hpidata, hpi_base, by = "division")
hpidata$hpi <- round(hpidata$index_po_seasonally_adjusted / hpidata$base * 100, digits = 2)
hpidata <- subset(hpidata, select = c(division, year, qtr, hpi))
# Convert the data to division level
df_abbr_to_division <- read_csv("df_abbr_to_division.csv", col_names = TRUE)
## Rows: 51 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): abbr, division
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
housingdata <- merge(housingdata, df_abbr_to_division, by = "abbr")
housingdata$qtr1 <- ifelse(housingdata$month > 0 & housingdata$month <= 3, 1, 0)
housingdata$qtr2 <- ifelse(housingdata$month > 3 & housingdata$month <= 6, 1, 0)
housingdata$qtr3 <- ifelse(housingdata$month > 6 & housingdata$month <= 9, 1, 0)
housingdata$qtr4 <- ifelse(housingdata$month > 9 & housingdata$month <= 12, 1, 0)
housingdata$qtr <- 1 * housingdata$qtr1 + 2 * housingdata$qtr2 + 3 * housingdata$qtr3 + 4 * housingdata$qtr4
housingdata <- merge(housingdata, hpidata, by = c("division", "year", "qtr"))
housingdata$dlog_sale_amount <- log(housingdata$sale_amount * 100 / housingdata$hpi)
# Procedure to get the income and population data:
# 1. Go to https://apps.bea.gov/regional/downloadzip.htm.
# 2. Select "Personal income (state and local)", download "CAINC1".
# 3. Unzip the folder and import the first Excel spreadsheet.
# Import the income and population data
incomedata <- read_csv("CAINC1__ALL_AREAS_1969_2023.csv", col_names = TRUE)
## Rows: 9600 Columns: 57
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (56): GeoFIPS, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 197...
## dbl (1): LineCode
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
populationdata <- incomedata[incomedata$LineCode == 2, ]
populationdata <- populationdata[-2]
populationdata$GeoFIPS <- gsub('"', '', populationdata$GeoFIPS)
populationdata$GeoFIPS <- as.integer(populationdata$GeoFIPS)
populationdata <- populationdata %>%
pivot_longer(
cols = -GeoFIPS,
names_to = "year",
values_to = "pop"
)
populationdata$pop <- as.integer(populationdata$pop)
## Warning: NAs introduced by coercion
populationdata$fips_county_code <- populationdata$GeoFIPS
income <- incomedata[incomedata$LineCode == 3, ]
incomedata <- incomedata[-2]
incomedata$GeoFIPS <- gsub('"', '', incomedata$GeoFIPS)
incomedata$GeoFIPS <- as.integer(incomedata$GeoFIPS)
incomedata <- incomedata %>%
pivot_longer(
cols = -GeoFIPS,
names_to = "year",
values_to = "inc_per_cap"
)
incomedata$inc_per_cap <- as.integer(incomedata$inc_per_cap)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion to integer range
incomedata$fips_county_code <- incomedata$GeoFIPS
# Merge the data
housingincomedata <- merge(housingdata, incomedata, c("fips_county_code", "year"))
housingincomedata <- merge(housingincomedata, populationdata, c("fips_county_code", "year"))
mappingdata <- housingincomedata[housingincomedata$year == 2008, ]
mappingdata <- mappingdata %>%
group_by(abbr) %>%
summarise(mean_inc_per_cap = mean(inc_per_cap))
mappingdata$state <- mappingdata$abbr
plot_usmap(data = mappingdata, values = "mean_inc_per_cap", color = "red") +
labs(title = "Mean of income per capita, 2008") +
scale_fill_continuous(name = "Mean of income per capita, 2008") +
theme(legend.position = "bottom")
mappingdata <- housingincomedata[housingincomedata$year == 2019, ]
mappingdata <- mappingdata %>%
group_by(abbr) %>%
summarise(mean_inc_per_cap = mean(inc_per_cap))
mappingdata$state <- mappingdata$abbr
plot_usmap(data = mappingdata, values = "mean_inc_per_cap", color = "red") +
labs(title = "Mean of income per capita, 2019") +
scale_fill_continuous(name = "Mean of income per capita, 2019") +
theme(legend.position = "bottom")
# The income per capita generally increases over time all over the United States. CA has the highest income per capita, following FL, GA, IL, and, then other states.
mappingdata <- housingincomedata[housingincomedata$year == 2008, ]
mappingdata <- mappingdata %>%
group_by(abbr) %>%
summarise(mean_dlog_sale_amount = mean(dlog_sale_amount))
mappingdata$state <- mappingdata$abbr
plot_usmap(data = mappingdata, values = "mean_dlog_sale_amount", color = "red") +
labs(title = "Mean of log of deflated sale amounts, 2008") +
scale_fill_continuous(name = "Mean of log of deflated sale amounts, 2008") +
theme(legend.position = "bottom")
mappingdata <- housingincomedata[housingincomedata$year == 2008, ]
mappingdata <- mappingdata %>%
group_by(abbr) %>%
summarise(mean_dlog_sale_amount = mean(dlog_sale_amount))
mappingdata$state <- mappingdata$abbr
plot_usmap(data = mappingdata, values = "mean_dlog_sale_amount", color = "red") +
labs(title = "Mean of log of deflated sale amounts, 2019") +
scale_fill_continuous(name = "Mean of log of deflated sale amounts, 2019") +
theme(legend.position = "bottom")
# But NY has the highest value in housing price. Connected to the income data, the residents in NY may suffer much more to afford their housing options compared to the other states.
modelvars <- c("dlog_sale_amount", "inc_per_cap", "pop")
sumstatdata <- subset(housingincomedata, select = modelvars)
stargazer(sumstatdata, type = "text", title = "Summary Statistics of housing characteristics of very old houses", digits = 2)
##
## Summary Statistics of housing characteristics of very old houses
## =======================================================================
## Statistic N Mean St. Dev. Min Max
## -----------------------------------------------------------------------
## dlog_sale_amount 7,003,476 12.57 0.83 4.39 21.64
## inc_per_cap 7,003,476 29,810,895.00 96,728,249.00 435 629,245,755
## pop 7,003,476 1,727,078.00 2,898,026.00 435 10,123,521
## -----------------------------------------------------------------------
m1 <- feols(dlog_sale_amount ~ inc_per_cap, data = housingincomedata)
summary(m1)
## OLS estimation, Dep. Var.: dlog_sale_amount
## Observations: 7,003,476
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.251075e+01 3.201894e-04 39072.970 < 2.2e-16 ***
## inc_per_cap 1.820000e-09 3.160000e-12 575.814 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.809767 Adj. R2: 0.045202
m2 <- feols(dlog_sale_amount ~ inc_per_cap | fips_county_code + year, data = housingincomedata)
summary(m2)
## OLS estimation, Dep. Var.: dlog_sale_amount
## Observations: 7,003,476
## Fixed-effects: fips_county_code: 496, year: 12
## Standard-errors: Clustered (fips_county_code)
## Estimate Std. Error t value Pr(>|t|)
## inc_per_cap 1.74e-11 8.96e-12 1.94714 0.052083 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.629221 Adj. R2: 0.423461
## Within R2: 5.185e-6
m3 <- feols(dlog_sale_amount ~ inc_per_cap + pop, data = housingincomedata)
summary(m3)
## OLS estimation, Dep. Var.: dlog_sale_amount
## Observations: 7,003,476
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.238148e+01 3.383142e-04 36597.5917 < 2.2e-16 ***
## inc_per_cap 2.100000e-10 3.530000e-12 59.4013 < 2.2e-16 ***
## pop 1.026710e-07 1.177400e-10 871.9921 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.769092 Adj. R2: 0.138713
m4 <- feols(dlog_sale_amount ~ inc_per_cap + pop | fips_county_code + year, data = housingincomedata)
summary(m4)
## OLS estimation, Dep. Var.: dlog_sale_amount
## Observations: 7,003,476
## Fixed-effects: fips_county_code: 496, year: 12
## Standard-errors: Clustered (fips_county_code)
## Estimate Std. Error t value Pr(>|t|)
## inc_per_cap 3.010000e-12 4.530000e-12 0.665499 5.0604e-01
## pop 5.123006e-07 1.208807e-07 4.238067 2.6899e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.628901 Adj. R2: 0.424047
## Within R2: 0.001022
tab <- etable(m1, m2, m3, m4, se.below = T)
tab
## m1 m2 m3
## Dependent Var.: dlog_sale_amount dlog_sale_amount dlog_sale_amount
##
## Constant 12.51*** 12.38***
## (0.0003) (0.0003)
## inc_per_cap 1.82e-9*** 1.74e-11 2.1e-10***
## (3.16e-12) (8.96e-12) (3.53e-12)
## pop 1.03e-7***
## (1.18e-10)
## Fixed-Effects: ---------------- ---------------- ----------------
## fips_county_code No Yes No
## year No Yes No
## ________________ ________________ ________________ ________________
## S.E. type IID by: fips_count.. IID
## Observations 7,003,476 7,003,476 7,003,476
## R2 0.04520 0.42350 0.13871
## Within R2 -- 5.19e-6 --
##
## m4
## Dependent Var.: dlog_sale_amount
##
## Constant
##
## inc_per_cap 3.01e-12
## (4.53e-12)
## pop 5.12e-7***
## (1.21e-7)
## Fixed-Effects: ----------------
## fips_county_code Yes
## year Yes
## ________________ ________________
## S.E. type by: fips_count..
## Observations 7,003,476
## R2 0.42409
## Within R2 0.00102
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# The average income per capita in the county does increase the housing price, either controlling for the population or not. But with two-way fixed effects, such an increase disappears.