# Load packages
install.packages("pacman")
library(pacman)
p_load(tidyverse, lubridate, usmap, gridExtra, stringr, readxl, plot3D,
cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer,
foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont,
kableExtra, snakecase, janitor)
# Load housing data
setwd("/Users/gracegao/Desktop/EEFE530/PS1")
housingdata <- readRDS("testdata20250121.RDS")
Problem 1
## i
housingdata <- housingdata %>%
mutate(sale_date = ymd(sale_date),
year = year(sale_date),
month = month(sale_date),
day = day(sale_date))
head(housingdata %>% select(sale_date, year, month, day), 10)
## ii
summary(housingdata$year)
summary(housingdata$year_built)
ggplot(housingdata, aes(x = year)) + geom_histogram(binwidth = 1)
ggplot(housingdata, aes(x = year_built)) + geom_histogram(binwidth = 10)
quantiles <- quantile(housingdata$year_built, probs = c(0, 0.25, 0.5, 0.75, 1))
housingdata <- housingdata %>%
mutate(year_built_grp = cut(year_built,
breaks = quantiles,
labels = c("Very Old", "Quite Old", "Quite New", "Very New"),
include.lowest = TRUE))
table(housingdata$year_built_grp)
## iii
housingdata <- housingdata %>%
mutate(log_sale_amount = log(sale_amount))
scatter3D(x = housingdata$log_sale_amount,
y = as.numeric(housingdata$year_built_grp),
z = housingdata$land_square_footage,
xlab = "Log Sale Amount",
ylab = "Year Built Group",
zlab = "Land Square Footage",
pch = 19,
colvar = as.numeric(housingdata$year_built_grp),
colkey = list(at = 1:4, labels = levels(housingdata$year_built_grp)),
main = "3D Plot of Log Sale Amount, Year Built Group, and Land Square Footage")
## iv
## I'll drop the unreasonable outliers in the dataset. Not sure if having 100 bedrooms is possible, but I suppose there might be some very large houses (something like a hotel) that might reasonably have 100 bedrooms so I'll keep those for now.
summary(housingdata$log_sale_amount)
summary(housingdata$land_square_footage)
summary(housingdata$bedrooms_all_buildings)
summary(housingdata$number_of_bathrooms)
summary(housingdata$number_of_fireplaces)
summary(housingdata$stories_number)
library(dplyr)
percentile_99_land_square_footage <- quantile(housingdata$land_square_footage, 0.99, na.rm = TRUE)
percentile_99_fireplace <- quantile(housingdata$number_of_fireplaces, 0.99, na.rm = TRUE)
cleanhousingdata <- housingdata %>%
filter(land_square_footage >= 120 &
land_square_footage < percentile_99_land_square_footage &
bedrooms_all_buildings < 100 &
number_of_bathrooms >= 1 &
number_of_bathrooms <= 100 &
number_of_fireplaces < 99 &
stories_number < 175)
Problem 2
knitr::opts_chunk$set(echo = TRUE)
setwd("/Users/gracegao/Desktop/EEFE530/PS1")
hpidata <- read_excel("hpi_po_us_and_census.xls")
summary(hpidata)
table(hpidata$division)
## i
cleanhpidata <- hpidata %>%
filter(division != "USA")
index_100 <- cleanhpidata %>%
filter(index_po_seasonally_adjusted == 100)
print(index_100)
start_year <- min(housingdata$year, na.rm = TRUE)
cleanhpidata <- cleanhpidata %>%
group_by(division) %>%
mutate(base_index = index_po_seasonally_adjusted[year == start_year & qtr == 1],
hpi = (index_po_seasonally_adjusted / base_index) * 100) %>%
ungroup()
head(cleanhpidata)
## ii
cleanhousingdata <- cleanhousingdata %>%
mutate(qtr = ceiling(month / 3))
cleanhousingdata <- cleanhousingdata %>%
mutate(division = case_when(
abbr %in% c("CT", "ME", "MA", "NH", "RI", "VT") ~ "DV_NE",
abbr %in% c("NJ", "NY", "PA") ~ "DV_MA",
abbr %in% c("IL", "IN", "MI", "OH", "WI") ~ "DV_ENC",
abbr %in% c("IA", "KS", "MN", "MO", "NE", "ND", "SD") ~ "DV_WNC",
abbr %in% c("AL", "AR", "FL", "GA", "KY", "LA", "MS", "NC", "SC", "TN", "VA", "WV") ~ "DV_SA",
abbr %in% c("DE", "DC", "MD", "WV", "VA") ~ "DV_ESC",
abbr %in% c("AZ", "CO", "ID", "MT", "NV", "NM", "UT", "WY") ~ "DV_MT",
abbr %in% c("AK", "CA", "HI", "OR", "WA") ~ "DV_PAC",
abbr %in% c("OK", "TX") ~ "DV_WSC"
))
summary(cleanhousingdata)
cleanhousinghpidata <- merge(cleanhousingdata, cleanhpidata, by = c("division", "year", "qtr"))
cleanhousinghpidata <- cleanhousinghpidata %>%
mutate(dlog_sale_amount = log(sale_amount * 100 / hpi))
Problem 3
## i
setwd("/Users/gracegao/Desktop/EEFE530/PS1")
crimedata <- read_excel("table1.xls", range = "A5:V24", col_names = c("year", "population", "violent_crime", "violent_crime_rate", "murder", "murder_rate", "rape_revised", "rape_revised_rate", "rape", "rape_rate", "robbery", "robbery_rate", "assult", "assult_rate", "property_crime", "property_crime_rate", "burglary", "burglary_rate", "larceny_theft", "larceny_theft_rate", "vehicle_theft", "vehicle_theft_rate"))
## ii & iii
crimedata$year <- ifelse(crimedata$year == 20015, 2001, crimedata$year)
crimedata$year <- ifelse(crimedata$year == 20186, 2018, crimedata$year)
start_year <- min(cleanhousinghpidata$year)
end_year <- max(cleanhousinghpidata$year)
cleancrimedata <- crimedata %>%
filter(year >= start_year & year <= end_year)
cleancrimedata <- crimedata %>% select(-rape_revised, -rape_revised_rate)
## iv
cleanhousinghpicrimedata <- merge(cleancrimedata, cleanhousinghpidata, by = c("year"))
Problem 4
## i
vars <- c("sale_amount", "dlog_sale_amount", "year_built", "violent_crime_rate", "murder_rate", "rape_rate", "robbery_rate", "assult_rate", "property_crime_rate", "burglary_rate", "larceny_theft_rate", "vehicle_theft_rate", "bedrooms_all_buildings", "number_of_bathrooms", "number_of_fireplaces", "stories_number", "land_square_footage", "AC_presence")
summary1 <- cleanhousinghpicrimedata %>%
select(all_of(vars)) %>%
summary()
stargazer(
cleanhousinghpicrimedata %>%
select(all_of(vars)),
type = "text",
digit = 2,
out = "summary1.txt"
)
## ii
summary_year_built_grp <- cleanhousinghpicrimedata %>%
group_by(year_built_grp) %>%
summarise(across(all_of(vars), list(mean = mean, sd = sd, min = min, max = max, median = median, IQR = IQR), .names = "{col}_{fn}"))
print(summary_year_built_grp)
## iii
summary_division <- cleanhousinghpicrimedata %>%
group_by(division) %>%
summarise(across(all_of(vars), list(mean = mean, sd = sd, min = min, max = max, median = median, IQR = IQR), .names = "{col}_{fn}"))
print(summary_division)
## iv
cleanhousinghpicrimedata <- cleanhousinghpicrimedata %>%
mutate(region = case_when(
division %in% c("DV_NE", "DV_MA") ~ "Northeast",
division %in% c("DV_ENC", "DV_WNC") ~ "Midwest",
division %in% c("DV_SA", "DV_ESC", "DV_WSC") ~ "South",
division %in% c("DV_MT", "DV_PAC") ~ "West"
))
summary_region <- cleanhousinghpicrimedata %>%
group_by(region) %>%
summarise(across(all_of(vars), list(mean = mean, sd = sd, min = min, max = max, median = median, IQR = IQR), .names = "{col}_{fn}"))
print(summary_region)
Problem 5
## i
reg <- dlog_sale_amount ~ violent_crime_rate + murder_rate + rape_rate + robbery_rate + assult_rate + property_crime_rate + burglary_rate + larceny_theft_rate + vehicle_theft_rate + year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + land_square_footage + AC_presence
hedonic <- lm(reg, data = cleanhousinghpicrimedata)
summary(hedonic)
## ii
library(sandwich)
library(lmtest)
hedonic_robust <- lm(reg, data = cleanhousinghpicrimedata)
coeftest(hedonic_robust, vcov = vcovHC(hedonic_robust, type = "HC1"))
library(plm)
pdata <- pdata.frame(cleanhousinghpicrimedata, index = c("division", "year"))
reg_fe <- dlog_sale_amount ~ violent_crime_rate + murder_rate + rape_rate + robbery_rate + assult_rate + property_crime_rate + burglary_rate + larceny_theft_rate + vehicle_theft_rate + year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + land_square_footage + AC_presence
hedonic_fe <- plm(reg_fe, data = pdata, model = "within")
summary(hedonic_fe)
library(stargazer)
stargazer(
hedonic, hedonic_robust, hedonic_fe,
type = "text",
title = "Hedonic Regression Models",
column.labels = c("OLS", "Robust OLS", "Fixed Effects"),
digits = 2,
out = "hedonic_models.txt"
)
### My preferred model is the fixed effect model
## iii
regions <- unique(cleanhousinghpicrimedata$region)
models_fe <- list()
for (region in regions) {
pdata_region <- pdata[pdata$region == region, ]
model_fe <- plm(reg, data = pdata_region, model = "within")
models_fe[[region]] <- model_fe
effect <- coef(model_fe)["violent_crime_rate"]
effects_df <- rbind(effects_df, data.frame(region = region, effect = effect))
}
stargazer(
models_fe[[1]], models_fe[[2]], models_fe[[3]], models_fe[[4]],
type = "text",
title = "Fixed Effects Regression Models by Region",
column.labels = regions,
digits = 2,
out = "fixed_effects_models_by_region.txt"
)
## iv
print(effects_df)
### Sorry i counldn't make the map work.