# Load packages
install.packages("pacman")
library(pacman)
p_load(tidyverse, lubridate, usmap, gridExtra, stringr, readxl, plot3D,  
       cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer,  
       foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont, 
       kableExtra, snakecase, janitor)

# Load housing data
setwd("/Users/gracegao/Desktop/EEFE530/PS1")
housingdata <- readRDS("testdata20250121.RDS")

Problem 1

## i
housingdata <- housingdata %>%
  mutate(sale_date = ymd(sale_date),
         year = year(sale_date),
         month = month(sale_date),
         day = day(sale_date))

head(housingdata %>% select(sale_date, year, month, day), 10)

## ii
summary(housingdata$year)
summary(housingdata$year_built)

ggplot(housingdata, aes(x = year)) + geom_histogram(binwidth = 1)
ggplot(housingdata, aes(x = year_built)) + geom_histogram(binwidth = 10)

quantiles <- quantile(housingdata$year_built, probs = c(0, 0.25, 0.5, 0.75, 1))

housingdata <- housingdata %>%
  mutate(year_built_grp = cut(year_built, 
                              breaks = quantiles, 
                              labels = c("Very Old", "Quite Old", "Quite New", "Very New"), 
                              include.lowest = TRUE))
table(housingdata$year_built_grp)

## iii
housingdata <- housingdata %>%
  mutate(log_sale_amount = log(sale_amount))

scatter3D(x = housingdata$log_sale_amount, 
          y = as.numeric(housingdata$year_built_grp), 
          z = housingdata$land_square_footage,
          xlab = "Log Sale Amount", 
          ylab = "Year Built Group", 
          zlab = "Land Square Footage",
          pch = 19, 
          colvar = as.numeric(housingdata$year_built_grp),
          colkey = list(at = 1:4, labels = levels(housingdata$year_built_grp)),
          main = "3D Plot of Log Sale Amount, Year Built Group, and Land Square Footage")

## iv
## I'll drop the unreasonable outliers in the dataset. Not sure if having 100 bedrooms is possible, but I suppose there might be some very large houses (something like a hotel) that might reasonably have 100 bedrooms so I'll keep those for now.
summary(housingdata$log_sale_amount)
summary(housingdata$land_square_footage)
summary(housingdata$bedrooms_all_buildings)
summary(housingdata$number_of_bathrooms)
summary(housingdata$number_of_fireplaces)
summary(housingdata$stories_number)

library(dplyr)
percentile_99_land_square_footage <- quantile(housingdata$land_square_footage, 0.99, na.rm = TRUE)
percentile_99_fireplace <- quantile(housingdata$number_of_fireplaces, 0.99, na.rm = TRUE)
cleanhousingdata <- housingdata %>%
  filter(land_square_footage >= 120 &
         land_square_footage < percentile_99_land_square_footage &
         bedrooms_all_buildings < 100 &
         number_of_bathrooms >= 1 &
         number_of_bathrooms <= 100 &
         number_of_fireplaces < 99 &
         stories_number < 175)

Problem 2

knitr::opts_chunk$set(echo = TRUE)
setwd("/Users/gracegao/Desktop/EEFE530/PS1")
hpidata <- read_excel("hpi_po_us_and_census.xls")
summary(hpidata)
table(hpidata$division)

## i
cleanhpidata <- hpidata %>%
  filter(division != "USA")

index_100 <- cleanhpidata %>%
  filter(index_po_seasonally_adjusted == 100)
print(index_100)

start_year <- min(housingdata$year, na.rm = TRUE)

cleanhpidata <- cleanhpidata %>%
  group_by(division) %>%
  mutate(base_index = index_po_seasonally_adjusted[year == start_year & qtr == 1],
         hpi = (index_po_seasonally_adjusted / base_index) * 100) %>%
  ungroup()

head(cleanhpidata)

## ii
cleanhousingdata <- cleanhousingdata %>%
  mutate(qtr = ceiling(month / 3))

cleanhousingdata <- cleanhousingdata %>%
  mutate(division = case_when(
    abbr %in% c("CT", "ME", "MA", "NH", "RI", "VT") ~ "DV_NE",
    abbr %in% c("NJ", "NY", "PA") ~ "DV_MA",
    abbr %in% c("IL", "IN", "MI", "OH", "WI") ~ "DV_ENC",
    abbr %in% c("IA", "KS", "MN", "MO", "NE", "ND", "SD") ~ "DV_WNC",
    abbr %in% c("AL", "AR", "FL", "GA", "KY", "LA", "MS", "NC", "SC", "TN", "VA", "WV") ~ "DV_SA",
    abbr %in% c("DE", "DC", "MD", "WV", "VA") ~ "DV_ESC",
    abbr %in% c("AZ", "CO", "ID", "MT", "NV", "NM", "UT", "WY") ~ "DV_MT",
    abbr %in% c("AK", "CA", "HI", "OR", "WA") ~ "DV_PAC",
    abbr %in% c("OK", "TX") ~ "DV_WSC"
  ))
summary(cleanhousingdata)

cleanhousinghpidata <- merge(cleanhousingdata, cleanhpidata, by = c("division", "year", "qtr"))

cleanhousinghpidata <- cleanhousinghpidata %>%
  mutate(dlog_sale_amount = log(sale_amount * 100 / hpi))

Problem 3

## i
setwd("/Users/gracegao/Desktop/EEFE530/PS1")
crimedata <- read_excel("table1.xls", range = "A5:V24", col_names = c("year", "population", "violent_crime", "violent_crime_rate", "murder", "murder_rate", "rape_revised", "rape_revised_rate", "rape", "rape_rate", "robbery", "robbery_rate", "assult", "assult_rate", "property_crime", "property_crime_rate", "burglary", "burglary_rate", "larceny_theft", "larceny_theft_rate", "vehicle_theft", "vehicle_theft_rate"))

## ii & iii
crimedata$year <- ifelse(crimedata$year == 20015, 2001, crimedata$year)
crimedata$year <- ifelse(crimedata$year == 20186, 2018, crimedata$year)

start_year <- min(cleanhousinghpidata$year)
end_year <- max(cleanhousinghpidata$year)
cleancrimedata <- crimedata %>%
  filter(year >= start_year & year <= end_year)
cleancrimedata <- crimedata %>% select(-rape_revised, -rape_revised_rate)

## iv
cleanhousinghpicrimedata <- merge(cleancrimedata, cleanhousinghpidata, by = c("year"))

Problem 4

## i
vars <- c("sale_amount", "dlog_sale_amount", "year_built", "violent_crime_rate", "murder_rate", "rape_rate", "robbery_rate", "assult_rate", "property_crime_rate", "burglary_rate", "larceny_theft_rate", "vehicle_theft_rate", "bedrooms_all_buildings", "number_of_bathrooms", "number_of_fireplaces", "stories_number", "land_square_footage", "AC_presence")

summary1 <- cleanhousinghpicrimedata %>%
  select(all_of(vars)) %>%
  summary()

stargazer(
  cleanhousinghpicrimedata %>%
  select(all_of(vars)),
  type = "text",
  digit = 2,
  out = "summary1.txt"
)

## ii
summary_year_built_grp <- cleanhousinghpicrimedata %>%
  group_by(year_built_grp) %>%
  summarise(across(all_of(vars), list(mean = mean, sd = sd, min = min, max = max, median = median, IQR = IQR), .names = "{col}_{fn}"))

print(summary_year_built_grp)
 
## iii
summary_division <- cleanhousinghpicrimedata %>%
  group_by(division) %>%
  summarise(across(all_of(vars), list(mean = mean, sd = sd, min = min, max = max, median = median, IQR = IQR), .names = "{col}_{fn}"))

print(summary_division)
 
## iv
cleanhousinghpicrimedata <- cleanhousinghpicrimedata %>%
  mutate(region = case_when(
    division %in% c("DV_NE", "DV_MA") ~ "Northeast",
    division %in% c("DV_ENC", "DV_WNC") ~ "Midwest",
    division %in% c("DV_SA", "DV_ESC", "DV_WSC") ~ "South",
    division %in% c("DV_MT", "DV_PAC") ~ "West"
  ))

summary_region <- cleanhousinghpicrimedata %>%
  group_by(region) %>%
  summarise(across(all_of(vars), list(mean = mean, sd = sd, min = min, max = max, median = median, IQR = IQR), .names = "{col}_{fn}"))

print(summary_region)

Problem 5

## i 
reg <- dlog_sale_amount ~ violent_crime_rate + murder_rate + rape_rate + robbery_rate + assult_rate + property_crime_rate + burglary_rate + larceny_theft_rate + vehicle_theft_rate + year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + land_square_footage + AC_presence

hedonic <- lm(reg, data = cleanhousinghpicrimedata)

summary(hedonic)

## ii
library(sandwich)
library(lmtest)

hedonic_robust <- lm(reg, data = cleanhousinghpicrimedata)
coeftest(hedonic_robust, vcov = vcovHC(hedonic_robust, type = "HC1"))

library(plm)
pdata <- pdata.frame(cleanhousinghpicrimedata, index = c("division", "year"))
reg_fe <- dlog_sale_amount ~ violent_crime_rate + murder_rate + rape_rate + robbery_rate + assult_rate + property_crime_rate + burglary_rate + larceny_theft_rate + vehicle_theft_rate + year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + land_square_footage + AC_presence

hedonic_fe <- plm(reg_fe, data = pdata, model = "within")

summary(hedonic_fe)

library(stargazer)
stargazer(
  hedonic, hedonic_robust, hedonic_fe,
  type = "text",
  title = "Hedonic Regression Models",
  column.labels = c("OLS", "Robust OLS", "Fixed Effects"),
  digits = 2,
  out = "hedonic_models.txt"
)

### My preferred model is the fixed effect model

## iii
regions <- unique(cleanhousinghpicrimedata$region)
models_fe <- list()

for (region in regions) {
  pdata_region <- pdata[pdata$region == region, ]
  model_fe <- plm(reg, data = pdata_region, model = "within")
  models_fe[[region]] <- model_fe
  effect <- coef(model_fe)["violent_crime_rate"]
  effects_df <- rbind(effects_df, data.frame(region = region, effect = effect))
}

stargazer(
  models_fe[[1]], models_fe[[2]], models_fe[[3]], models_fe[[4]],
  type = "text",
  title = "Fixed Effects Regression Models by Region",
  column.labels = regions,
  digits = 2,
  out = "fixed_effects_models_by_region.txt"
)

## iv
print(effects_df)
### Sorry i counldn't make the map work.