setwd("C:/Users/Lenovo/OneDrive - The Pennsylvania State University/Penn State/Coursework/Sem IV/EEFE 530/Problem Sets/Problem Set 1")
#Libraries
# Load packages
library(pacman)
p_load(tidyverse, lubridate, usmap, gridExtra, stringr, readxl, plot3D,
cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer,
foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont,
kableExtra, snakecase, janitor)
library(dplyr)
#Load Data
housingdata <- readRDS("testdata20250121.RDS")
#PROBLEM 1
##Part i - Date Extraction
#Checking variable type
str(housingdata$sale_date) # character
## chr [1:2350548] "20150129" "20090904" "20130228" "20080825" "20140225" ...
head(housingdata$sale_date) # View first few values
## [1] "20150129" "20090904" "20130228" "20080825" "20140225" "20180111"
#create a copy
housingdata_wip <- housingdata
#Change the string to YYY-MM-DD format
housingdata_wip <- housingdata_wip %>%
mutate(sale_date = ymd(sale_date))
#Separate columns
housingdata_wip <- housingdata_wip %>%
mutate(
year = year(sale_date),
month = month(sale_date),
day = day(sale_date)
)
#Displaying first 10 rows with extracted dates
housingdata_wip %>%
select(sale_date, year, month, day) %>%
head(10)
## sale_date year month day
## <Date> <int> <int> <int>
## 1: 2015-01-29 2015 1 29
## 2: 2009-09-04 2009 9 4
## 3: 2013-02-28 2013 2 28
## 4: 2008-08-25 2008 8 25
## 5: 2014-02-25 2014 2 25
## 6: 2018-01-11 2018 1 11
## 7: 2015-04-08 2015 4 8
## 8: 2015-06-23 2015 6 23
## 9: 2009-05-28 2009 5 28
## 10: 2012-01-31 2012 1 31
#Summarizing Year
summary(housingdata_wip$year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2008 2012 2015 2015 2017 2019
hist(housingdata_wip$year)
#Summarizing Year Built
summary(housingdata_wip$year_built)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1700 1965 1986 1981 2001 2019
hist(housingdata_wip$year_built)
#Generating Categorical Variable for Year Built
housingdata_wip <- housingdata_wip %>%
mutate(year_built_grp = case_when(
year_built < 1965 ~ 1, # Very old houses = based on first quantile
year_built >= 1965 & year_built < 1986 ~ 2, # Quite old houses = based on median
year_built >= 1986 & year_built < 2001 ~ 3, # Quite new houses = based on third quantile
year_built >= 2001 ~ 4 # Very new houses
))
#Defining categorical labels for `year_built_grp`
housingdata_wip <- housingdata_wip %>%
mutate(
log_sale_amount = log(sale_amount),
log_land_square_footage = log(land_square_footage),
year_built_grp_label = case_when(
year_built_grp == 1 ~ "1: Very Old (<1965)",
year_built_grp == 2 ~ "2: Quite Old (1965-1986)",
year_built_grp == 3 ~ "3: Quite New (1986-2001)",
year_built_grp == 4 ~ "4: Very New (≥2001)"
)
)
#Displaying first few rows
head(housingdata_wip %>% select(year_built, year_built_grp), 10)
## year_built year_built_grp
## <int> <num>
## 1: 2009 4
## 2: 1986 3
## 3: 2006 4
## 4: 2000 3
## 5: 1940 1
## 6: 2006 4
## 7: 1999 3
## 8: 1998 3
## 9: 2007 4
## 10: 1994 3
#Creating logged variables for continuous values
housingdata_wip <- housingdata_wip %>%
mutate(
log_sale_amount = log(sale_amount),
log_land_square_footage = log(land_square_footage)
)
#Check NAs
sum(is.na(housingdata_wip$log_land_square_footage)) #zero
## [1] 0
sum(is.na(housingdata_wip$log_sale_amount)) #zero
## [1] 0
#3D scatter Plot
library(plot3D)
#Defining colours for categories
colors <- c("darkblue", "skyblue", "yellow", "brown")
color_mapping <- colors[as.numeric(housingdata_wip$year_built_grp)]
#Plot
plot_3d <- scatter3D(
x = housingdata_wip$log_sale_amount,
y = housingdata_wip$year_built_grp,
z = housingdata_wip$log_land_square_footage,
colvar = NULL,
col = color_mapping,
pch = 19,
cex = 1.2,
theta = 40, phi = 20,
main = "3D Plot of Log Sale Amount, Year Built Group, and Log Land Square Footage",
xlab = "Log Sale Amount",
ylab = "Year Built Group",
zlab = "Log Land Square Footage",
ticktype = "detailed"
)
plot_3d
## [,1] [,2] [,3] [,4]
## [1,] 8.947906e-02 -0.02567950 0.07055385 -0.07055385
## [2,] 4.285251e-01 0.17466842 -0.47989754 0.47989754
## [3,] -3.798445e-18 0.09068963 0.03300832 -0.03300832
## [4,] -2.249423e+00 -1.03825888 -2.80326238 3.80326238
#Legend
par(new = TRUE)
legend("topright",
legend = c("1: Very Old (<1965)", "2: Quite Old (1965-1986)",
"3: Quite New (1986-2001)", "4: Very New (≥2001)"),
col = colors, pch = 19, cex = 1.2, bty = "n")
#Summarising numerical variables
summary_stats <- housingdata_wip %>% summary(across(where(is.numeric)))
print(summary_stats)
## sale_amount sale_date x y
## Min. :1.000e+02 Min. :2008-01-01 Min. :-124.57 Min. :26.32
## 1st Qu.:1.800e+05 1st Qu.:2012-06-13 1st Qu.:-118.32 1st Qu.:33.55
## Median :2.870e+05 Median :2015-07-28 Median :-104.77 Median :34.37
## Mean :4.417e+05 Mean :2015-01-19 Mean :-102.81 Mean :35.88
## 3rd Qu.:4.740e+05 3rd Qu.:2017-12-13 3rd Qu.: -84.53 3rd Qu.:38.56
## Max. :2.730e+09 Max. :2019-12-31 Max. : -73.91 Max. :48.97
## fips_county_code year_built bedrooms_all_buildings number_of_bathrooms
## Min. : 1101 Min. :1700 Min. : 1.000 Min. : 0.010
## 1st Qu.: 6081 1st Qu.:1965 1st Qu.: 3.000 1st Qu.: 2.000
## Median :12091 Median :1986 Median : 3.000 Median : 2.000
## Mean :20436 Mean :1981 Mean : 3.431 Mean : 2.472
## 3rd Qu.:40087 3rd Qu.:2001 3rd Qu.: 4.000 3rd Qu.: 3.000
## Max. :53071 Max. :2019 Max. :999.000 Max. :450.000
## number_of_fireplaces stories_number land_square_footage abbr
## Min. : 1.00 Min. : 0.500 Min. : 1 Length:2350548
## 1st Qu.: 1.00 1st Qu.: 1.000 1st Qu.: 6874 Class :character
## Median : 1.00 Median : 1.000 Median : 9583 Mode :character
## Mean : 1.19 Mean : 1.371 Mean : 39671
## 3rd Qu.: 1.00 3rd Qu.: 2.000 3rd Qu.: 17031
## Max. :999.00 Max. :175.000 Max. :999999999
## AC_presence year month day
## Min. :0.0000 Min. :2008 Min. : 1.000 Min. : 1.00
## 1st Qu.:0.0000 1st Qu.:2012 1st Qu.: 4.000 1st Qu.: 9.00
## Median :1.0000 Median :2015 Median : 7.000 Median :17.00
## Mean :0.6091 Mean :2015 Mean : 6.662 Mean :16.76
## 3rd Qu.:1.0000 3rd Qu.:2017 3rd Qu.: 9.000 3rd Qu.:25.00
## Max. :1.0000 Max. :2019 Max. :12.000 Max. :31.00
## year_built_grp log_sale_amount log_land_square_footage year_built_grp_label
## Min. :1.000 Min. : 4.605 Min. : 0.000 Length:2350548
## 1st Qu.:2.000 1st Qu.:12.101 1st Qu.: 8.836 Class :character
## Median :3.000 Median :12.567 Median : 9.168 Mode :character
## Mean :2.525 Mean :12.595 Mean : 9.386
## 3rd Qu.:4.000 3rd Qu.:13.069 3rd Qu.: 9.743
## Max. :4.000 Max. :21.727 Max. :20.723
The minimum bathroom is 0.010 and stories 0.50 which seems invalid, therefore, I will remove those.
#Filtering values
housingdata_wip_clean <- housingdata_wip %>%
filter(number_of_bathrooms >= 1, stories_number >= 1)
Some of the outliers that are odd are: number of bedrooms and fireplaces being 999, number of bathrooms being 450, number of stories being 175. I will check the values of these variables above their 90th, 95th, and 99.99th percentile and if they are far from mean and median, I will remove them on account of being errors
###90th Percentile
colnames(housingdata_wip)
## [1] "sale_amount" "sale_date"
## [3] "x" "y"
## [5] "fips_county_code" "year_built"
## [7] "bedrooms_all_buildings" "number_of_bathrooms"
## [9] "number_of_fireplaces" "stories_number"
## [11] "land_square_footage" "abbr"
## [13] "AC_presence" "year"
## [15] "month" "day"
## [17] "year_built_grp" "log_sale_amount"
## [19] "log_land_square_footage" "year_built_grp_label"
#90th Percentile
##List of variables for which we want the 90th percentile
vars_to_check <- c("bedrooms_all_buildings", "number_of_bathrooms",
"number_of_fireplaces", "stories_number")
##Compute the 90th percentile for each variable
percentile_90 <- housingdata_wip %>%
summarise(across(all_of(vars_to_check),
~ quantile(.x, 0.90, na.rm = TRUE)))
##Print the results
print(percentile_90)
## bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## 1 4 3.5 2
## stories_number
## 1 2
###95th Percentile
#95th Percentile
##Computing the 95th percentile for each variable
percentile_95 <- housingdata_wip %>%
summarise(across(all_of(vars_to_check),
~ quantile(.x, 0.95, na.rm = TRUE)))
# Print the results
print(percentile_95)
## bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## 1 5 4 2
## stories_number
## 1 2
###99.99th Percentile
#95th Percentile
##Computing the 99.99th percentile for each variable
percentile_9999 <- housingdata_wip %>%
summarise(across(all_of(vars_to_check),
~ quantile(.x, 0.9999, na.rm = TRUE)))
# Print the results
print(percentile_9999)
## bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## 1 14 11 99
## stories_number
## 1 7
Since all the values seem reasonable up until 99.9th Percentile, I will remove entries above them.
#Computing the 99.99th percentile for each variable and storing it in new columns
housingdata_wip_clean <- housingdata_wip_clean %>%
mutate(
bedrooms_99 = quantile(bedrooms_all_buildings, 0.9999, na.rm = TRUE),
bathrooms_99 = quantile(number_of_bathrooms, 0.9999, na.rm = TRUE),
fireplaces_99 = quantile(number_of_fireplaces, 0.9999, na.rm = TRUE),
stories_99 = quantile(stories_number, 0.9999, na.rm = TRUE)
)
#Remove rows where values exceed 99.99th percentile
housingdata_wip_clean <- housingdata_wip_clean %>%
filter(
bedrooms_all_buildings < bedrooms_99,
number_of_bathrooms < bathrooms_99,
number_of_fireplaces < fireplaces_99,
stories_number < stories_99
)
#Summary of clean data
summary(housingdata_wip_clean)
## sale_amount sale_date x y
## Min. :1.000e+02 Min. :2008-01-01 Min. :-124.57 Min. :26.32
## 1st Qu.:1.800e+05 1st Qu.:2012-06-13 1st Qu.:-118.32 1st Qu.:33.54
## Median :2.875e+05 Median :2015-07-28 Median :-104.77 Median :34.37
## Mean :4.411e+05 Mean :2015-01-18 Mean :-102.81 Mean :35.88
## 3rd Qu.:4.742e+05 3rd Qu.:2017-12-13 3rd Qu.: -84.53 3rd Qu.:38.55
## Max. :2.730e+09 Max. :2019-12-31 Max. : -73.91 Max. :48.97
## fips_county_code year_built bedrooms_all_buildings number_of_bathrooms
## Min. : 1101 Min. :1700 Min. : 1.000 Min. : 1.000
## 1st Qu.: 6081 1st Qu.:1965 1st Qu.: 3.000 1st Qu.: 2.000
## Median :12091 Median :1986 Median : 3.000 Median : 2.000
## Mean :20437 Mean :1981 Mean : 3.427 Mean : 2.471
## 3rd Qu.:40087 3rd Qu.:2001 3rd Qu.: 4.000 3rd Qu.: 3.000
## Max. :53071 Max. :2019 Max. :13.000 Max. :10.750
## number_of_fireplaces stories_number land_square_footage abbr
## Min. : 1.000 Min. :1.000 Min. : 1 Length:2348446
## 1st Qu.: 1.000 1st Qu.:1.000 1st Qu.: 6874 Class :character
## Median : 1.000 Median :1.000 Median : 9583 Mode :character
## Mean : 1.119 Mean :1.371 Mean : 39600
## 3rd Qu.: 1.000 3rd Qu.:2.000 3rd Qu.: 17000
## Max. :93.000 Max. :6.500 Max. :999999999
## AC_presence year month day
## Min. :0.0000 Min. :2008 Min. : 1.000 Min. : 1.00
## 1st Qu.:0.0000 1st Qu.:2012 1st Qu.: 4.000 1st Qu.: 9.00
## Median :1.0000 Median :2015 Median : 7.000 Median :17.00
## Mean :0.6092 Mean :2015 Mean : 6.662 Mean :16.76
## 3rd Qu.:1.0000 3rd Qu.:2017 3rd Qu.: 9.000 3rd Qu.:25.00
## Max. :1.0000 Max. :2019 Max. :12.000 Max. :31.00
## year_built_grp log_sale_amount log_land_square_footage year_built_grp_label
## Min. :1.000 Min. : 4.605 Min. : 0.000 Length:2348446
## 1st Qu.:2.000 1st Qu.:12.101 1st Qu.: 8.836 Class :character
## Median :3.000 Median :12.569 Median : 9.168 Mode :character
## Mean :2.526 Mean :12.595 Mean : 9.385
## 3rd Qu.:4.000 3rd Qu.:13.069 3rd Qu.: 9.741
## Max. :4.000 Max. :21.727 Max. :20.723
## bedrooms_99 bathrooms_99 fireplaces_99 stories_99
## Min. :14 Min. :11 Min. :99 Min. :7
## 1st Qu.:14 1st Qu.:11 1st Qu.:99 1st Qu.:7
## Median :14 Median :11 Median :99 Median :7
## Mean :14 Mean :11 Mean :99 Mean :7
## 3rd Qu.:14 3rd Qu.:11 3rd Qu.:99 3rd Qu.:7
## Max. :14 Max. :11 Max. :99 Max. :7
#Note: cleanhousingdata is called housingdata_wip_clean here
#PROBLEM 2
###Secondary Data
#Calling data
secondary_data <- read.csv("hpi_po_us_and_census.csv")
###Create quarter in Primary Dataset
#Generating quarters based on month
housingdata_wip_clean <- housingdata_wip_clean %>%
mutate(qtr = case_when(
month %in% c(1, 2, 3) ~ 1,
month %in% c(4, 5, 6) ~ 2,
month %in% c(7, 8, 9) ~ 3,
month %in% c(10, 11, 12) ~ 4
))
###Create division abbreviations
#State-to-division mapping using the correct abbreviations
state_division_map <- tibble(
abbr = c("CT", "ME", "MA", "NH", "RI", "VT", "NY", "NJ", "PA", # Mid Atlantic
"IL", "IN", "MI", "OH", "WI", # East North Central
"IA", "KS", "MN", "MO", "NE", "ND", "SD", # West North Central
"DE", "FL", "GA", "MD", "NC", "SC", "VA", "DC", "WV", # South Atlantic
"AL", "KY", "MS", "TN", # East South Central
"AR", "LA", "OK", "TX", # West South Central
"AZ", "CO", "ID", "MT", "NV", "NM", "UT", "WY", # Mountain
"AK", "CA", "HI", "OR", "WA"), # Pacific
division = c(rep("DV_MA", 9), rep("DV_ENC", 5), rep("DV_WNC", 7),
rep("DV_SA", 9), rep("DV_ESC", 4), rep("DV_WSC", 4),
rep("DV_MT", 8), rep("DV_PAC", 5))
)
#Merging division info into `housingdata_wip_clean`
housingdata_wip_clean <- housingdata_wip_clean %>%
left_join(state_division_map, by = "abbr")
###Rescaling
# Step 1: Create `cleanhpidata` and Remove "USA" division
cleanhpidata <- secondary_data %>%
filter(division != "USA")
# Step 2: Identify the First Available Year in House Sales Data
start_year <- min(housingdata_wip$year, na.rm = TRUE)
# Step 3: Correctly Extract Base HPI for Each Division in `qtr = 1` of `start_year`
base_hpi_values <- cleanhpidata %>%
group_by(division) %>%
filter(year == start_year, qtr == 1) %>%
summarise(base_hpi = first(index_po_seasonally_adjusted), .groups = "drop")
# Step 4: Merge Base HPI Values with `cleanhpidata`
cleanhpidata <- cleanhpidata %>%
left_join(base_hpi_values, by = "division") %>%
mutate(hpi = (index_po_seasonally_adjusted / base_hpi) * 100)
# Step 5: Ensure `hpi = 100` for `qtr = 1` of `start_year`
cleanhpidata <- cleanhpidata %>%
mutate(hpi = if_else(year == start_year & qtr == 1, 100, hpi))
# Step 6: Restrict `cleanhpidata` to the Range of Years in `housingdata_wip`
year_range <- range(housingdata_wip$year, na.rm = TRUE)
cleanhpidata <- cleanhpidata %>%
filter(year >= year_range[1], year <= year_range[2])
# Step 7: Check Results
summary(cleanhpidata$hpi)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 71.87 91.09 97.26 100.50 109.20 146.23
head(cleanhpidata %>% select(division, year, qtr, index_po_seasonally_adjusted, base_hpi, hpi))
## division year qtr index_po_seasonally_adjusted base_hpi hpi
## 1 DV_ENC 2008 1 179.93 179.93 100.00000
## 2 DV_ENC 2008 2 175.95 179.93 97.78803
## 3 DV_ENC 2008 3 172.76 179.93 96.01512
## 4 DV_ENC 2008 4 168.17 179.93 93.46412
## 5 DV_ENC 2009 1 169.15 179.93 94.00878
## 6 DV_ENC 2009 2 166.74 179.93 92.66937
#Merge the cleaned housing data and cleaned index data
housingdata_cleanhpidata <- housingdata_wip_clean %>%
left_join(cleanhpidata, by = c("division", "year", "qtr"))
#Summary
summary(housingdata_cleanhpidata$hpi)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 71.87 89.60 102.45 104.75 120.16 146.23
# Computing the deflated house prices
housingdata_cleanhpidata <- housingdata_cleanhpidata %>%
mutate(dlog_sale_amount = log((sale_amount * 100) / hpi))
#Summary
summary(housingdata_cleanhpidata$dlog_sale_amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.39 12.07 12.53 12.57 13.03 21.64
# Load necessary libraries
library(dplyr)
library(ggplot2)
library(usmap)
# Step 1: Creating data for mapping
state_year_avg <- housingdata_cleanhpidata %>%
group_by(abbr, year) %>%
summarise(mean_dlog_sale = mean(dlog_sale_amount, na.rm = TRUE)) %>%
ungroup() %>%
filter(!is.na(year))
## `summarise()` has grouped output by 'abbr'. You can override using the
## `.groups` argument.
# Step 2: Adjusting abbreviations for state names
state_year_avg <- state_year_avg %>%
mutate(state = state.name[match(abbr, state.abb)])
# Step 3: Plotting the Data
plot_usmap(data = state_year_avg, values = "mean_dlog_sale", include = state_year_avg$state) +
facet_wrap(~ year) +
scale_fill_continuous(
name = "Mean dlog_sale_amount",
low = "blue",
high = "red",
na.value = "white"
) +
labs(
title = "Mean dlog_sale_amount by State and Year",
subtitle = "Each facet represents a year",
caption = "Data Source: housingdata_cleanhpidata"
) +
theme_minimal() +
theme(legend.position = "right")
library(stringr)
#Loading crime data
crimedata <- read_excel("table-1.xls", skip = 3, col_names = TRUE)
## New names:
## • `` -> `...23`
## • `` -> `...24`
#Checking column names
colnames(crimedata)
## [1] "Year"
## [2] "Population1"
## [3] "Violent\ncrime2"
## [4] "Violent \ncrime \nrate"
## [5] "Murder and\nnonnegligent \nmanslaughter"
## [6] "Murder and \nnonnegligent \nmanslaughter \nrate"
## [7] "Rape\n(revised \ndefinition)3"
## [8] "Rape\n(revised \ndefinition) \nrate3"
## [9] "Rape\n(legacy \ndefinition)4"
## [10] "Rape\n(legacy \ndefinition) \nrate4"
## [11] "Robbery"
## [12] "Robbery \nrate"
## [13] "Aggravated \nassault"
## [14] "Aggravated \nassault rate"
## [15] "Property \ncrime"
## [16] "Property \ncrime \nrate"
## [17] "Burglary"
## [18] "Burglary \nrate"
## [19] "Larceny-\ntheft"
## [20] "Larceny-\ntheft rate"
## [21] "Motor \nvehicle \ntheft"
## [22] "Motor \nvehicle \ntheft \nrate"
## [23] "...23"
## [24] "...24"
#drop last two vars
crimedata <- crimedata %>% select(-c(...23, ...24))
###Rename variables
crimedata <- crimedata %>%
rename(
year = "Year",
population = "Population1",
violent_crime = "Violent\ncrime2",
violent_crime_rate = "Violent \ncrime \nrate",
murder = "Murder and\nnonnegligent \nmanslaughter",
murder_rate = "Murder and \nnonnegligent \nmanslaughter \nrate",
rape_revised = "Rape\n(revised \ndefinition)3",
rape_revised_rate = "Rape\n(revised \ndefinition) \nrate3",
rape_legacy = "Rape\n(legacy \ndefinition)4",
rape_legacy_rate = "Rape\n(legacy \ndefinition) \nrate4",
robbery = "Robbery",
robbery_rate = "Robbery \nrate",
aggravated_assault = "Aggravated \nassault",
aggravated_assault_rate = "Aggravated \nassault rate",
property_crime = "Property \ncrime",
property_crime_rate = "Property \ncrime \nrate",
burglary = "Burglary",
burglary_rate = "Burglary \nrate",
larceny_theft = "Larceny-\ntheft",
larceny_theft_rate = "Larceny-\ntheft rate",
motor_vehicle_theft = "Motor \nvehicle \ntheft",
motor_vehicle_theft_rate = "Motor \nvehicle \ntheft \nrate"
)
# Check the renamed dataset
colnames(crimedata)
## [1] "year" "population"
## [3] "violent_crime" "violent_crime_rate"
## [5] "murder" "murder_rate"
## [7] "rape_revised" "rape_revised_rate"
## [9] "rape_legacy" "rape_legacy_rate"
## [11] "robbery" "robbery_rate"
## [13] "aggravated_assault" "aggravated_assault_rate"
## [15] "property_crime" "property_crime_rate"
## [17] "burglary" "burglary_rate"
## [19] "larceny_theft" "larceny_theft_rate"
## [21] "motor_vehicle_theft" "motor_vehicle_theft_rate"
###Fix footnotes
crimedata$year[which(crimedata$year == 20015)] <- 2001
crimedata$year[which(crimedata$year == 20186)] <- 2018
###Drop rows with notes
#Dropping last 7 rows with footnotes
crimedata <- crimedata %>%
slice(1:(n() - 7))
#Cleaned dataset
tail(crimedata)
## # A tibble: 6 × 22
## year population violent_crime violent_crime_rate murder murder_rate
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2014 318907401 1153022 362. 14164 4.4
## 2 2015 320896618 1199310 374. 15883 4.9
## 3 2016 323405935 1250162 387. 17413 5.4
## 4 2017 325147121 1247917 384. 17294 5.3
## 5 2018 326687501 1209997 370. 16374 5
## 6 2019 328239523 1203808 367. 16425 5
## # ℹ 16 more variables: rape_revised <dbl>, rape_revised_rate <dbl>,
## # rape_legacy <dbl>, rape_legacy_rate <dbl>, robbery <dbl>,
## # robbery_rate <dbl>, aggravated_assault <dbl>,
## # aggravated_assault_rate <dbl>, property_crime <dbl>,
## # property_crime_rate <dbl>, burglary <dbl>, burglary_rate <dbl>,
## # larceny_theft <dbl>, larceny_theft_rate <dbl>, motor_vehicle_theft <dbl>,
## # motor_vehicle_theft_rate <dbl>
#Limiting the data to years given in housingdata_cleanhpidata
range(housingdata_cleanhpidata$year)
## [1] 2008 2019
crimedata <- crimedata %>% filter(year %in% 2008:2019)
###Find Missing Variables
#Identifying columns with missing values
missing_vars <- crimedata %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "missing_count") %>%
filter(missing_count > 0)
#Printing missing variables and their missing value counts
print(missing_vars)
## # A tibble: 2 × 2
## variable missing_count
## <chr> <int>
## 1 rape_revised 5
## 2 rape_revised_rate 5
Removed “rape_revised” and “rape_revised_rate” as they had missing values (NAs).
###Clean Crime Data
cleancrimedata <- crimedata %>% select(-c(rape_revised, rape_revised_rate))
#Check NAs
sum(is.na(cleancrimedata)) #Zero Missing Values
## [1] 0
crime_vars <- colnames(crimedata)
housing_vars <- colnames(housingdata_cleanhpidata)
#common variables
common_vars <- intersect(crime_vars, housing_vars)
common_vars
## [1] "year"
housingdata_cleanhpidata$year <- as.numeric(housingdata_cleanhpidata$year)
cleancrimedata$year <- as.numeric(cleancrimedata$year)
#Merge
cleanhousinghpicrimedata <- left_join(housingdata_cleanhpidata, cleancrimedata, by = "year")
###Plot Crime Rates and Year
# Define the variables to plot
crime_vars <- c("violent_crime_rate", "murder_rate", "rape_legacy_rate", "robbery_rate",
"aggravated_assault_rate", "property_crime_rate", "burglary_rate",
"larceny_theft_rate", "motor_vehicle_theft_rate")
# Convert dataset to long format
crime_long <- cleanhousinghpicrimedata %>%
select(year, all_of(crime_vars)) %>%
pivot_longer(cols = -year, names_to = "crime_type", values_to = "value")
# Plot
ggplot(crime_long, aes(x = year, y = value, color = crime_type)) +
geom_line(size = 1) +
scale_x_continuous(breaks = seq(2008, 2019, by = 1)) +
labs(title = "Crime Trends Over the Years",
x = "Year",
y = "Crime Count / Rate",
color = "Crime Type") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
The plot suggests that most crime rates are well below 1000 per 100,000,
however, property crimes and larceny thefts are two highest crime rates.
They seem to have decreased substantiatlly over the years but are still
higher than other crime rates. Burglary rate is the third highest crime,
but still declining.
###Plot deflated sale amount by year for different housing groups This plot takes a very long time to produce (havent successfully ran a single time even though waited for an hour or so), that’s why I am skipping plotting this. However, code is given below:
# Scatter plot with smoothed trend lines
#ggplot(cleanhousinghpicrimedata, aes(x = year, y = dlog_sale_amount, color = as.factor(year_built_grp))) + geom_point(alpha = 0.6, size = 2) + geom_smooth(method = "loess", se = FALSE, size = 1) + scale_x_continuous(breaks = seq(2008, 2019, by = 1)) + labs(title = "Deflated Sale Amount Over the Years by Year Built Group", x = "Year", y = "Log Deflated Sale Amount", color = "Year Built Group") + theme_minimal()
#PROBLEM 4
#Selecting relevant variables from housing data
vars_to_summarize <- c("sale_amount", "dlog_sale_amount", "year_built",
"bedrooms_all_buildings", "number_of_bathrooms",
"number_of_fireplaces", "stories_number",
"log_sale_amount", "log_land_square_footage",
"violent_crime_rate", "murder_rate", "rape_legacy_rate",
"robbery_rate", "aggravated_assault_rate", "property_crime_rate",
"burglary_rate", "larceny_theft_rate", "motor_vehicle_theft_rate")
#Summary Table
stargazer(cleanhousinghpicrimedata %>% select(all_of(vars_to_summarize)),
type = "text", title = "Summary Statistics", digits = 2)
##
## Summary Statistics
## ====================================================================================
## Statistic N Mean St. Dev. Min Max
## ------------------------------------------------------------------------------------
## sale_amount 2,348,446 441,056.80 2,116,736.00 100.00 2,729,784,889.00
## dlog_sale_amount 2,348,446 12.57 0.83 4.39 21.64
## year_built 2,348,446 1,981.46 24.51 1,700 2,019
## bedrooms_all_buildings 2,348,446 3.43 0.88 1 13
## number_of_bathrooms 2,348,446 2.47 0.90 1.00 10.75
## number_of_fireplaces 2,348,446 1.12 0.41 1 93
## stories_number 2,348,446 1.37 0.49 1.00 6.50
## log_sale_amount 2,348,446 12.59 0.83 4.61 21.73
## log_land_square_footage 2,348,446 9.39 1.02 0.00 20.72
## violent_crime_rate 2,348,446 384.74 23.94 361.60 458.60
## murder_rate 2,348,446 4.95 0.31 4.40 5.40
## rape_legacy_rate 2,348,446 28.89 1.68 25.90 31.00
## robbery_rate 2,348,446 104.21 16.48 81.60 145.90
## aggravated_assault_rate 2,348,446 246.65 11.38 229.20 277.50
## property_crime_rate 2,348,446 2,561.03 324.64 2,109.90 3,214.60
## burglary_rate 2,348,446 524.23 134.87 340.50 733.00
## larceny_theft_rate 2,348,446 1,802.36 178.57 1,549.50 2,166.10
## motor_vehicle_theft_rate 2,348,446 234.45 21.63 215.40 315.40
## ------------------------------------------------------------------------------------
#Housing characters variables
housing_vars <- c("sale_amount", "dlog_sale_amount", "year_built",
"bedrooms_all_buildings", "number_of_bathrooms",
"number_of_fireplaces", "stories_number",
"log_sale_amount", "log_land_square_footage")
#Summary by year group
summary_by_year_built_grp_df <- cleanhousinghpicrimedata %>%
group_by(year_built_grp) %>%
summarise(across(all_of(vars_to_summarize),
list(Mean = mean, Median = median, SD = sd, Min = min, Max = max),
.names = "{.col}.{.fn}")) %>% # Use "." instead of "_"
pivot_longer(cols = -year_built_grp,
names_to = c("Variable", "Statistic"),
names_pattern = "(.*)\\.(.*)") %>% # Extract proper names
mutate(Value = round(as.numeric(value), 2)) %>%
select(-value) %>% # Drop old value column
pivot_wider(names_from = year_built_grp, values_from = Value) # Wide format
#Summary Stats table using kable
kable(summary_by_year_built_grp_df, format = "html", align = "c", caption = "Summary Statistics by Year Built Group") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
| Variable | Statistic | 1 | 2 | 3 | 4 |
|---|---|---|---|---|---|
| sale_amount | Mean | 564319.73 | 405752.40 | 3.779475e+05 | 420230.51 |
| sale_amount | Median | 360000.00 | 255257.18 | 2.740000e+05 | 291000.00 |
| sale_amount | SD | 1313575.06 | 1320834.23 | 3.728872e+06 | 1070924.04 |
| sale_amount | Min | 100.00 | 100.00 | 1.000000e+02 | 100.00 |
| sale_amount | Max | 600000000.00 | 132019663.00 | 2.729785e+09 | 288164568.00 |
| dlog_sale_amount | Mean | 12.73 | 12.48 | 1.250000e+01 | 12.56 |
| dlog_sale_amount | Median | 12.82 | 12.41 | 1.248000e+01 | 12.52 |
| dlog_sale_amount | SD | 1.02 | 0.83 | 6.900000e-01 | 0.71 |
| dlog_sale_amount | Min | 4.39 | 4.39 | 4.390000e+00 | 4.39 |
| dlog_sale_amount | Max | 20.00 | 18.60 | 2.164000e+01 | 19.21 |
| year_built | Mean | 1946.48 | 1976.41 | 1.993190e+03 | 2007.65 |
| year_built | Median | 1952.00 | 1977.00 | 1.993000e+03 | 2006.00 |
| year_built | SD | 16.58 | 5.77 | 4.490000e+00 | 5.12 |
| year_built | Min | 1700.00 | 1965.00 | 1.986000e+03 | 2001.00 |
| year_built | Max | 1964.00 | 1985.00 | 2.000000e+03 | 2019.00 |
| bedrooms_all_buildings | Mean | 3.18 | 3.31 | 3.480000e+00 | 3.72 |
| bedrooms_all_buildings | Median | 3.00 | 3.00 | 3.000000e+00 | 4.00 |
| bedrooms_all_buildings | SD | 0.90 | 0.82 | 8.100000e-01 | 0.89 |
| bedrooms_all_buildings | Min | 1.00 | 1.00 | 1.000000e+00 | 1.00 |
| bedrooms_all_buildings | Max | 13.00 | 13.00 | 1.300000e+01 | 13.00 |
| number_of_bathrooms | Mean | 2.03 | 2.29 | 2.660000e+00 | 2.88 |
| number_of_bathrooms | Median | 2.00 | 2.00 | 2.500000e+00 | 2.50 |
| number_of_bathrooms | SD | 0.87 | 0.70 | 8.300000e-01 | 0.94 |
| number_of_bathrooms | Min | 1.00 | 1.00 | 1.000000e+00 | 1.00 |
| number_of_bathrooms | Max | 10.50 | 10.00 | 1.075000e+01 | 10.50 |
| number_of_fireplaces | Mean | 1.10 | 1.11 | 1.130000e+00 | 1.14 |
| number_of_fireplaces | Median | 1.00 | 1.00 | 1.000000e+00 | 1.00 |
| number_of_fireplaces | SD | 0.37 | 0.41 | 4.200000e-01 | 0.44 |
| number_of_fireplaces | Min | 1.00 | 1.00 | 1.000000e+00 | 1.00 |
| number_of_fireplaces | Max | 16.00 | 93.00 | 1.900000e+01 | 30.00 |
| stories_number | Mean | 1.16 | 1.30 | 1.490000e+00 | 1.53 |
| stories_number | Median | 1.00 | 1.00 | 1.500000e+00 | 1.50 |
| stories_number | SD | 0.36 | 0.47 | 5.000000e-01 | 0.50 |
| stories_number | Min | 1.00 | 1.00 | 1.000000e+00 | 1.00 |
| stories_number | Max | 6.00 | 6.50 | 6.250000e+00 | 6.50 |
| log_sale_amount | Mean | 12.74 | 12.50 | 1.253000e+01 | 12.61 |
| log_sale_amount | Median | 12.79 | 12.45 | 1.252000e+01 | 12.58 |
| log_sale_amount | SD | 1.03 | 0.83 | 7.000000e-01 | 0.72 |
| log_sale_amount | Min | 4.61 | 4.61 | 4.610000e+00 | 4.61 |
| log_sale_amount | Max | 20.21 | 18.70 | 2.173000e+01 | 19.48 |
| log_land_square_footage | Mean | 9.22 | 9.48 | 9.460000e+00 | 9.38 |
| log_land_square_footage | Median | 9.00 | 9.25 | 9.250000e+00 | 9.16 |
| log_land_square_footage | SD | 0.82 | 1.11 | 1.050000e+00 | 1.05 |
| log_land_square_footage | Min | 0.00 | 0.00 | 0.000000e+00 | 0.00 |
| log_land_square_footage | Max | 20.72 | 20.72 | 2.072000e+01 | 20.72 |
| violent_crime_rate | Mean | 385.60 | 384.21 | 3.838900e+02 | 385.23 |
| violent_crime_rate | Median | 383.80 | 373.70 | 3.737000e+02 | 383.80 |
| violent_crime_rate | SD | 24.33 | 23.33 | 2.321000e+01 | 24.77 |
| violent_crime_rate | Min | 361.60 | 361.60 | 3.616000e+02 | 361.60 |
| violent_crime_rate | Max | 458.60 | 458.60 | 4.586000e+02 | 458.60 |
| murder_rate | Mean | 4.94 | 4.95 | 4.950000e+00 | 4.96 |
| murder_rate | Median | 5.00 | 5.00 | 5.000000e+00 | 5.00 |
| murder_rate | SD | 0.31 | 0.31 | 3.100000e-01 | 0.31 |
| murder_rate | Min | 4.40 | 4.40 | 4.400000e+00 | 4.40 |
| murder_rate | Max | 5.40 | 5.40 | 5.400000e+00 | 5.40 |
| rape_legacy_rate | Mean | 28.82 | 28.89 | 2.890000e+01 | 28.94 |
| rape_legacy_rate | Median | 29.10 | 29.80 | 2.980000e+01 | 29.80 |
| rape_legacy_rate | SD | 1.68 | 1.69 | 1.690000e+00 | 1.67 |
| rape_legacy_rate | Min | 25.90 | 25.90 | 2.590000e+01 | 25.90 |
| rape_legacy_rate | Max | 31.00 | 31.00 | 3.100000e+01 | 31.00 |
| robbery_rate | Mean | 105.04 | 103.89 | 1.036400e+02 | 104.28 |
| robbery_rate | Median | 102.20 | 102.20 | 1.022000e+02 | 102.20 |
| robbery_rate | SD | 16.59 | 16.21 | 1.614000e+01 | 16.90 |
| robbery_rate | Min | 81.60 | 81.60 | 8.160000e+01 | 81.60 |
| robbery_rate | Max | 145.90 | 145.90 | 1.459000e+02 | 145.90 |
| aggravated_assault_rate | Mean | 246.76 | 246.44 | 2.463500e+02 | 247.01 |
| aggravated_assault_rate | Median | 248.30 | 248.30 | 2.482000e+02 | 248.30 |
| aggravated_assault_rate | SD | 11.52 | 11.16 | 1.116000e+01 | 11.62 |
| aggravated_assault_rate | Min | 229.20 | 229.20 | 2.292000e+02 | 229.20 |
| aggravated_assault_rate | Max | 277.50 | 277.50 | 2.775000e+02 | 277.50 |
| property_crime_rate | Mean | 2579.99 | 2555.66 | 2.549940e+03 | 2559.02 |
| property_crime_rate | Median | 2500.50 | 2500.50 | 2.500500e+03 | 2500.50 |
| property_crime_rate | SD | 327.07 | 321.51 | 3.199300e+02 | 328.92 |
| property_crime_rate | Min | 2109.90 | 2109.90 | 2.109900e+03 | 2109.90 |
| property_crime_rate | Max | 3214.60 | 3214.60 | 3.214600e+03 | 3214.60 |
| burglary_rate | Mean | 532.61 | 522.30 | 5.197200e+02 | 522.57 |
| burglary_rate | Median | 494.70 | 494.70 | 4.947000e+02 | 494.70 |
| burglary_rate | SD | 136.22 | 134.12 | 1.332600e+02 | 135.48 |
| burglary_rate | Min | 340.50 | 340.50 | 3.405000e+02 | 340.50 |
| burglary_rate | Max | 733.00 | 733.00 | 7.330000e+02 | 733.00 |
| larceny_theft_rate | Mean | 1812.57 | 1799.41 | 1.796380e+03 | 1801.33 |
| larceny_theft_rate | Median | 1783.60 | 1783.60 | 1.783600e+03 | 1783.60 |
| larceny_theft_rate | SD | 179.59 | 176.83 | 1.760900e+02 | 181.15 |
| larceny_theft_rate | Min | 1549.50 | 1549.50 | 1.549500e+03 | 1549.50 |
| larceny_theft_rate | Max | 2166.10 | 2166.10 | 2.166100e+03 | 2166.10 |
| motor_vehicle_theft_rate | Mean | 234.82 | 233.96 | 2.338400e+02 | 235.12 |
| motor_vehicle_theft_rate | Median | 230.20 | 230.20 | 2.302000e+02 | 230.20 |
| motor_vehicle_theft_rate | SD | 21.82 | 20.91 | 2.095000e+01 | 22.69 |
| motor_vehicle_theft_rate | Min | 215.40 | 215.40 | 2.154000e+02 | 215.40 |
| motor_vehicle_theft_rate | Max | 315.40 | 315.40 | 3.154000e+02 | 315.40 |
#Summary stats by census division
summary_by_division_df <- cleanhousinghpicrimedata %>%
group_by(division) %>%
summarise(across(all_of(vars_to_summarize),
list(Mean = mean, Median = median, SD = sd, Min = min, Max = max),
.names = "{.col}.{.fn}")) %>% # Use "." instead of "_"
pivot_longer(cols = -division,
names_to = c("Variable", "Statistic"),
names_pattern = "(.*)\\.(.*)") %>% # Extract proper names
mutate(Value = round(as.numeric(value), 2)) %>%
select(-value) %>% # Drop old value column
pivot_wider(names_from = division, values_from = Value)
#Summary stats table using kable
kable(summary_by_division_df, format = "html", align = "c", caption = "Summary Statistics by Census Division") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
| Variable | Statistic | DV_ENC | DV_ESC | DV_MA | DV_MT | DV_PAC | DV_SA | DV_WNC | DV_WSC |
|---|---|---|---|---|---|---|---|---|---|
| sale_amount | Mean | 270053.38 | 259789.82 | 209559.07 | 483343.76 | 597029.97 | 332534.00 | 3.113912e+05 | 254362.59 |
| sale_amount | Median | 205000.00 | 212900.00 | 160000.00 | 360000.00 | 400000.00 | 220000.00 | 2.250000e+05 | 215000.00 |
| sale_amount | SD | 697566.45 | 910328.83 | 372586.52 | 1800193.27 | 1102861.86 | 1201825.21 | 8.370690e+06 | 490246.41 |
| sale_amount | Min | 500.00 | 500.00 | 1000.00 | 500.00 | 280.00 | 100.00 | 4.850000e+02 | 101.00 |
| sale_amount | Max | 31000000.00 | 80240000.00 | 50000000.00 | 82000000.00 | 288164568.00 | 600000000.00 | 2.729785e+09 | 166250000.00 |
| dlog_sale_amount | Mean | 12.22 | 12.11 | 11.96 | 12.76 | 12.93 | 12.30 | 1.221000e+01 | 12.10 |
| dlog_sale_amount | Median | 12.20 | 12.16 | 12.00 | 12.75 | 12.90 | 12.28 | 1.225000e+01 | 12.12 |
| dlog_sale_amount | SD | 0.62 | 0.69 | 0.80 | 0.58 | 0.77 | 0.84 | 7.500000e-01 | 0.61 |
| dlog_sale_amount | Min | 6.21 | 6.00 | 6.81 | 6.10 | 5.37 | 4.39 | 5.950000e+00 | 4.52 |
| dlog_sale_amount | Max | 17.20 | 18.10 | 17.66 | 18.00 | 19.21 | 20.00 | 2.164000e+01 | 18.93 |
| year_built | Mean | 1979.27 | 1993.10 | 1971.79 | 1991.76 | 1971.44 | 1986.28 | 1.979080e+03 | 1993.02 |
| year_built | Median | 1984.00 | 2001.00 | 1978.00 | 1997.00 | 1973.00 | 1990.00 | 1.987000e+03 | 1999.00 |
| year_built | SD | 26.86 | 24.84 | 30.45 | 20.23 | 24.34 | 21.92 | 2.971000e+01 | 18.14 |
| year_built | Min | 1800.00 | 1805.00 | 1700.00 | 1800.00 | 1801.00 | 1700.00 | 1.825000e+03 | 1859.00 |
| year_built | Max | 2019.00 | 2019.00 | 2019.00 | 2019.00 | 2019.00 | 2019.00 | 2.019000e+03 | 2019.00 |
| bedrooms_all_buildings | Mean | 3.29 | 3.43 | 3.27 | 3.50 | 3.35 | 3.48 | 3.290000e+00 | 3.55 |
| bedrooms_all_buildings | Median | 3.00 | 3.00 | 3.00 | 3.00 | 3.00 | 3.00 | 3.000000e+00 | 3.00 |
| bedrooms_all_buildings | SD | 0.73 | 0.72 | 0.89 | 1.00 | 0.90 | 0.84 | 9.400000e-01 | 0.76 |
| bedrooms_all_buildings | Min | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.000000e+00 | 1.00 |
| bedrooms_all_buildings | Max | 8.00 | 13.00 | 11.00 | 13.00 | 13.00 | 13.00 | 1.300000e+01 | 13.00 |
| number_of_bathrooms | Mean | 2.34 | 2.53 | 1.97 | 2.80 | 2.28 | 2.61 | 2.600000e+00 | 2.48 |
| number_of_bathrooms | Median | 2.50 | 2.10 | 2.00 | 3.00 | 2.00 | 2.50 | 2.500000e+00 | 2.00 |
| number_of_bathrooms | SD | 0.82 | 0.87 | 0.79 | 0.96 | 0.82 | 0.96 | 1.050000e+00 | 0.79 |
| number_of_bathrooms | Min | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.000000e+00 | 1.00 |
| number_of_bathrooms | Max | 7.00 | 10.00 | 9.00 | 10.75 | 10.50 | 10.50 | 1.000000e+01 | 10.50 |
| number_of_fireplaces | Mean | 1.08 | 1.07 | 1.11 | 1.21 | 1.10 | 1.09 | 1.190000e+00 | 1.13 |
| number_of_fireplaces | Median | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.000000e+00 | 1.00 |
| number_of_fireplaces | SD | 0.34 | 0.31 | 0.42 | 0.52 | 0.36 | 0.43 | 4.600000e-01 | 0.37 |
| number_of_fireplaces | Min | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.000000e+00 | 1.00 |
| number_of_fireplaces | Max | 11.00 | 11.00 | 20.00 | 13.00 | 30.00 | 93.00 | 1.200000e+01 | 11.00 |
| stories_number | Mean | 1.54 | 1.39 | 1.57 | 1.55 | 1.31 | 1.38 | 1.370000e+00 | 1.37 |
| stories_number | Median | 1.50 | 1.00 | 1.50 | 2.00 | 1.00 | 1.00 | 1.000000e+00 | 1.00 |
| stories_number | SD | 0.51 | 0.46 | 0.51 | 0.50 | 0.47 | 0.49 | 4.700000e-01 | 0.46 |
| stories_number | Min | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.000000e+00 | 1.00 |
| stories_number | Max | 5.00 | 4.00 | 5.00 | 5.00 | 6.00 | 6.00 | 6.500000e+00 | 4.00 |
| log_sale_amount | Mean | 12.25 | 12.20 | 11.94 | 12.79 | 12.93 | 12.29 | 1.228000e+01 | 12.26 |
| log_sale_amount | Median | 12.23 | 12.27 | 11.98 | 12.79 | 12.90 | 12.30 | 1.232000e+01 | 12.28 |
| log_sale_amount | SD | 0.62 | 0.70 | 0.81 | 0.60 | 0.80 | 0.86 | 7.500000e-01 | 0.62 |
| log_sale_amount | Min | 6.21 | 6.21 | 6.91 | 6.21 | 5.63 | 4.61 | 6.180000e+00 | 4.62 |
| log_sale_amount | Max | 17.25 | 18.20 | 17.73 | 18.22 | 19.48 | 20.21 | 2.173000e+01 | 18.93 |
| log_land_square_footage | Mean | 9.52 | 9.88 | 9.95 | 9.34 | 9.15 | 9.65 | 9.790000e+00 | 9.39 |
| log_land_square_footage | Median | 9.23 | 9.63 | 9.93 | 9.06 | 8.95 | 9.57 | 9.430000e+00 | 9.11 |
| log_land_square_footage | SD | 1.14 | 1.11 | 1.12 | 1.18 | 0.91 | 1.04 | 1.150000e+00 | 0.90 |
| log_land_square_footage | Min | 3.78 | 6.08 | 3.78 | 0.00 | 0.00 | 0.69 | 1.390000e+00 | 0.00 |
| log_land_square_footage | Max | 14.35 | 16.65 | 15.40 | 18.47 | 19.54 | 18.69 | 2.038000e+01 | 20.72 |
| violent_crime_rate | Mean | 386.94 | 380.08 | 384.30 | 382.56 | 386.40 | 382.72 | 3.844700e+02 | 385.71 |
| violent_crime_rate | Median | 383.80 | 370.40 | 383.80 | 373.70 | 383.80 | 373.70 | 3.737000e+02 | 383.80 |
| violent_crime_rate | SD | 26.60 | 21.67 | 23.66 | 22.13 | 24.71 | 22.42 | 2.384000e+01 | 25.34 |
| violent_crime_rate | Min | 361.60 | 361.60 | 361.60 | 361.60 | 361.60 | 361.60 | 3.616000e+02 | 361.60 |
| violent_crime_rate | Max | 458.60 | 458.60 | 458.60 | 458.60 | 458.60 | 458.60 | 4.586000e+02 | 458.60 |
| murder_rate | Mean | 4.98 | 4.98 | 4.96 | 4.96 | 4.94 | 4.96 | 4.950000e+00 | 4.95 |
| murder_rate | Median | 5.00 | 5.00 | 5.00 | 5.00 | 5.00 | 5.00 | 5.000000e+00 | 5.00 |
| murder_rate | SD | 0.31 | 0.27 | 0.31 | 0.31 | 0.31 | 0.31 | 3.100000e-01 | 0.31 |
| murder_rate | Min | 4.40 | 4.40 | 4.40 | 4.40 | 4.40 | 4.40 | 4.400000e+00 | 4.40 |
| murder_rate | Max | 5.40 | 5.40 | 5.40 | 5.40 | 5.40 | 5.40 | 5.400000e+00 | 5.40 |
| rape_legacy_rate | Mean | 29.05 | 29.34 | 28.95 | 28.98 | 28.79 | 29.00 | 2.890000e+01 | 28.84 |
| rape_legacy_rate | Median | 29.80 | 29.90 | 29.80 | 29.90 | 29.10 | 29.90 | 2.980000e+01 | 29.80 |
| rape_legacy_rate | SD | 1.62 | 1.54 | 1.67 | 1.69 | 1.67 | 1.68 | 1.680000e+00 | 1.69 |
| rape_legacy_rate | Min | 25.90 | 25.90 | 25.90 | 25.90 | 25.90 | 25.90 | 2.590000e+01 | 25.90 |
| rape_legacy_rate | Max | 31.00 | 31.00 | 31.00 | 31.00 | 31.00 | 31.00 | 3.100000e+01 | 31.00 |
| robbery_rate | Mean | 104.81 | 98.46 | 103.69 | 102.48 | 105.68 | 102.42 | 1.040400e+02 | 105.02 |
| robbery_rate | Median | 102.20 | 98.60 | 102.20 | 102.20 | 102.90 | 102.20 | 1.022000e+02 | 102.20 |
| robbery_rate | SD | 17.90 | 16.42 | 16.33 | 15.68 | 16.66 | 15.93 | 1.637000e+01 | 17.01 |
| robbery_rate | Min | 81.60 | 81.60 | 81.60 | 81.60 | 81.60 | 81.60 | 8.160000e+01 | 81.60 |
| robbery_rate | Max | 145.90 | 145.90 | 145.90 | 145.90 | 145.90 | 145.90 | 1.459000e+02 | 145.90 |
| aggravated_assault_rate | Mean | 248.07 | 247.26 | 246.65 | 246.09 | 246.94 | 246.29 | 2.465400e+02 | 246.86 |
| aggravated_assault_rate | Median | 248.30 | 248.30 | 248.30 | 248.30 | 248.30 | 248.30 | 2.483000e+02 | 248.20 |
| aggravated_assault_rate | SD | 12.03 | 9.91 | 11.25 | 10.80 | 11.65 | 10.83 | 1.138000e+01 | 11.98 |
| aggravated_assault_rate | Min | 229.20 | 229.20 | 229.20 | 229.20 | 229.20 | 229.20 | 2.292000e+02 | 229.20 |
| aggravated_assault_rate | Max | 277.50 | 277.50 | 277.50 | 277.50 | 277.50 | 277.50 | 2.775000e+02 | 277.50 |
| property_crime_rate | Mean | 2563.76 | 2439.67 | 2547.38 | 2524.52 | 2592.99 | 2522.99 | 2.556320e+03 | 2577.13 |
| property_crime_rate | Median | 2500.50 | 2362.90 | 2500.50 | 2451.60 | 2500.50 | 2451.60 | 2.500500e+03 | 2500.50 |
| property_crime_rate | SD | 341.42 | 318.74 | 321.11 | 312.01 | 327.86 | 315.76 | 3.218900e+02 | 330.11 |
| property_crime_rate | Min | 2109.90 | 2109.90 | 2109.90 | 2109.90 | 2109.90 | 2109.90 | 2.109900e+03 | 2109.90 |
| property_crime_rate | Max | 3214.60 | 3214.60 | 3214.60 | 3214.60 | 3214.60 | 3214.60 | 3.214600e+03 | 3214.60 |
| burglary_rate | Mean | 522.74 | 472.91 | 517.97 | 508.79 | 538.05 | 508.12 | 5.219900e+02 | 530.47 |
| burglary_rate | Median | 494.70 | 429.70 | 494.70 | 468.90 | 494.70 | 468.90 | 4.947000e+02 | 494.70 |
| burglary_rate | SD | 138.14 | 130.39 | 133.07 | 129.97 | 136.55 | 131.29 | 1.335100e+02 | 135.87 |
| burglary_rate | Min | 340.50 | 340.50 | 340.50 | 340.50 | 340.50 | 340.50 | 3.405000e+02 | 340.50 |
| burglary_rate | Max | 733.00 | 733.00 | 733.00 | 733.00 | 733.00 | 733.00 | 7.330000e+02 | 733.00 |
| larceny_theft_rate | Mean | 1803.92 | 1735.06 | 1795.10 | 1782.63 | 1819.63 | 1781.66 | 1.800000e+03 | 1811.30 |
| larceny_theft_rate | Median | 1783.60 | 1695.50 | 1783.60 | 1745.40 | 1783.60 | 1745.40 | 1.783600e+03 | 1783.60 |
| larceny_theft_rate | SD | 188.33 | 177.14 | 176.81 | 172.13 | 179.82 | 174.24 | 1.771600e+02 | 181.67 |
| larceny_theft_rate | Min | 1549.50 | 1549.50 | 1549.50 | 1549.50 | 1549.50 | 1549.50 | 1.549500e+03 | 1549.50 |
| larceny_theft_rate | Max | 2166.10 | 2166.10 | 2166.10 | 2166.10 | 2166.10 | 2166.10 | 2.166100e+03 | 2166.10 |
| motor_vehicle_theft_rate | Mean | 237.10 | 231.70 | 234.32 | 233.10 | 235.32 | 233.22 | 2.343400e+02 | 235.36 |
| motor_vehicle_theft_rate | Median | 230.20 | 230.20 | 230.20 | 230.20 | 230.20 | 230.20 | 2.302000e+02 | 230.20 |
| motor_vehicle_theft_rate | SD | 25.30 | 19.60 | 21.52 | 20.12 | 22.15 | 20.31 | 2.164000e+01 | 23.28 |
| motor_vehicle_theft_rate | Min | 215.40 | 215.40 | 215.40 | 215.40 | 215.40 | 215.40 | 2.154000e+02 | 215.40 |
| motor_vehicle_theft_rate | Max | 315.40 | 315.40 | 315.40 | 315.40 | 315.40 | 315.40 | 3.154000e+02 | 315.40 |
###Creating region variable
#Converting division to region
division_to_region <- c(
"DV_NE" = "Northeast", "DV_MA" = "Northeast", # New England & Middle Atlantic
"DV_ENC" = "Midwest", "DV_WNC" = "Midwest", # East & West North Central
"DV_SA" = "South", "DV_ESC" = "South", "DV_WSC" = "South", # South Atlantic & Central
"DV_MT" = "West", "DV_PAC" = "West" # Mountain & Pacific
)
#Regions Based on Division
cleanhousinghpicrimedata <- cleanhousinghpicrimedata %>%
mutate(region = recode(division, !!!division_to_region))
# Compute summary statistics by census region
summary_by_region_df <- cleanhousinghpicrimedata %>%
group_by(region) %>%
summarise(across(all_of(vars_to_summarize),
list(Mean = mean, Median = median, SD = sd, Min = min, Max = max),
.names = "{.col}.{.fn}")) %>%
pivot_longer(cols = -region,
names_to = c("Variable", "Statistic"),
names_pattern = "(.*)\\.(.*)") %>%
mutate(Value = round(as.numeric(value), 2)) %>%
select(-value) %>% # Drop old value column
pivot_wider(names_from = region, values_from = Value)
# Display table using kable
kable(summary_by_region_df, format = "html", align = "c", caption = "Summary Statistics by Census Region") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
| Variable | Statistic | Midwest | Northeast | South | West |
|---|---|---|---|---|---|
| sale_amount | Mean | 3.088721e+05 | 209559.07 | 303133.49 | 572709.28 |
| sale_amount | Median | 2.240000e+05 | 160000.00 | 218000.00 | 389000.00 |
| sale_amount | SD | 8.113455e+06 | 372586.52 | 1000965.34 | 1285126.43 |
| sale_amount | Min | 4.850000e+02 | 1000.00 | 100.00 | 280.00 |
| sale_amount | Max | 2.729785e+09 | 50000000.00 | 600000000.00 | 288164568.00 |
| dlog_sale_amount | Mean | 1.221000e+01 | 11.96 | 12.23 | 12.89 |
| dlog_sale_amount | Median | 1.225000e+01 | 12.00 | 12.22 | 12.85 |
| dlog_sale_amount | SD | 7.400000e-01 | 0.80 | 0.77 | 0.74 |
| dlog_sale_amount | Min | 5.950000e+00 | 6.81 | 4.39 | 5.37 |
| dlog_sale_amount | Max | 2.164000e+01 | 17.66 | 20.00 | 19.21 |
| year_built | Mean | 1.979090e+03 | 1971.79 | 1988.82 | 1975.79 |
| year_built | Median | 1.987000e+03 | 1978.00 | 1993.00 | 1978.00 |
| year_built | SD | 2.955000e+01 | 30.45 | 20.95 | 24.95 |
| year_built | Min | 1.800000e+03 | 1700.00 | 1700.00 | 1800.00 |
| year_built | Max | 2.019000e+03 | 2019.00 | 2019.00 | 2019.00 |
| bedrooms_all_buildings | Mean | 3.290000e+00 | 3.27 | 3.50 | 3.38 |
| bedrooms_all_buildings | Median | 3.000000e+00 | 3.00 | 3.00 | 3.00 |
| bedrooms_all_buildings | SD | 9.300000e-01 | 0.89 | 0.81 | 0.92 |
| bedrooms_all_buildings | Min | 1.000000e+00 | 1.00 | 1.00 | 1.00 |
| bedrooms_all_buildings | Max | 1.300000e+01 | 11.00 | 13.00 | 13.00 |
| number_of_bathrooms | Mean | 2.580000e+00 | 1.97 | 2.56 | 2.39 |
| number_of_bathrooms | Median | 2.500000e+00 | 2.00 | 2.50 | 2.00 |
| number_of_bathrooms | SD | 1.030000e+00 | 0.79 | 0.90 | 0.88 |
| number_of_bathrooms | Min | 1.000000e+00 | 1.00 | 1.00 | 1.00 |
| number_of_bathrooms | Max | 1.000000e+01 | 9.00 | 10.50 | 10.75 |
| number_of_fireplaces | Mean | 1.190000e+00 | 1.11 | 1.11 | 1.12 |
| number_of_fireplaces | Median | 1.000000e+00 | 1.00 | 1.00 | 1.00 |
| number_of_fireplaces | SD | 4.600000e-01 | 0.42 | 0.41 | 0.40 |
| number_of_fireplaces | Min | 1.000000e+00 | 1.00 | 1.00 | 1.00 |
| number_of_fireplaces | Max | 1.200000e+01 | 20.00 | 93.00 | 30.00 |
| stories_number | Mean | 1.380000e+00 | 1.57 | 1.37 | 1.36 |
| stories_number | Median | 1.000000e+00 | 1.50 | 1.00 | 1.00 |
| stories_number | SD | 4.800000e-01 | 0.51 | 0.48 | 0.49 |
| stories_number | Min | 1.000000e+00 | 1.00 | 1.00 | 1.00 |
| stories_number | Max | 6.500000e+00 | 5.00 | 6.00 | 6.00 |
| log_sale_amount | Mean | 1.228000e+01 | 11.94 | 12.28 | 12.90 |
| log_sale_amount | Median | 1.232000e+01 | 11.98 | 12.29 | 12.87 |
| log_sale_amount | SD | 7.500000e-01 | 0.81 | 0.78 | 0.76 |
| log_sale_amount | Min | 6.180000e+00 | 6.91 | 4.61 | 5.63 |
| log_sale_amount | Max | 2.173000e+01 | 17.73 | 20.21 | 19.48 |
| log_land_square_footage | Mean | 9.770000e+00 | 9.95 | 9.56 | 9.19 |
| log_land_square_footage | Median | 9.410000e+00 | 9.93 | 9.37 | 8.97 |
| log_land_square_footage | SD | 1.150000e+00 | 1.12 | 1.00 | 0.98 |
| log_land_square_footage | Min | 1.390000e+00 | 3.78 | 0.00 | 0.00 |
| log_land_square_footage | Max | 2.038000e+01 | 15.40 | 20.72 | 19.54 |
| violent_crime_rate | Mean | 3.846200e+02 | 384.30 | 383.75 | 385.58 |
| violent_crime_rate | Median | 3.737000e+02 | 383.80 | 373.70 | 383.80 |
| violent_crime_rate | SD | 2.402000e+01 | 23.66 | 23.55 | 24.23 |
| violent_crime_rate | Min | 3.616000e+02 | 361.60 | 361.60 | 361.60 |
| violent_crime_rate | Max | 4.586000e+02 | 458.60 | 458.60 | 458.60 |
| murder_rate | Mean | 4.960000e+00 | 4.96 | 4.96 | 4.95 |
| murder_rate | Median | 5.000000e+00 | 5.00 | 5.00 | 5.00 |
| murder_rate | SD | 3.100000e-01 | 0.31 | 0.31 | 0.31 |
| murder_rate | Min | 4.400000e+00 | 4.40 | 4.40 | 4.40 |
| murder_rate | Max | 5.400000e+00 | 5.40 | 5.40 | 5.40 |
| rape_legacy_rate | Mean | 2.891000e+01 | 28.95 | 28.95 | 28.83 |
| rape_legacy_rate | Median | 2.980000e+01 | 29.80 | 29.80 | 29.10 |
| rape_legacy_rate | SD | 1.680000e+00 | 1.67 | 1.69 | 1.68 |
| rape_legacy_rate | Min | 2.590000e+01 | 25.90 | 25.90 | 25.90 |
| rape_legacy_rate | Max | 3.100000e+01 | 31.00 | 31.00 | 31.00 |
| robbery_rate | Mean | 1.040900e+02 | 103.69 | 103.29 | 105.00 |
| robbery_rate | Median | 1.022000e+02 | 102.20 | 102.20 | 102.20 |
| robbery_rate | SD | 1.646000e+01 | 16.33 | 16.40 | 16.51 |
| robbery_rate | Min | 8.160000e+01 | 81.60 | 81.60 | 81.60 |
| robbery_rate | Max | 1.459000e+02 | 145.90 | 145.90 | 145.90 |
| aggravated_assault_rate | Mean | 2.466300e+02 | 246.65 | 246.51 | 246.76 |
| aggravated_assault_rate | Median | 2.483000e+02 | 248.30 | 248.30 | 248.30 |
| aggravated_assault_rate | SD | 1.143000e+01 | 11.25 | 11.24 | 11.48 |
| aggravated_assault_rate | Min | 2.292000e+02 | 229.20 | 229.20 | 229.20 |
| aggravated_assault_rate | Max | 2.775000e+02 | 277.50 | 277.50 | 277.50 |
| property_crime_rate | Mean | 2.556770e+03 | 2547.38 | 2540.98 | 2578.34 |
| property_crime_rate | Median | 2.500500e+03 | 2500.50 | 2500.50 | 2500.50 |
| property_crime_rate | SD | 3.231200e+02 | 321.11 | 322.37 | 325.74 |
| property_crime_rate | Min | 2.109900e+03 | 2109.90 | 2109.90 | 2109.90 |
| property_crime_rate | Max | 3.214600e+03 | 3214.60 | 3214.60 | 3214.60 |
| burglary_rate | Mean | 5.220400e+02 | 517.97 | 515.53 | 531.79 |
| burglary_rate | Median | 4.947000e+02 | 494.70 | 494.70 | 494.70 |
| burglary_rate | SD | 1.338000e+02 | 133.07 | 133.49 | 135.70 |
| burglary_rate | Min | 3.405000e+02 | 340.50 | 340.50 | 340.50 |
| burglary_rate | Max | 7.330000e+02 | 733.00 | 733.00 | 733.00 |
| larceny_theft_rate | Mean | 1.800240e+03 | 1795.10 | 1791.49 | 1811.72 |
| larceny_theft_rate | Median | 1.783600e+03 | 1783.60 | 1783.60 | 1783.60 |
| larceny_theft_rate | SD | 1.778600e+02 | 176.81 | 177.72 | 178.85 |
| larceny_theft_rate | Min | 1.549500e+03 | 1549.50 | 1549.50 | 1549.50 |
| larceny_theft_rate | Max | 2.166100e+03 | 2166.10 | 2166.10 | 2166.10 |
| motor_vehicle_theft_rate | Mean | 2.345100e+02 | 234.32 | 233.96 | 234.84 |
| motor_vehicle_theft_rate | Median | 2.302000e+02 | 230.20 | 230.20 | 230.20 |
| motor_vehicle_theft_rate | SD | 2.189000e+01 | 21.52 | 21.44 | 21.75 |
| motor_vehicle_theft_rate | Min | 2.154000e+02 | 215.40 | 215.40 | 215.40 |
| motor_vehicle_theft_rate | Max | 3.154000e+02 | 315.40 | 315.40 | 315.40 |
library(lfe)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(fixest)
##
## Attaching package: 'fixest'
## The following object is masked from 'package:lfe':
##
## fepois
## The following object is masked from 'package:scales':
##
## pvalue
#Dependent Variable
cleanhousinghpicrimedata$log_sale_hpi <- log(cleanhousinghpicrimedata$sale_amount / cleanhousinghpicrimedata$hpi)
# Run the regression
reg_model1 <- feols( log_sale_hpi ~ violent_crime_rate + murder_rate + rape_legacy_rate + robbery_rate +
aggravated_assault_rate + property_crime_rate + burglary_rate + larceny_theft_rate +
motor_vehicle_theft_rate +
bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number +
log_land_square_footage + AC_presence
| year + division, data = cleanhousinghpicrimedata)
## The variables 'violent_crime_rate', 'murder_rate', 'rape_legacy_rate', 'robbery_rate', 'aggravated_assault_rate', 'burglary_rate' and 2 others have been removed because of collinearity (see $collin.var).
summary(reg_model1)
## OLS estimation, Dep. Var.: log_sale_hpi
## Observations: 2,348,446
## Fixed-effects: year: 12, division: 8
## Standard-errors: Clustered (year)
## Estimate Std. Error t value Pr(>|t|)
## property_crime_rate -0.035109 90113.721445 -0.00000039 1.0000e+00
## bedrooms_all_buildings 0.033349 0.002606 12.79508467 5.9999e-08 ***
## number_of_bathrooms 0.390041 0.005534 70.47807530 5.8287e-16 ***
## number_of_fireplaces 0.104705 0.005774 18.13351701 1.5247e-09 ***
## stories_number 0.025739 0.003864 6.66059565 3.5599e-05 ***
## log_land_square_footage 0.009049 0.002383 3.79735806 2.9568e-03 **
## AC_presence -0.150589 0.015525 -9.70005503 1.0014e-06 ***
## ... 8 variables were removed because of collinearity (violent_crime_rate, murder_rate and 6 others [full set in $collin.var])
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.644389 Adj. R2: 0.391734
## Within R2: 0.259987
The model automatically removes violent_crime_rate, murder_rate, rape_legacy_rate, robbery_rate, aggravated_assault_rate, burglary_rate, larceny_theft_rate due to collinearity between variables and only keeps property_crime_rate.
#Model 1: Adding year_built_group
reg_model2 <- feols( log_sale_hpi ~ property_crime_rate +
year_built_grp + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number +
log_land_square_footage + AC_presence
| year + division, data = cleanhousinghpicrimedata)
summary(reg_model2)
## OLS estimation, Dep. Var.: log_sale_hpi
## Observations: 2,348,446
## Fixed-effects: year: 12, division: 8
## Standard-errors: Clustered (year)
## Estimate Std. Error t value Pr(>|t|)
## property_crime_rate -0.032002 87219.715689 -0.000000367 1.0000e+00
## year_built_grp -0.069411 0.003459 -20.069152836 5.1500e-10 ***
## bedrooms_all_buildings 0.033820 0.002604 12.985422222 5.1481e-08 ***
## number_of_bathrooms 0.412299 0.005985 68.892500438 7.4826e-16 ***
## number_of_fireplaces 0.091589 0.005564 16.461993841 4.2657e-09 ***
## stories_number 0.051416 0.004274 12.029278393 1.1345e-07 ***
## log_land_square_footage 0.007059 0.002350 3.003710145 1.2000e-02 *
## AC_presence -0.132790 0.013812 -9.614021443 1.0940e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.640959 Adj. R2: 0.398192
## Within R2: 0.267844
#Model 2: Adding year_built to the model
reg_model3 <- feols( log_sale_hpi ~ property_crime_rate +
year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number +
log_land_square_footage + AC_presence
| year + division, data = cleanhousinghpicrimedata)
summary(reg_model3)
## OLS estimation, Dep. Var.: log_sale_hpi
## Observations: 2,348,446
## Fixed-effects: year: 12, division: 8
## Standard-errors: Clustered (year)
## Estimate Std. Error t value Pr(>|t|)
## property_crime_rate -0.029196 85328.669408 -0.000000342 1.0000e+00
## year_built -0.003539 0.000174 -20.320988727 4.5049e-10 ***
## bedrooms_all_buildings 0.032436 0.002563 12.657242362 6.7119e-08 ***
## number_of_bathrooms 0.417360 0.006166 67.685403662 9.0848e-16 ***
## number_of_fireplaces 0.086945 0.005398 16.106409654 5.3762e-09 ***
## stories_number 0.044878 0.004192 10.706758146 3.7212e-07 ***
## log_land_square_footage 0.008783 0.002408 3.647265701 3.8390e-03 **
## AC_presence -0.129416 0.013560 -9.543658617 1.1766e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.640031 Adj. R2: 0.399934
## Within R2: 0.269963
#Model 3: using fips_county_code as fixed effects
reg_model4 <- feols(log_sale_hpi ~ property_crime_rate +
year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number +
log_land_square_footage + AC_presence
| year + fips_county_code, data = cleanhousinghpicrimedata)
summary(reg_model4)
## OLS estimation, Dep. Var.: log_sale_hpi
## Observations: 2,348,446
## Fixed-effects: year: 12, fips_county_code: 501
## Standard-errors: Clustered (year)
## Estimate Std. Error t value Pr(>|t|)
## property_crime_rate 0.005961 31813.967915 0.000000187 1.0000e+00
## year_built 0.001051 0.000083 12.648130888 6.7621e-08 ***
## bedrooms_all_buildings 0.044571 0.002188 20.366365222 4.3982e-10 ***
## number_of_bathrooms 0.303712 0.005577 54.454969683 9.8735e-15 ***
## number_of_fireplaces 0.114228 0.005384 21.214922398 2.8362e-10 ***
## stories_number 0.041854 0.006819 6.137660350 7.3345e-05 ***
## log_land_square_footage 0.070799 0.001031 68.668072951 7.7554e-16 ***
## AC_presence -0.021494 0.003507 -6.129301249 7.4219e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.533756 Adj. R2: 0.582579
## Within R2: 0.276663
# Load necessary package
library(modelsummary)
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
## backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
##
## Revert to `kableExtra` for one session:
##
## options(modelsummary_factory_default = 'kableExtra')
## options(modelsummary_factory_latex = 'kableExtra')
## options(modelsummary_factory_html = 'kableExtra')
##
## Silence this message forever:
##
## config_modelsummary(startup_message = FALSE)
# Create regression table
model_list <- list(
"Model 1 (Year Built Group)" = reg_model2,
"Model 2 (Year Built Added)" = reg_model3,
"Model 3 (County FE)" = reg_model4
)
# Generate summary table
modelsummary(model_list,
title = "Regression Results: Impact of Crime Rates on Housing Prices",
coef_omit = "Intercept", # Hide intercept
stars = TRUE,
gof_omit = "IC|RMSE", # Remove unnecessary stats
fmt = 4, # Round numbers
escape = FALSE # Ensures special characters (e.g., ) don't appear in the title
)
| Model 1 (Year Built Group) | Model 2 (Year Built Added) | Model 3 (County FE) | |
|---|---|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |||
| property_crime_rate | -0.0320 | -0.0292 | 0.0060 |
| (87219.7157) | (85328.6694) | (31813.9679) | |
| year_built_grp | -0.0694*** | ||
| (0.0035) | |||
| bedrooms_all_buildings | 0.0338*** | 0.0324*** | 0.0446*** |
| (0.0026) | (0.0026) | (0.0022) | |
| number_of_bathrooms | 0.4123*** | 0.4174*** | 0.3037*** |
| (0.0060) | (0.0062) | (0.0056) | |
| number_of_fireplaces | 0.0916*** | 0.0869*** | 0.1142*** |
| (0.0056) | (0.0054) | (0.0054) | |
| stories_number | 0.0514*** | 0.0449*** | 0.0419*** |
| (0.0043) | (0.0042) | (0.0068) | |
| log_land_square_footage | 0.0071* | 0.0088** | 0.0708*** |
| (0.0024) | (0.0024) | (0.0010) | |
| AC_presence | -0.1328*** | -0.1294*** | -0.0215*** |
| (0.0138) | (0.0136) | (0.0035) | |
| year_built | -0.0035*** | 0.0011*** | |
| (0.0002) | (0.0001) | ||
| Num.Obs. | 2348446 | 2348446 | 2348446 |
| R2 | 0.398 | 0.400 | 0.583 |
| R2 Adj. | 0.398 | 0.400 | 0.583 |
| R2 Within | 0.268 | 0.270 | 0.277 |
| R2 Within Adj. | 0.268 | 0.270 | 0.277 |
| Std.Errors | by: year | by: year | by: year |
| FE: year | X | X | X |
| FE: division | X | X | |
| FE: fips_county_code | X | ||
I prefer the last model, as the year_built has positive significant effect which is consistent with the housing market where newer houses has better sale values.
#Model 3 by Region
reg_model4_reg <- feols(log_sale_hpi ~ property_crime_rate +
year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number +
log_land_square_footage + AC_presence +
region
| year, data = cleanhousinghpicrimedata)
summary(reg_model4_reg)
## OLS estimation, Dep. Var.: log_sale_hpi
## Observations: 2,348,446
## Fixed-effects: year: 12
## Standard-errors: Clustered (year)
## Estimate Std. Error t value Pr(>|t|)
## property_crime_rate -0.000016 0.000011 -1.43860 1.7810e-01
## year_built -0.004729 0.000152 -31.08332 4.5319e-12 ***
## bedrooms_all_buildings 0.039166 0.003082 12.70701 6.4448e-08 ***
## number_of_bathrooms 0.410606 0.005558 73.87403 3.4769e-16 ***
## number_of_fireplaces 0.069861 0.005188 13.46618 3.5275e-08 ***
## stories_number 0.031290 0.003783 8.27116 4.7512e-06 ***
## log_land_square_footage 0.006742 0.001930 3.49268 5.0347e-03 **
## AC_presence -0.102340 0.012734 -8.03683 6.2529e-06 ***
## regionNortheast -0.057900 0.027833 -2.08029 6.1668e-02 .
## regionSouth 0.115901 0.014255 8.13051 5.5988e-06 ***
## regionWest 0.761214 0.014750 51.60695 1.7790e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.648676 Adj. R2: 0.383615
## Within R2: 0.382314
Here, the key crime variable is property crime rate as others had collinearity
#New variable to store estimated effects
cleanhousinghpicrimedata$crime_effect_region <- NA
#Loop through each region and estimate the effect
for (r in unique(cleanhousinghpicrimedata$region)) {
#Run regression for each region
model <- feols(log_sale_hpi ~ property_crime_rate + bedrooms_all_buildings +
number_of_bathrooms + number_of_fireplaces + stories_number +
log_land_square_footage + AC_presence,
data = cleanhousinghpicrimedata %>% filter(region == r))
#Extract the coefficient for property_crime_rate
effect <- coef(model)["property_crime_rate"]
#Assigning effect to all rows
cleanhousinghpicrimedata$crime_effect_region[cleanhousinghpicrimedata$region == r] <- effect
}
# Round the estimated crime effect values to 4 decimal places
cleanhousinghpicrimedata$crime_effect_region <- round(cleanhousinghpicrimedata$crime_effect_region, 4)
#Scaling the crime rate variable
cleanhousinghpicrimedata$scaled_property_crime_rate <- cleanhousinghpicrimedata$property_crime_rate / 1000
summary(cleanhousinghpicrimedata$scaled_property_crime_rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.110 2.363 2.501 2.561 2.868 3.215
# Define U.S. Census regions
census_regions <- data.frame(
state = c("Maine", "New Hampshire", "Vermont", "Massachusetts", "Rhode Island", "Connecticut",
"New York", "New Jersey", "Pennsylvania",
"Wisconsin", "Michigan", "Illinois", "Indiana", "Ohio",
"North Dakota", "South Dakota", "Nebraska", "Kansas", "Minnesota", "Iowa", "Missouri",
"Delaware", "Maryland", "District of Columbia", "Virginia", "West Virginia",
"North Carolina", "South Carolina", "Georgia", "Florida", "Kentucky", "Tennessee",
"Alabama", "Mississippi", "Arkansas", "Louisiana",
"Oklahoma", "Texas",
"Montana", "Idaho", "Wyoming", "Colorado", "New Mexico",
"Arizona", "Utah", "Nevada",
"Washington", "Oregon", "California", "Alaska", "Hawaii"),
region = c(rep("Northeast", 9),
rep("Midwest", 12),
rep("South", 17),
rep("West", 13))
)
# Convert state names to lowercase for merging with `usmap` data
census_regions$state <- tolower(census_regions$state)
# Aggregate mean crime effect by region
region_effects <- cleanhousinghpicrimedata %>%
group_by(region) %>%
summarise(mean_crime_effect = mean(crime_effect_region, na.rm = TRUE))
# Merge state-region mapping with crime effect data
plot_data <- left_join(census_regions, region_effects, by = "region")
# Plot the map, now with a **gradient scale**
plot_usmap(data = plot_data, values = "mean_crime_effect", regions = "state") +
scale_fill_gradient2(
low = "blue", mid = "white", high = "red",
midpoint = mean(plot_data$mean_crime_effect, na.rm = TRUE),
name = "Mean Crime Effect",
na.value = "grey50"
) +
theme(legend.position = "right") +
labs(title = "Variation in Estimated Crime Rate Effect by U.S. Region (Gradient)")