PROBLEM 1. Checking the raw data in the primary dataset
# Load the data
housingdata <- readRDS("testdata20250121.RDS")
# Extract the day, month, and year
housingdata$sale_date <- as.integer(housingdata$sale_date)
housingdata$day <- 100 * ((housingdata$sale_date / 100) - round(housingdata$sale_date / 100))
housingdata$month <- 100 * (((housingdata$sale_date - housingdata$day) / 10000) - round((housingdata$sale_date - housingdata$day) / 10000))
housingdata$year <- round(housingdata$sale_date / 10000)
# Summarize
housingdata$sale_date[1:10]
## [1] 20150129 20090904 20130228 20080825 20140225 20180111 20150408 20150623
## [9] 20090528 20120131
housingdata$year[1:10]
## [1] 2015 2009 2013 2008 2014 2018 2015 2015 2009 2012
housingdata$month[1:10]
## [1] 1 9 2 8 2 1 4 6 5 1
housingdata$day[1:10]
## [1] 29 4 28 25 25 11 8 23 28 31
# Generate categorical variables by year built
housingdata$year_built_very_old <- ifelse(housingdata$year_built < 1960, 1, 0)
housingdata$year_built_quite_old <- ifelse(housingdata$year_built >= 1960 & housingdata$year_built < 1980, 1, 0)
housingdata$year_built_quite_new <- ifelse(housingdata$year_built >= 1980 & housingdata$year_built < 2000, 1, 0)
housingdata$year_built_very_new <- ifelse(housingdata$year_built >= 2000, 1, 0)
housingdata$year_built_grp <- 1 * housingdata$year_built_very_old + 2 * housingdata$year_built_quite_old + 3 * housingdata$year_built_quite_new + 4 * housingdata$year_built_very_new
housingdata$log_sale_amount <- log(housingdata$sale_amount)
# 3D plot (not working)
#scatter3D(housingdata$log_sale_amount, housingdata$year_built_group, housingdata$number_of_fireplaces)
#Error in plot.new() : figure margins too large
# Save the data
cleanhousingdata <- housingdata
PROBLEM 2. Adding secondary data to the primary dataset
# Load the HPI data
hpidata <- read_excel("hpi_po_us_and_census.xls")
summary(hpidata)
## division year qtr
## Length:1350 Min. :1991 Min. :1.000
## Class :character 1st Qu.:1999 1st Qu.:1.000
## Mode :character Median :2007 Median :2.000
## Mean :2007 Mean :2.489
## 3rd Qu.:2016 3rd Qu.:3.000
## Max. :2024 Max. :4.000
## index_po_not_seasonally_adjusted index_po_seasonally_adjusted
## Min. : 94.17 Min. : 94.35
## 1st Qu.:136.18 1st Qu.:135.38
## Median :190.72 Median :189.87
## Mean :202.96 Mean :202.11
## 3rd Qu.:233.36 3rd Qu.:233.31
## Max. :599.69 Max. :591.16
# Clean the data to corresponding range
cleanhpidata <- hpidata[hpidata$division != "USA", ]
cleanhpidata <- cleanhpidata[cleanhpidata$year >= 2008, ]
# Re-adjust HPI according to the new base
hpi_base <- cleanhpidata %>%
group_by(division) %>%
slice(1)
hpi_base$base <- hpi_base$index_po_seasonally_adjusted
hpi_base <- subset(hpi_base, select = c(division, base))
cleanhpidata <- merge(cleanhpidata, hpi_base, by = "division")
cleanhpidata$hpi <- round(cleanhpidata$index_po_seasonally_adjusted / cleanhpidata$base * 100, digits = 2)
cleanhpidata <- subset(cleanhpidata, select = c(division, year, qtr, hpi))
# Convert the data to division level
df_abbr_to_division <- read_csv("df_abbr_to_division.csv", col_names = TRUE)
## Rows: 51 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): abbr, division
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cleanhousingdata <- merge(cleanhousingdata, df_abbr_to_division, by = "abbr")
cleanhousingdata$qtr1 <- ifelse(cleanhousingdata$month > 0 & cleanhousingdata$month <= 3, 1, 0)
cleanhousingdata$qtr2 <- ifelse(cleanhousingdata$month > 3 & cleanhousingdata$month <= 6, 1, 0)
cleanhousingdata$qtr3 <- ifelse(cleanhousingdata$month > 6 & cleanhousingdata$month <= 9, 1, 0)
cleanhousingdata$qtr4 <- ifelse(cleanhousingdata$month > 9 & cleanhousingdata$month <= 12, 1, 0)
cleanhousingdata$qtr <- 1 * cleanhousingdata$qtr1 + 2 * cleanhousingdata$qtr2 + 3 * cleanhousingdata$qtr3 + 4 * cleanhousingdata$qtr4
cleanhousingdata <- merge(cleanhousingdata, cleanhpidata, by = c("division", "year", "qtr"))
cleanhousingdata$dlog_sale_amount <- log(cleanhousingdata$sale_amount * 100 / cleanhousingdata$hpi)
mappingdata <- cleanhousingdata %>%
group_by(abbr, year) %>%
summarise(mean_dlog_sale_amount = mean(dlog_sale_amount))
## `summarise()` has grouped output by 'abbr'. You can override using the
## `.groups` argument.
mappingdata$state <- mappingdata$abbr
# Map of housing prices for the year of 2008
mappingdata2008 <- mappingdata[mappingdata$year == 2008, ]
plot_usmap(data = mappingdata2008, values = "mean_dlog_sale_amount", color = "red") +
labs(title = "2008") +
scale_fill_continuous(name = "Mean of log of deflated sale amounts over time") +
theme(legend.position = "bottom")

# Map of housing prices for the year of 2019
mappingdata2019 <- mappingdata[mappingdata$year == 2019, ]
plot_usmap(data = mappingdata2019, values = "mean_dlog_sale_amount", color = "red") +
labs(title = "2019") +
scale_fill_continuous(name = "Mean of log of deflated sale amounts over time") +
theme(legend.position = "bottom")

# Map of housing prices by county
mappingdata <- cleanhousingdata %>%
group_by(fips_county_code) %>%
summarise(mean_dlog_sale_amount = mean(dlog_sale_amount))
mappingdata$fips <- mappingdata$fips_county_code
plot_usmap(data = mappingdata, values = "mean_dlog_sale_amount", color = "red") +
labs(title = "Mean of log of deflated sale amounts") +
scale_fill_continuous(name = "Mean of log of deflated sale amounts") +
theme(legend.position = "bottom")

# Map of year built by county
mappingdata <- cleanhousingdata %>%
group_by(fips_county_code) %>%
summarise(mean_year_built = mean(year_built))
mappingdata$fips <- mappingdata$fips_county_code
plot_usmap(data = mappingdata, values = "mean_year_built", color = "red") +
labs(title = "Mean of year built") +
scale_fill_continuous(name = "Mean of year built") +
theme(legend.position = "bottom")

# DISCUSSION: The later the house was built, the higher the sale price is.
PROBLEM 3. Adding further secondary data
# Load the crime rate data
cleancrimedata <- read_excel("table-1.xls")
## New names:
## • `` -> `...11`
## • `` -> `...12`
cleancrimedata <- cleancrimedata[cleancrimedata$year >= 2008 & cleancrimedata$year <= 2019, ]
cleancrimedata$year <- as.numeric(cleancrimedata$year)
cleanhousingcrimedata <- merge(cleanhousingdata, cleancrimedata, by = "year")
# Plot crime rate against year
ggplot(cleancrimedata, aes(x = as.factor(year), y = violent)) +
geom_bar(stat = "identity") +
labs(title = "Crime rate over time",
x = "Year",
y = "Total violent crime rate") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

plottingdata <- cleanhousingdata %>%
group_by(year, year_built_grp) %>%
summarise(mean_dlog_sale_amount = mean(dlog_sale_amount))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
plottingdata_grp1 <- plottingdata[plottingdata$year_built_grp == 1, ]
plottingdata_grp2 <- plottingdata[plottingdata$year_built_grp == 2, ]
plottingdata_grp3 <- plottingdata[plottingdata$year_built_grp == 3, ]
plottingdata_grp4 <- plottingdata[plottingdata$year_built_grp == 4, ]
ggplot(plottingdata_grp1, aes(x = as.factor(year), y = mean_dlog_sale_amount)) +
geom_bar(stat = "identity") +
labs(title = "Mean of log of deflated sale amounts over time: very old housing",
x = "Year",
y = "Mean of log of deflated sale amounts") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(plottingdata_grp2, aes(x = as.factor(year), y = mean_dlog_sale_amount)) +
geom_bar(stat = "identity") +
labs(title = "Mean of log of deflated sale amounts over time: quite old housing",
x = "Year",
y = "Mean of log of deflated sale amounts") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(plottingdata_grp3, aes(x = as.factor(year), y = mean_dlog_sale_amount)) +
geom_bar(stat = "identity") +
labs(title = "Mean of log of deflated sale amounts over time: quite new housing",
x = "Year",
y = "Mean of log of deflated sale amounts") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(plottingdata_grp4, aes(x = as.factor(year), y = mean_dlog_sale_amount)) +
geom_bar(stat = "identity") +
labs(title = "Mean of log of deflated sale amounts over time: very new housing",
x = "Year",
y = "Mean of log of deflated sale amounts") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# DISCUSSION: The total violent crime rates generally decreased over time, but the general sale amounts did not change much over time in each group. The sales amount of the quite new houses are somewhat higher than the other houses.
PROBLEM 4. Summarizing the data
empiricsdata <- subset(cleanhousingcrimedata, select = c(sale_amount, dlog_sale_amount,
violent, murder, rape, robbery, assault, property, burglary, larceny,
bedrooms_all_buildings, number_of_bathrooms, number_of_fireplaces, stories_number, land_square_footage, AC_presence,
division, year_built_grp))
summary(empiricsdata)
## sale_amount dlog_sale_amount violent murder
## Min. :1.000e+02 Min. : 4.39 Min. :361.6 Min. :4.400
## 1st Qu.:1.800e+05 1st Qu.:12.07 1st Qu.:369.1 1st Qu.:4.700
## Median :2.870e+05 Median :12.53 Median :383.8 Median :5.000
## Mean :4.417e+05 Mean :12.57 Mean :384.7 Mean :4.951
## 3rd Qu.:4.740e+05 3rd Qu.:13.03 3rd Qu.:387.1 3rd Qu.:5.300
## Max. :2.730e+09 Max. :21.64 Max. :458.6 Max. :5.400
## rape robbery assault property burglary
## Min. :25.90 Min. : 81.6 Min. :229.2 Min. :2110 Min. :340.5
## 1st Qu.:27.10 1st Qu.: 98.6 1st Qu.:238.1 1st Qu.:2363 1st Qu.:429.7
## Median :29.80 Median :102.2 Median :248.3 Median :2500 Median :494.7
## Mean :28.89 Mean :104.2 Mean :246.6 Mean :2561 Mean :524.2
## 3rd Qu.:30.00 3rd Qu.:113.1 3rd Qu.:250.2 3rd Qu.:2868 3rd Qu.:672.2
## Max. :31.00 Max. :145.9 Max. :277.5 Max. :3215 Max. :733.0
## larceny bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## Min. :1550 Min. : 1.000 Min. : 0.010 Min. : 1.00
## 1st Qu.:1696 1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 1.00
## Median :1784 Median : 3.000 Median : 2.000 Median : 1.00
## Mean :1802 Mean : 3.431 Mean : 2.472 Mean : 1.19
## 3rd Qu.:1965 3rd Qu.: 4.000 3rd Qu.: 3.000 3rd Qu.: 1.00
## Max. :2166 Max. :999.000 Max. :450.000 Max. :999.00
## stories_number land_square_footage AC_presence division
## Min. : 0.500 Min. : 1 Min. :0.0000 Length:2350548
## 1st Qu.: 1.000 1st Qu.: 6874 1st Qu.:0.0000 Class :character
## Median : 1.000 Median : 9583 Median :1.0000 Mode :character
## Mean : 1.371 Mean : 39671 Mean :0.6091
## 3rd Qu.: 2.000 3rd Qu.: 17031 3rd Qu.:1.0000
## Max. :175.000 Max. :999999999 Max. :1.0000
## year_built_grp
## Min. :1.000
## 1st Qu.:2.000
## Median :3.000
## Mean :2.679
## 3rd Qu.:4.000
## Max. :4.000
tapply(empiricsdata, empiricsdata$division, summary)
## $DV_ENC
## sale_amount dlog_sale_amount violent murder
## Min. : 500 Min. : 6.211 Min. :361.6 Min. :4.400
## 1st Qu.: 151000 1st Qu.:11.885 1st Qu.:369.1 1st Qu.:4.800
## Median : 205000 Median :12.203 Median :383.8 Median :5.000
## Mean : 270018 Mean :12.225 Mean :386.9 Mean :4.984
## 3rd Qu.: 281750 3rd Qu.:12.539 3rd Qu.:387.8 3rd Qu.:5.300
## Max. :31000000 Max. :17.201 Max. :458.6 Max. :5.400
## rape robbery assault property burglary
## Min. :25.90 Min. : 81.6 Min. :229.2 Min. :2110 Min. :340.5
## 1st Qu.:27.70 1st Qu.: 86.1 1st Qu.:241.5 1st Qu.:2210 1st Qu.:378.0
## Median :29.80 Median :102.2 Median :248.3 Median :2500 Median :494.7
## Mean :29.05 Mean :104.8 Mean :248.1 Mean :2564 Mean :522.7
## 3rd Qu.:30.00 3rd Qu.:113.1 3rd Qu.:250.2 3rd Qu.:2868 3rd Qu.:672.2
## Max. :31.00 Max. :145.9 Max. :277.5 Max. :3215 Max. :733.0
## larceny bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## Min. :1550 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.:1602 1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 1.000
## Median :1784 Median : 3.000 Median : 2.500 Median : 1.000
## Mean :1804 Mean : 3.294 Mean : 2.338 Mean : 1.084
## 3rd Qu.:1965 3rd Qu.: 4.000 3rd Qu.: 3.000 3rd Qu.: 1.000
## Max. :2166 Max. :30.000 Max. :22.500 Max. :11.000
## stories_number land_square_footage AC_presence division
## Min. :1.000 Min. : 44 Min. :0.0000 Length:6924
## 1st Qu.:1.000 1st Qu.: 7777 1st Qu.:1.0000 Class :character
## Median :1.500 Median : 10200 Median :1.0000 Mode :character
## Mean :1.535 Mean : 33879 Mean :0.7595
## 3rd Qu.:2.000 3rd Qu.: 18901 3rd Qu.:1.0000
## Max. :5.000 Max. :1711908 Max. :1.0000
## year_built_grp
## Min. :1.000
## 1st Qu.:2.000
## Median :3.000
## Mean :2.625
## 3rd Qu.:4.000
## Max. :4.000
##
## $DV_ESC
## sale_amount dlog_sale_amount violent murder
## Min. : 500 Min. : 6.001 Min. :361.6 Min. :4.400
## 1st Qu.: 148465 1st Qu.:11.824 1st Qu.:366.7 1st Qu.:4.900
## Median : 212900 Median :12.155 Median :370.4 Median :5.000
## Mean : 259805 Mean :12.110 Mean :380.1 Mean :4.983
## 3rd Qu.: 289200 3rd Qu.:12.482 3rd Qu.:386.6 3rd Qu.:5.000
## Max. :80240000 Max. :18.096 Max. :458.6 Max. :5.400
## rape robbery assault property
## Min. :25.90 Min. : 81.60 Min. :229.2 Min. :2110
## 1st Qu.:28.40 1st Qu.: 86.10 1st Qu.:242.8 1st Qu.:2210
## Median :29.90 Median : 98.60 Median :248.3 Median :2363
## Mean :29.34 Mean : 98.46 Mean :247.3 Mean :2440
## 3rd Qu.:30.70 3rd Qu.:102.90 3rd Qu.:250.2 3rd Qu.:2574
## Max. :31.00 Max. :145.90 Max. :277.5 Max. :3215
## burglary larceny bedrooms_all_buildings number_of_bathrooms
## Min. :340.5 Min. :1550 Min. : 1.000 Min. : 0.500
## 1st Qu.:378.0 1st Qu.:1602 1st Qu.: 3.000 1st Qu.: 2.000
## Median :429.7 Median :1696 Median : 3.000 Median : 2.100
## Mean :472.9 Mean :1735 Mean : 3.434 Mean : 2.528
## 3rd Qu.:537.2 3rd Qu.:1822 3rd Qu.: 4.000 3rd Qu.: 3.000
## Max. :733.0 Max. :2166 Max. :13.000 Max. :11.500
## number_of_fireplaces stories_number land_square_footage AC_presence
## Min. : 1.000 Min. :1.000 Min. : 436 Min. :0.0000
## 1st Qu.: 1.000 1st Qu.:1.000 1st Qu.: 9888 1st Qu.:0.0000
## Median : 1.000 Median :1.000 Median : 15216 Median :1.0000
## Mean : 1.069 Mean :1.393 Mean : 85106 Mean :0.5625
## 3rd Qu.: 1.000 3rd Qu.:2.000 3rd Qu.: 26180 3rd Qu.:1.0000
## Max. :11.000 Max. :4.000 Max. :16984697 Max. :1.0000
## division year_built_grp
## Length:17793 Min. :1.000
## Class :character 1st Qu.:3.000
## Mode :character Median :4.000
## Mean :3.214
## 3rd Qu.:4.000
## Max. :4.000
##
## $DV_MA
## sale_amount dlog_sale_amount violent murder
## Min. : 1000 Min. : 6.812 Min. :361.6 Min. :4.400
## 1st Qu.: 100000 1st Qu.:11.528 1st Qu.:369.1 1st Qu.:4.700
## Median : 160000 Median :12.004 Median :383.8 Median :5.000
## Mean : 209782 Mean :11.958 Mean :384.3 Mean :4.962
## 3rd Qu.: 253000 3rd Qu.:12.454 3rd Qu.:387.1 3rd Qu.:5.300
## Max. :50000000 Max. :17.663 Max. :458.6 Max. :5.400
## rape robbery assault property burglary
## Min. :25.90 Min. : 81.6 Min. :229.2 Min. :2110 Min. :340.5
## 1st Qu.:27.10 1st Qu.: 98.6 1st Qu.:238.1 1st Qu.:2363 1st Qu.:429.7
## Median :29.80 Median :102.2 Median :248.3 Median :2500 Median :494.7
## Mean :28.95 Mean :103.7 Mean :246.7 Mean :2547 Mean :518.0
## 3rd Qu.:30.00 3rd Qu.:113.1 3rd Qu.:250.2 3rd Qu.:2868 3rd Qu.:672.2
## Max. :31.00 Max. :145.9 Max. :277.5 Max. :3215 Max. :733.0
## larceny bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## Min. :1550 Min. : 1.000 Min. : 0.500 Min. : 1.000
## 1st Qu.:1696 1st Qu.: 3.000 1st Qu.: 1.000 1st Qu.: 1.000
## Median :1784 Median : 3.000 Median : 2.000 Median : 1.000
## Mean :1795 Mean : 3.268 Mean : 1.973 Mean : 1.108
## 3rd Qu.:1965 3rd Qu.: 4.000 3rd Qu.: 2.500 3rd Qu.: 1.000
## Max. :2166 Max. :30.000 Max. :11.000 Max. :20.000
## stories_number land_square_footage AC_presence division
## Min. :0.910 Min. : 44 Min. :0.0000 Length:25401
## 1st Qu.:1.000 1st Qu.: 11761 1st Qu.:0.0000 Class :character
## Median :1.500 Median : 20473 Median :0.0000 Mode :character
## Mean :1.566 Mean : 42832 Mean :0.1528
## 3rd Qu.:2.000 3rd Qu.: 43560 3rd Qu.:0.0000
## Max. :5.000 Max. :4856940 Max. :1.0000
## year_built_grp
## Min. :1.000
## 1st Qu.:1.000
## Median :2.000
## Mean :2.374
## 3rd Qu.:3.000
## Max. :4.000
##
## $DV_MT
## sale_amount dlog_sale_amount violent murder
## Min. : 500 Min. : 6.098 Min. :361.6 Min. :4.40
## 1st Qu.: 255100 1st Qu.:12.440 1st Qu.:369.1 1st Qu.:4.70
## Median : 360000 Median :12.745 Median :373.7 Median :5.00
## Mean : 483744 Mean :12.761 Mean :382.6 Mean :4.96
## 3rd Qu.: 492600 3rd Qu.:13.045 3rd Qu.:386.6 3rd Qu.:5.30
## Max. :82000000 Max. :18.003 Max. :458.6 Max. :5.40
## rape robbery assault property burglary
## Min. :25.90 Min. : 81.6 Min. :229.2 Min. :2110 Min. :340.5
## 1st Qu.:27.10 1st Qu.: 86.1 1st Qu.:238.1 1st Qu.:2210 1st Qu.:378.0
## Median :29.90 Median :102.2 Median :248.3 Median :2452 Median :468.9
## Mean :28.98 Mean :102.5 Mean :246.1 Mean :2525 Mean :508.8
## 3rd Qu.:30.70 3rd Qu.:109.0 3rd Qu.:250.2 3rd Qu.:2734 3rd Qu.:610.5
## Max. :31.00 Max. :145.9 Max. :277.5 Max. :3215 Max. :733.0
## larceny bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## Min. :1550 Min. : 1.000 Min. : 0.300 Min. : 1.000
## 1st Qu.:1602 1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 1.000
## Median :1745 Median : 3.000 Median : 3.000 Median : 1.000
## Mean :1783 Mean : 3.502 Mean : 2.799 Mean : 1.218
## 3rd Qu.:1902 3rd Qu.: 4.000 3rd Qu.: 3.250 3rd Qu.: 1.000
## Max. :2166 Max. :372.000 Max. :77.500 Max. :999.000
## stories_number land_square_footage AC_presence division
## Min. : 1.000 Min. : 1 Min. :0.0000 Length:258690
## 1st Qu.: 1.000 1st Qu.: 6534 1st Qu.:0.0000 Class :character
## Median : 2.000 Median : 8625 Median :0.0000 Mode :character
## Mean : 1.554 Mean : 44919 Mean :0.3339
## 3rd Qu.: 2.000 3rd Qu.: 14269 3rd Qu.:1.0000
## Max. :10.000 Max. :104633298 Max. :1.0000
## year_built_grp
## Min. :1.000
## 1st Qu.:3.000
## Median :3.000
## Mean :3.112
## 3rd Qu.:4.000
## Max. :4.000
##
## $DV_PAC
## sale_amount dlog_sale_amount violent murder
## Min. : 280 Min. : 5.369 Min. :361.6 Min. :4.400
## 1st Qu.: 244000 1st Qu.:12.386 1st Qu.:369.1 1st Qu.:4.700
## Median : 400000 Median :12.900 Median :383.8 Median :5.000
## Mean : 598281 Mean :12.928 Mean :386.4 Mean :4.942
## 3rd Qu.: 670000 3rd Qu.:13.401 3rd Qu.:387.8 3rd Qu.:5.300
## Max. :288164568 Max. :19.209 Max. :458.6 Max. :5.400
## rape robbery assault property burglary
## Min. :25.90 Min. : 81.6 Min. :229.2 Min. :2110 Min. :340.5
## 1st Qu.:27.10 1st Qu.: 98.6 1st Qu.:241.5 1st Qu.:2363 1st Qu.:429.7
## Median :29.10 Median :102.9 Median :248.3 Median :2500 Median :494.7
## Mean :28.79 Mean :105.7 Mean :246.9 Mean :2593 Mean :538.0
## 3rd Qu.:30.00 3rd Qu.:113.9 3rd Qu.:250.2 3rd Qu.:2905 3rd Qu.:701.0
## Max. :31.00 Max. :145.9 Max. :277.5 Max. :3215 Max. :733.0
## larceny bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## Min. :1550 Min. : 1.000 Min. : 0.010 Min. : 1.000
## 1st Qu.:1696 1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 1.000
## Median :1784 Median : 3.000 Median : 2.000 Median : 1.000
## Mean :1820 Mean : 3.351 Mean : 2.284 Mean : 1.111
## 3rd Qu.:1974 3rd Qu.: 4.000 3rd Qu.: 2.500 3rd Qu.: 1.000
## Max. :2166 Max. :999.000 Max. :175.000 Max. :198.000
## stories_number land_square_footage AC_presence division
## Min. : 1.000 Min. : 1 Min. :0.000 Length:950325
## 1st Qu.: 1.000 1st Qu.: 6098 1st Qu.:0.000 Class :character
## Median : 1.000 Median : 7700 Median :1.000 Mode :character
## Mean : 1.313 Mean : 31139 Mean :0.504
## 3rd Qu.: 2.000 3rd Qu.: 11480 3rd Qu.:1.000
## Max. :175.000 Max. :304920000 Max. :1.000
## year_built_grp
## Min. :1.000
## 1st Qu.:1.000
## Median :2.000
## Mean :2.211
## 3rd Qu.:3.000
## Max. :4.000
##
## $DV_SA
## sale_amount dlog_sale_amount violent murder
## Min. : 100 Min. : 4.39 Min. :361.6 Min. :4.400
## 1st Qu.: 144000 1st Qu.:11.89 1st Qu.:369.1 1st Qu.:4.700
## Median : 220000 Median :12.28 Median :373.7 Median :5.000
## Mean : 332789 Mean :12.30 Mean :382.7 Mean :4.962
## 3rd Qu.: 340000 3rd Qu.:12.73 3rd Qu.:386.6 3rd Qu.:5.300
## Max. :600000000 Max. :20.00 Max. :458.6 Max. :5.400
## rape robbery assault property burglary
## Min. :25.9 Min. : 81.6 Min. :229.2 Min. :2110 Min. :340.5
## 1st Qu.:27.1 1st Qu.: 86.1 1st Qu.:238.1 1st Qu.:2210 1st Qu.:378.0
## Median :29.9 Median :102.2 Median :248.3 Median :2452 Median :468.9
## Mean :29.0 Mean :102.4 Mean :246.3 Mean :2523 Mean :508.1
## 3rd Qu.:30.7 3rd Qu.:109.0 3rd Qu.:250.2 3rd Qu.:2734 3rd Qu.:610.5
## Max. :31.0 Max. :145.9 Max. :277.5 Max. :3215 Max. :733.0
## larceny bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## Min. :1550 Min. : 1.000 Min. : 0.100 Min. : 1.000
## 1st Qu.:1602 1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 1.000
## Median :1745 Median : 3.000 Median : 2.500 Median : 1.000
## Mean :1782 Mean : 3.483 Mean : 2.615 Mean : 1.339
## 3rd Qu.:1902 3rd Qu.: 4.000 3rd Qu.: 3.000 3rd Qu.: 1.000
## Max. :2166 Max. :164.000 Max. :450.000 Max. :998.000
## stories_number land_square_footage AC_presence division
## Min. : 0.500 Min. : 2 Min. :0.0000 Length:624384
## 1st Qu.: 1.000 1st Qu.: 9360 1st Qu.:1.0000 Class :character
## Median : 1.000 Median : 14375 Median :1.0000 Mode :character
## Mean : 1.379 Mean : 32888 Mean :0.7522
## 3rd Qu.: 2.000 3rd Qu.: 24394 3rd Qu.:1.0000
## Max. :21.000 Max. :130592880 Max. :1.0000
## year_built_grp
## Min. :1.000
## 1st Qu.:2.000
## Median :3.000
## Mean :2.907
## 3rd Qu.:4.000
## Max. :4.000
##
## $DV_WNC
## sale_amount dlog_sale_amount violent murder
## Min. :4.850e+02 Min. : 5.946 Min. :361.6 Min. :4.400
## 1st Qu.:1.500e+05 1st Qu.:11.857 1st Qu.:369.1 1st Qu.:4.700
## Median :2.250e+05 Median :12.248 Median :373.7 Median :5.000
## Mean :3.114e+05 Mean :12.213 Mean :384.5 Mean :4.954
## 3rd Qu.:3.300e+05 3rd Qu.:12.630 3rd Qu.:387.1 3rd Qu.:5.300
## Max. :2.730e+09 Max. :21.640 Max. :458.6 Max. :5.400
## rape robbery assault property burglary
## Min. :25.9 Min. : 81.6 Min. :229.2 Min. :2110 Min. :340.5
## 1st Qu.:27.1 1st Qu.: 98.6 1st Qu.:238.1 1st Qu.:2363 1st Qu.:429.7
## Median :29.8 Median :102.2 Median :248.3 Median :2500 Median :494.7
## Mean :28.9 Mean :104.0 Mean :246.5 Mean :2556 Mean :522.0
## 3rd Qu.:30.0 3rd Qu.:113.1 3rd Qu.:250.2 3rd Qu.:2868 3rd Qu.:672.2
## Max. :31.0 Max. :145.9 Max. :277.5 Max. :3215 Max. :733.0
## larceny bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## Min. :1550 Min. : 1.000 Min. : 0.250 Min. : 1.000
## 1st Qu.:1696 1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 1.000
## Median :1784 Median : 3.000 Median : 2.500 Median : 1.000
## Mean :1800 Mean : 3.301 Mean : 2.595 Mean : 1.193
## 3rd Qu.:1965 3rd Qu.: 4.000 3rd Qu.: 3.000 3rd Qu.: 1.000
## Max. :2166 Max. :440.000 Max. :32.000 Max. :12.000
## stories_number land_square_footage AC_presence division
## Min. : 1.000 Min. : 4 Min. :0.0000 Length:106834
## 1st Qu.: 1.000 1st Qu.: 8836 1st Qu.:0.0000 Class :character
## Median : 1.000 Median : 12413 Median :0.0000 Mode :character
## Mean : 1.367 Mean : 86432 Mean :0.2803
## 3rd Qu.: 2.000 3rd Qu.: 24829 3rd Qu.:1.0000
## Max. :12.000 Max. :710899200 Max. :1.0000
## year_built_grp
## Min. :1.000
## 1st Qu.:2.000
## Median :3.000
## Mean :2.656
## 3rd Qu.:4.000
## Max. :4.000
##
## $DV_WSC
## sale_amount dlog_sale_amount violent murder
## Min. : 101 Min. : 4.518 Min. :361.6 Min. :4.400
## 1st Qu.: 149000 1st Qu.:11.767 1st Qu.:369.1 1st Qu.:4.700
## Median : 215000 Median :12.119 Median :383.8 Median :5.000
## Mean : 254375 Mean :12.102 Mean :385.7 Mean :4.945
## 3rd Qu.: 314366 3rd Qu.:12.484 3rd Qu.:387.1 3rd Qu.:5.300
## Max. :166250000 Max. :18.929 Max. :458.6 Max. :5.400
## rape robbery assault property burglary
## Min. :25.90 Min. : 81.6 Min. :229.2 Min. :2110 Min. :340.5
## 1st Qu.:27.10 1st Qu.: 98.6 1st Qu.:238.1 1st Qu.:2363 1st Qu.:429.7
## Median :29.80 Median :102.2 Median :248.2 Median :2500 Median :494.7
## Mean :28.84 Mean :105.0 Mean :246.9 Mean :2577 Mean :530.5
## 3rd Qu.:30.00 3rd Qu.:113.1 3rd Qu.:250.2 3rd Qu.:2868 3rd Qu.:672.2
## Max. :31.00 Max. :145.9 Max. :277.5 Max. :3215 Max. :733.0
## larceny bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## Min. :1550 Min. : 1.000 Min. : 0.500 Min. : 1.000
## 1st Qu.:1696 1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 1.000
## Median :1784 Median : 3.000 Median : 2.000 Median : 1.000
## Mean :1811 Mean : 3.555 Mean : 2.486 Mean : 1.129
## 3rd Qu.:1965 3rd Qu.: 4.000 3rd Qu.: 3.000 3rd Qu.: 1.000
## Max. :2166 Max. :999.000 Max. :205.000 Max. :13.000
## stories_number land_square_footage AC_presence division
## Min. :1.000 Min. : 1 Min. :0.0000 Length:360197
## 1st Qu.:1.000 1st Qu.: 7220 1st Qu.:1.0000 Class :character
## Median :1.000 Median : 9091 Median :1.0000 Mode :character
## Mean :1.365 Mean : 53944 Mean :0.9651
## 3rd Qu.:2.000 3rd Qu.: 13445 3rd Qu.:1.0000
## Max. :4.000 Max. :999999999 Max. :1.0000
## year_built_grp
## Min. :1.000
## 1st Qu.:3.000
## Median :3.000
## Mean :3.213
## 3rd Qu.:4.000
## Max. :4.000
PROBLEM 5. Running regressions
# Generate the dependent variable
cleanhousingcrimedata$dep_var <- log(cleanhousingcrimedata$sale_amount / cleanhousingcrimedata$hpi)
# Estimate three different models
model1 <- lm(dep_var ~ violent +
bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + land_square_footage + AC_presence,
data = cleanhousingcrimedata)
summary(model1) # Preferred model
##
## Call:
## lm(formula = dep_var ~ violent + bedrooms_all_buildings + number_of_bathrooms +
## number_of_fireplaces + stories_number + land_square_footage +
## AC_presence, data = cleanhousingcrimedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -133.968 -0.419 -0.045 0.403 9.374
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.176e+00 8.101e-03 885.851 <2e-16 ***
## violent 1.702e-04 2.047e-05 8.317 <2e-16 ***
## bedrooms_all_buildings 2.305e-02 3.374e-04 68.328 <2e-16 ***
## number_of_bathrooms 3.020e-01 5.627e-04 536.706 <2e-16 ***
## number_of_fireplaces 6.984e-05 6.994e-05 0.999 0.318
## stories_number 6.241e-02 1.048e-03 59.573 <2e-16 ***
## land_square_footage -6.413e-12 2.242e-10 -0.029 0.977
## AC_presence -3.165e-01 1.009e-03 -313.676 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7511 on 2350540 degrees of freedom
## Multiple R-squared: 0.1746, Adjusted R-squared: 0.1746
## F-statistic: 7.103e+04 on 7 and 2350540 DF, p-value: < 2.2e-16
model2 <- lm(dep_var ~ murder + rape + robbery + assault + property + burglary + larceny + motor,
data = cleanhousingcrimedata)
summary(model2)
##
## Call:
## lm(formula = dep_var ~ murder + rape + robbery + assault + property +
## burglary + larceny + motor, data = cleanhousingcrimedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.1060 -0.4950 -0.0360 0.4662 9.0704
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.1526350 0.2069650 34.560 < 2e-16 ***
## murder 0.0826872 0.0069346 11.924 < 2e-16 ***
## rape 0.0152382 0.0013705 11.118 < 2e-16 ***
## robbery -0.0070928 0.0010279 -6.900 5.19e-12 ***
## assault -0.0038842 0.0001839 -21.124 < 2e-16 ***
## property -0.0140170 0.0295511 -0.474 0.635
## burglary 0.0140722 0.0294747 0.477 0.633
## larceny 0.0148963 0.0296853 0.502 0.616
## motor 0.0141916 0.0295732 0.480 0.631
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8259 on 2350539 degrees of freedom
## Multiple R-squared: 0.002052, Adjusted R-squared: 0.002048
## F-statistic: 604.1 on 8 and 2350539 DF, p-value: < 2.2e-16
model3 <- lm(dep_var ~ murder + rape + robbery + assault + property + burglary + larceny + motor +
bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + land_square_footage + AC_presence,
data = cleanhousingcrimedata)
summary(model3)
##
## Call:
## lm(formula = dep_var ~ murder + rape + robbery + assault + property +
## burglary + larceny + motor + bedrooms_all_buildings + number_of_bathrooms +
## number_of_fireplaces + stories_number + land_square_footage +
## AC_presence, data = cleanhousingcrimedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -133.750 -0.417 -0.043 0.403 9.374
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.799e+00 1.880e-01 36.155 < 2e-16 ***
## murder 4.930e-02 6.301e-03 7.825 5.08e-15 ***
## rape 1.599e-02 1.245e-03 12.844 < 2e-16 ***
## robbery -4.077e-03 9.339e-04 -4.366 1.27e-05 ***
## assault -3.180e-03 1.671e-04 -19.036 < 2e-16 ***
## property 4.811e-02 2.685e-02 1.792 0.0732 .
## burglary -4.785e-02 2.678e-02 -1.787 0.0740 .
## larceny -4.764e-02 2.697e-02 -1.766 0.0773 .
## motor -4.821e-02 2.687e-02 -1.794 0.0728 .
## bedrooms_all_buildings 2.299e-02 3.371e-04 68.202 < 2e-16 ***
## number_of_bathrooms 3.015e-01 5.622e-04 536.340 < 2e-16 ***
## number_of_fireplaces 7.321e-05 6.987e-05 1.048 0.2947
## stories_number 6.301e-02 1.047e-03 60.208 < 2e-16 ***
## land_square_footage 4.710e-12 2.240e-10 0.021 0.9832
## AC_presence -3.161e-01 1.008e-03 -313.513 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7504 on 2350533 degrees of freedom
## Multiple R-squared: 0.1762, Adjusted R-squared: 0.1762
## F-statistic: 3.591e+04 on 14 and 2350533 DF, p-value: < 2.2e-16
# Estimate the preferred model with regional data
df_NE <- cleanhousingcrimedata[cleanhousingcrimedata$division == "DV_NE" | cleanhousingcrimedata$division == "DV_MA", ]
model3_NE <- lm(dep_var ~ violent +
bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + land_square_footage + AC_presence,
data = df_NE)
summary(model3_NE)
##
## Call:
## lm(formula = dep_var ~ violent + bedrooms_all_buildings + number_of_bathrooms +
## number_of_fireplaces + stories_number + land_square_footage +
## AC_presence, data = df_NE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3308 -0.3071 0.1174 0.4276 5.2193
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.588e+00 7.120e-02 78.482 < 2e-16 ***
## violent 8.561e-04 1.783e-04 4.801 1.59e-06 ***
## bedrooms_all_buildings 1.251e-01 5.676e-03 22.035 < 2e-16 ***
## number_of_bathrooms 3.049e-01 6.256e-03 48.735 < 2e-16 ***
## number_of_fireplaces 1.366e-01 1.039e-02 13.155 < 2e-16 ***
## stories_number 1.076e-01 9.600e-03 11.212 < 2e-16 ***
## land_square_footage 5.634e-07 3.532e-08 15.952 < 2e-16 ***
## AC_presence 5.302e-01 1.196e-02 44.327 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6722 on 25393 degrees of freedom
## Multiple R-squared: 0.3032, Adjusted R-squared: 0.303
## F-statistic: 1579 on 7 and 25393 DF, p-value: < 2.2e-16
df_MW <- cleanhousingcrimedata[cleanhousingcrimedata$division == "DV_ENC" | cleanhousingcrimedata$division == "DV_WNC", ]
model3_MW <- lm(dep_var ~ violent +
bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + land_square_footage + AC_presence,
data = df_MW)
summary(model3_MW)
##
## Call:
## lm(formula = dep_var ~ violent + bedrooms_all_buildings + number_of_bathrooms +
## number_of_fireplaces + stories_number + land_square_footage +
## AC_presence, data = df_MW)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5318 -0.2277 0.0399 0.3126 9.6270
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.329e+00 3.173e-02 199.451 < 2e-16 ***
## violent 3.173e-04 7.985e-05 3.974 7.09e-05 ***
## bedrooms_all_buildings 6.369e-03 1.195e-03 5.329 9.89e-08 ***
## number_of_bathrooms 2.817e-01 2.085e-03 135.152 < 2e-16 ***
## number_of_fireplaces 2.754e-01 4.471e-03 61.600 < 2e-16 ***
## stories_number 4.909e-02 4.218e-03 11.639 < 2e-16 ***
## land_square_footage -2.290e-10 5.264e-10 -0.435 0.664
## AC_presence 4.842e-02 4.239e-03 11.423 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6467 on 113750 degrees of freedom
## Multiple R-squared: 0.2422, Adjusted R-squared: 0.2421
## F-statistic: 5193 on 7 and 113750 DF, p-value: < 2.2e-16
df_S <- cleanhousingcrimedata[cleanhousingcrimedata$division == "DV_SA" | cleanhousingcrimedata$division == "DV_ESC" | cleanhousingcrimedata$division == "DV_WSC", ]
model3_S <- lm(dep_var ~ violent +
bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + land_square_footage + AC_presence,
data = df_S)
summary(model3_S)
##
## Call:
## lm(formula = dep_var ~ violent + bedrooms_all_buildings + number_of_bathrooms +
## number_of_fireplaces + stories_number + land_square_footage +
## AC_presence, data = df_S)
##
## Residuals:
## Min 1Q Median 3Q Max
## -127.778 -0.288 0.019 0.322 8.014
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.653e+00 1.129e-02 589.461 <2e-16 ***
## violent -4.903e-04 2.858e-05 -17.156 <2e-16 ***
## bedrooms_all_buildings 2.821e-02 4.159e-04 67.830 <2e-16 ***
## number_of_bathrooms 2.882e-01 7.206e-04 399.995 <2e-16 ***
## number_of_fireplaces 1.904e-05 6.338e-05 0.300 0.764
## stories_number 2.106e-01 1.524e-03 138.199 <2e-16 ***
## land_square_footage -5.184e-11 2.315e-10 -0.224 0.823
## AC_presence 3.289e-02 1.774e-03 18.539 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6736 on 1002366 degrees of freedom
## Multiple R-squared: 0.2376, Adjusted R-squared: 0.2376
## F-statistic: 4.462e+04 on 7 and 1002366 DF, p-value: < 2.2e-16
df_W <- cleanhousingcrimedata[cleanhousingcrimedata$division == "DV_MT" | cleanhousingcrimedata$division == "DV_PAC", ]
model3_W <- lm(dep_var ~ violent +
bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + land_square_footage + AC_presence,
data = df_W)
summary(model3_W)
##
## Call:
## lm(formula = dep_var ~ violent + bedrooms_all_buildings + number_of_bathrooms +
## number_of_fireplaces + stories_number + land_square_footage +
## AC_presence, data = df_W)
##
## Residuals:
## Min 1Q Median 3Q Max
## -62.771 -0.441 -0.055 0.399 9.232
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.664e+00 9.905e-03 773.78 < 2e-16 ***
## violent -2.639e-04 2.490e-05 -10.60 < 2e-16 ***
## bedrooms_all_buildings 9.686e-03 4.698e-04 20.62 < 2e-16 ***
## number_of_bathrooms 3.569e-01 7.672e-04 465.20 < 2e-16 ***
## number_of_fireplaces 1.178e-02 4.318e-04 27.29 < 2e-16 ***
## stories_number -5.513e-02 1.240e-03 -44.47 < 2e-16 ***
## land_square_footage 3.893e-09 5.926e-10 6.57 5.04e-11 ***
## AC_presence -2.159e-01 1.216e-03 -177.60 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6628 on 1209007 degrees of freedom
## Multiple R-squared: 0.1981, Adjusted R-squared: 0.1981
## F-statistic: 4.267e+04 on 7 and 1209007 DF, p-value: < 2.2e-16
df_effect_by_division <- read_csv("df_effect_by_division.csv", col_names = TRUE) # Result collected and manually put into an excel spreadsheet
## Rows: 9 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): region, division
## dbl (1): effect
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_murder_effect <- merge(df_abbr_to_division, df_effect_by_division, by = c("division"))
df_murder_effect$state <- df_murder_effect$abbr
# Map
plot_usmap(data = df_murder_effect, values = "effect", color = "red") +
labs(title = "Variation of the effect of 1 unit increase in the murder rate on log of housing price") +
scale_fill_continuous(name = "Effect of murder (log of deflated sale amount)") +
theme(legend.position = "bottom")
