# Load housing data
housingdata <- readRDS("C:/Users/mohan/Dropbox/Mohan_files/530/PS1/testdata20250121.rds")
housingdata <- housingdata %>%
mutate(
sale_date = as.Date(sale_date, format = "%Y%m%d"),
year = year(sale_date),
month = month(sale_date),
day = day(sale_date)
)
head(housingdata %>% select(sale_date, year, month, day), 10)
## sale_date year month day
## <Date> <int> <int> <int>
## 1: 2015-01-29 2015 1 29
## 2: 2009-09-04 2009 9 4
## 3: 2013-02-28 2013 2 28
## 4: 2008-08-25 2008 8 25
## 5: 2014-02-25 2014 2 25
## 6: 2018-01-11 2018 1 11
## 7: 2015-04-08 2015 4 8
## 8: 2015-06-23 2015 6 23
## 9: 2009-05-28 2009 5 28
## 10: 2012-01-31 2012 1 31
summary(housingdata$year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2008 2012 2015 2015 2017 2019
summary(housingdata$year_built)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1700 1965 1986 1981 2001 2019
ggplot(housingdata, aes(x = year)) +
geom_histogram(binwidth = 1, fill = "lightblue", color = "black") +
labs(title = "Distribution of Year (Sale)", x = "Year", y = "Count")
ggplot(housingdata, aes(x = year_built)) +
geom_histogram(binwidth = 5, fill = "lightgreen", color = "black") +
labs(title = "Distribution of Year Built", x = "Year Built", y = "Count")
housingdata <- housingdata %>%
mutate(
year_built_grp = case_when(
year_built <= 1940 ~ 1, # Very old houses
year_built > 1940 & year_built <= 1970 ~ 2, # Quite old houses
year_built > 1970 & year_built <= 2000 ~ 3, # Quite new houses
year_built > 2000 ~ 4 # Very new houses
)
)
table(housingdata$year_built_grp)
##
## 1 2 3 4
## 157925 522850 1049282 620491
ggplot(housingdata, aes(x = factor(year_built_grp))) +
geom_bar(fill = "plum", color = "black") +
labs(title = "Distribution of Year Built Groups", x = "Year Built Group", y = "Count") +
scale_x_discrete(labels = c("1" = "Very Old", "2" = "Quite Old", "3" = "Quite New", "4" = "Very New"))
# iii
housingdata <- housingdata %>%
mutate(
log_sale_amount = log(sale_amount)
)
scatter3D(
x = housingdata$year_built_grp,
y = housingdata$log_sale_amount,
z = housingdata$year,
colvar = housingdata$year_built_grp,
colkey = list(length = 0.5, width = 0.5),
pch = 19,
cex = 0.7,
xlab = "Year Built Group",
ylab = "Log(Sale Amount)",
zlab = "Year",
main = "3D Plot: Log(Sale Amount), Year Built Group, and Year"
)
#iv There are outliers. I set upper and lower thresholds to filter the
outliers.
summary(housingdata$sale_amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000e+02 1.800e+05 2.870e+05 4.417e+05 4.740e+05 2.730e+09
ggplot(housingdata, aes(x = sale_amount)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Boxplot of Sale Amount", x = "Sale Amount")
Q1 <- quantile(housingdata$sale_amount, 0.25, na.rm = TRUE)
Q3 <- quantile(housingdata$sale_amount, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
cleanhousingdata <- housingdata %>%
filter(
sale_amount >= lower_bound & sale_amount <= upper_bound
)
cat("Original dataset rows:", nrow(housingdata), "\n")
## Original dataset rows: 2350548
cat("Cleaned dataset rows:", nrow(cleanhousingdata), "\n")
## Cleaned dataset rows: 2177473
#i
hpidata <- read_excel("C:/Users/mohan/Dropbox/Mohan_files/530/PS1/hpi_po_us_and_census.xls")
cleanhpidata <- hpidata %>%
filter(division != "USA")
summary(cleanhpidata)
## division year qtr
## Length:1215 Min. :1991 Min. :1.000
## Class :character 1st Qu.:1999 1st Qu.:1.000
## Mode :character Median :2007 Median :2.000
## Mean :2007 Mean :2.489
## 3rd Qu.:2016 3rd Qu.:3.000
## Max. :2024 Max. :4.000
## index_po_not_seasonally_adjusted index_po_seasonally_adjusted
## Min. : 94.17 Min. : 94.35
## 1st Qu.:136.69 1st Qu.:135.78
## Median :190.75 Median :190.13
## Mean :203.28 Mean :202.43
## 3rd Qu.:233.86 3rd Qu.:234.03
## Max. :599.69 Max. :591.16
table(cleanhpidata$division)
##
## DV_ENC DV_ESC DV_MA DV_MT DV_NE DV_PAC DV_SA DV_WNC DV_WSC
## 135 135 135 135 135 135 135 135 135
cleanhpidata <- cleanhpidata %>%
group_by(division) %>%
mutate(
base_index = index_po_seasonally_adjusted[qtr == 1 & year == 2008],
hpi = index_po_seasonally_adjusted / base_index * 100
) %>%
ungroup()
range_years <- range(housingdata$year, na.rm = TRUE)
cleanhpidata <- cleanhpidata %>%
filter(year >= range_years[1] & year <= range_years[2])
summary(cleanhpidata)
## division year qtr
## Length:432 Min. :2008 Min. :1.00
## Class :character 1st Qu.:2011 1st Qu.:1.75
## Mode :character Median :2014 Median :2.50
## Mean :2014 Mean :2.50
## 3rd Qu.:2016 3rd Qu.:3.25
## Max. :2019 Max. :4.00
## index_po_not_seasonally_adjusted index_po_seasonally_adjusted base_index
## Min. :152.3 Min. :154.0 Min. :179.9
## 1st Qu.:191.2 1st Qu.:190.3 1st Qu.:195.5
## Median :204.1 Median :203.6 Median :213.9
## Mean :215.7 Mean :214.8 Mean :214.3
## 3rd Qu.:233.2 3rd Qu.:232.9 3rd Qu.:222.3
## Max. :375.8 Max. :376.6 Max. :271.3
## hpi
## Min. : 71.87
## 1st Qu.: 91.09
## Median : 97.26
## Mean :100.50
## 3rd Qu.:109.20
## Max. :146.23
cleanhousingdata <- cleanhousingdata %>%
mutate(
qtr = case_when(
month %in% 1:3 ~ 1,
month %in% 4:6 ~ 2,
month %in% 7:9 ~ 3,
month %in% 10:12 ~ 4
)
)
state_division_map <- c(
"AL" = "DV_ESC",
"CA" = "DV_PAC",
"CO" = "DV_M",
"FL" = "DV_SA",
"GA" = "DV_SA",
"IA" = "DV_WNC",
"IL" = "DV_ENC",
"KY" = "DV_ESC",
"MI" = "DV_ENC",
"MN" = "DV_WNC",
"MO" = "DV_WNC",
"NC" = "DV_SA",
"NE" = "DV_WNC",
"NJ" = "DV_MA",
"NM" = "DV_M",
"NV" = "DV_M",
"NY" = "DV_MA",
"OH" = "DV_ENC",
"OK" = "DV_WSC",
"OR" = "DV_PAC",
"PA" = "DV_MA",
"SC" = "DV_SA",
"TN" = "DV_ESC",
"TX" = "DV_WSC",
"VA" = "DV_SA",
"WA" = "DV_PAC"
)
cleanhousingdata <- cleanhousingdata %>%
mutate(division = state_division_map[abbr])
# missing_divisions <- cleanhousingdata %>%
# filter(is.na(division))
#
# print(missing_divisions)
cleanhousinghpidata <- cleanhousingdata %>%
left_join(
cleanhpidata %>% select(division, year, qtr, hpi),
by = c("division", "year", "qtr")
)
cleanhousinghpidata <- cleanhousinghpidata %>%
mutate(
dlog_sale_amount = log(sale_amount * 100 / hpi)
)
states_fips <- usmap::fips_info()
cleanhousinghpidata <- cleanhousinghpidata %>%
left_join(states_fips %>% select(abbr, fips), by = "abbr")
state_year_summary <- cleanhousinghpidata %>%
group_by(fips, year) %>%
summarise(mean_dlog_sale = mean(dlog_sale_amount, na.rm = TRUE), .groups = "drop") %>%
ungroup()
start_year <- min(state_year_summary$year, na.rm = TRUE)
end_year <- max(state_year_summary$year, na.rm = TRUE)
state_map_start <- state_year_summary %>% filter(year == start_year)
state_map_end <- state_year_summary %>% filter(year == end_year)
plot_state_map <- function(data, year_label) {
plot_usmap(data = data, values = "mean_dlog_sale", regions = "states") +
scale_fill_continuous(low = "lightblue", high = "darkblue", name = "log(price/hpi)") +
labs(title = paste("House Price Index (HPI) -", year_label)) +
theme(legend.position = "right")
}
p1 <- plot_state_map(state_map_start, start_year)
p2 <- plot_state_map(state_map_end, end_year)
# State-Level Maps for Start and End Year
print(p1)
print(p2)
plot_list <- list()
for (y in unique(state_year_summary$year)) {
data <- state_year_summary %>% filter(year == y)
p <- plot_state_map(data, y)
plot_list[[as.character(y)]] <- p
}
#Annual State-Level Maps
library(gridExtra)
do.call(grid.arrange, c(plot_list, ncol = 3))
############################
state_built_summary <- cleanhousinghpidata %>%
group_by(fips, year_built_grp) %>%
summarise(mean_dlog_sale = mean(dlog_sale_amount, na.rm = TRUE), .groups = "drop") %>%
ungroup()
plot_state_map_built <- function(data, built_grp_label) {
plot_usmap(data = data, values = "mean_dlog_sale", regions = "states") +
scale_fill_continuous(low = "lightgreen", high = "darkgreen", name = "log(price/hpi)") +
labs(title = paste("House Price Index by Year Built Group -", built_grp_label)) +
theme(legend.position = "right")
}
plot_list_built <- list()
for (g in unique(state_built_summary$year_built_grp)) {
data <- state_built_summary %>% filter(year_built_grp == g)
p <- plot_state_map_built(data, g)
plot_list_built[[as.character(g)]] <- p
}
# Maps by year_built_grp
do.call(grid.arrange, c(plot_list_built, ncol = 2))
#######################################################################
county_summary <- cleanhousinghpidata %>%
group_by(fips, fips_county_code) %>%
summarise(
mean_year_built_grp = mean(year_built_grp, na.rm = TRUE),
mean_dlog_sale = mean(dlog_sale_amount, na.rm = TRUE), .groups = "drop"
) %>%
ungroup()
county_summary2 <- county_summary %>%
select(-fips) %>% # Remove the existing state-level fips column
rename(fips = fips_county_code) # Rename county FIPS correctly
plot_county_map <- function(data, value_col, title_label) {
plot_usmap(data = data, values = value_col, regions = "counties") +
scale_fill_continuous(low = "yellow", high = "red", name = title_label) +
labs(title = title_label) +
theme(legend.position = "right")
}
p3 <- plot_county_map(county_summary2, "mean_year_built_grp", "Average Year Built Group")
p4 <- plot_county_map(county_summary2, "mean_dlog_sale", "Average House Price")
# County-Level Maps
print(p3)
print(p4)
## PROBLEM 3. Adding further secondary data
crimedata <- read_excel("C:/Users/mohan/Dropbox/Mohan_files/530/PS1/table-1.xls",
range = "A4:V24", col_names = TRUE)
colnames(crimedata) <- c("year", "population", "violent_crime", "violent_crime_rate",
"murder", "murder_rate", "rape_revised", "rape_revised_rate",
"rape_legacy", "rape_legacy_rate", "robbery", "robbery_rate",
"aggravated_assault", "aggravated_assault_rate",
"property_crime", "property_crime_rate", "burglary", "burglary_rate",
"larceny_theft", "larceny_theft_rate", "motor_vehicle_theft",
"motor_vehicle_theft_rate")
crimedata <- crimedata %>%
mutate(year = as.numeric(substr(as.character(year), 1, 4)))
crimedata <- crimedata %>%
mutate(year = as.numeric(year)) %>%
filter(!is.na(year))
summary(crimedata)
## year population violent_crime violent_crime_rate
## Min. :2000 Min. :281421906 Min. :1153022 Min. :361.6
## 1st Qu.:2005 1st Qu.:295794506 1st Qu.:1208999 1st Qu.:381.3
## Median :2010 Median :308168384 Median :1288572 Median :418.2
## Mean :2010 Mean :307116277 Mean :1305421 Mean :427.3
## 3rd Qu.:2014 3rd Qu.:319404705 3rd Qu.:1401588 3rd Qu.:472.8
## Max. :2019 Max. :328239523 Max. :1439480 Max. :506.5
##
## murder murder_rate rape_revised rape_revised_rate
## Min. :14164 Min. :4.400 Min. :113695 Min. :35.90
## 1st Qu.:15263 1st Qu.:4.875 1st Qu.:122081 1st Qu.:38.15
## Median :16188 Median :5.350 Median :132414 Median :40.90
## Mean :15984 Mean :5.205 Mean :129931 Mean :40.20
## 3rd Qu.:16581 3rd Qu.:5.600 3rd Qu.:137741 3rd Qu.:42.15
## Max. :17413 Max. :5.800 Max. :143765 Max. :44.00
## NA's :13 NA's :13
## rape_legacy rape_legacy_rate robbery robbery_rate
## Min. : 82109 Min. :25.90 Min. :267988 Min. : 81.6
## 1st Qu.: 88329 1st Qu.:28.23 1st Qu.:331625 1st Qu.:102.7
## Median : 91711 Median :30.30 Median :385280 Median :126.2
## Mean : 91781 Mean :29.94 Mean :375602 Mean :123.2
## 3rd Qu.: 95126 3rd Qu.:31.80 3rd Qu.:418280 3rd Qu.:145.2
## Max. :101363 Max. :33.10 Max. :449246 Max. :150.0
##
## aggravated_assault aggravated_assault_rate property_crime
## Min. :726777 Min. :229.2 Min. : 6925677
## 1st Qu.:777397 1st Qu.:246.8 1st Qu.: 8162786
## Median :816848 Median :258.8 Median : 9224842
## Mean :822054 Mean :268.9 Mean : 9141687
## 3rd Qu.:863255 3rd Qu.:291.1 3rd Qu.:10176712
## Max. :911706 Max. :324.0 Max. :10455277
##
## property_crime_rate burglary burglary_rate larceny_theft
## Min. :2110 Min. :1117696 Min. :340.5 Min. :5086096
## 1st Qu.:2556 1st Qu.:1681756 1st Qu.:526.6 1st Qu.:5787662
## Median :2994 Median :2130489 Median :709.5 Median :6271348
## Mean :2999 Mean :1927672 Mean :633.0 Mean :6278173
## 3rd Qu.:3452 3rd Qu.:2172629 3rd Qu.:731.0 3rd Qu.:6821858
## Max. :3658 Max. :2228887 Max. :747.0 Max. :7092267
##
## larceny_theft_rate motor_vehicle_theft motor_vehicle_theft_rate
## Min. :1550 Min. : 686803 Min. :215.4
## 1st Qu.:1812 1st Qu.: 722861 1st Qu.:230.2
## Median :2035 Median : 784298 Median :249.2
## Mean :2058 Mean : 935842 Mean :308.5
## 3rd Qu.:2306 3rd Qu.:1205782 3rd Qu.:413.4
## Max. :2486 Max. :1261226 Max. :433.7
##
year_range <- range(cleanhousinghpidata$year, na.rm = TRUE)
cleancrimedata <- crimedata %>%
filter(year >= year_range[1] & year <= year_range[2]) %>%
select(year, violent_crime_rate, property_crime_rate, murder_rate)
head(cleancrimedata)
## # A tibble: 6 × 4
## year violent_crime_rate property_crime_rate murder_rate
## <dbl> <dbl> <dbl> <dbl>
## 1 2008 459. 3215. 5.4
## 2 2009 432. 3041. 5
## 3 2010 404. 2946. 4.8
## 4 2011 387. 2905. 4.7
## 5 2012 388. 2868 4.7
## 6 2013 369. 2734. 4.5
cleanhousinghpicrimedata <- cleanhousinghpidata %>%
left_join(cleancrimedata, by = "year")
head(cleanhousinghpicrimedata)
## sale_amount sale_date x y fips_county_code year_built
## <num> <Date> <num> <num> <int> <int>
## 1: 104000 2015-01-29 -86.19826 32.41438 1101 2009
## 2: 180501 2009-09-04 -86.20467 32.32170 1101 1986
## 3: 140000 2013-02-28 -86.13278 32.34974 1101 2006
## 4: 245916 2008-08-25 -86.17190 32.40677 1101 2000
## 5: 60000 2014-02-25 -86.27811 32.39384 1101 1940
## 6: 108150 2018-01-11 -86.12927 32.34984 1101 2006
## bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## <int> <num> <int>
## 1: 3 2.0 1
## 2: 3 2.0 1
## 3: 3 2.0 1
## 4: 3 2.0 1
## 5: 2 1.5 1
## 6: 3 2.0 1
## stories_number land_square_footage abbr AC_presence year month day
## <num> <int> <char> <num> <num> <int> <int>
## 1: 1 12855 AL 0 2015 1 29
## 2: 1 16583 AL 0 2009 9 4
## 3: 1 6216 AL 0 2013 2 28
## 4: 1 8372 AL 0 2008 8 25
## 5: 1 21928 AL 0 2014 2 25
## 6: 1 4713 AL 0 2018 1 11
## year_built_grp log_sale_amount qtr division hpi dlog_sale_amount
## <num> <num> <num> <char> <num> <num>
## 1: 4 11.55215 1 DV_ESC 99.83123 11.55384
## 2: 3 12.10349 3 DV_ESC 95.13630 12.15335
## 3: 4 11.84940 1 DV_ESC 93.68383 11.91464
## 4: 3 12.41275 3 DV_ESC 97.95939 12.43336
## 5: 1 11.00210 1 DV_ESC 96.14381 11.04142
## 6: 4 11.59127 1 DV_ESC 115.72137 11.44526
## fips violent_crime_rate property_crime_rate murder_rate
## <char> <num> <num> <num>
## 1: 01 373.7 2500.5 4.9
## 2: 01 431.9 3041.3 5.0
## 3: 01 369.1 2733.6 4.5
## 4: 01 458.6 3214.6 5.4
## 5: 01 361.6 2574.1 4.4
## 6: 01 370.4 2209.8 5.0
#Plot Crime Rates Against Year
ggplot(cleanhousinghpicrimedata, aes(x = year)) +
geom_line(aes(y = violent_crime_rate, color = "Violent Crime Rate"), linewidth = 1) +
geom_line(aes(y = property_crime_rate, color = "Property Crime Rate"), linewidth = 1) +
geom_line(aes(y = murder_rate, color = "Murder Rate"), linewidth = 1) +
labs(title = "Crime Rates Over Time",
x = "Year",
y = "Crime Rate (per 100,000 inhabitants)") +
theme_minimal() +
scale_color_manual(name = "Crime Type",
values = c("Violent Crime Rate" = "red",
"Property Crime Rate" = "blue",
"Murder Rate" = "black"))
#Plot dlog_sale_amount Against Year by year_built_grp
cleanhousinghpicrimedata <- cleanhousinghpicrimedata %>%
filter(!is.na(hpi) & hpi > 0)
ggplot(cleanhousinghpicrimedata, aes(x = year, y = dlog_sale_amount, color = as.factor(year_built_grp))) +
geom_line(stat = "summary", fun = mean, size = 1) +
labs(title = "Deflated Log Sale Amount Over Time by Year Built Group",
x = "Year",
y = "Mean dlog_sale_amount",
color = "Year Built Group") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## PROBLEM 4. Summarizing the data
summary_full <- cleanhousinghpicrimedata %>%
select(sale_amount, dlog_sale_amount, violent_crime_rate, property_crime_rate, murder_rate,
year_built, year_built_grp)
stargazer(summary_full, type = "text", title = "Summary Statistics - Full Sample")
##
## Summary Statistics - Full Sample
## ===========================================================================
## Statistic N Mean St. Dev. Min Max
## ---------------------------------------------------------------------------
## sale_amount 1,929,564 307,710.000 191,439.300 100.000 915,000.000
## dlog_sale_amount 1,929,564 12.402 0.712 4.390 14.044
## violent_crime_rate 1,929,564 385.347 24.393 361.600 458.600
## property_crime_rate 1,929,564 2,572.400 326.658 2,109.900 3,214.600
## murder_rate 1,929,564 4.948 0.309 4.400 5.400
## year_built 1,929,564 1,981.345 24.137 1,700 2,019
## year_built_grp 1,929,564 2.906 0.853 1 4
## ---------------------------------------------------------------------------
summary_by_year_built_grp <- cleanhousinghpicrimedata %>%
group_by(year_built_grp) %>%
summarise(
sale_amount_mean = mean(sale_amount, na.rm = TRUE),
dlog_sale_amount_mean = mean(dlog_sale_amount, na.rm = TRUE),
violent_crime_rate_mean = mean(violent_crime_rate, na.rm = TRUE),
property_crime_rate_mean = mean(property_crime_rate, na.rm = TRUE),
murder_rate_mean = mean(murder_rate, na.rm = TRUE),
year_built_mean = mean(year_built, na.rm = TRUE)
)
kable(summary_by_year_built_grp, caption = "Summary Statistics by Year Built Group") %>%
kable_styling(full_width = FALSE)
| year_built_grp | sale_amount_mean | dlog_sale_amount_mean | violent_crime_rate_mean | property_crime_rate_mean | murder_rate_mean | year_built_mean |
|---|---|---|---|---|---|---|
| 1 | 328950.0 | 12.35417 | 385.8611 | 2587.192 | 4.938901 | 1923.953 |
| 2 | 353120.5 | 12.51055 | 386.4877 | 2598.179 | 4.937084 | 1957.862 |
| 3 | 289253.1 | 12.35696 | 384.2947 | 2557.417 | 4.948682 | 1986.273 |
| 4 | 296101.8 | 12.39938 | 386.1105 | 2573.155 | 4.957604 | 2007.321 |
summary_by_division <- cleanhousinghpicrimedata %>%
group_by(division) %>%
summarise(
sale_amount_mean = mean(sale_amount, na.rm = TRUE),
dlog_sale_amount_mean = mean(dlog_sale_amount, na.rm = TRUE),
violent_crime_rate_mean = mean(violent_crime_rate, na.rm = TRUE),
property_crime_rate_mean = mean(property_crime_rate, na.rm = TRUE),
murder_rate_mean = mean(murder_rate, na.rm = TRUE),
year_built_mean = mean(year_built, na.rm = TRUE)
)
kable(summary_by_division, caption = "Summary Statistics by Census Division") %>%
kable_styling(full_width = FALSE)
| division | sale_amount_mean | dlog_sale_amount_mean | violent_crime_rate_mean | property_crime_rate_mean | murder_rate_mean | year_built_mean |
|---|---|---|---|---|---|---|
| DV_ENC | 235055.3 | 12.19496 | 386.9429 | 2564.518 | 4.982437 | 1979.142 |
| DV_ESC | 231787.3 | 12.09080 | 380.0758 | 2440.104 | 4.982176 | 1993.127 |
| DV_MA | 194339.4 | 11.93277 | 384.2614 | 2547.189 | 4.961842 | 1972.058 |
| DV_PAC | 390170.3 | 12.72023 | 387.3039 | 2610.851 | 4.937189 | 1972.986 |
| DV_SA | 252154.0 | 12.22713 | 382.8063 | 2525.114 | 4.960761 | 1986.190 |
| DV_WNC | 252019.6 | 12.17645 | 384.4482 | 2556.924 | 4.952392 | 1979.100 |
| DV_WSC | 242800.0 | 12.08773 | 385.7376 | 2577.424 | 4.945299 | 1993.003 |
#iv
summary_by_region <- cleanhousinghpicrimedata %>%
group_by(division) %>%
summarise(
sale_amount_mean = mean(sale_amount, na.rm = TRUE),
dlog_sale_amount_mean = mean(dlog_sale_amount, na.rm = TRUE),
violent_crime_rate_mean = mean(violent_crime_rate, na.rm = TRUE),
property_crime_rate_mean = mean(property_crime_rate, na.rm = TRUE),
murder_rate_mean = mean(murder_rate, na.rm = TRUE),
year_built_mean = mean(year_built, na.rm = TRUE)
)
kable(summary_by_region, caption = "Summary Statistics by census-designated Region") %>%
kable_styling(full_width = FALSE)
| division | sale_amount_mean | dlog_sale_amount_mean | violent_crime_rate_mean | property_crime_rate_mean | murder_rate_mean | year_built_mean |
|---|---|---|---|---|---|---|
| DV_ENC | 235055.3 | 12.19496 | 386.9429 | 2564.518 | 4.982437 | 1979.142 |
| DV_ESC | 231787.3 | 12.09080 | 380.0758 | 2440.104 | 4.982176 | 1993.127 |
| DV_MA | 194339.4 | 11.93277 | 384.2614 | 2547.189 | 4.961842 | 1972.058 |
| DV_PAC | 390170.3 | 12.72023 | 387.3039 | 2610.851 | 4.937189 | 1972.986 |
| DV_SA | 252154.0 | 12.22713 | 382.8063 | 2525.114 | 4.960761 | 1986.190 |
| DV_WNC | 252019.6 | 12.17645 | 384.4482 | 2556.924 | 4.952392 | 1979.100 |
| DV_WSC | 242800.0 | 12.08773 | 385.7376 | 2577.424 | 4.945299 | 1993.003 |
model_formula <- dlog_sale_amount ~ violent_crime_rate + property_crime_rate +
murder_rate + year_built + factor(year_built_grp) + factor(region)
library(texreg)
## Warning: 程序包'texreg'是用R版本4.4.1 来建造的
## Version: 1.39.4
## Date: 2024-07-23
## Author: Philip Leifeld (University of Manchester)
##
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
##
## 载入程序包:'texreg'
## The following object is masked from 'package:tidyr':
##
## extract
model1 <- lm(dlog_sale_amount ~ violent_crime_rate + property_crime_rate, data = cleanhousinghpicrimedata)
model2 <- lm(dlog_sale_amount ~ violent_crime_rate + property_crime_rate + murder_rate, data = cleanhousinghpicrimedata)
model3 <- lm(dlog_sale_amount ~ violent_crime_rate + property_crime_rate + murder_rate + year_built + factor(year_built_grp), data = cleanhousinghpicrimedata)
screenreg(list(model1, model2, model3), custom.model.names = c("Model 1", "Model 2", "Model 3"))
##
## =======================================================================
## Model 1 Model 2 Model 3
## -----------------------------------------------------------------------
## (Intercept) 12.33 *** 12.22 *** 7.86 ***
## (0.01) (0.01) (0.12)
## violent_crime_rate -0.00 *** -0.00 *** -0.00 ***
## (0.00) (0.00) (0.00)
## property_crime_rate 0.00 *** 0.00 *** 0.00 ***
## (0.00) (0.00) (0.00)
## murder_rate 0.08 *** 0.08 ***
## (0.00) (0.00)
## year_built 0.00 ***
## (0.00)
## factor(year_built_grp)2 0.08 ***
## (0.00)
## factor(year_built_grp)3 -0.13 ***
## (0.00)
## factor(year_built_grp)4 -0.14 ***
## (0.01)
## -----------------------------------------------------------------------
## R^2 0.01 0.01 0.02
## Adj. R^2 0.01 0.01 0.02
## Num. obs. 1929564 1929564 1929564
## =======================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
model_by_region <- cleanhousinghpicrimedata %>%
group_by(division) %>%
do(model = lm(dlog_sale_amount ~ violent_crime_rate + property_crime_rate + year_built, data = .))
region_results <- model_by_region %>%
summarise(division = unique(division),
violent_crime_coef = coef(model)[["violent_crime_rate"]],
property_crime_coef = coef(model)[["property_crime_rate"]])
print(region_results)
## # A tibble: 7 × 3
## division violent_crime_coef property_crime_coef
## <chr> <dbl> <dbl>
## 1 DV_ENC -0.000201 0.000274
## 2 DV_ESC -0.000414 0.000142
## 3 DV_MA 0.000558 0.0000113
## 4 DV_PAC -0.00228 0.000297
## 5 DV_SA -0.000595 0.0000375
## 6 DV_WNC -0.00154 0.000140
## 7 DV_WSC -0.00112 0.000125
cleanhousinghpicrimedata <- cleanhousinghpicrimedata %>%
left_join(region_results, by = "division")
ggplot(cleanhousinghpicrimedata, aes(x = division, y = violent_crime_coef, fill = division)) +
geom_bar(stat = "identity") +
labs(title = "Estimated Effect of Violent Crime Rate on House Prices by Division",
x = "Census Division",
y = "Effect Size") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))