Practice Data

# Load housing data
housingdata <- readRDS("C:/Users/mohan/Dropbox/Mohan_files/530/PS1/testdata20250121.rds")

PROBLEM 1. Checking the raw data in the primary dataset

i

housingdata <- housingdata %>%
  mutate(
    sale_date = as.Date(sale_date, format = "%Y%m%d"), 
    year = year(sale_date),
    month = month(sale_date), 
    day = day(sale_date) 
  )

head(housingdata %>% select(sale_date, year, month, day), 10)
##      sale_date  year month   day
##         <Date> <int> <int> <int>
##  1: 2015-01-29  2015     1    29
##  2: 2009-09-04  2009     9     4
##  3: 2013-02-28  2013     2    28
##  4: 2008-08-25  2008     8    25
##  5: 2014-02-25  2014     2    25
##  6: 2018-01-11  2018     1    11
##  7: 2015-04-08  2015     4     8
##  8: 2015-06-23  2015     6    23
##  9: 2009-05-28  2009     5    28
## 10: 2012-01-31  2012     1    31

ii

summary(housingdata$year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2008    2012    2015    2015    2017    2019
summary(housingdata$year_built)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1700    1965    1986    1981    2001    2019
ggplot(housingdata, aes(x = year)) +
  geom_histogram(binwidth = 1, fill = "lightblue", color = "black") +
  labs(title = "Distribution of Year (Sale)", x = "Year", y = "Count")

ggplot(housingdata, aes(x = year_built)) +
  geom_histogram(binwidth = 5, fill = "lightgreen", color = "black") +
  labs(title = "Distribution of Year Built", x = "Year Built", y = "Count")

housingdata <- housingdata %>%
  mutate(
    year_built_grp = case_when(
      year_built <= 1940 ~ 1,  # Very old houses
      year_built > 1940 & year_built <= 1970 ~ 2,  # Quite old houses
      year_built > 1970 & year_built <= 2000 ~ 3,  # Quite new houses
      year_built > 2000 ~ 4  # Very new houses
    )
  )
table(housingdata$year_built_grp)
## 
##       1       2       3       4 
##  157925  522850 1049282  620491
ggplot(housingdata, aes(x = factor(year_built_grp))) +
  geom_bar(fill = "plum", color = "black") +
  labs(title = "Distribution of Year Built Groups", x = "Year Built Group", y = "Count") +
  scale_x_discrete(labels = c("1" = "Very Old", "2" = "Quite Old", "3" = "Quite New", "4" = "Very New"))

# iii

housingdata <- housingdata %>%
  mutate(
    log_sale_amount = log(sale_amount) 
  )
scatter3D(
  x = housingdata$year_built_grp,               
  y = housingdata$log_sale_amount,             
  z = housingdata$year,                        
  colvar = housingdata$year_built_grp,
  colkey = list(length = 0.5, width = 0.5),    
  pch = 19,                                    
  cex = 0.7,                                   
  xlab = "Year Built Group",                   
  ylab = "Log(Sale Amount)",                   
  zlab = "Year",                               
  main = "3D Plot: Log(Sale Amount), Year Built Group, and Year"
)

#iv There are outliers. I set upper and lower thresholds to filter the outliers.

summary(housingdata$sale_amount)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 1.000e+02 1.800e+05 2.870e+05 4.417e+05 4.740e+05 2.730e+09
ggplot(housingdata, aes(x = sale_amount)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot of Sale Amount", x = "Sale Amount")

Q1 <- quantile(housingdata$sale_amount, 0.25, na.rm = TRUE)
Q3 <- quantile(housingdata$sale_amount, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
cleanhousingdata <- housingdata %>%
  filter(
    sale_amount >= lower_bound & sale_amount <= upper_bound
  )
cat("Original dataset rows:", nrow(housingdata), "\n")
## Original dataset rows: 2350548
cat("Cleaned dataset rows:", nrow(cleanhousingdata), "\n")
## Cleaned dataset rows: 2177473

PROBLEM 2. Adding secondary data to the primary dataset

#i

hpidata <- read_excel("C:/Users/mohan/Dropbox/Mohan_files/530/PS1/hpi_po_us_and_census.xls")
cleanhpidata <- hpidata %>%
  filter(division != "USA")
summary(cleanhpidata)
##    division              year           qtr       
##  Length:1215        Min.   :1991   Min.   :1.000  
##  Class :character   1st Qu.:1999   1st Qu.:1.000  
##  Mode  :character   Median :2007   Median :2.000  
##                     Mean   :2007   Mean   :2.489  
##                     3rd Qu.:2016   3rd Qu.:3.000  
##                     Max.   :2024   Max.   :4.000  
##  index_po_not_seasonally_adjusted index_po_seasonally_adjusted
##  Min.   : 94.17                   Min.   : 94.35              
##  1st Qu.:136.69                   1st Qu.:135.78              
##  Median :190.75                   Median :190.13              
##  Mean   :203.28                   Mean   :202.43              
##  3rd Qu.:233.86                   3rd Qu.:234.03              
##  Max.   :599.69                   Max.   :591.16
table(cleanhpidata$division)
## 
## DV_ENC DV_ESC  DV_MA  DV_MT  DV_NE DV_PAC  DV_SA DV_WNC DV_WSC 
##    135    135    135    135    135    135    135    135    135
cleanhpidata <- cleanhpidata %>%
  group_by(division) %>%
  mutate(
    base_index = index_po_seasonally_adjusted[qtr == 1 & year == 2008],
    hpi = index_po_seasonally_adjusted / base_index * 100
  ) %>%
  ungroup()
range_years <- range(housingdata$year, na.rm = TRUE)
cleanhpidata <- cleanhpidata %>%
  filter(year >= range_years[1] & year <= range_years[2])
summary(cleanhpidata)
##    division              year           qtr      
##  Length:432         Min.   :2008   Min.   :1.00  
##  Class :character   1st Qu.:2011   1st Qu.:1.75  
##  Mode  :character   Median :2014   Median :2.50  
##                     Mean   :2014   Mean   :2.50  
##                     3rd Qu.:2016   3rd Qu.:3.25  
##                     Max.   :2019   Max.   :4.00  
##  index_po_not_seasonally_adjusted index_po_seasonally_adjusted   base_index   
##  Min.   :152.3                    Min.   :154.0                Min.   :179.9  
##  1st Qu.:191.2                    1st Qu.:190.3                1st Qu.:195.5  
##  Median :204.1                    Median :203.6                Median :213.9  
##  Mean   :215.7                    Mean   :214.8                Mean   :214.3  
##  3rd Qu.:233.2                    3rd Qu.:232.9                3rd Qu.:222.3  
##  Max.   :375.8                    Max.   :376.6                Max.   :271.3  
##       hpi        
##  Min.   : 71.87  
##  1st Qu.: 91.09  
##  Median : 97.26  
##  Mean   :100.50  
##  3rd Qu.:109.20  
##  Max.   :146.23

ii

cleanhousingdata <- cleanhousingdata %>%
  mutate(
    qtr = case_when(
      month %in% 1:3 ~ 1,
      month %in% 4:6 ~ 2,
      month %in% 7:9 ~ 3,
      month %in% 10:12 ~ 4
    )
  )

state_division_map <- c(
  "AL" = "DV_ESC",
  "CA" = "DV_PAC",
  "CO" = "DV_M",
  "FL" = "DV_SA",
  "GA" = "DV_SA",
  "IA" = "DV_WNC",
  "IL" = "DV_ENC",
  "KY" = "DV_ESC",
  "MI" = "DV_ENC",
  "MN" = "DV_WNC",
  "MO" = "DV_WNC",
  "NC" = "DV_SA",
  "NE" = "DV_WNC",
  "NJ" = "DV_MA",
  "NM" = "DV_M",
  "NV" = "DV_M",
  "NY" = "DV_MA",
  "OH" = "DV_ENC",
  "OK" = "DV_WSC",
  "OR" = "DV_PAC",
  "PA" = "DV_MA",
  "SC" = "DV_SA",
  "TN" = "DV_ESC",
  "TX" = "DV_WSC",
  "VA" = "DV_SA",
  "WA" = "DV_PAC"
)

cleanhousingdata <- cleanhousingdata %>%
  mutate(division = state_division_map[abbr])

# missing_divisions <- cleanhousingdata %>%
#   filter(is.na(division))
# 
# print(missing_divisions)

cleanhousinghpidata <- cleanhousingdata %>%
  left_join(
    cleanhpidata %>% select(division, year, qtr, hpi),
    by = c("division", "year", "qtr")
  )
cleanhousinghpidata <- cleanhousinghpidata %>%
  mutate(
    dlog_sale_amount = log(sale_amount * 100 / hpi)
  )

iii

states_fips <- usmap::fips_info()
cleanhousinghpidata <- cleanhousinghpidata %>%
  left_join(states_fips %>% select(abbr, fips), by = "abbr")
state_year_summary <- cleanhousinghpidata %>%
  group_by(fips, year) %>%
  summarise(mean_dlog_sale = mean(dlog_sale_amount, na.rm = TRUE), .groups = "drop") %>%
  ungroup()
start_year <- min(state_year_summary$year, na.rm = TRUE)
end_year <- max(state_year_summary$year, na.rm = TRUE)
state_map_start <- state_year_summary %>% filter(year == start_year)
state_map_end <- state_year_summary %>% filter(year == end_year)
plot_state_map <- function(data, year_label) {
  plot_usmap(data = data, values = "mean_dlog_sale", regions = "states") +
    scale_fill_continuous(low = "lightblue", high = "darkblue", name = "log(price/hpi)") +
    labs(title = paste("House Price Index (HPI) -", year_label)) +
    theme(legend.position = "right")
}
p1 <- plot_state_map(state_map_start, start_year)
p2 <- plot_state_map(state_map_end, end_year)
# State-Level Maps for Start and End Year
print(p1)

print(p2)

plot_list <- list()
for (y in unique(state_year_summary$year)) {
  data <- state_year_summary %>% filter(year == y)
  p <- plot_state_map(data, y)
  plot_list[[as.character(y)]] <- p
}
#Annual State-Level Maps
library(gridExtra)
do.call(grid.arrange, c(plot_list, ncol = 3))

############################
state_built_summary <- cleanhousinghpidata %>%
  group_by(fips, year_built_grp) %>%
  summarise(mean_dlog_sale = mean(dlog_sale_amount, na.rm = TRUE), .groups = "drop") %>%
  ungroup()

plot_state_map_built <- function(data, built_grp_label) {
  plot_usmap(data = data, values = "mean_dlog_sale", regions = "states") +
    scale_fill_continuous(low = "lightgreen", high = "darkgreen", name = "log(price/hpi)") +
    labs(title = paste("House Price Index by Year Built Group -", built_grp_label)) +
    theme(legend.position = "right")
}

plot_list_built <- list()

for (g in unique(state_built_summary$year_built_grp)) {
  data <- state_built_summary %>% filter(year_built_grp == g)
  p <- plot_state_map_built(data, g)
  plot_list_built[[as.character(g)]] <- p
}

# Maps by year_built_grp
do.call(grid.arrange, c(plot_list_built, ncol = 2))

#######################################################################

county_summary <- cleanhousinghpidata %>%
  group_by(fips, fips_county_code) %>%
  summarise(
    mean_year_built_grp = mean(year_built_grp, na.rm = TRUE),
    mean_dlog_sale = mean(dlog_sale_amount, na.rm = TRUE), .groups = "drop"
  ) %>%
  ungroup()

county_summary2 <- county_summary %>%
  select(-fips) %>%  # Remove the existing state-level fips column
  rename(fips = fips_county_code)  # Rename county FIPS correctly
plot_county_map <- function(data, value_col, title_label) {
  plot_usmap(data = data, values = value_col, regions = "counties") +
    scale_fill_continuous(low = "yellow", high = "red", name = title_label) +
    labs(title = title_label) +
    theme(legend.position = "right")
}

p3 <- plot_county_map(county_summary2, "mean_year_built_grp", "Average Year Built Group")
p4 <- plot_county_map(county_summary2, "mean_dlog_sale", "Average House Price")
# County-Level Maps
print(p3)

print(p4)

## PROBLEM 3. Adding further secondary data

crimedata <- read_excel("C:/Users/mohan/Dropbox/Mohan_files/530/PS1/table-1.xls", 
                        range = "A4:V24", col_names = TRUE)
colnames(crimedata) <- c("year", "population", "violent_crime", "violent_crime_rate", 
                         "murder", "murder_rate", "rape_revised", "rape_revised_rate", 
                         "rape_legacy", "rape_legacy_rate", "robbery", "robbery_rate",
                         "aggravated_assault", "aggravated_assault_rate", 
                         "property_crime", "property_crime_rate", "burglary", "burglary_rate", 
                         "larceny_theft", "larceny_theft_rate", "motor_vehicle_theft", 
                         "motor_vehicle_theft_rate")
crimedata <- crimedata %>%
  mutate(year = as.numeric(substr(as.character(year), 1, 4)))
crimedata <- crimedata %>%
  mutate(year = as.numeric(year)) %>%
  filter(!is.na(year)) 
summary(crimedata)
##       year        population        violent_crime     violent_crime_rate
##  Min.   :2000   Min.   :281421906   Min.   :1153022   Min.   :361.6     
##  1st Qu.:2005   1st Qu.:295794506   1st Qu.:1208999   1st Qu.:381.3     
##  Median :2010   Median :308168384   Median :1288572   Median :418.2     
##  Mean   :2010   Mean   :307116277   Mean   :1305421   Mean   :427.3     
##  3rd Qu.:2014   3rd Qu.:319404705   3rd Qu.:1401588   3rd Qu.:472.8     
##  Max.   :2019   Max.   :328239523   Max.   :1439480   Max.   :506.5     
##                                                                         
##      murder       murder_rate     rape_revised    rape_revised_rate
##  Min.   :14164   Min.   :4.400   Min.   :113695   Min.   :35.90    
##  1st Qu.:15263   1st Qu.:4.875   1st Qu.:122081   1st Qu.:38.15    
##  Median :16188   Median :5.350   Median :132414   Median :40.90    
##  Mean   :15984   Mean   :5.205   Mean   :129931   Mean   :40.20    
##  3rd Qu.:16581   3rd Qu.:5.600   3rd Qu.:137741   3rd Qu.:42.15    
##  Max.   :17413   Max.   :5.800   Max.   :143765   Max.   :44.00    
##                                  NA's   :13       NA's   :13       
##   rape_legacy     rape_legacy_rate    robbery        robbery_rate  
##  Min.   : 82109   Min.   :25.90    Min.   :267988   Min.   : 81.6  
##  1st Qu.: 88329   1st Qu.:28.23    1st Qu.:331625   1st Qu.:102.7  
##  Median : 91711   Median :30.30    Median :385280   Median :126.2  
##  Mean   : 91781   Mean   :29.94    Mean   :375602   Mean   :123.2  
##  3rd Qu.: 95126   3rd Qu.:31.80    3rd Qu.:418280   3rd Qu.:145.2  
##  Max.   :101363   Max.   :33.10    Max.   :449246   Max.   :150.0  
##                                                                    
##  aggravated_assault aggravated_assault_rate property_crime    
##  Min.   :726777     Min.   :229.2           Min.   : 6925677  
##  1st Qu.:777397     1st Qu.:246.8           1st Qu.: 8162786  
##  Median :816848     Median :258.8           Median : 9224842  
##  Mean   :822054     Mean   :268.9           Mean   : 9141687  
##  3rd Qu.:863255     3rd Qu.:291.1           3rd Qu.:10176712  
##  Max.   :911706     Max.   :324.0           Max.   :10455277  
##                                                               
##  property_crime_rate    burglary       burglary_rate   larceny_theft    
##  Min.   :2110        Min.   :1117696   Min.   :340.5   Min.   :5086096  
##  1st Qu.:2556        1st Qu.:1681756   1st Qu.:526.6   1st Qu.:5787662  
##  Median :2994        Median :2130489   Median :709.5   Median :6271348  
##  Mean   :2999        Mean   :1927672   Mean   :633.0   Mean   :6278173  
##  3rd Qu.:3452        3rd Qu.:2172629   3rd Qu.:731.0   3rd Qu.:6821858  
##  Max.   :3658        Max.   :2228887   Max.   :747.0   Max.   :7092267  
##                                                                         
##  larceny_theft_rate motor_vehicle_theft motor_vehicle_theft_rate
##  Min.   :1550       Min.   : 686803     Min.   :215.4           
##  1st Qu.:1812       1st Qu.: 722861     1st Qu.:230.2           
##  Median :2035       Median : 784298     Median :249.2           
##  Mean   :2058       Mean   : 935842     Mean   :308.5           
##  3rd Qu.:2306       3rd Qu.:1205782     3rd Qu.:413.4           
##  Max.   :2486       Max.   :1261226     Max.   :433.7           
## 
year_range <- range(cleanhousinghpidata$year, na.rm = TRUE)
cleancrimedata <- crimedata %>%
  filter(year >= year_range[1] & year <= year_range[2]) %>%
  select(year, violent_crime_rate, property_crime_rate, murder_rate) 
head(cleancrimedata)
## # A tibble: 6 × 4
##    year violent_crime_rate property_crime_rate murder_rate
##   <dbl>              <dbl>               <dbl>       <dbl>
## 1  2008               459.               3215.         5.4
## 2  2009               432.               3041.         5  
## 3  2010               404.               2946.         4.8
## 4  2011               387.               2905.         4.7
## 5  2012               388.               2868          4.7
## 6  2013               369.               2734.         4.5
cleanhousinghpicrimedata <- cleanhousinghpidata %>%
  left_join(cleancrimedata, by = "year")
head(cleanhousinghpicrimedata)
##    sale_amount  sale_date         x        y fips_county_code year_built
##          <num>     <Date>     <num>    <num>            <int>      <int>
## 1:      104000 2015-01-29 -86.19826 32.41438             1101       2009
## 2:      180501 2009-09-04 -86.20467 32.32170             1101       1986
## 3:      140000 2013-02-28 -86.13278 32.34974             1101       2006
## 4:      245916 2008-08-25 -86.17190 32.40677             1101       2000
## 5:       60000 2014-02-25 -86.27811 32.39384             1101       1940
## 6:      108150 2018-01-11 -86.12927 32.34984             1101       2006
##    bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
##                     <int>               <num>                <int>
## 1:                      3                 2.0                    1
## 2:                      3                 2.0                    1
## 3:                      3                 2.0                    1
## 4:                      3                 2.0                    1
## 5:                      2                 1.5                    1
## 6:                      3                 2.0                    1
##    stories_number land_square_footage   abbr AC_presence  year month   day
##             <num>               <int> <char>       <num> <num> <int> <int>
## 1:              1               12855     AL           0  2015     1    29
## 2:              1               16583     AL           0  2009     9     4
## 3:              1                6216     AL           0  2013     2    28
## 4:              1                8372     AL           0  2008     8    25
## 5:              1               21928     AL           0  2014     2    25
## 6:              1                4713     AL           0  2018     1    11
##    year_built_grp log_sale_amount   qtr division       hpi dlog_sale_amount
##             <num>           <num> <num>   <char>     <num>            <num>
## 1:              4        11.55215     1   DV_ESC  99.83123         11.55384
## 2:              3        12.10349     3   DV_ESC  95.13630         12.15335
## 3:              4        11.84940     1   DV_ESC  93.68383         11.91464
## 4:              3        12.41275     3   DV_ESC  97.95939         12.43336
## 5:              1        11.00210     1   DV_ESC  96.14381         11.04142
## 6:              4        11.59127     1   DV_ESC 115.72137         11.44526
##      fips violent_crime_rate property_crime_rate murder_rate
##    <char>              <num>               <num>       <num>
## 1:     01              373.7              2500.5         4.9
## 2:     01              431.9              3041.3         5.0
## 3:     01              369.1              2733.6         4.5
## 4:     01              458.6              3214.6         5.4
## 5:     01              361.6              2574.1         4.4
## 6:     01              370.4              2209.8         5.0
#Plot Crime Rates Against Year
ggplot(cleanhousinghpicrimedata, aes(x = year)) +
  geom_line(aes(y = violent_crime_rate, color = "Violent Crime Rate"), linewidth  = 1) +
  geom_line(aes(y = property_crime_rate, color = "Property Crime Rate"), linewidth = 1) +
  geom_line(aes(y = murder_rate, color = "Murder Rate"), linewidth = 1) +
  labs(title = "Crime Rates Over Time",
       x = "Year",
       y = "Crime Rate (per 100,000 inhabitants)") +
  theme_minimal() +
  scale_color_manual(name = "Crime Type", 
                     values = c("Violent Crime Rate" = "red", 
                                "Property Crime Rate" = "blue",
                                "Murder Rate" = "black"))

#Plot dlog_sale_amount Against Year by year_built_grp

cleanhousinghpicrimedata <- cleanhousinghpicrimedata %>%
  filter(!is.na(hpi) & hpi > 0)
ggplot(cleanhousinghpicrimedata, aes(x = year, y = dlog_sale_amount, color = as.factor(year_built_grp))) +
  geom_line(stat = "summary", fun = mean, size = 1) +
  labs(title = "Deflated Log Sale Amount Over Time by Year Built Group",
       x = "Year",
       y = "Mean dlog_sale_amount",
       color = "Year Built Group") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## PROBLEM 4. Summarizing the data

i

summary_full <- cleanhousinghpicrimedata %>%
  select(sale_amount, dlog_sale_amount, violent_crime_rate, property_crime_rate, murder_rate, 
         year_built, year_built_grp)
stargazer(summary_full, type = "text", title = "Summary Statistics - Full Sample")
## 
## Summary Statistics - Full Sample
## ===========================================================================
## Statistic               N        Mean      St. Dev.      Min        Max    
## ---------------------------------------------------------------------------
## sale_amount         1,929,564 307,710.000 191,439.300  100.000  915,000.000
## dlog_sale_amount    1,929,564   12.402       0.712      4.390     14.044   
## violent_crime_rate  1,929,564   385.347     24.393     361.600    458.600  
## property_crime_rate 1,929,564  2,572.400    326.658   2,109.900  3,214.600 
## murder_rate         1,929,564    4.948       0.309      4.400      5.400   
## year_built          1,929,564  1,981.345    24.137      1,700      2,019   
## year_built_grp      1,929,564    2.906       0.853        1          4     
## ---------------------------------------------------------------------------

ii

summary_by_year_built_grp <- cleanhousinghpicrimedata %>%
  group_by(year_built_grp) %>%
  summarise(
    sale_amount_mean = mean(sale_amount, na.rm = TRUE),
    dlog_sale_amount_mean = mean(dlog_sale_amount, na.rm = TRUE),
    violent_crime_rate_mean = mean(violent_crime_rate, na.rm = TRUE),
    property_crime_rate_mean = mean(property_crime_rate, na.rm = TRUE),
    murder_rate_mean = mean(murder_rate, na.rm = TRUE),
    year_built_mean = mean(year_built, na.rm = TRUE)
  )
kable(summary_by_year_built_grp, caption = "Summary Statistics by Year Built Group") %>%
  kable_styling(full_width = FALSE)
Summary Statistics by Year Built Group
year_built_grp sale_amount_mean dlog_sale_amount_mean violent_crime_rate_mean property_crime_rate_mean murder_rate_mean year_built_mean
1 328950.0 12.35417 385.8611 2587.192 4.938901 1923.953
2 353120.5 12.51055 386.4877 2598.179 4.937084 1957.862
3 289253.1 12.35696 384.2947 2557.417 4.948682 1986.273
4 296101.8 12.39938 386.1105 2573.155 4.957604 2007.321

iii

summary_by_division <- cleanhousinghpicrimedata %>%
  group_by(division) %>%
  summarise(
    sale_amount_mean = mean(sale_amount, na.rm = TRUE),
    dlog_sale_amount_mean = mean(dlog_sale_amount, na.rm = TRUE),
    violent_crime_rate_mean = mean(violent_crime_rate, na.rm = TRUE),
    property_crime_rate_mean = mean(property_crime_rate, na.rm = TRUE),
    murder_rate_mean = mean(murder_rate, na.rm = TRUE),
    year_built_mean = mean(year_built, na.rm = TRUE)
  )
kable(summary_by_division, caption = "Summary Statistics by Census Division") %>%
  kable_styling(full_width = FALSE)
Summary Statistics by Census Division
division sale_amount_mean dlog_sale_amount_mean violent_crime_rate_mean property_crime_rate_mean murder_rate_mean year_built_mean
DV_ENC 235055.3 12.19496 386.9429 2564.518 4.982437 1979.142
DV_ESC 231787.3 12.09080 380.0758 2440.104 4.982176 1993.127
DV_MA 194339.4 11.93277 384.2614 2547.189 4.961842 1972.058
DV_PAC 390170.3 12.72023 387.3039 2610.851 4.937189 1972.986
DV_SA 252154.0 12.22713 382.8063 2525.114 4.960761 1986.190
DV_WNC 252019.6 12.17645 384.4482 2556.924 4.952392 1979.100
DV_WSC 242800.0 12.08773 385.7376 2577.424 4.945299 1993.003

#iv

summary_by_region <- cleanhousinghpicrimedata %>%
  group_by(division) %>%
  summarise(
    sale_amount_mean = mean(sale_amount, na.rm = TRUE),
    dlog_sale_amount_mean = mean(dlog_sale_amount, na.rm = TRUE),
    violent_crime_rate_mean = mean(violent_crime_rate, na.rm = TRUE),
    property_crime_rate_mean = mean(property_crime_rate, na.rm = TRUE),
    murder_rate_mean = mean(murder_rate, na.rm = TRUE),
    year_built_mean = mean(year_built, na.rm = TRUE)
  )
kable(summary_by_region, caption = "Summary Statistics by census-designated Region") %>%
  kable_styling(full_width = FALSE)
Summary Statistics by census-designated Region
division sale_amount_mean dlog_sale_amount_mean violent_crime_rate_mean property_crime_rate_mean murder_rate_mean year_built_mean
DV_ENC 235055.3 12.19496 386.9429 2564.518 4.982437 1979.142
DV_ESC 231787.3 12.09080 380.0758 2440.104 4.982176 1993.127
DV_MA 194339.4 11.93277 384.2614 2547.189 4.961842 1972.058
DV_PAC 390170.3 12.72023 387.3039 2610.851 4.937189 1972.986
DV_SA 252154.0 12.22713 382.8063 2525.114 4.960761 1986.190
DV_WNC 252019.6 12.17645 384.4482 2556.924 4.952392 1979.100
DV_WSC 242800.0 12.08773 385.7376 2577.424 4.945299 1993.003

PROBLEM 5. Running regressions

i

model_formula <- dlog_sale_amount ~ violent_crime_rate + property_crime_rate + 
                 murder_rate + year_built + factor(year_built_grp) + factor(region)

ii

library(texreg)
## Warning: 程序包'texreg'是用R版本4.4.1 来建造的
## Version:  1.39.4
## Date:     2024-07-23
## Author:   Philip Leifeld (University of Manchester)
## 
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
## 
## 载入程序包:'texreg'
## The following object is masked from 'package:tidyr':
## 
##     extract
model1 <- lm(dlog_sale_amount ~ violent_crime_rate + property_crime_rate, data = cleanhousinghpicrimedata)
model2 <- lm(dlog_sale_amount ~ violent_crime_rate + property_crime_rate + murder_rate, data = cleanhousinghpicrimedata)
model3 <- lm(dlog_sale_amount ~ violent_crime_rate + property_crime_rate + murder_rate + year_built + factor(year_built_grp), data = cleanhousinghpicrimedata)
screenreg(list(model1, model2, model3), custom.model.names = c("Model 1", "Model 2", "Model 3"))
## 
## =======================================================================
##                          Model 1         Model 2         Model 3       
## -----------------------------------------------------------------------
## (Intercept)                   12.33 ***       12.22 ***        7.86 ***
##                               (0.01)          (0.01)          (0.12)   
## violent_crime_rate            -0.00 ***       -0.00 ***       -0.00 ***
##                               (0.00)          (0.00)          (0.00)   
## property_crime_rate            0.00 ***        0.00 ***        0.00 ***
##                               (0.00)          (0.00)          (0.00)   
## murder_rate                                    0.08 ***        0.08 ***
##                                               (0.00)          (0.00)   
## year_built                                                     0.00 ***
##                                                               (0.00)   
## factor(year_built_grp)2                                        0.08 ***
##                                                               (0.00)   
## factor(year_built_grp)3                                       -0.13 ***
##                                                               (0.00)   
## factor(year_built_grp)4                                       -0.14 ***
##                                                               (0.01)   
## -----------------------------------------------------------------------
## R^2                            0.01            0.01            0.02    
## Adj. R^2                       0.01            0.01            0.02    
## Num. obs.                1929564         1929564         1929564       
## =======================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

iii

model_by_region <- cleanhousinghpicrimedata %>%
  group_by(division) %>%
  do(model = lm(dlog_sale_amount ~ violent_crime_rate + property_crime_rate + year_built, data = .))
region_results <- model_by_region %>%
  summarise(division = unique(division),
            violent_crime_coef = coef(model)[["violent_crime_rate"]],
            property_crime_coef = coef(model)[["property_crime_rate"]])
print(region_results)
## # A tibble: 7 × 3
##   division violent_crime_coef property_crime_coef
##   <chr>                 <dbl>               <dbl>
## 1 DV_ENC            -0.000201           0.000274 
## 2 DV_ESC            -0.000414           0.000142 
## 3 DV_MA              0.000558           0.0000113
## 4 DV_PAC            -0.00228            0.000297 
## 5 DV_SA             -0.000595           0.0000375
## 6 DV_WNC            -0.00154            0.000140 
## 7 DV_WSC            -0.00112            0.000125

iv

cleanhousinghpicrimedata <- cleanhousinghpicrimedata %>%
  left_join(region_results, by = "division")

ggplot(cleanhousinghpicrimedata, aes(x = division, y = violent_crime_coef, fill = division)) +
  geom_bar(stat = "identity") +
  labs(title = "Estimated Effect of Violent Crime Rate on House Prices by Division",
       x = "Census Division",
       y = "Effect Size") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))