Setting Working Directory

setwd("C:/Users/Lenovo/OneDrive - The Pennsylvania State University/Penn State/Coursework/Sem IV/EEFE 530/Problem Sets/Problem Set 1")

#Libraries

# Load packages
library(pacman)
p_load(tidyverse, lubridate, usmap, gridExtra, stringr, readxl, plot3D,  
       cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer,  
       foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont, 
       kableExtra, snakecase, janitor)
library(dplyr)

#Load Data

housingdata <- readRDS("testdata20250121.RDS")

#PROBLEM 1

##Part i - Date Extraction

#Checking variable type
str(housingdata$sale_date)  # character
##  chr [1:2350548] "20150129" "20090904" "20130228" "20080825" "20140225" ...
head(housingdata$sale_date)  # View first few values
## [1] "20150129" "20090904" "20130228" "20080825" "20140225" "20180111"
#create a copy
housingdata_wip <- housingdata

#Change the string to YYY-MM-DD format
housingdata_wip <- housingdata_wip %>%
  mutate(sale_date = ymd(sale_date)) 

#Separate columns
housingdata_wip <- housingdata_wip %>%
  mutate(
    year = year(sale_date),
    month = month(sale_date),
    day = day(sale_date)
  )

#Displaying first 10 rows with extracted dates
housingdata_wip %>%
  select(sale_date, year, month, day) %>%
  head(10)
##      sale_date  year month   day
##         <Date> <int> <int> <int>
##  1: 2015-01-29  2015     1    29
##  2: 2009-09-04  2009     9     4
##  3: 2013-02-28  2013     2    28
##  4: 2008-08-25  2008     8    25
##  5: 2014-02-25  2014     2    25
##  6: 2018-01-11  2018     1    11
##  7: 2015-04-08  2015     4     8
##  8: 2015-06-23  2015     6    23
##  9: 2009-05-28  2009     5    28
## 10: 2012-01-31  2012     1    31

Part ii. Summarizing Variables

#Summarizing Year
summary(housingdata_wip$year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2008    2012    2015    2015    2017    2019
hist(housingdata_wip$year)

#Summarizing Year Built
summary(housingdata_wip$year_built)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1700    1965    1986    1981    2001    2019
hist(housingdata_wip$year_built)

#Generating Categorical Variable for Year Built
housingdata_wip <- housingdata_wip %>%
  mutate(year_built_grp = case_when(
    year_built < 1965 ~ 1,  # Very old houses = based on first quantile
    year_built >= 1965 & year_built < 1986 ~ 2,  # Quite old houses = based on median
    year_built >= 1986 & year_built < 2001 ~ 3,  # Quite new houses = based on third quantile
    year_built >= 2001 ~ 4  # Very new houses
  ))

#Defining categorical labels for `year_built_grp`
housingdata_wip <- housingdata_wip %>%
  mutate(
    log_sale_amount = log(sale_amount),
    log_land_square_footage = log(land_square_footage),
    year_built_grp_label = case_when(
      year_built_grp == 1 ~ "1: Very Old (<1965)",
      year_built_grp == 2 ~ "2: Quite Old (1965-1986)",
      year_built_grp == 3 ~ "3: Quite New (1986-2001)",
      year_built_grp == 4 ~ "4: Very New (≥2001)"
    )
  )

#Displaying first few rows
head(housingdata_wip %>% select(year_built, year_built_grp), 10)
##     year_built year_built_grp
##          <int>          <num>
##  1:       2009              4
##  2:       1986              3
##  3:       2006              4
##  4:       2000              3
##  5:       1940              1
##  6:       2006              4
##  7:       1999              3
##  8:       1998              3
##  9:       2007              4
## 10:       1994              3

Part iii - 3D Plot

#Creating logged variables for continuous values
housingdata_wip <- housingdata_wip %>%
  mutate(
    log_sale_amount = log(sale_amount),
    log_land_square_footage = log(land_square_footage)
  )

#Check NAs
sum(is.na(housingdata_wip$log_land_square_footage)) #zero
## [1] 0
sum(is.na(housingdata_wip$log_sale_amount)) #zero
## [1] 0
#3D scatter Plot

library(plot3D)

#Defining colours for categories
colors <- c("darkblue", "skyblue", "yellow", "brown")  
color_mapping <- colors[as.numeric(housingdata_wip$year_built_grp)]

#Plot
plot_3d <- scatter3D(
  x = housingdata_wip$log_sale_amount,  
  y = housingdata_wip$year_built_grp,   
  z = housingdata_wip$log_land_square_footage,  
  colvar = NULL,  
  col = color_mapping,
  pch = 19, 
  cex = 1.2,
  theta = 40, phi = 20,  
  main = "3D Plot of Log Sale Amount, Year Built Group, and Log Land Square Footage",
  xlab = "Log Sale Amount",
  ylab = "Year Built Group",
  zlab = "Log Land Square Footage",
  ticktype = "detailed"
)

plot_3d
##               [,1]        [,2]        [,3]        [,4]
## [1,]  8.947906e-02 -0.02567950  0.07055385 -0.07055385
## [2,]  4.285251e-01  0.17466842 -0.47989754  0.47989754
## [3,] -3.798445e-18  0.09068963  0.03300832 -0.03300832
## [4,] -2.249423e+00 -1.03825888 -2.80326238  3.80326238
#Legend
par(new = TRUE)
legend("topright", 
       legend = c("1: Very Old (<1965)", "2: Quite Old (1965-1986)", 
                  "3: Quite New (1986-2001)", "4: Very New (≥2001)"),
       col = colors, pch = 19, cex = 1.2, bty = "n")

Part iv. Cleaning the data

#Summarising numerical variables
summary_stats <- housingdata_wip %>% summary(across(where(is.numeric)))
print(summary_stats)
##   sale_amount          sale_date                x                 y        
##  Min.   :1.000e+02   Min.   :2008-01-01   Min.   :-124.57   Min.   :26.32  
##  1st Qu.:1.800e+05   1st Qu.:2012-06-13   1st Qu.:-118.32   1st Qu.:33.55  
##  Median :2.870e+05   Median :2015-07-28   Median :-104.77   Median :34.37  
##  Mean   :4.417e+05   Mean   :2015-01-19   Mean   :-102.81   Mean   :35.88  
##  3rd Qu.:4.740e+05   3rd Qu.:2017-12-13   3rd Qu.: -84.53   3rd Qu.:38.56  
##  Max.   :2.730e+09   Max.   :2019-12-31   Max.   : -73.91   Max.   :48.97  
##  fips_county_code   year_built   bedrooms_all_buildings number_of_bathrooms
##  Min.   : 1101    Min.   :1700   Min.   :  1.000        Min.   :  0.010    
##  1st Qu.: 6081    1st Qu.:1965   1st Qu.:  3.000        1st Qu.:  2.000    
##  Median :12091    Median :1986   Median :  3.000        Median :  2.000    
##  Mean   :20436    Mean   :1981   Mean   :  3.431        Mean   :  2.472    
##  3rd Qu.:40087    3rd Qu.:2001   3rd Qu.:  4.000        3rd Qu.:  3.000    
##  Max.   :53071    Max.   :2019   Max.   :999.000        Max.   :450.000    
##  number_of_fireplaces stories_number    land_square_footage     abbr          
##  Min.   :  1.00       Min.   :  0.500   Min.   :        1   Length:2350548    
##  1st Qu.:  1.00       1st Qu.:  1.000   1st Qu.:     6874   Class :character  
##  Median :  1.00       Median :  1.000   Median :     9583   Mode  :character  
##  Mean   :  1.19       Mean   :  1.371   Mean   :    39671                     
##  3rd Qu.:  1.00       3rd Qu.:  2.000   3rd Qu.:    17031                     
##  Max.   :999.00       Max.   :175.000   Max.   :999999999                     
##   AC_presence          year          month             day       
##  Min.   :0.0000   Min.   :2008   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.:0.0000   1st Qu.:2012   1st Qu.: 4.000   1st Qu.: 9.00  
##  Median :1.0000   Median :2015   Median : 7.000   Median :17.00  
##  Mean   :0.6091   Mean   :2015   Mean   : 6.662   Mean   :16.76  
##  3rd Qu.:1.0000   3rd Qu.:2017   3rd Qu.: 9.000   3rd Qu.:25.00  
##  Max.   :1.0000   Max.   :2019   Max.   :12.000   Max.   :31.00  
##  year_built_grp  log_sale_amount  log_land_square_footage year_built_grp_label
##  Min.   :1.000   Min.   : 4.605   Min.   : 0.000          Length:2350548      
##  1st Qu.:2.000   1st Qu.:12.101   1st Qu.: 8.836          Class :character    
##  Median :3.000   Median :12.567   Median : 9.168          Mode  :character    
##  Mean   :2.525   Mean   :12.595   Mean   : 9.386                              
##  3rd Qu.:4.000   3rd Qu.:13.069   3rd Qu.: 9.743                              
##  Max.   :4.000   Max.   :21.727   Max.   :20.723

The minimum bathroom is 0.010 and stories 0.50 which seems invalid, therefore, I will remove those.

#Filtering values
housingdata_wip_clean <- housingdata_wip %>%
  filter(number_of_bathrooms >= 1, stories_number >= 1)

Some of the outliers that are odd are: number of bedrooms and fireplaces being 999, number of bathrooms being 450, number of stories being 175. I will check the values of these variables above their 90th, 95th, and 99.99th percentile and if they are far from mean and median, I will remove them on account of being errors

###90th Percentile

colnames(housingdata_wip)
##  [1] "sale_amount"             "sale_date"              
##  [3] "x"                       "y"                      
##  [5] "fips_county_code"        "year_built"             
##  [7] "bedrooms_all_buildings"  "number_of_bathrooms"    
##  [9] "number_of_fireplaces"    "stories_number"         
## [11] "land_square_footage"     "abbr"                   
## [13] "AC_presence"             "year"                   
## [15] "month"                   "day"                    
## [17] "year_built_grp"          "log_sale_amount"        
## [19] "log_land_square_footage" "year_built_grp_label"
#90th Percentile
##List of variables for which we want the 90th percentile
vars_to_check <- c("bedrooms_all_buildings", "number_of_bathrooms", 
                   "number_of_fireplaces", "stories_number")

##Compute the 90th percentile for each variable
percentile_90 <- housingdata_wip %>%
  summarise(across(all_of(vars_to_check), 
                   ~ quantile(.x, 0.90, na.rm = TRUE)))

##Print the results
print(percentile_90)
##   bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## 1                      4                 3.5                    2
##   stories_number
## 1              2

###95th Percentile

#95th Percentile
##Computing the 95th percentile for each variable
percentile_95 <- housingdata_wip %>%
  summarise(across(all_of(vars_to_check), 
                   ~ quantile(.x, 0.95, na.rm = TRUE)))

# Print the results
print(percentile_95)
##   bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## 1                      5                   4                    2
##   stories_number
## 1              2

###99.99th Percentile

#95th Percentile
##Computing the 99.99th percentile for each variable
percentile_9999 <- housingdata_wip %>%
  summarise(across(all_of(vars_to_check), 
                   ~ quantile(.x, 0.9999, na.rm = TRUE)))

# Print the results
print(percentile_9999)
##   bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## 1                     14                  11                   99
##   stories_number
## 1              7

Since all the values seem reasonable up until 99.9th Percentile, I will remove entries above them.

#Computing the 99.99th percentile for each variable and storing it in new columns
housingdata_wip_clean <- housingdata_wip_clean %>%
  mutate(
    bedrooms_99 = quantile(bedrooms_all_buildings, 0.9999, na.rm = TRUE),
    bathrooms_99 = quantile(number_of_bathrooms, 0.9999, na.rm = TRUE),
    fireplaces_99 = quantile(number_of_fireplaces, 0.9999, na.rm = TRUE),
    stories_99 = quantile(stories_number, 0.9999, na.rm = TRUE)
  )

#Remove rows where values exceed 99.99th percentile
housingdata_wip_clean <- housingdata_wip_clean %>%
  filter(
    bedrooms_all_buildings < bedrooms_99,
    number_of_bathrooms < bathrooms_99,
    number_of_fireplaces < fireplaces_99,
    stories_number < stories_99
  )

#Summary of clean data
summary(housingdata_wip_clean)
##   sale_amount          sale_date                x                 y        
##  Min.   :1.000e+02   Min.   :2008-01-01   Min.   :-124.57   Min.   :26.32  
##  1st Qu.:1.800e+05   1st Qu.:2012-06-13   1st Qu.:-118.32   1st Qu.:33.54  
##  Median :2.875e+05   Median :2015-07-28   Median :-104.77   Median :34.37  
##  Mean   :4.411e+05   Mean   :2015-01-18   Mean   :-102.81   Mean   :35.88  
##  3rd Qu.:4.742e+05   3rd Qu.:2017-12-13   3rd Qu.: -84.53   3rd Qu.:38.55  
##  Max.   :2.730e+09   Max.   :2019-12-31   Max.   : -73.91   Max.   :48.97  
##  fips_county_code   year_built   bedrooms_all_buildings number_of_bathrooms
##  Min.   : 1101    Min.   :1700   Min.   : 1.000         Min.   : 1.000     
##  1st Qu.: 6081    1st Qu.:1965   1st Qu.: 3.000         1st Qu.: 2.000     
##  Median :12091    Median :1986   Median : 3.000         Median : 2.000     
##  Mean   :20437    Mean   :1981   Mean   : 3.427         Mean   : 2.471     
##  3rd Qu.:40087    3rd Qu.:2001   3rd Qu.: 4.000         3rd Qu.: 3.000     
##  Max.   :53071    Max.   :2019   Max.   :13.000         Max.   :10.750     
##  number_of_fireplaces stories_number  land_square_footage     abbr          
##  Min.   : 1.000       Min.   :1.000   Min.   :        1   Length:2348446    
##  1st Qu.: 1.000       1st Qu.:1.000   1st Qu.:     6874   Class :character  
##  Median : 1.000       Median :1.000   Median :     9583   Mode  :character  
##  Mean   : 1.119       Mean   :1.371   Mean   :    39600                     
##  3rd Qu.: 1.000       3rd Qu.:2.000   3rd Qu.:    17000                     
##  Max.   :93.000       Max.   :6.500   Max.   :999999999                     
##   AC_presence          year          month             day       
##  Min.   :0.0000   Min.   :2008   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.:0.0000   1st Qu.:2012   1st Qu.: 4.000   1st Qu.: 9.00  
##  Median :1.0000   Median :2015   Median : 7.000   Median :17.00  
##  Mean   :0.6092   Mean   :2015   Mean   : 6.662   Mean   :16.76  
##  3rd Qu.:1.0000   3rd Qu.:2017   3rd Qu.: 9.000   3rd Qu.:25.00  
##  Max.   :1.0000   Max.   :2019   Max.   :12.000   Max.   :31.00  
##  year_built_grp  log_sale_amount  log_land_square_footage year_built_grp_label
##  Min.   :1.000   Min.   : 4.605   Min.   : 0.000          Length:2348446      
##  1st Qu.:2.000   1st Qu.:12.101   1st Qu.: 8.836          Class :character    
##  Median :3.000   Median :12.569   Median : 9.168          Mode  :character    
##  Mean   :2.526   Mean   :12.595   Mean   : 9.385                              
##  3rd Qu.:4.000   3rd Qu.:13.069   3rd Qu.: 9.741                              
##  Max.   :4.000   Max.   :21.727   Max.   :20.723                              
##   bedrooms_99  bathrooms_99 fireplaces_99   stories_99
##  Min.   :14   Min.   :11    Min.   :99    Min.   :7   
##  1st Qu.:14   1st Qu.:11    1st Qu.:99    1st Qu.:7   
##  Median :14   Median :11    Median :99    Median :7   
##  Mean   :14   Mean   :11    Mean   :99    Mean   :7   
##  3rd Qu.:14   3rd Qu.:11    3rd Qu.:99    3rd Qu.:7   
##  Max.   :14   Max.   :11    Max.   :99    Max.   :7
#Note: cleanhousingdata is called housingdata_wip_clean here

#PROBLEM 2

Part i

###Secondary Data

#Calling data
secondary_data <- read.csv("hpi_po_us_and_census.csv")

###Create quarter in Primary Dataset

#Generating quarters based on month
housingdata_wip_clean <- housingdata_wip_clean %>%
  mutate(qtr = case_when(
    month %in% c(1, 2, 3) ~ 1,
    month %in% c(4, 5, 6) ~ 2,
    month %in% c(7, 8, 9) ~ 3,
    month %in% c(10, 11, 12) ~ 4
  ))

###Create division abbreviations

#State-to-division mapping using the correct abbreviations
state_division_map <- tibble(
  abbr = c("CT", "ME", "MA", "NH", "RI", "VT", "NY", "NJ", "PA",  # Mid Atlantic
           "IL", "IN", "MI", "OH", "WI",  # East North Central
           "IA", "KS", "MN", "MO", "NE", "ND", "SD",  # West North Central
           "DE", "FL", "GA", "MD", "NC", "SC", "VA", "DC", "WV",  # South Atlantic
           "AL", "KY", "MS", "TN",  # East South Central
           "AR", "LA", "OK", "TX",  # West South Central
           "AZ", "CO", "ID", "MT", "NV", "NM", "UT", "WY",  # Mountain
           "AK", "CA", "HI", "OR", "WA"),  # Pacific
  division = c(rep("DV_MA", 9), rep("DV_ENC", 5), rep("DV_WNC", 7),
               rep("DV_SA", 9), rep("DV_ESC", 4), rep("DV_WSC", 4),
               rep("DV_MT", 8), rep("DV_PAC", 5))
)

#Merging division info into `housingdata_wip_clean`
housingdata_wip_clean <- housingdata_wip_clean %>%
  left_join(state_division_map, by = "abbr")

###Rescaling

# Step 1: Create `cleanhpidata` and Remove "USA" division
cleanhpidata <- secondary_data %>%
  filter(division != "USA")

# Step 2: Identify the First Available Year in House Sales Data
start_year <- min(housingdata_wip$year, na.rm = TRUE)

# Step 3: Correctly Extract Base HPI for Each Division in `qtr = 1` of `start_year`
base_hpi_values <- cleanhpidata %>%
  group_by(division) %>%
  filter(year == start_year, qtr == 1) %>%
  summarise(base_hpi = first(index_po_seasonally_adjusted), .groups = "drop")

# Step 4: Merge Base HPI Values with `cleanhpidata`
cleanhpidata <- cleanhpidata %>%
  left_join(base_hpi_values, by = "division") %>%
  mutate(hpi = (index_po_seasonally_adjusted / base_hpi) * 100)

# Step 5: Ensure `hpi = 100` for `qtr = 1` of `start_year`
cleanhpidata <- cleanhpidata %>%
  mutate(hpi = if_else(year == start_year & qtr == 1, 100, hpi))

# Step 6: Restrict `cleanhpidata` to the Range of Years in `housingdata_wip`
year_range <- range(housingdata_wip$year, na.rm = TRUE)
cleanhpidata <- cleanhpidata %>%
  filter(year >= year_range[1], year <= year_range[2])

# Step 7: Check Results
summary(cleanhpidata$hpi)  
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   71.87   91.09   97.26  100.50  109.20  146.23
head(cleanhpidata %>% select(division, year, qtr, index_po_seasonally_adjusted, base_hpi, hpi))
##   division year qtr index_po_seasonally_adjusted base_hpi       hpi
## 1   DV_ENC 2008   1                       179.93   179.93 100.00000
## 2   DV_ENC 2008   2                       175.95   179.93  97.78803
## 3   DV_ENC 2008   3                       172.76   179.93  96.01512
## 4   DV_ENC 2008   4                       168.17   179.93  93.46412
## 5   DV_ENC 2009   1                       169.15   179.93  94.00878
## 6   DV_ENC 2009   2                       166.74   179.93  92.66937

Part ii

#Merge the cleaned housing data and cleaned index data
housingdata_cleanhpidata <- housingdata_wip_clean %>%
  left_join(cleanhpidata, by = c("division", "year", "qtr"))

#Summary
summary(housingdata_cleanhpidata$hpi)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   71.87   89.60  102.45  104.75  120.16  146.23
# Computing the deflated house prices
housingdata_cleanhpidata <- housingdata_cleanhpidata %>%
  mutate(dlog_sale_amount = log((sale_amount * 100) / hpi))

#Summary
summary(housingdata_cleanhpidata$dlog_sale_amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.39   12.07   12.53   12.57   13.03   21.64

Part iii. US Maps

# Load necessary libraries
library(dplyr)
library(ggplot2)
library(usmap)

# Step 1: Creating data for mapping
state_year_avg <- housingdata_cleanhpidata %>%
  group_by(abbr, year) %>%
  summarise(mean_dlog_sale = mean(dlog_sale_amount, na.rm = TRUE)) %>%
  ungroup() %>%
  filter(!is.na(year))
## `summarise()` has grouped output by 'abbr'. You can override using the
## `.groups` argument.
# Step 2: Adjusting abbreviations for state names
state_year_avg <- state_year_avg %>%
  mutate(state = state.name[match(abbr, state.abb)])

# Step 3: Plotting the Data
plot_usmap(data = state_year_avg, values = "mean_dlog_sale", include = state_year_avg$state) +
  facet_wrap(~ year) +  
  scale_fill_continuous(
    name = "Mean dlog_sale_amount",
    low = "blue",
    high = "red",
    na.value = "white"
  ) +
  labs(
    title = "Mean dlog_sale_amount by State and Year",
    subtitle = "Each facet represents a year",
    caption = "Data Source: housingdata_cleanhpidata"
  ) +
  theme_minimal() +
  theme(legend.position = "right")

PROBLEM 3

Part i

library(stringr)

#Loading crime data
crimedata <- read_excel("table-1.xls", skip = 3, col_names = TRUE)
## New names:
## • `` -> `...23`
## • `` -> `...24`
#Checking column names
colnames(crimedata)
##  [1] "Year"                                           
##  [2] "Population1"                                    
##  [3] "Violent\ncrime2"                                
##  [4] "Violent \ncrime \nrate"                         
##  [5] "Murder and\nnonnegligent \nmanslaughter"        
##  [6] "Murder and \nnonnegligent \nmanslaughter \nrate"
##  [7] "Rape\n(revised \ndefinition)3"                  
##  [8] "Rape\n(revised \ndefinition) \nrate3"           
##  [9] "Rape\n(legacy \ndefinition)4"                   
## [10] "Rape\n(legacy \ndefinition) \nrate4"            
## [11] "Robbery"                                        
## [12] "Robbery \nrate"                                 
## [13] "Aggravated \nassault"                           
## [14] "Aggravated \nassault rate"                      
## [15] "Property \ncrime"                               
## [16] "Property \ncrime \nrate"                        
## [17] "Burglary"                                       
## [18] "Burglary \nrate"                                
## [19] "Larceny-\ntheft"                                
## [20] "Larceny-\ntheft rate"                           
## [21] "Motor \nvehicle \ntheft"                        
## [22] "Motor \nvehicle \ntheft \nrate"                 
## [23] "...23"                                          
## [24] "...24"
#drop last two vars
crimedata <- crimedata %>% select(-c(...23, ...24))

###Rename variables

crimedata <- crimedata %>%
  rename(
    year = "Year",
    population = "Population1",
    violent_crime = "Violent\ncrime2",
    violent_crime_rate = "Violent \ncrime \nrate",
    murder = "Murder and\nnonnegligent \nmanslaughter",
    murder_rate = "Murder and \nnonnegligent \nmanslaughter \nrate",
    rape_revised = "Rape\n(revised \ndefinition)3",
    rape_revised_rate = "Rape\n(revised \ndefinition) \nrate3",
    rape_legacy = "Rape\n(legacy \ndefinition)4",
    rape_legacy_rate = "Rape\n(legacy \ndefinition) \nrate4",
    robbery = "Robbery",
    robbery_rate = "Robbery \nrate",
    aggravated_assault = "Aggravated \nassault",
    aggravated_assault_rate = "Aggravated \nassault rate",
    property_crime = "Property \ncrime",
    property_crime_rate = "Property \ncrime \nrate",
    burglary = "Burglary",
    burglary_rate = "Burglary \nrate",
    larceny_theft = "Larceny-\ntheft",
    larceny_theft_rate = "Larceny-\ntheft rate",
    motor_vehicle_theft = "Motor \nvehicle \ntheft",
    motor_vehicle_theft_rate = "Motor \nvehicle \ntheft \nrate"
  )

# Check the renamed dataset
colnames(crimedata)
##  [1] "year"                     "population"              
##  [3] "violent_crime"            "violent_crime_rate"      
##  [5] "murder"                   "murder_rate"             
##  [7] "rape_revised"             "rape_revised_rate"       
##  [9] "rape_legacy"              "rape_legacy_rate"        
## [11] "robbery"                  "robbery_rate"            
## [13] "aggravated_assault"       "aggravated_assault_rate" 
## [15] "property_crime"           "property_crime_rate"     
## [17] "burglary"                 "burglary_rate"           
## [19] "larceny_theft"            "larceny_theft_rate"      
## [21] "motor_vehicle_theft"      "motor_vehicle_theft_rate"

Part ii.

###Fix footnotes

crimedata$year[which(crimedata$year == 20015)] <- 2001
crimedata$year[which(crimedata$year == 20186)] <- 2018

###Drop rows with notes

#Dropping last 7 rows with footnotes
crimedata <- crimedata %>%
  slice(1:(n() - 7))

#Cleaned dataset
tail(crimedata)
## # A tibble: 6 × 22
##   year  population violent_crime violent_crime_rate murder murder_rate
##   <chr>      <dbl>         <dbl>              <dbl>  <dbl>       <dbl>
## 1 2014   318907401       1153022               362.  14164         4.4
## 2 2015   320896618       1199310               374.  15883         4.9
## 3 2016   323405935       1250162               387.  17413         5.4
## 4 2017   325147121       1247917               384.  17294         5.3
## 5 2018   326687501       1209997               370.  16374         5  
## 6 2019   328239523       1203808               367.  16425         5  
## # ℹ 16 more variables: rape_revised <dbl>, rape_revised_rate <dbl>,
## #   rape_legacy <dbl>, rape_legacy_rate <dbl>, robbery <dbl>,
## #   robbery_rate <dbl>, aggravated_assault <dbl>,
## #   aggravated_assault_rate <dbl>, property_crime <dbl>,
## #   property_crime_rate <dbl>, burglary <dbl>, burglary_rate <dbl>,
## #   larceny_theft <dbl>, larceny_theft_rate <dbl>, motor_vehicle_theft <dbl>,
## #   motor_vehicle_theft_rate <dbl>
#Limiting the data to years given in housingdata_cleanhpidata
range(housingdata_cleanhpidata$year)
## [1] 2008 2019
crimedata <- crimedata %>% filter(year %in% 2008:2019)

Part iii.

###Find Missing Variables

#Identifying columns with missing values
missing_vars <- crimedata %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "missing_count") %>%
  filter(missing_count > 0)

#Printing missing variables and their missing value counts
print(missing_vars)
## # A tibble: 2 × 2
##   variable          missing_count
##   <chr>                     <int>
## 1 rape_revised                  5
## 2 rape_revised_rate             5

Removed “rape_revised” and “rape_revised_rate” as they had missing values (NAs).

###Clean Crime Data

cleancrimedata <- crimedata %>% select(-c(rape_revised, rape_revised_rate))

#Check NAs
sum(is.na(cleancrimedata)) #Zero Missing Values
## [1] 0

Part iv. Merge “cleancrimedata” and “housingdata_cleanhpidata”

Find common variables

crime_vars <- colnames(crimedata)
housing_vars <- colnames(housingdata_cleanhpidata)

#common variables
common_vars <- intersect(crime_vars, housing_vars)
common_vars
## [1] "year"

Merge Data

housingdata_cleanhpidata$year <- as.numeric(housingdata_cleanhpidata$year)
cleancrimedata$year <- as.numeric(cleancrimedata$year)

#Merge
cleanhousinghpicrimedata <- left_join(housingdata_cleanhpidata, cleancrimedata, by = "year")

###Plot Crime Rates and Year

# Define the variables to plot
crime_vars <- c("violent_crime_rate", "murder_rate", "rape_legacy_rate", "robbery_rate", 
                "aggravated_assault_rate", "property_crime_rate", "burglary_rate", 
                "larceny_theft_rate", "motor_vehicle_theft_rate")

# Convert dataset to long format
crime_long <- cleanhousinghpicrimedata %>%
  select(year, all_of(crime_vars)) %>%
  pivot_longer(cols = -year, names_to = "crime_type", values_to = "value")


# Plot 
ggplot(crime_long, aes(x = year, y = value, color = crime_type)) +
  geom_line(size = 1) +
  scale_x_continuous(breaks = seq(2008, 2019, by = 1)) +  
  labs(title = "Crime Trends Over the Years",
       x = "Year",
       y = "Crime Count / Rate",
       color = "Crime Type") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

The plot suggests that most crime rates are well below 1000 per 100,000, however, property crimes and larceny thefts are two highest crime rates. They seem to have decreased substantiatlly over the years but are still higher than other crime rates. Burglary rate is the third highest crime, but still declining.

###Plot deflated sale amount by year for different housing groups This plot takes a very long time to produce (havent successfully ran a single time even though waited for an hour or so), that’s why I am skipping plotting this. However, code is given below:

# Scatter plot with smoothed trend lines

#ggplot(cleanhousinghpicrimedata, aes(x = year, y = dlog_sale_amount, color = as.factor(year_built_grp))) + geom_point(alpha = 0.6, size = 2) +  geom_smooth(method = "loess", se = FALSE, size = 1) + scale_x_continuous(breaks = seq(2008, 2019, by = 1)) +   labs(title = "Deflated Sale Amount Over the Years by Year Built Group", x = "Year", y = "Log Deflated Sale Amount", color = "Year Built Group") + theme_minimal()

#PROBLEM 4

Part i. Summarize

#Selecting relevant variables from housing data
vars_to_summarize <- c("sale_amount", "dlog_sale_amount", "year_built",
                       "bedrooms_all_buildings", "number_of_bathrooms", 
                       "number_of_fireplaces", "stories_number", 
                       "log_sale_amount", "log_land_square_footage",
                       "violent_crime_rate", "murder_rate", "rape_legacy_rate",
                       "robbery_rate", "aggravated_assault_rate", "property_crime_rate",
                       "burglary_rate", "larceny_theft_rate", "motor_vehicle_theft_rate")

#Summary Table
stargazer(cleanhousinghpicrimedata %>% select(all_of(vars_to_summarize)), 
          type = "text", title = "Summary Statistics", digits = 2)
## 
## Summary Statistics
## ====================================================================================
## Statistic                    N        Mean      St. Dev.     Min          Max       
## ------------------------------------------------------------------------------------
## sale_amount              2,348,446 441,056.80 2,116,736.00  100.00  2,729,784,889.00
## dlog_sale_amount         2,348,446   12.57        0.83       4.39        21.64      
## year_built               2,348,446  1,981.46     24.51      1,700        2,019      
## bedrooms_all_buildings   2,348,446    3.43        0.88        1            13       
## number_of_bathrooms      2,348,446    2.47        0.90       1.00        10.75      
## number_of_fireplaces     2,348,446    1.12        0.41        1            93       
## stories_number           2,348,446    1.37        0.49       1.00         6.50      
## log_sale_amount          2,348,446   12.59        0.83       4.61        21.73      
## log_land_square_footage  2,348,446    9.39        1.02       0.00        20.72      
## violent_crime_rate       2,348,446   384.74      23.94      361.60       458.60     
## murder_rate              2,348,446    4.95        0.31       4.40         5.40      
## rape_legacy_rate         2,348,446   28.89        1.68      25.90        31.00      
## robbery_rate             2,348,446   104.21      16.48      81.60        145.90     
## aggravated_assault_rate  2,348,446   246.65      11.38      229.20       277.50     
## property_crime_rate      2,348,446  2,561.03     324.64    2,109.90     3,214.60    
## burglary_rate            2,348,446   524.23      134.87     340.50       733.00     
## larceny_theft_rate       2,348,446  1,802.36     178.57    1,549.50     2,166.10    
## motor_vehicle_theft_rate 2,348,446   234.45      21.63      215.40       315.40     
## ------------------------------------------------------------------------------------

Part ii.

#Housing characters variables
housing_vars <- c("sale_amount", "dlog_sale_amount", "year_built",
                       "bedrooms_all_buildings", "number_of_bathrooms", 
                       "number_of_fireplaces", "stories_number", 
                       "log_sale_amount", "log_land_square_footage")

#Summary by year group
summary_by_year_built_grp_df <- cleanhousinghpicrimedata %>%
  group_by(year_built_grp) %>%
  summarise(across(all_of(vars_to_summarize),
                   list(Mean = mean, Median = median, SD = sd, Min = min, Max = max),
                   .names = "{.col}.{.fn}")) %>%  # Use "." instead of "_"
  pivot_longer(cols = -year_built_grp, 
               names_to = c("Variable", "Statistic"), 
               names_pattern = "(.*)\\.(.*)") %>%  # Extract proper names
  mutate(Value = round(as.numeric(value), 2)) %>%
  select(-value) %>%  # Drop old value column
  pivot_wider(names_from = year_built_grp, values_from = Value)  # Wide format

#Summary Stats table using kable
kable(summary_by_year_built_grp_df, format = "html", align = "c", caption = "Summary Statistics by Year Built Group") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
Summary Statistics by Year Built Group
Variable Statistic 1 2 3 4
sale_amount Mean 564319.73 405752.40 3.779475e+05 420230.51
sale_amount Median 360000.00 255257.18 2.740000e+05 291000.00
sale_amount SD 1313575.06 1320834.23 3.728872e+06 1070924.04
sale_amount Min 100.00 100.00 1.000000e+02 100.00
sale_amount Max 600000000.00 132019663.00 2.729785e+09 288164568.00
dlog_sale_amount Mean 12.73 12.48 1.250000e+01 12.56
dlog_sale_amount Median 12.82 12.41 1.248000e+01 12.52
dlog_sale_amount SD 1.02 0.83 6.900000e-01 0.71
dlog_sale_amount Min 4.39 4.39 4.390000e+00 4.39
dlog_sale_amount Max 20.00 18.60 2.164000e+01 19.21
year_built Mean 1946.48 1976.41 1.993190e+03 2007.65
year_built Median 1952.00 1977.00 1.993000e+03 2006.00
year_built SD 16.58 5.77 4.490000e+00 5.12
year_built Min 1700.00 1965.00 1.986000e+03 2001.00
year_built Max 1964.00 1985.00 2.000000e+03 2019.00
bedrooms_all_buildings Mean 3.18 3.31 3.480000e+00 3.72
bedrooms_all_buildings Median 3.00 3.00 3.000000e+00 4.00
bedrooms_all_buildings SD 0.90 0.82 8.100000e-01 0.89
bedrooms_all_buildings Min 1.00 1.00 1.000000e+00 1.00
bedrooms_all_buildings Max 13.00 13.00 1.300000e+01 13.00
number_of_bathrooms Mean 2.03 2.29 2.660000e+00 2.88
number_of_bathrooms Median 2.00 2.00 2.500000e+00 2.50
number_of_bathrooms SD 0.87 0.70 8.300000e-01 0.94
number_of_bathrooms Min 1.00 1.00 1.000000e+00 1.00
number_of_bathrooms Max 10.50 10.00 1.075000e+01 10.50
number_of_fireplaces Mean 1.10 1.11 1.130000e+00 1.14
number_of_fireplaces Median 1.00 1.00 1.000000e+00 1.00
number_of_fireplaces SD 0.37 0.41 4.200000e-01 0.44
number_of_fireplaces Min 1.00 1.00 1.000000e+00 1.00
number_of_fireplaces Max 16.00 93.00 1.900000e+01 30.00
stories_number Mean 1.16 1.30 1.490000e+00 1.53
stories_number Median 1.00 1.00 1.500000e+00 1.50
stories_number SD 0.36 0.47 5.000000e-01 0.50
stories_number Min 1.00 1.00 1.000000e+00 1.00
stories_number Max 6.00 6.50 6.250000e+00 6.50
log_sale_amount Mean 12.74 12.50 1.253000e+01 12.61
log_sale_amount Median 12.79 12.45 1.252000e+01 12.58
log_sale_amount SD 1.03 0.83 7.000000e-01 0.72
log_sale_amount Min 4.61 4.61 4.610000e+00 4.61
log_sale_amount Max 20.21 18.70 2.173000e+01 19.48
log_land_square_footage Mean 9.22 9.48 9.460000e+00 9.38
log_land_square_footage Median 9.00 9.25 9.250000e+00 9.16
log_land_square_footage SD 0.82 1.11 1.050000e+00 1.05
log_land_square_footage Min 0.00 0.00 0.000000e+00 0.00
log_land_square_footage Max 20.72 20.72 2.072000e+01 20.72
violent_crime_rate Mean 385.60 384.21 3.838900e+02 385.23
violent_crime_rate Median 383.80 373.70 3.737000e+02 383.80
violent_crime_rate SD 24.33 23.33 2.321000e+01 24.77
violent_crime_rate Min 361.60 361.60 3.616000e+02 361.60
violent_crime_rate Max 458.60 458.60 4.586000e+02 458.60
murder_rate Mean 4.94 4.95 4.950000e+00 4.96
murder_rate Median 5.00 5.00 5.000000e+00 5.00
murder_rate SD 0.31 0.31 3.100000e-01 0.31
murder_rate Min 4.40 4.40 4.400000e+00 4.40
murder_rate Max 5.40 5.40 5.400000e+00 5.40
rape_legacy_rate Mean 28.82 28.89 2.890000e+01 28.94
rape_legacy_rate Median 29.10 29.80 2.980000e+01 29.80
rape_legacy_rate SD 1.68 1.69 1.690000e+00 1.67
rape_legacy_rate Min 25.90 25.90 2.590000e+01 25.90
rape_legacy_rate Max 31.00 31.00 3.100000e+01 31.00
robbery_rate Mean 105.04 103.89 1.036400e+02 104.28
robbery_rate Median 102.20 102.20 1.022000e+02 102.20
robbery_rate SD 16.59 16.21 1.614000e+01 16.90
robbery_rate Min 81.60 81.60 8.160000e+01 81.60
robbery_rate Max 145.90 145.90 1.459000e+02 145.90
aggravated_assault_rate Mean 246.76 246.44 2.463500e+02 247.01
aggravated_assault_rate Median 248.30 248.30 2.482000e+02 248.30
aggravated_assault_rate SD 11.52 11.16 1.116000e+01 11.62
aggravated_assault_rate Min 229.20 229.20 2.292000e+02 229.20
aggravated_assault_rate Max 277.50 277.50 2.775000e+02 277.50
property_crime_rate Mean 2579.99 2555.66 2.549940e+03 2559.02
property_crime_rate Median 2500.50 2500.50 2.500500e+03 2500.50
property_crime_rate SD 327.07 321.51 3.199300e+02 328.92
property_crime_rate Min 2109.90 2109.90 2.109900e+03 2109.90
property_crime_rate Max 3214.60 3214.60 3.214600e+03 3214.60
burglary_rate Mean 532.61 522.30 5.197200e+02 522.57
burglary_rate Median 494.70 494.70 4.947000e+02 494.70
burglary_rate SD 136.22 134.12 1.332600e+02 135.48
burglary_rate Min 340.50 340.50 3.405000e+02 340.50
burglary_rate Max 733.00 733.00 7.330000e+02 733.00
larceny_theft_rate Mean 1812.57 1799.41 1.796380e+03 1801.33
larceny_theft_rate Median 1783.60 1783.60 1.783600e+03 1783.60
larceny_theft_rate SD 179.59 176.83 1.760900e+02 181.15
larceny_theft_rate Min 1549.50 1549.50 1.549500e+03 1549.50
larceny_theft_rate Max 2166.10 2166.10 2.166100e+03 2166.10
motor_vehicle_theft_rate Mean 234.82 233.96 2.338400e+02 235.12
motor_vehicle_theft_rate Median 230.20 230.20 2.302000e+02 230.20
motor_vehicle_theft_rate SD 21.82 20.91 2.095000e+01 22.69
motor_vehicle_theft_rate Min 215.40 215.40 2.154000e+02 215.40
motor_vehicle_theft_rate Max 315.40 315.40 3.154000e+02 315.40

Part iii.

#Summary stats by census division
summary_by_division_df <- cleanhousinghpicrimedata %>%
  group_by(division) %>%
  summarise(across(all_of(vars_to_summarize),
                   list(Mean = mean, Median = median, SD = sd, Min = min, Max = max),
                   .names = "{.col}.{.fn}")) %>%  # Use "." instead of "_"
  pivot_longer(cols = -division, 
               names_to = c("Variable", "Statistic"), 
               names_pattern = "(.*)\\.(.*)") %>%  # Extract proper names
  mutate(Value = round(as.numeric(value), 2)) %>%
  select(-value) %>%  # Drop old value column
  pivot_wider(names_from = division, values_from = Value) 

#Summary stats table using kable
kable(summary_by_division_df, format = "html", align = "c", caption = "Summary Statistics by Census Division") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
Summary Statistics by Census Division
Variable Statistic DV_ENC DV_ESC DV_MA DV_MT DV_PAC DV_SA DV_WNC DV_WSC
sale_amount Mean 270053.38 259789.82 209559.07 483343.76 597029.97 332534.00 3.113912e+05 254362.59
sale_amount Median 205000.00 212900.00 160000.00 360000.00 400000.00 220000.00 2.250000e+05 215000.00
sale_amount SD 697566.45 910328.83 372586.52 1800193.27 1102861.86 1201825.21 8.370690e+06 490246.41
sale_amount Min 500.00 500.00 1000.00 500.00 280.00 100.00 4.850000e+02 101.00
sale_amount Max 31000000.00 80240000.00 50000000.00 82000000.00 288164568.00 600000000.00 2.729785e+09 166250000.00
dlog_sale_amount Mean 12.22 12.11 11.96 12.76 12.93 12.30 1.221000e+01 12.10
dlog_sale_amount Median 12.20 12.16 12.00 12.75 12.90 12.28 1.225000e+01 12.12
dlog_sale_amount SD 0.62 0.69 0.80 0.58 0.77 0.84 7.500000e-01 0.61
dlog_sale_amount Min 6.21 6.00 6.81 6.10 5.37 4.39 5.950000e+00 4.52
dlog_sale_amount Max 17.20 18.10 17.66 18.00 19.21 20.00 2.164000e+01 18.93
year_built Mean 1979.27 1993.10 1971.79 1991.76 1971.44 1986.28 1.979080e+03 1993.02
year_built Median 1984.00 2001.00 1978.00 1997.00 1973.00 1990.00 1.987000e+03 1999.00
year_built SD 26.86 24.84 30.45 20.23 24.34 21.92 2.971000e+01 18.14
year_built Min 1800.00 1805.00 1700.00 1800.00 1801.00 1700.00 1.825000e+03 1859.00
year_built Max 2019.00 2019.00 2019.00 2019.00 2019.00 2019.00 2.019000e+03 2019.00
bedrooms_all_buildings Mean 3.29 3.43 3.27 3.50 3.35 3.48 3.290000e+00 3.55
bedrooms_all_buildings Median 3.00 3.00 3.00 3.00 3.00 3.00 3.000000e+00 3.00
bedrooms_all_buildings SD 0.73 0.72 0.89 1.00 0.90 0.84 9.400000e-01 0.76
bedrooms_all_buildings Min 1.00 1.00 1.00 1.00 1.00 1.00 1.000000e+00 1.00
bedrooms_all_buildings Max 8.00 13.00 11.00 13.00 13.00 13.00 1.300000e+01 13.00
number_of_bathrooms Mean 2.34 2.53 1.97 2.80 2.28 2.61 2.600000e+00 2.48
number_of_bathrooms Median 2.50 2.10 2.00 3.00 2.00 2.50 2.500000e+00 2.00
number_of_bathrooms SD 0.82 0.87 0.79 0.96 0.82 0.96 1.050000e+00 0.79
number_of_bathrooms Min 1.00 1.00 1.00 1.00 1.00 1.00 1.000000e+00 1.00
number_of_bathrooms Max 7.00 10.00 9.00 10.75 10.50 10.50 1.000000e+01 10.50
number_of_fireplaces Mean 1.08 1.07 1.11 1.21 1.10 1.09 1.190000e+00 1.13
number_of_fireplaces Median 1.00 1.00 1.00 1.00 1.00 1.00 1.000000e+00 1.00
number_of_fireplaces SD 0.34 0.31 0.42 0.52 0.36 0.43 4.600000e-01 0.37
number_of_fireplaces Min 1.00 1.00 1.00 1.00 1.00 1.00 1.000000e+00 1.00
number_of_fireplaces Max 11.00 11.00 20.00 13.00 30.00 93.00 1.200000e+01 11.00
stories_number Mean 1.54 1.39 1.57 1.55 1.31 1.38 1.370000e+00 1.37
stories_number Median 1.50 1.00 1.50 2.00 1.00 1.00 1.000000e+00 1.00
stories_number SD 0.51 0.46 0.51 0.50 0.47 0.49 4.700000e-01 0.46
stories_number Min 1.00 1.00 1.00 1.00 1.00 1.00 1.000000e+00 1.00
stories_number Max 5.00 4.00 5.00 5.00 6.00 6.00 6.500000e+00 4.00
log_sale_amount Mean 12.25 12.20 11.94 12.79 12.93 12.29 1.228000e+01 12.26
log_sale_amount Median 12.23 12.27 11.98 12.79 12.90 12.30 1.232000e+01 12.28
log_sale_amount SD 0.62 0.70 0.81 0.60 0.80 0.86 7.500000e-01 0.62
log_sale_amount Min 6.21 6.21 6.91 6.21 5.63 4.61 6.180000e+00 4.62
log_sale_amount Max 17.25 18.20 17.73 18.22 19.48 20.21 2.173000e+01 18.93
log_land_square_footage Mean 9.52 9.88 9.95 9.34 9.15 9.65 9.790000e+00 9.39
log_land_square_footage Median 9.23 9.63 9.93 9.06 8.95 9.57 9.430000e+00 9.11
log_land_square_footage SD 1.14 1.11 1.12 1.18 0.91 1.04 1.150000e+00 0.90
log_land_square_footage Min 3.78 6.08 3.78 0.00 0.00 0.69 1.390000e+00 0.00
log_land_square_footage Max 14.35 16.65 15.40 18.47 19.54 18.69 2.038000e+01 20.72
violent_crime_rate Mean 386.94 380.08 384.30 382.56 386.40 382.72 3.844700e+02 385.71
violent_crime_rate Median 383.80 370.40 383.80 373.70 383.80 373.70 3.737000e+02 383.80
violent_crime_rate SD 26.60 21.67 23.66 22.13 24.71 22.42 2.384000e+01 25.34
violent_crime_rate Min 361.60 361.60 361.60 361.60 361.60 361.60 3.616000e+02 361.60
violent_crime_rate Max 458.60 458.60 458.60 458.60 458.60 458.60 4.586000e+02 458.60
murder_rate Mean 4.98 4.98 4.96 4.96 4.94 4.96 4.950000e+00 4.95
murder_rate Median 5.00 5.00 5.00 5.00 5.00 5.00 5.000000e+00 5.00
murder_rate SD 0.31 0.27 0.31 0.31 0.31 0.31 3.100000e-01 0.31
murder_rate Min 4.40 4.40 4.40 4.40 4.40 4.40 4.400000e+00 4.40
murder_rate Max 5.40 5.40 5.40 5.40 5.40 5.40 5.400000e+00 5.40
rape_legacy_rate Mean 29.05 29.34 28.95 28.98 28.79 29.00 2.890000e+01 28.84
rape_legacy_rate Median 29.80 29.90 29.80 29.90 29.10 29.90 2.980000e+01 29.80
rape_legacy_rate SD 1.62 1.54 1.67 1.69 1.67 1.68 1.680000e+00 1.69
rape_legacy_rate Min 25.90 25.90 25.90 25.90 25.90 25.90 2.590000e+01 25.90
rape_legacy_rate Max 31.00 31.00 31.00 31.00 31.00 31.00 3.100000e+01 31.00
robbery_rate Mean 104.81 98.46 103.69 102.48 105.68 102.42 1.040400e+02 105.02
robbery_rate Median 102.20 98.60 102.20 102.20 102.90 102.20 1.022000e+02 102.20
robbery_rate SD 17.90 16.42 16.33 15.68 16.66 15.93 1.637000e+01 17.01
robbery_rate Min 81.60 81.60 81.60 81.60 81.60 81.60 8.160000e+01 81.60
robbery_rate Max 145.90 145.90 145.90 145.90 145.90 145.90 1.459000e+02 145.90
aggravated_assault_rate Mean 248.07 247.26 246.65 246.09 246.94 246.29 2.465400e+02 246.86
aggravated_assault_rate Median 248.30 248.30 248.30 248.30 248.30 248.30 2.483000e+02 248.20
aggravated_assault_rate SD 12.03 9.91 11.25 10.80 11.65 10.83 1.138000e+01 11.98
aggravated_assault_rate Min 229.20 229.20 229.20 229.20 229.20 229.20 2.292000e+02 229.20
aggravated_assault_rate Max 277.50 277.50 277.50 277.50 277.50 277.50 2.775000e+02 277.50
property_crime_rate Mean 2563.76 2439.67 2547.38 2524.52 2592.99 2522.99 2.556320e+03 2577.13
property_crime_rate Median 2500.50 2362.90 2500.50 2451.60 2500.50 2451.60 2.500500e+03 2500.50
property_crime_rate SD 341.42 318.74 321.11 312.01 327.86 315.76 3.218900e+02 330.11
property_crime_rate Min 2109.90 2109.90 2109.90 2109.90 2109.90 2109.90 2.109900e+03 2109.90
property_crime_rate Max 3214.60 3214.60 3214.60 3214.60 3214.60 3214.60 3.214600e+03 3214.60
burglary_rate Mean 522.74 472.91 517.97 508.79 538.05 508.12 5.219900e+02 530.47
burglary_rate Median 494.70 429.70 494.70 468.90 494.70 468.90 4.947000e+02 494.70
burglary_rate SD 138.14 130.39 133.07 129.97 136.55 131.29 1.335100e+02 135.87
burglary_rate Min 340.50 340.50 340.50 340.50 340.50 340.50 3.405000e+02 340.50
burglary_rate Max 733.00 733.00 733.00 733.00 733.00 733.00 7.330000e+02 733.00
larceny_theft_rate Mean 1803.92 1735.06 1795.10 1782.63 1819.63 1781.66 1.800000e+03 1811.30
larceny_theft_rate Median 1783.60 1695.50 1783.60 1745.40 1783.60 1745.40 1.783600e+03 1783.60
larceny_theft_rate SD 188.33 177.14 176.81 172.13 179.82 174.24 1.771600e+02 181.67
larceny_theft_rate Min 1549.50 1549.50 1549.50 1549.50 1549.50 1549.50 1.549500e+03 1549.50
larceny_theft_rate Max 2166.10 2166.10 2166.10 2166.10 2166.10 2166.10 2.166100e+03 2166.10
motor_vehicle_theft_rate Mean 237.10 231.70 234.32 233.10 235.32 233.22 2.343400e+02 235.36
motor_vehicle_theft_rate Median 230.20 230.20 230.20 230.20 230.20 230.20 2.302000e+02 230.20
motor_vehicle_theft_rate SD 25.30 19.60 21.52 20.12 22.15 20.31 2.164000e+01 23.28
motor_vehicle_theft_rate Min 215.40 215.40 215.40 215.40 215.40 215.40 2.154000e+02 215.40
motor_vehicle_theft_rate Max 315.40 315.40 315.40 315.40 315.40 315.40 3.154000e+02 315.40

Part iv.

###Creating region variable

#Converting division to region
division_to_region <- c(
  "DV_NE" = "Northeast", "DV_MA" = "Northeast", # New England & Middle Atlantic
  
  "DV_ENC" = "Midwest", "DV_WNC" = "Midwest", # East & West North Central
  
  "DV_SA" = "South", "DV_ESC" = "South", "DV_WSC" = "South", # South Atlantic & Central
  
  "DV_MT" = "West", "DV_PAC" = "West" # Mountain & Pacific
)

#Regions Based on Division
cleanhousinghpicrimedata <- cleanhousinghpicrimedata %>%
  mutate(region = recode(division, !!!division_to_region))

Summarizing

# Compute summary statistics by census region
summary_by_region_df <- cleanhousinghpicrimedata %>%
  group_by(region) %>%
  summarise(across(all_of(vars_to_summarize),
                   list(Mean = mean, Median = median, SD = sd, Min = min, Max = max),
                   .names = "{.col}.{.fn}")) %>%  
  pivot_longer(cols = -region, 
               names_to = c("Variable", "Statistic"), 
               names_pattern = "(.*)\\.(.*)") %>%  
  mutate(Value = round(as.numeric(value), 2)) %>%
  select(-value) %>%  # Drop old value column
  pivot_wider(names_from = region, values_from = Value)  

# Display table using kable
kable(summary_by_region_df, format = "html", align = "c", caption = "Summary Statistics by Census Region") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
Summary Statistics by Census Region
Variable Statistic Midwest Northeast South West
sale_amount Mean 3.088721e+05 209559.07 303133.49 572709.28
sale_amount Median 2.240000e+05 160000.00 218000.00 389000.00
sale_amount SD 8.113455e+06 372586.52 1000965.34 1285126.43
sale_amount Min 4.850000e+02 1000.00 100.00 280.00
sale_amount Max 2.729785e+09 50000000.00 600000000.00 288164568.00
dlog_sale_amount Mean 1.221000e+01 11.96 12.23 12.89
dlog_sale_amount Median 1.225000e+01 12.00 12.22 12.85
dlog_sale_amount SD 7.400000e-01 0.80 0.77 0.74
dlog_sale_amount Min 5.950000e+00 6.81 4.39 5.37
dlog_sale_amount Max 2.164000e+01 17.66 20.00 19.21
year_built Mean 1.979090e+03 1971.79 1988.82 1975.79
year_built Median 1.987000e+03 1978.00 1993.00 1978.00
year_built SD 2.955000e+01 30.45 20.95 24.95
year_built Min 1.800000e+03 1700.00 1700.00 1800.00
year_built Max 2.019000e+03 2019.00 2019.00 2019.00
bedrooms_all_buildings Mean 3.290000e+00 3.27 3.50 3.38
bedrooms_all_buildings Median 3.000000e+00 3.00 3.00 3.00
bedrooms_all_buildings SD 9.300000e-01 0.89 0.81 0.92
bedrooms_all_buildings Min 1.000000e+00 1.00 1.00 1.00
bedrooms_all_buildings Max 1.300000e+01 11.00 13.00 13.00
number_of_bathrooms Mean 2.580000e+00 1.97 2.56 2.39
number_of_bathrooms Median 2.500000e+00 2.00 2.50 2.00
number_of_bathrooms SD 1.030000e+00 0.79 0.90 0.88
number_of_bathrooms Min 1.000000e+00 1.00 1.00 1.00
number_of_bathrooms Max 1.000000e+01 9.00 10.50 10.75
number_of_fireplaces Mean 1.190000e+00 1.11 1.11 1.12
number_of_fireplaces Median 1.000000e+00 1.00 1.00 1.00
number_of_fireplaces SD 4.600000e-01 0.42 0.41 0.40
number_of_fireplaces Min 1.000000e+00 1.00 1.00 1.00
number_of_fireplaces Max 1.200000e+01 20.00 93.00 30.00
stories_number Mean 1.380000e+00 1.57 1.37 1.36
stories_number Median 1.000000e+00 1.50 1.00 1.00
stories_number SD 4.800000e-01 0.51 0.48 0.49
stories_number Min 1.000000e+00 1.00 1.00 1.00
stories_number Max 6.500000e+00 5.00 6.00 6.00
log_sale_amount Mean 1.228000e+01 11.94 12.28 12.90
log_sale_amount Median 1.232000e+01 11.98 12.29 12.87
log_sale_amount SD 7.500000e-01 0.81 0.78 0.76
log_sale_amount Min 6.180000e+00 6.91 4.61 5.63
log_sale_amount Max 2.173000e+01 17.73 20.21 19.48
log_land_square_footage Mean 9.770000e+00 9.95 9.56 9.19
log_land_square_footage Median 9.410000e+00 9.93 9.37 8.97
log_land_square_footage SD 1.150000e+00 1.12 1.00 0.98
log_land_square_footage Min 1.390000e+00 3.78 0.00 0.00
log_land_square_footage Max 2.038000e+01 15.40 20.72 19.54
violent_crime_rate Mean 3.846200e+02 384.30 383.75 385.58
violent_crime_rate Median 3.737000e+02 383.80 373.70 383.80
violent_crime_rate SD 2.402000e+01 23.66 23.55 24.23
violent_crime_rate Min 3.616000e+02 361.60 361.60 361.60
violent_crime_rate Max 4.586000e+02 458.60 458.60 458.60
murder_rate Mean 4.960000e+00 4.96 4.96 4.95
murder_rate Median 5.000000e+00 5.00 5.00 5.00
murder_rate SD 3.100000e-01 0.31 0.31 0.31
murder_rate Min 4.400000e+00 4.40 4.40 4.40
murder_rate Max 5.400000e+00 5.40 5.40 5.40
rape_legacy_rate Mean 2.891000e+01 28.95 28.95 28.83
rape_legacy_rate Median 2.980000e+01 29.80 29.80 29.10
rape_legacy_rate SD 1.680000e+00 1.67 1.69 1.68
rape_legacy_rate Min 2.590000e+01 25.90 25.90 25.90
rape_legacy_rate Max 3.100000e+01 31.00 31.00 31.00
robbery_rate Mean 1.040900e+02 103.69 103.29 105.00
robbery_rate Median 1.022000e+02 102.20 102.20 102.20
robbery_rate SD 1.646000e+01 16.33 16.40 16.51
robbery_rate Min 8.160000e+01 81.60 81.60 81.60
robbery_rate Max 1.459000e+02 145.90 145.90 145.90
aggravated_assault_rate Mean 2.466300e+02 246.65 246.51 246.76
aggravated_assault_rate Median 2.483000e+02 248.30 248.30 248.30
aggravated_assault_rate SD 1.143000e+01 11.25 11.24 11.48
aggravated_assault_rate Min 2.292000e+02 229.20 229.20 229.20
aggravated_assault_rate Max 2.775000e+02 277.50 277.50 277.50
property_crime_rate Mean 2.556770e+03 2547.38 2540.98 2578.34
property_crime_rate Median 2.500500e+03 2500.50 2500.50 2500.50
property_crime_rate SD 3.231200e+02 321.11 322.37 325.74
property_crime_rate Min 2.109900e+03 2109.90 2109.90 2109.90
property_crime_rate Max 3.214600e+03 3214.60 3214.60 3214.60
burglary_rate Mean 5.220400e+02 517.97 515.53 531.79
burglary_rate Median 4.947000e+02 494.70 494.70 494.70
burglary_rate SD 1.338000e+02 133.07 133.49 135.70
burglary_rate Min 3.405000e+02 340.50 340.50 340.50
burglary_rate Max 7.330000e+02 733.00 733.00 733.00
larceny_theft_rate Mean 1.800240e+03 1795.10 1791.49 1811.72
larceny_theft_rate Median 1.783600e+03 1783.60 1783.60 1783.60
larceny_theft_rate SD 1.778600e+02 176.81 177.72 178.85
larceny_theft_rate Min 1.549500e+03 1549.50 1549.50 1549.50
larceny_theft_rate Max 2.166100e+03 2166.10 2166.10 2166.10
motor_vehicle_theft_rate Mean 2.345100e+02 234.32 233.96 234.84
motor_vehicle_theft_rate Median 2.302000e+02 230.20 230.20 230.20
motor_vehicle_theft_rate SD 2.189000e+01 21.52 21.44 21.75
motor_vehicle_theft_rate Min 2.154000e+02 215.40 215.40 215.40
motor_vehicle_theft_rate Max 3.154000e+02 315.40 315.40 315.40

PROBLEM 5

Part i.

library(lfe)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(fixest)
## 
## Attaching package: 'fixest'
## The following object is masked from 'package:lfe':
## 
##     fepois
## The following object is masked from 'package:scales':
## 
##     pvalue
#Dependent Variable
cleanhousinghpicrimedata$log_sale_hpi <- log(cleanhousinghpicrimedata$sale_amount / cleanhousinghpicrimedata$hpi)

# Run the regression
reg_model1 <- feols( log_sale_hpi ~ violent_crime_rate + murder_rate + rape_legacy_rate + robbery_rate + 
                    aggravated_assault_rate + property_crime_rate + burglary_rate + larceny_theft_rate + 
                    motor_vehicle_theft_rate +
                    bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + 
                    log_land_square_footage + AC_presence
  | year + division, data = cleanhousinghpicrimedata)
## The variables 'violent_crime_rate', 'murder_rate', 'rape_legacy_rate', 'robbery_rate', 'aggravated_assault_rate', 'burglary_rate' and 2 others have been removed because of collinearity (see $collin.var).
summary(reg_model1)
## OLS estimation, Dep. Var.: log_sale_hpi
## Observations: 2,348,446
## Fixed-effects: year: 12,  division: 8
## Standard-errors: Clustered (year) 
##                          Estimate   Std. Error     t value   Pr(>|t|)    
## property_crime_rate     -0.035109 90113.721445 -0.00000039 1.0000e+00    
## bedrooms_all_buildings   0.033349     0.002606 12.79508467 5.9999e-08 ***
## number_of_bathrooms      0.390041     0.005534 70.47807530 5.8287e-16 ***
## number_of_fireplaces     0.104705     0.005774 18.13351701 1.5247e-09 ***
## stories_number           0.025739     0.003864  6.66059565 3.5599e-05 ***
## log_land_square_footage  0.009049     0.002383  3.79735806 2.9568e-03 ** 
## AC_presence             -0.150589     0.015525 -9.70005503 1.0014e-06 ***
## ... 8 variables were removed because of collinearity (violent_crime_rate, murder_rate and 6 others [full set in $collin.var])
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.644389     Adj. R2: 0.391734
##                  Within R2: 0.259987

The model automatically removes violent_crime_rate, murder_rate, rape_legacy_rate, robbery_rate, aggravated_assault_rate, burglary_rate, larceny_theft_rate due to collinearity between variables and only keeps property_crime_rate.

Part ii.

#Model 1: Adding year_built_group
reg_model2 <- feols( log_sale_hpi ~ property_crime_rate +
                    year_built_grp + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + 
                    log_land_square_footage + AC_presence
  | year + division, data = cleanhousinghpicrimedata)

summary(reg_model2)
## OLS estimation, Dep. Var.: log_sale_hpi
## Observations: 2,348,446
## Fixed-effects: year: 12,  division: 8
## Standard-errors: Clustered (year) 
##                          Estimate   Std. Error       t value   Pr(>|t|)    
## property_crime_rate     -0.032002 87219.715689  -0.000000367 1.0000e+00    
## year_built_grp          -0.069411     0.003459 -20.069152836 5.1500e-10 ***
## bedrooms_all_buildings   0.033820     0.002604  12.985422222 5.1481e-08 ***
## number_of_bathrooms      0.412299     0.005985  68.892500438 7.4826e-16 ***
## number_of_fireplaces     0.091589     0.005564  16.461993841 4.2657e-09 ***
## stories_number           0.051416     0.004274  12.029278393 1.1345e-07 ***
## log_land_square_footage  0.007059     0.002350   3.003710145 1.2000e-02 *  
## AC_presence             -0.132790     0.013812  -9.614021443 1.0940e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.640959     Adj. R2: 0.398192
##                  Within R2: 0.267844
#Model 2: Adding year_built to the model
reg_model3 <- feols( log_sale_hpi ~ property_crime_rate +
                    year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + 
                    log_land_square_footage + AC_presence
  | year + division, data = cleanhousinghpicrimedata)

summary(reg_model3)
## OLS estimation, Dep. Var.: log_sale_hpi
## Observations: 2,348,446
## Fixed-effects: year: 12,  division: 8
## Standard-errors: Clustered (year) 
##                          Estimate   Std. Error       t value   Pr(>|t|)    
## property_crime_rate     -0.029196 85328.669408  -0.000000342 1.0000e+00    
## year_built              -0.003539     0.000174 -20.320988727 4.5049e-10 ***
## bedrooms_all_buildings   0.032436     0.002563  12.657242362 6.7119e-08 ***
## number_of_bathrooms      0.417360     0.006166  67.685403662 9.0848e-16 ***
## number_of_fireplaces     0.086945     0.005398  16.106409654 5.3762e-09 ***
## stories_number           0.044878     0.004192  10.706758146 3.7212e-07 ***
## log_land_square_footage  0.008783     0.002408   3.647265701 3.8390e-03 ** 
## AC_presence             -0.129416     0.013560  -9.543658617 1.1766e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.640031     Adj. R2: 0.399934
##                  Within R2: 0.269963
#Model 3: using fips_county_code as fixed effects
reg_model4 <- feols(log_sale_hpi ~ property_crime_rate +
                    year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + 
                    log_land_square_footage + AC_presence
  | year + fips_county_code, data = cleanhousinghpicrimedata)

summary(reg_model4)
## OLS estimation, Dep. Var.: log_sale_hpi
## Observations: 2,348,446
## Fixed-effects: year: 12,  fips_county_code: 501
## Standard-errors: Clustered (year) 
##                          Estimate   Std. Error      t value   Pr(>|t|)    
## property_crime_rate      0.005961 31813.967915  0.000000187 1.0000e+00    
## year_built               0.001051     0.000083 12.648130888 6.7621e-08 ***
## bedrooms_all_buildings   0.044571     0.002188 20.366365222 4.3982e-10 ***
## number_of_bathrooms      0.303712     0.005577 54.454969683 9.8735e-15 ***
## number_of_fireplaces     0.114228     0.005384 21.214922398 2.8362e-10 ***
## stories_number           0.041854     0.006819  6.137660350 7.3345e-05 ***
## log_land_square_footage  0.070799     0.001031 68.668072951 7.7554e-16 ***
## AC_presence             -0.021494     0.003507 -6.129301249 7.4219e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.533756     Adj. R2: 0.582579
##                  Within R2: 0.276663
# Load necessary package
library(modelsummary)
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
# Create regression table
model_list <- list(
  "Model 1 (Year Built Group)" = reg_model2,
  "Model 2 (Year Built Added)" = reg_model3,
  "Model 3 (County FE)" = reg_model4
)

# Generate summary table
modelsummary(model_list, 
  title = "Regression Results: Impact of Crime Rates on Housing Prices",
  coef_omit = "Intercept",  # Hide intercept
  stars = TRUE, 
  gof_omit = "IC|RMSE",     # Remove unnecessary stats
  fmt = 4,                  # Round numbers
  escape = FALSE  # Ensures special characters (e.g., &nbsp;) don't appear in the title
)
Regression Results: Impact of Crime Rates on Housing Prices
Model 1 (Year Built Group) Model 2 (Year Built Added) Model 3 (County FE)
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
property_crime_rate -0.0320 -0.0292 0.0060
(87219.7157) (85328.6694) (31813.9679)
year_built_grp -0.0694***
(0.0035)
bedrooms_all_buildings 0.0338*** 0.0324*** 0.0446***
(0.0026) (0.0026) (0.0022)
number_of_bathrooms 0.4123*** 0.4174*** 0.3037***
(0.0060) (0.0062) (0.0056)
number_of_fireplaces 0.0916*** 0.0869*** 0.1142***
(0.0056) (0.0054) (0.0054)
stories_number 0.0514*** 0.0449*** 0.0419***
(0.0043) (0.0042) (0.0068)
log_land_square_footage 0.0071* 0.0088** 0.0708***
(0.0024) (0.0024) (0.0010)
AC_presence -0.1328*** -0.1294*** -0.0215***
(0.0138) (0.0136) (0.0035)
year_built -0.0035*** 0.0011***
(0.0002) (0.0001)
Num.Obs. 2348446 2348446 2348446
R2 0.398 0.400 0.583
R2 Adj. 0.398 0.400 0.583
R2 Within 0.268 0.270 0.277
R2 Within Adj. 0.268 0.270 0.277
Std.Errors by: year by: year by: year
FE: year X X X
FE: division X X
FE: fips_county_code X

Part iii.

I prefer the last model, as the year_built has positive significant effect which is consistent with the housing market where newer houses has better sale values.

#Model 3 by Region
reg_model4_reg <- feols(log_sale_hpi ~ property_crime_rate +
                    year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + 
                    log_land_square_footage + AC_presence +
                      region
  | year, data = cleanhousinghpicrimedata)

summary(reg_model4_reg)
## OLS estimation, Dep. Var.: log_sale_hpi
## Observations: 2,348,446
## Fixed-effects: year: 12
## Standard-errors: Clustered (year) 
##                          Estimate Std. Error   t value   Pr(>|t|)    
## property_crime_rate     -0.000016   0.000011  -1.43860 1.7810e-01    
## year_built              -0.004729   0.000152 -31.08332 4.5319e-12 ***
## bedrooms_all_buildings   0.039166   0.003082  12.70701 6.4448e-08 ***
## number_of_bathrooms      0.410606   0.005558  73.87403 3.4769e-16 ***
## number_of_fireplaces     0.069861   0.005188  13.46618 3.5275e-08 ***
## stories_number           0.031290   0.003783   8.27116 4.7512e-06 ***
## log_land_square_footage  0.006742   0.001930   3.49268 5.0347e-03 ** 
## AC_presence             -0.102340   0.012734  -8.03683 6.2529e-06 ***
## regionNortheast         -0.057900   0.027833  -2.08029 6.1668e-02 .  
## regionSouth              0.115901   0.014255   8.13051 5.5988e-06 ***
## regionWest               0.761214   0.014750  51.60695 1.7790e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.648676     Adj. R2: 0.383615
##                  Within R2: 0.382314

Part iv.

Here, the key crime variable is property crime rate as others had collinearity

#New variable to store estimated effects
cleanhousinghpicrimedata$crime_effect_region <- NA

#Loop through each region and estimate the effect
for (r in unique(cleanhousinghpicrimedata$region)) {
  
  #Run regression for each region
  model <- feols(log_sale_hpi ~ property_crime_rate + bedrooms_all_buildings +
                   number_of_bathrooms + number_of_fireplaces + stories_number +
                   log_land_square_footage + AC_presence, 
                 data = cleanhousinghpicrimedata %>% filter(region == r))
  
  #Extract the coefficient for property_crime_rate
  effect <- coef(model)["property_crime_rate"]
  
  #Assigning effect to all rows 
  cleanhousinghpicrimedata$crime_effect_region[cleanhousinghpicrimedata$region == r] <- effect
}

# Round the estimated crime effect values to 4 decimal places
cleanhousinghpicrimedata$crime_effect_region <- round(cleanhousinghpicrimedata$crime_effect_region, 4)

#Scaling the crime rate variable
cleanhousinghpicrimedata$scaled_property_crime_rate <- cleanhousinghpicrimedata$property_crime_rate / 1000

summary(cleanhousinghpicrimedata$scaled_property_crime_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.110   2.363   2.501   2.561   2.868   3.215

Map the region wise effect

# Define U.S. Census regions
census_regions <- data.frame(
  state = c("Maine", "New Hampshire", "Vermont", "Massachusetts", "Rhode Island", "Connecticut",
            "New York", "New Jersey", "Pennsylvania",
            "Wisconsin", "Michigan", "Illinois", "Indiana", "Ohio",
            "North Dakota", "South Dakota", "Nebraska", "Kansas", "Minnesota", "Iowa", "Missouri",
            "Delaware", "Maryland", "District of Columbia", "Virginia", "West Virginia",
            "North Carolina", "South Carolina", "Georgia", "Florida", "Kentucky", "Tennessee",
            "Alabama", "Mississippi", "Arkansas", "Louisiana",
            "Oklahoma", "Texas",
            "Montana", "Idaho", "Wyoming", "Colorado", "New Mexico",
            "Arizona", "Utah", "Nevada",
            "Washington", "Oregon", "California", "Alaska", "Hawaii"),
  region = c(rep("Northeast", 9),
             rep("Midwest", 12),
             rep("South", 17),
             rep("West", 13))
)

# Convert state names to lowercase for merging with `usmap` data
census_regions$state <- tolower(census_regions$state)

# Aggregate mean crime effect by region
region_effects <- cleanhousinghpicrimedata %>%
  group_by(region) %>%
  summarise(mean_crime_effect = mean(crime_effect_region, na.rm = TRUE))

# Merge state-region mapping with crime effect data
plot_data <- left_join(census_regions, region_effects, by = "region")

# Plot the map, now with a **gradient scale**
plot_usmap(data = plot_data, values = "mean_crime_effect", regions = "state") +
  scale_fill_gradient2(
    low = "blue", mid = "white", high = "red",
    midpoint = mean(plot_data$mean_crime_effect, na.rm = TRUE),
    name = "Mean Crime Effect",
    na.value = "grey50"
  ) +
  theme(legend.position = "right") +
  labs(title = "Variation in Estimated Crime Rate Effect by U.S. Region (Gradient)")