Setting Working Directory

setwd("C:/Users/Lenovo/OneDrive - The Pennsylvania State University/Penn State/Coursework/Sem IV/EEFE 530/Problem Sets/Problem Set 1")

#Libraries

# Load packages
library(pacman)
p_load(tidyverse, lubridate, usmap, gridExtra, stringr, readxl, plot3D,  
       cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer,  
       foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont, 
       kableExtra, snakecase, janitor)
library(dplyr)

#Load Data

housingdata <- readRDS("testdata20250121.RDS")

#PROBLEM 1

##Part i - Date Extraction

#Checking variable type
str(housingdata$sale_date)  # character

##  chr [1:2350548] "20150129" "20090904" "20130228" "20080825" "20140225" ...

head(housingdata$sale_date)  # View first few values

## [1] "20150129" "20090904" "20130228" "20080825" "20140225" "20180111"

#create a copy
housingdata_wip <- housingdata

#Change the string to YYY-MM-DD format
housingdata_wip <- housingdata_wip %>%
  mutate(sale_date = ymd(sale_date)) 

#Separate columns
housingdata_wip <- housingdata_wip %>%
  mutate(
    year = year(sale_date),
    month = month(sale_date),
    day = day(sale_date)
  )

#Displaying first 10 rows with extracted dates
housingdata_wip %>%
  select(sale_date, year, month, day) %>%
  head(10)

##      sale_date  year month   day
##         <Date> <int> <int> <int>
##  1: 2015-01-29  2015     1    29
##  2: 2009-09-04  2009     9     4
##  3: 2013-02-28  2013     2    28
##  4: 2008-08-25  2008     8    25
##  5: 2014-02-25  2014     2    25
##  6: 2018-01-11  2018     1    11
##  7: 2015-04-08  2015     4     8
##  8: 2015-06-23  2015     6    23
##  9: 2009-05-28  2009     5    28
## 10: 2012-01-31  2012     1    31

Part ii. Summarizing Variables

#Summarizing Year
summary(housingdata_wip$year)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2008    2012    2015    2015    2017    2019

hist(housingdata_wip$year)

#Summarizing Year Built
summary(housingdata_wip$year_built)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1700    1965    1986    1981    2001    2019

hist(housingdata_wip$year_built)

#Generating Categorical Variable for Year Built
housingdata_wip <- housingdata_wip %>%
  mutate(year_built_grp = case_when(
    year_built < 1965 ~ 1,  # Very old houses = based on first quantile
    year_built >= 1965 & year_built < 1986 ~ 2,  # Quite old houses = based on median
    year_built >= 1986 & year_built < 2001 ~ 3,  # Quite new houses = based on third quantile
    year_built >= 2001 ~ 4  # Very new houses
  ))

#Defining categorical labels for `year_built_grp`
housingdata_wip <- housingdata_wip %>%
  mutate(
    log_sale_amount = log(sale_amount),
    log_land_square_footage = log(land_square_footage),
    year_built_grp_label = case_when(
      year_built_grp == 1 ~ "1: Very Old (<1965)",
      year_built_grp == 2 ~ "2: Quite Old (1965-1986)",
      year_built_grp == 3 ~ "3: Quite New (1986-2001)",
      year_built_grp == 4 ~ "4: Very New (≥2001)"
    )
  )

#Displaying first few rows
head(housingdata_wip %>% select(year_built, year_built_grp), 10)

##     year_built year_built_grp
##          <int>          <num>
##  1:       2009              4
##  2:       1986              3
##  3:       2006              4
##  4:       2000              3
##  5:       1940              1
##  6:       2006              4
##  7:       1999              3
##  8:       1998              3
##  9:       2007              4
## 10:       1994              3

Part iii - 3D Plot

#Creating logged variables for continuous values
housingdata_wip <- housingdata_wip %>%
  mutate(
    log_sale_amount = log(sale_amount),
    log_land_square_footage = log(land_square_footage)
  )

#Check NAs
sum(is.na(housingdata_wip$log_land_square_footage)) #zero

## [1] 0

sum(is.na(housingdata_wip$log_sale_amount)) #zero

## [1] 0

#3D scatter Plot

library(plot3D)

#Defining colours for categories
colors <- c("darkblue", "skyblue", "yellow", "brown")  
color_mapping <- colors[as.numeric(housingdata_wip$year_built_grp)]

#Plot
plot_3d <- scatter3D(
  x = housingdata_wip$log_sale_amount,  
  y = housingdata_wip$year_built_grp,   
  z = housingdata_wip$log_land_square_footage,  
  colvar = NULL,  
  col = color_mapping,
  pch = 19, 
  cex = 1.2,
  theta = 40, phi = 20,  
  main = "3D Plot of Log Sale Amount, Year Built Group, and Log Land Square Footage",
  xlab = "Log Sale Amount",
  ylab = "Year Built Group",
  zlab = "Log Land Square Footage",
  ticktype = "detailed"
)

plot_3d

##               [,1]        [,2]        [,3]        [,4]
## [1,]  8.947906e-02 -0.02567950  0.07055385 -0.07055385
## [2,]  4.285251e-01  0.17466842 -0.47989754  0.47989754
## [3,] -3.798445e-18  0.09068963  0.03300832 -0.03300832
## [4,] -2.249423e+00 -1.03825888 -2.80326238  3.80326238

#Legend
par(new = TRUE)
legend("topright", 
       legend = c("1: Very Old (<1965)", "2: Quite Old (1965-1986)", 
                  "3: Quite New (1986-2001)", "4: Very New (≥2001)"),
       col = colors, pch = 19, cex = 1.2, bty = "n")

Part iv. Cleaning the data

#Summarising numerical variables
summary_stats <- housingdata_wip %>% summary(across(where(is.numeric)))
print(summary_stats)

##   sale_amount          sale_date                x                 y        
##  Min.   :1.000e+02   Min.   :2008-01-01   Min.   :-124.57   Min.   :26.32  
##  1st Qu.:1.800e+05   1st Qu.:2012-06-13   1st Qu.:-118.32   1st Qu.:33.55  
##  Median :2.870e+05   Median :2015-07-28   Median :-104.77   Median :34.37  
##  Mean   :4.417e+05   Mean   :2015-01-19   Mean   :-102.81   Mean   :35.88  
##  3rd Qu.:4.740e+05   3rd Qu.:2017-12-13   3rd Qu.: -84.53   3rd Qu.:38.56  
##  Max.   :2.730e+09   Max.   :2019-12-31   Max.   : -73.91   Max.   :48.97  
##  fips_county_code   year_built   bedrooms_all_buildings number_of_bathrooms
##  Min.   : 1101    Min.   :1700   Min.   :  1.000        Min.   :  0.010    
##  1st Qu.: 6081    1st Qu.:1965   1st Qu.:  3.000        1st Qu.:  2.000    
##  Median :12091    Median :1986   Median :  3.000        Median :  2.000    
##  Mean   :20436    Mean   :1981   Mean   :  3.431        Mean   :  2.472    
##  3rd Qu.:40087    3rd Qu.:2001   3rd Qu.:  4.000        3rd Qu.:  3.000    
##  Max.   :53071    Max.   :2019   Max.   :999.000        Max.   :450.000    
##  number_of_fireplaces stories_number    land_square_footage     abbr          
##  Min.   :  1.00       Min.   :  0.500   Min.   :        1   Length:2350548    
##  1st Qu.:  1.00       1st Qu.:  1.000   1st Qu.:     6874   Class :character  
##  Median :  1.00       Median :  1.000   Median :     9583   Mode  :character  
##  Mean   :  1.19       Mean   :  1.371   Mean   :    39671                     
##  3rd Qu.:  1.00       3rd Qu.:  2.000   3rd Qu.:    17031                     
##  Max.   :999.00       Max.   :175.000   Max.   :999999999                     
##   AC_presence          year          month             day       
##  Min.   :0.0000   Min.   :2008   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.:0.0000   1st Qu.:2012   1st Qu.: 4.000   1st Qu.: 9.00  
##  Median :1.0000   Median :2015   Median : 7.000   Median :17.00  
##  Mean   :0.6091   Mean   :2015   Mean   : 6.662   Mean   :16.76  
##  3rd Qu.:1.0000   3rd Qu.:2017   3rd Qu.: 9.000   3rd Qu.:25.00  
##  Max.   :1.0000   Max.   :2019   Max.   :12.000   Max.   :31.00  
##  year_built_grp  log_sale_amount  log_land_square_footage year_built_grp_label
##  Min.   :1.000   Min.   : 4.605   Min.   : 0.000          Length:2350548      
##  1st Qu.:2.000   1st Qu.:12.101   1st Qu.: 8.836          Class :character    
##  Median :3.000   Median :12.567   Median : 9.168          Mode  :character    
##  Mean   :2.525   Mean   :12.595   Mean   : 9.386                              
##  3rd Qu.:4.000   3rd Qu.:13.069   3rd Qu.: 9.743                              
##  Max.   :4.000   Max.   :21.727   Max.   :20.723

The minimum bathroom is 0.010 and stories 0.50 which seems invalid, therefore, I will remove those.

#Filtering values
housingdata_wip_clean <- housingdata_wip %>%
  filter(number_of_bathrooms >= 1, stories_number >= 1)

Some of the outliers that are odd are: number of bedrooms and fireplaces being 999, number of bathrooms being 450, number of stories being 175. I will check the values of these variables above their 90th, 95th, and 99.99th percentile and if they are far from mean and median, I will remove them on account of being errors

###90th Percentile

colnames(housingdata_wip)

##  [1] "sale_amount"             "sale_date"              
##  [3] "x"                       "y"                      
##  [5] "fips_county_code"        "year_built"             
##  [7] "bedrooms_all_buildings"  "number_of_bathrooms"    
##  [9] "number_of_fireplaces"    "stories_number"         
## [11] "land_square_footage"     "abbr"                   
## [13] "AC_presence"             "year"                   
## [15] "month"                   "day"                    
## [17] "year_built_grp"          "log_sale_amount"        
## [19] "log_land_square_footage" "year_built_grp_label"

#90th Percentile
##List of variables for which we want the 90th percentile
vars_to_check <- c("bedrooms_all_buildings", "number_of_bathrooms", 
                   "number_of_fireplaces", "stories_number")

##Compute the 90th percentile for each variable
percentile_90 <- housingdata_wip %>%
  summarise(across(all_of(vars_to_check), 
                   ~ quantile(.x, 0.90, na.rm = TRUE)))

##Print the results
print(percentile_90)

##   bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## 1                      4                 3.5                    2
##   stories_number
## 1              2

###95th Percentile

#95th Percentile
##Computing the 95th percentile for each variable
percentile_95 <- housingdata_wip %>%
  summarise(across(all_of(vars_to_check), 
                   ~ quantile(.x, 0.95, na.rm = TRUE)))

# Print the results
print(percentile_95)

##   bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## 1                      5                   4                    2
##   stories_number
## 1              2

###99.99th Percentile

#95th Percentile
##Computing the 99.99th percentile for each variable
percentile_9999 <- housingdata_wip %>%
  summarise(across(all_of(vars_to_check), 
                   ~ quantile(.x, 0.9999, na.rm = TRUE)))

# Print the results
print(percentile_9999)

##   bedrooms_all_buildings number_of_bathrooms number_of_fireplaces
## 1                     14                  11                   99
##   stories_number
## 1              7

Since all the values seem reasonable up until 99.9th Percentile, I will remove entries above them.

#Computing the 99.99th percentile for each variable and storing it in new columns
housingdata_wip_clean <- housingdata_wip_clean %>%
  mutate(
    bedrooms_99 = quantile(bedrooms_all_buildings, 0.9999, na.rm = TRUE),
    bathrooms_99 = quantile(number_of_bathrooms, 0.9999, na.rm = TRUE),
    fireplaces_99 = quantile(number_of_fireplaces, 0.9999, na.rm = TRUE),
    stories_99 = quantile(stories_number, 0.9999, na.rm = TRUE)
  )

#Remove rows where values exceed 99.99th percentile
housingdata_wip_clean <- housingdata_wip_clean %>%
  filter(
    bedrooms_all_buildings < bedrooms_99,
    number_of_bathrooms < bathrooms_99,
    number_of_fireplaces < fireplaces_99,
    stories_number < stories_99
  )

#Summary of clean data
summary(housingdata_wip_clean)

##   sale_amount          sale_date                x                 y        
##  Min.   :1.000e+02   Min.   :2008-01-01   Min.   :-124.57   Min.   :26.32  
##  1st Qu.:1.800e+05   1st Qu.:2012-06-13   1st Qu.:-118.32   1st Qu.:33.54  
##  Median :2.875e+05   Median :2015-07-28   Median :-104.77   Median :34.37  
##  Mean   :4.411e+05   Mean   :2015-01-18   Mean   :-102.81   Mean   :35.88  
##  3rd Qu.:4.742e+05   3rd Qu.:2017-12-13   3rd Qu.: -84.53   3rd Qu.:38.55  
##  Max.   :2.730e+09   Max.   :2019-12-31   Max.   : -73.91   Max.   :48.97  
##  fips_county_code   year_built   bedrooms_all_buildings number_of_bathrooms
##  Min.   : 1101    Min.   :1700   Min.   : 1.000         Min.   : 1.000     
##  1st Qu.: 6081    1st Qu.:1965   1st Qu.: 3.000         1st Qu.: 2.000     
##  Median :12091    Median :1986   Median : 3.000         Median : 2.000     
##  Mean   :20437    Mean   :1981   Mean   : 3.427         Mean   : 2.471     
##  3rd Qu.:40087    3rd Qu.:2001   3rd Qu.: 4.000         3rd Qu.: 3.000     
##  Max.   :53071    Max.   :2019   Max.   :13.000         Max.   :10.750     
##  number_of_fireplaces stories_number  land_square_footage     abbr          
##  Min.   : 1.000       Min.   :1.000   Min.   :        1   Length:2348446    
##  1st Qu.: 1.000       1st Qu.:1.000   1st Qu.:     6874   Class :character  
##  Median : 1.000       Median :1.000   Median :     9583   Mode  :character  
##  Mean   : 1.119       Mean   :1.371   Mean   :    39600                     
##  3rd Qu.: 1.000       3rd Qu.:2.000   3rd Qu.:    17000                     
##  Max.   :93.000       Max.   :6.500   Max.   :999999999                     
##   AC_presence          year          month             day       
##  Min.   :0.0000   Min.   :2008   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.:0.0000   1st Qu.:2012   1st Qu.: 4.000   1st Qu.: 9.00  
##  Median :1.0000   Median :2015   Median : 7.000   Median :17.00  
##  Mean   :0.6092   Mean   :2015   Mean   : 6.662   Mean   :16.76  
##  3rd Qu.:1.0000   3rd Qu.:2017   3rd Qu.: 9.000   3rd Qu.:25.00  
##  Max.   :1.0000   Max.   :2019   Max.   :12.000   Max.   :31.00  
##  year_built_grp  log_sale_amount  log_land_square_footage year_built_grp_label
##  Min.   :1.000   Min.   : 4.605   Min.   : 0.000          Length:2348446      
##  1st Qu.:2.000   1st Qu.:12.101   1st Qu.: 8.836          Class :character    
##  Median :3.000   Median :12.569   Median : 9.168          Mode  :character    
##  Mean   :2.526   Mean   :12.595   Mean   : 9.385                              
##  3rd Qu.:4.000   3rd Qu.:13.069   3rd Qu.: 9.741                              
##  Max.   :4.000   Max.   :21.727   Max.   :20.723                              
##   bedrooms_99  bathrooms_99 fireplaces_99   stories_99
##  Min.   :14   Min.   :11    Min.   :99    Min.   :7   
##  1st Qu.:14   1st Qu.:11    1st Qu.:99    1st Qu.:7   
##  Median :14   Median :11    Median :99    Median :7   
##  Mean   :14   Mean   :11    Mean   :99    Mean   :7   
##  3rd Qu.:14   3rd Qu.:11    3rd Qu.:99    3rd Qu.:7   
##  Max.   :14   Max.   :11    Max.   :99    Max.   :7

#Note: cleanhousingdata is called housingdata_wip_clean here

#PROBLEM 2

Part i

###Secondary Data

#Calling data
secondary_data <- read.csv("hpi_po_us_and_census.csv")

###Create quarter in Primary Dataset

#Generating quarters based on month
housingdata_wip_clean <- housingdata_wip_clean %>%
  mutate(qtr = case_when(
    month %in% c(1, 2, 3) ~ 1,
    month %in% c(4, 5, 6) ~ 2,
    month %in% c(7, 8, 9) ~ 3,
    month %in% c(10, 11, 12) ~ 4
  ))

###Create division abbreviations

#State-to-division mapping using the correct abbreviations
state_division_map <- tibble(
  abbr = c("CT", "ME", "MA", "NH", "RI", "VT", "NY", "NJ", "PA",  # Mid Atlantic
           "IL", "IN", "MI", "OH", "WI",  # East North Central
           "IA", "KS", "MN", "MO", "NE", "ND", "SD",  # West North Central
           "DE", "FL", "GA", "MD", "NC", "SC", "VA", "DC", "WV",  # South Atlantic
           "AL", "KY", "MS", "TN",  # East South Central
           "AR", "LA", "OK", "TX",  # West South Central
           "AZ", "CO", "ID", "MT", "NV", "NM", "UT", "WY",  # Mountain
           "AK", "CA", "HI", "OR", "WA"),  # Pacific
  division = c(rep("DV_MA", 9), rep("DV_ENC", 5), rep("DV_WNC", 7),
               rep("DV_SA", 9), rep("DV_ESC", 4), rep("DV_WSC", 4),
               rep("DV_MT", 8), rep("DV_PAC", 5))
)

#Merging division info into `housingdata_wip_clean`
housingdata_wip_clean <- housingdata_wip_clean %>%
  left_join(state_division_map, by = "abbr")

###Rescaling

# Step 1: Create `cleanhpidata` and Remove "USA" division
cleanhpidata <- secondary_data %>%
  filter(division != "USA")

# Step 2: Identify the First Available Year in House Sales Data
start_year <- min(housingdata_wip$year, na.rm = TRUE)

# Step 3: Correctly Extract Base HPI for Each Division in `qtr = 1` of `start_year`
base_hpi_values <- cleanhpidata %>%
  group_by(division) %>%
  filter(year == start_year, qtr == 1) %>%
  summarise(base_hpi = first(index_po_seasonally_adjusted), .groups = "drop")

# Step 4: Merge Base HPI Values with `cleanhpidata`
cleanhpidata <- cleanhpidata %>%
  left_join(base_hpi_values, by = "division") %>%
  mutate(hpi = (index_po_seasonally_adjusted / base_hpi) * 100)

# Step 5: Ensure `hpi = 100` for `qtr = 1` of `start_year`
cleanhpidata <- cleanhpidata %>%
  mutate(hpi = if_else(year == start_year & qtr == 1, 100, hpi))

# Step 6: Restrict `cleanhpidata` to the Range of Years in `housingdata_wip`
year_range <- range(housingdata_wip$year, na.rm = TRUE)
cleanhpidata <- cleanhpidata %>%
  filter(year >= year_range[1], year <= year_range[2])

# Step 7: Check Results
summary(cleanhpidata$hpi)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   71.87   91.09   97.26  100.50  109.20  146.23

head(cleanhpidata %>% select(division, year, qtr, index_po_seasonally_adjusted, base_hpi, hpi))

##   division year qtr index_po_seasonally_adjusted base_hpi       hpi
## 1   DV_ENC 2008   1                       179.93   179.93 100.00000
## 2   DV_ENC 2008   2                       175.95   179.93  97.78803
## 3   DV_ENC 2008   3                       172.76   179.93  96.01512
## 4   DV_ENC 2008   4                       168.17   179.93  93.46412
## 5   DV_ENC 2009   1                       169.15   179.93  94.00878
## 6   DV_ENC 2009   2                       166.74   179.93  92.66937

Part ii

#Merge the cleaned housing data and cleaned index data
housingdata_cleanhpidata <- housingdata_wip_clean %>%
  left_join(cleanhpidata, by = c("division", "year", "qtr"))

#Summary
summary(housingdata_cleanhpidata$hpi)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   71.87   89.60  102.45  104.75  120.16  146.23

# Computing the deflated house prices
housingdata_cleanhpidata <- housingdata_cleanhpidata %>%
  mutate(dlog_sale_amount = log((sale_amount * 100) / hpi))

#Summary
summary(housingdata_cleanhpidata$dlog_sale_amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.39   12.07   12.53   12.57   13.03   21.64

Part iii. US Maps

# Load necessary libraries
library(dplyr)
library(ggplot2)
library(usmap)

# Step 1: Creating data for mapping
state_year_avg <- housingdata_cleanhpidata %>%
  group_by(abbr, year) %>%
  summarise(mean_dlog_sale = mean(dlog_sale_amount, na.rm = TRUE)) %>%
  ungroup() %>%
  filter(!is.na(year))

## `summarise()` has grouped output by 'abbr'. You can override using the
## `.groups` argument.

# Step 2: Adjusting abbreviations for state names
state_year_avg <- state_year_avg %>%
  mutate(state = state.name[match(abbr, state.abb)])

# Step 3: Plotting the Data
plot_usmap(data = state_year_avg, values = "mean_dlog_sale", include = state_year_avg$state) +
  facet_wrap(~ year) +  
  scale_fill_continuous(
    name = "Mean dlog_sale_amount",
    low = "blue",
    high = "red",
    na.value = "white"
  ) +
  labs(
    title = "Mean dlog_sale_amount by State and Year",
    subtitle = "Each facet represents a year",
    caption = "Data Source: housingdata_cleanhpidata"
  ) +
  theme_minimal() +
  theme(legend.position = "right")

PROBLEM 3

Part i

library(stringr)

#Loading crime data
crimedata <- read_excel("table-1.xls", skip = 3, col_names = TRUE)

## New names:
## • `` -> `...23`
## • `` -> `...24`

#Checking column names
colnames(crimedata)

##  [1] "Year"                                           
##  [2] "Population1"                                    
##  [3] "Violent\ncrime2"                                
##  [4] "Violent \ncrime \nrate"                         
##  [5] "Murder and\nnonnegligent \nmanslaughter"        
##  [6] "Murder and \nnonnegligent \nmanslaughter \nrate"
##  [7] "Rape\n(revised \ndefinition)3"                  
##  [8] "Rape\n(revised \ndefinition) \nrate3"           
##  [9] "Rape\n(legacy \ndefinition)4"                   
## [10] "Rape\n(legacy \ndefinition) \nrate4"            
## [11] "Robbery"                                        
## [12] "Robbery \nrate"                                 
## [13] "Aggravated \nassault"                           
## [14] "Aggravated \nassault rate"                      
## [15] "Property \ncrime"                               
## [16] "Property \ncrime \nrate"                        
## [17] "Burglary"                                       
## [18] "Burglary \nrate"                                
## [19] "Larceny-\ntheft"                                
## [20] "Larceny-\ntheft rate"                           
## [21] "Motor \nvehicle \ntheft"                        
## [22] "Motor \nvehicle \ntheft \nrate"                 
## [23] "...23"                                          
## [24] "...24"

#drop last two vars
crimedata <- crimedata %>% select(-c(...23, ...24))

###Rename variables

crimedata <- crimedata %>%
  rename(
    year = "Year",
    population = "Population1",
    violent_crime = "Violent\ncrime2",
    violent_crime_rate = "Violent \ncrime \nrate",
    murder = "Murder and\nnonnegligent \nmanslaughter",
    murder_rate = "Murder and \nnonnegligent \nmanslaughter \nrate",
    rape_revised = "Rape\n(revised \ndefinition)3",
    rape_revised_rate = "Rape\n(revised \ndefinition) \nrate3",
    rape_legacy = "Rape\n(legacy \ndefinition)4",
    rape_legacy_rate = "Rape\n(legacy \ndefinition) \nrate4",
    robbery = "Robbery",
    robbery_rate = "Robbery \nrate",
    aggravated_assault = "Aggravated \nassault",
    aggravated_assault_rate = "Aggravated \nassault rate",
    property_crime = "Property \ncrime",
    property_crime_rate = "Property \ncrime \nrate",
    burglary = "Burglary",
    burglary_rate = "Burglary \nrate",
    larceny_theft = "Larceny-\ntheft",
    larceny_theft_rate = "Larceny-\ntheft rate",
    motor_vehicle_theft = "Motor \nvehicle \ntheft",
    motor_vehicle_theft_rate = "Motor \nvehicle \ntheft \nrate"
  )

# Check the renamed dataset
colnames(crimedata)

##  [1] "year"                     "population"              
##  [3] "violent_crime"            "violent_crime_rate"      
##  [5] "murder"                   "murder_rate"             
##  [7] "rape_revised"             "rape_revised_rate"       
##  [9] "rape_legacy"              "rape_legacy_rate"        
## [11] "robbery"                  "robbery_rate"            
## [13] "aggravated_assault"       "aggravated_assault_rate" 
## [15] "property_crime"           "property_crime_rate"     
## [17] "burglary"                 "burglary_rate"           
## [19] "larceny_theft"            "larceny_theft_rate"      
## [21] "motor_vehicle_theft"      "motor_vehicle_theft_rate"

Part ii.

###Fix footnotes

crimedata$year[which(crimedata$year == 20015)] <- 2001
crimedata$year[which(crimedata$year == 20186)] <- 2018

###Drop rows with notes

#Dropping last 7 rows with footnotes
crimedata <- crimedata %>%
  slice(1:(n() - 7))

#Cleaned dataset
tail(crimedata)

## # A tibble: 6 × 22
##   year  population violent_crime violent_crime_rate murder murder_rate
##   <chr>      <dbl>         <dbl>              <dbl>  <dbl>       <dbl>
## 1 2014   318907401       1153022               362.  14164         4.4
## 2 2015   320896618       1199310               374.  15883         4.9
## 3 2016   323405935       1250162               387.  17413         5.4
## 4 2017   325147121       1247917               384.  17294         5.3
## 5 2018   326687501       1209997               370.  16374         5  
## 6 2019   328239523       1203808               367.  16425         5  
## # ℹ 16 more variables: rape_revised <dbl>, rape_revised_rate <dbl>,
## #   rape_legacy <dbl>, rape_legacy_rate <dbl>, robbery <dbl>,
## #   robbery_rate <dbl>, aggravated_assault <dbl>,
## #   aggravated_assault_rate <dbl>, property_crime <dbl>,
## #   property_crime_rate <dbl>, burglary <dbl>, burglary_rate <dbl>,
## #   larceny_theft <dbl>, larceny_theft_rate <dbl>, motor_vehicle_theft <dbl>,
## #   motor_vehicle_theft_rate <dbl>

#Limiting the data to years given in housingdata_cleanhpidata
range(housingdata_cleanhpidata$year)

## [1] 2008 2019

crimedata <- crimedata %>% filter(year %in% 2008:2019)

Part iii.

###Find Missing Variables

#Identifying columns with missing values
missing_vars <- crimedata %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "missing_count") %>%
  filter(missing_count > 0)

#Printing missing variables and their missing value counts
print(missing_vars)

## # A tibble: 2 × 2
##   variable          missing_count
##   <chr>                     <int>
## 1 rape_revised                  5
## 2 rape_revised_rate             5

Removed “rape_revised” and “rape_revised_rate” as they had missing values (NAs).

###Clean Crime Data

cleancrimedata <- crimedata %>% select(-c(rape_revised, rape_revised_rate))

#Check NAs
sum(is.na(cleancrimedata)) #Zero Missing Values

## [1] 0

Part iv. Merge “cleancrimedata” and “housingdata_cleanhpidata”

Find common variables

crime_vars <- colnames(crimedata)
housing_vars <- colnames(housingdata_cleanhpidata)

#common variables
common_vars <- intersect(crime_vars, housing_vars)
common_vars

## [1] "year"

Merge Data

housingdata_cleanhpidata$year <- as.numeric(housingdata_cleanhpidata$year)
cleancrimedata$year <- as.numeric(cleancrimedata$year)

#Merge
cleanhousinghpicrimedata <- left_join(housingdata_cleanhpidata, cleancrimedata, by = "year")

###Plot Crime Rates and Year

# Define the variables to plot
crime_vars <- c("violent_crime_rate", "murder_rate", "rape_legacy_rate", "robbery_rate", 
                "aggravated_assault_rate", "property_crime_rate", "burglary_rate", 
                "larceny_theft_rate", "motor_vehicle_theft_rate")

# Convert dataset to long format
crime_long <- cleanhousinghpicrimedata %>%
  select(year, all_of(crime_vars)) %>%
  pivot_longer(cols = -year, names_to = "crime_type", values_to = "value")


# Plot 
ggplot(crime_long, aes(x = year, y = value, color = crime_type)) +
  geom_line(size = 1) +
  scale_x_continuous(breaks = seq(2008, 2019, by = 1)) +  
  labs(title = "Crime Trends Over the Years",
       x = "Year",
       y = "Crime Count / Rate",
       color = "Crime Type") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

The plot suggests that most crime rates are well below 1000 per 100,000, however, property crimes and larceny thefts are two highest crime rates. They seem to have decreased substantiatlly over the years but are still higher than other crime rates. Burglary rate is the third highest crime, but still declining.

###Plot deflated sale amount by year for different housing groups This plot takes a very long time to produce (havent successfully ran a single time even though waited for an hour or so), that’s why I am skipping plotting this. However, code is given below:

# Scatter plot with smoothed trend lines

#ggplot(cleanhousinghpicrimedata, aes(x = year, y = dlog_sale_amount, color = as.factor(year_built_grp))) + geom_point(alpha = 0.6, size = 2) +  geom_smooth(method = "loess", se = FALSE, size = 1) + scale_x_continuous(breaks = seq(2008, 2019, by = 1)) +   labs(title = "Deflated Sale Amount Over the Years by Year Built Group", x = "Year", y = "Log Deflated Sale Amount", color = "Year Built Group") + theme_minimal()

#PROBLEM 4

Part i. Summarize

#Selecting relevant variables from housing data
vars_to_summarize <- c("sale_amount", "dlog_sale_amount", "year_built",
                       "bedrooms_all_buildings", "number_of_bathrooms", 
                       "number_of_fireplaces", "stories_number", 
                       "log_sale_amount", "log_land_square_footage",
                       "violent_crime_rate", "murder_rate", "rape_legacy_rate",
                       "robbery_rate", "aggravated_assault_rate", "property_crime_rate",
                       "burglary_rate", "larceny_theft_rate", "motor_vehicle_theft_rate")

#Summary Table
stargazer(cleanhousinghpicrimedata %>% select(all_of(vars_to_summarize)), 
          type = "text", title = "Summary Statistics", digits = 2)

## 
## Summary Statistics
## ====================================================================================
## Statistic                    N        Mean      St. Dev.     Min          Max       
## ------------------------------------------------------------------------------------
## sale_amount              2,348,446 441,056.80 2,116,736.00  100.00  2,729,784,889.00
## dlog_sale_amount         2,348,446   12.57        0.83       4.39        21.64      
## year_built               2,348,446  1,981.46     24.51      1,700        2,019      
## bedrooms_all_buildings   2,348,446    3.43        0.88        1            13       
## number_of_bathrooms      2,348,446    2.47        0.90       1.00        10.75      
## number_of_fireplaces     2,348,446    1.12        0.41        1            93       
## stories_number           2,348,446    1.37        0.49       1.00         6.50      
## log_sale_amount          2,348,446   12.59        0.83       4.61        21.73      
## log_land_square_footage  2,348,446    9.39        1.02       0.00        20.72      
## violent_crime_rate       2,348,446   384.74      23.94      361.60       458.60     
## murder_rate              2,348,446    4.95        0.31       4.40         5.40      
## rape_legacy_rate         2,348,446   28.89        1.68      25.90        31.00      
## robbery_rate             2,348,446   104.21      16.48      81.60        145.90     
## aggravated_assault_rate  2,348,446   246.65      11.38      229.20       277.50     
## property_crime_rate      2,348,446  2,561.03     324.64    2,109.90     3,214.60    
## burglary_rate            2,348,446   524.23      134.87     340.50       733.00     
## larceny_theft_rate       2,348,446  1,802.36     178.57    1,549.50     2,166.10    
## motor_vehicle_theft_rate 2,348,446   234.45      21.63      215.40       315.40     
## ------------------------------------------------------------------------------------

Part ii.

#Housing characters variables
housing_vars <- c("sale_amount", "dlog_sale_amount", "year_built",
                       "bedrooms_all_buildings", "number_of_bathrooms", 
                       "number_of_fireplaces", "stories_number", 
                       "log_sale_amount", "log_land_square_footage")

#Summary by year group
summary_by_year_built_grp_df <- cleanhousinghpicrimedata %>%
  group_by(year_built_grp) %>%
  summarise(across(all_of(vars_to_summarize),
                   list(Mean = mean, Median = median, SD = sd, Min = min, Max = max),
                   .names = "{.col}.{.fn}")) %>%  # Use "." instead of "_"
  pivot_longer(cols = -year_built_grp, 
               names_to = c("Variable", "Statistic"), 
               names_pattern = "(.*)\\.(.*)") %>%  # Extract proper names
  mutate(Value = round(as.numeric(value), 2)) %>%
  select(-value) %>%  # Drop old value column
  pivot_wider(names_from = year_built_grp, values_from = Value)  # Wide format

#Summary Stats table using kable
kable(summary_by_year_built_grp_df, format = "html", align = "c", caption = "Summary Statistics by Year Built Group") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))

Summary Statistics by Year Built Group
Variable	Statistic	1	2	3	4
sale_amount	Mean	564319.73	405752.40	3.779475e+05	420230.51
sale_amount	Median	360000.00	255257.18	2.740000e+05	291000.00
sale_amount	SD	1313575.06	1320834.23	3.728872e+06	1070924.04
sale_amount	Min	100.00	100.00	1.000000e+02	100.00
sale_amount	Max	600000000.00	132019663.00	2.729785e+09	288164568.00
dlog_sale_amount	Mean	12.73	12.48	1.250000e+01	12.56
dlog_sale_amount	Median	12.82	12.41	1.248000e+01	12.52
dlog_sale_amount	SD	1.02	0.83	6.900000e-01	0.71
dlog_sale_amount	Min	4.39	4.39	4.390000e+00	4.39
dlog_sale_amount	Max	20.00	18.60	2.164000e+01	19.21
year_built	Mean	1946.48	1976.41	1.993190e+03	2007.65
year_built	Median	1952.00	1977.00	1.993000e+03	2006.00
year_built	SD	16.58	5.77	4.490000e+00	5.12
year_built	Min	1700.00	1965.00	1.986000e+03	2001.00
year_built	Max	1964.00	1985.00	2.000000e+03	2019.00
bedrooms_all_buildings	Mean	3.18	3.31	3.480000e+00	3.72
bedrooms_all_buildings	Median	3.00	3.00	3.000000e+00	4.00
bedrooms_all_buildings	SD	0.90	0.82	8.100000e-01	0.89
bedrooms_all_buildings	Min	1.00	1.00	1.000000e+00	1.00
bedrooms_all_buildings	Max	13.00	13.00	1.300000e+01	13.00
number_of_bathrooms	Mean	2.03	2.29	2.660000e+00	2.88
number_of_bathrooms	Median	2.00	2.00	2.500000e+00	2.50
number_of_bathrooms	SD	0.87	0.70	8.300000e-01	0.94
number_of_bathrooms	Min	1.00	1.00	1.000000e+00	1.00
number_of_bathrooms	Max	10.50	10.00	1.075000e+01	10.50
number_of_fireplaces	Mean	1.10	1.11	1.130000e+00	1.14
number_of_fireplaces	Median	1.00	1.00	1.000000e+00	1.00
number_of_fireplaces	SD	0.37	0.41	4.200000e-01	0.44
number_of_fireplaces	Min	1.00	1.00	1.000000e+00	1.00
number_of_fireplaces	Max	16.00	93.00	1.900000e+01	30.00
stories_number	Mean	1.16	1.30	1.490000e+00	1.53
stories_number	Median	1.00	1.00	1.500000e+00	1.50
stories_number	SD	0.36	0.47	5.000000e-01	0.50
stories_number	Min	1.00	1.00	1.000000e+00	1.00
stories_number	Max	6.00	6.50	6.250000e+00	6.50
log_sale_amount	Mean	12.74	12.50	1.253000e+01	12.61
log_sale_amount	Median	12.79	12.45	1.252000e+01	12.58
log_sale_amount	SD	1.03	0.83	7.000000e-01	0.72
log_sale_amount	Min	4.61	4.61	4.610000e+00	4.61
log_sale_amount	Max	20.21	18.70	2.173000e+01	19.48
log_land_square_footage	Mean	9.22	9.48	9.460000e+00	9.38
log_land_square_footage	Median	9.00	9.25	9.250000e+00	9.16
log_land_square_footage	SD	0.82	1.11	1.050000e+00	1.05
log_land_square_footage	Min	0.00	0.00	0.000000e+00	0.00
log_land_square_footage	Max	20.72	20.72	2.072000e+01	20.72
violent_crime_rate	Mean	385.60	384.21	3.838900e+02	385.23
violent_crime_rate	Median	383.80	373.70	3.737000e+02	383.80
violent_crime_rate	SD	24.33	23.33	2.321000e+01	24.77
violent_crime_rate	Min	361.60	361.60	3.616000e+02	361.60
violent_crime_rate	Max	458.60	458.60	4.586000e+02	458.60
murder_rate	Mean	4.94	4.95	4.950000e+00	4.96
murder_rate	Median	5.00	5.00	5.000000e+00	5.00
murder_rate	SD	0.31	0.31	3.100000e-01	0.31
murder_rate	Min	4.40	4.40	4.400000e+00	4.40
murder_rate	Max	5.40	5.40	5.400000e+00	5.40
rape_legacy_rate	Mean	28.82	28.89	2.890000e+01	28.94
rape_legacy_rate	Median	29.10	29.80	2.980000e+01	29.80
rape_legacy_rate	SD	1.68	1.69	1.690000e+00	1.67
rape_legacy_rate	Min	25.90	25.90	2.590000e+01	25.90
rape_legacy_rate	Max	31.00	31.00	3.100000e+01	31.00
robbery_rate	Mean	105.04	103.89	1.036400e+02	104.28
robbery_rate	Median	102.20	102.20	1.022000e+02	102.20
robbery_rate	SD	16.59	16.21	1.614000e+01	16.90
robbery_rate	Min	81.60	81.60	8.160000e+01	81.60
robbery_rate	Max	145.90	145.90	1.459000e+02	145.90
aggravated_assault_rate	Mean	246.76	246.44	2.463500e+02	247.01
aggravated_assault_rate	Median	248.30	248.30	2.482000e+02	248.30
aggravated_assault_rate	SD	11.52	11.16	1.116000e+01	11.62
aggravated_assault_rate	Min	229.20	229.20	2.292000e+02	229.20
aggravated_assault_rate	Max	277.50	277.50	2.775000e+02	277.50
property_crime_rate	Mean	2579.99	2555.66	2.549940e+03	2559.02
property_crime_rate	Median	2500.50	2500.50	2.500500e+03	2500.50
property_crime_rate	SD	327.07	321.51	3.199300e+02	328.92
property_crime_rate	Min	2109.90	2109.90	2.109900e+03	2109.90
property_crime_rate	Max	3214.60	3214.60	3.214600e+03	3214.60
burglary_rate	Mean	532.61	522.30	5.197200e+02	522.57
burglary_rate	Median	494.70	494.70	4.947000e+02	494.70
burglary_rate	SD	136.22	134.12	1.332600e+02	135.48
burglary_rate	Min	340.50	340.50	3.405000e+02	340.50
burglary_rate	Max	733.00	733.00	7.330000e+02	733.00
larceny_theft_rate	Mean	1812.57	1799.41	1.796380e+03	1801.33
larceny_theft_rate	Median	1783.60	1783.60	1.783600e+03	1783.60
larceny_theft_rate	SD	179.59	176.83	1.760900e+02	181.15
larceny_theft_rate	Min	1549.50	1549.50	1.549500e+03	1549.50
larceny_theft_rate	Max	2166.10	2166.10	2.166100e+03	2166.10
motor_vehicle_theft_rate	Mean	234.82	233.96	2.338400e+02	235.12
motor_vehicle_theft_rate	Median	230.20	230.20	2.302000e+02	230.20
motor_vehicle_theft_rate	SD	21.82	20.91	2.095000e+01	22.69
motor_vehicle_theft_rate	Min	215.40	215.40	2.154000e+02	215.40
motor_vehicle_theft_rate	Max	315.40	315.40	3.154000e+02	315.40

Part iii.

#Summary stats by census division
summary_by_division_df <- cleanhousinghpicrimedata %>%
  group_by(division) %>%
  summarise(across(all_of(vars_to_summarize),
                   list(Mean = mean, Median = median, SD = sd, Min = min, Max = max),
                   .names = "{.col}.{.fn}")) %>%  # Use "." instead of "_"
  pivot_longer(cols = -division, 
               names_to = c("Variable", "Statistic"), 
               names_pattern = "(.*)\\.(.*)") %>%  # Extract proper names
  mutate(Value = round(as.numeric(value), 2)) %>%
  select(-value) %>%  # Drop old value column
  pivot_wider(names_from = division, values_from = Value) 

#Summary stats table using kable
kable(summary_by_division_df, format = "html", align = "c", caption = "Summary Statistics by Census Division") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))

Summary Statistics by Census Division
Variable	Statistic	DV_ENC	DV_ESC	DV_MA	DV_MT	DV_PAC	DV_SA	DV_WNC	DV_WSC
sale_amount	Mean	270053.38	259789.82	209559.07	483343.76	597029.97	332534.00	3.113912e+05	254362.59
sale_amount	Median	205000.00	212900.00	160000.00	360000.00	400000.00	220000.00	2.250000e+05	215000.00
sale_amount	SD	697566.45	910328.83	372586.52	1800193.27	1102861.86	1201825.21	8.370690e+06	490246.41
sale_amount	Min	500.00	500.00	1000.00	500.00	280.00	100.00	4.850000e+02	101.00
sale_amount	Max	31000000.00	80240000.00	50000000.00	82000000.00	288164568.00	600000000.00	2.729785e+09	166250000.00
dlog_sale_amount	Mean	12.22	12.11	11.96	12.76	12.93	12.30	1.221000e+01	12.10
dlog_sale_amount	Median	12.20	12.16	12.00	12.75	12.90	12.28	1.225000e+01	12.12
dlog_sale_amount	SD	0.62	0.69	0.80	0.58	0.77	0.84	7.500000e-01	0.61
dlog_sale_amount	Min	6.21	6.00	6.81	6.10	5.37	4.39	5.950000e+00	4.52
dlog_sale_amount	Max	17.20	18.10	17.66	18.00	19.21	20.00	2.164000e+01	18.93
year_built	Mean	1979.27	1993.10	1971.79	1991.76	1971.44	1986.28	1.979080e+03	1993.02
year_built	Median	1984.00	2001.00	1978.00	1997.00	1973.00	1990.00	1.987000e+03	1999.00
year_built	SD	26.86	24.84	30.45	20.23	24.34	21.92	2.971000e+01	18.14
year_built	Min	1800.00	1805.00	1700.00	1800.00	1801.00	1700.00	1.825000e+03	1859.00
year_built	Max	2019.00	2019.00	2019.00	2019.00	2019.00	2019.00	2.019000e+03	2019.00
bedrooms_all_buildings	Mean	3.29	3.43	3.27	3.50	3.35	3.48	3.290000e+00	3.55
bedrooms_all_buildings	Median	3.00	3.00	3.00	3.00	3.00	3.00	3.000000e+00	3.00
bedrooms_all_buildings	SD	0.73	0.72	0.89	1.00	0.90	0.84	9.400000e-01	0.76
bedrooms_all_buildings	Min	1.00	1.00	1.00	1.00	1.00	1.00	1.000000e+00	1.00
bedrooms_all_buildings	Max	8.00	13.00	11.00	13.00	13.00	13.00	1.300000e+01	13.00
number_of_bathrooms	Mean	2.34	2.53	1.97	2.80	2.28	2.61	2.600000e+00	2.48
number_of_bathrooms	Median	2.50	2.10	2.00	3.00	2.00	2.50	2.500000e+00	2.00
number_of_bathrooms	SD	0.82	0.87	0.79	0.96	0.82	0.96	1.050000e+00	0.79
number_of_bathrooms	Min	1.00	1.00	1.00	1.00	1.00	1.00	1.000000e+00	1.00
number_of_bathrooms	Max	7.00	10.00	9.00	10.75	10.50	10.50	1.000000e+01	10.50
number_of_fireplaces	Mean	1.08	1.07	1.11	1.21	1.10	1.09	1.190000e+00	1.13
number_of_fireplaces	Median	1.00	1.00	1.00	1.00	1.00	1.00	1.000000e+00	1.00
number_of_fireplaces	SD	0.34	0.31	0.42	0.52	0.36	0.43	4.600000e-01	0.37
number_of_fireplaces	Min	1.00	1.00	1.00	1.00	1.00	1.00	1.000000e+00	1.00
number_of_fireplaces	Max	11.00	11.00	20.00	13.00	30.00	93.00	1.200000e+01	11.00
stories_number	Mean	1.54	1.39	1.57	1.55	1.31	1.38	1.370000e+00	1.37
stories_number	Median	1.50	1.00	1.50	2.00	1.00	1.00	1.000000e+00	1.00
stories_number	SD	0.51	0.46	0.51	0.50	0.47	0.49	4.700000e-01	0.46
stories_number	Min	1.00	1.00	1.00	1.00	1.00	1.00	1.000000e+00	1.00
stories_number	Max	5.00	4.00	5.00	5.00	6.00	6.00	6.500000e+00	4.00
log_sale_amount	Mean	12.25	12.20	11.94	12.79	12.93	12.29	1.228000e+01	12.26
log_sale_amount	Median	12.23	12.27	11.98	12.79	12.90	12.30	1.232000e+01	12.28
log_sale_amount	SD	0.62	0.70	0.81	0.60	0.80	0.86	7.500000e-01	0.62
log_sale_amount	Min	6.21	6.21	6.91	6.21	5.63	4.61	6.180000e+00	4.62
log_sale_amount	Max	17.25	18.20	17.73	18.22	19.48	20.21	2.173000e+01	18.93
log_land_square_footage	Mean	9.52	9.88	9.95	9.34	9.15	9.65	9.790000e+00	9.39
log_land_square_footage	Median	9.23	9.63	9.93	9.06	8.95	9.57	9.430000e+00	9.11
log_land_square_footage	SD	1.14	1.11	1.12	1.18	0.91	1.04	1.150000e+00	0.90
log_land_square_footage	Min	3.78	6.08	3.78	0.00	0.00	0.69	1.390000e+00	0.00
log_land_square_footage	Max	14.35	16.65	15.40	18.47	19.54	18.69	2.038000e+01	20.72
violent_crime_rate	Mean	386.94	380.08	384.30	382.56	386.40	382.72	3.844700e+02	385.71
violent_crime_rate	Median	383.80	370.40	383.80	373.70	383.80	373.70	3.737000e+02	383.80
violent_crime_rate	SD	26.60	21.67	23.66	22.13	24.71	22.42	2.384000e+01	25.34
violent_crime_rate	Min	361.60	361.60	361.60	361.60	361.60	361.60	3.616000e+02	361.60
violent_crime_rate	Max	458.60	458.60	458.60	458.60	458.60	458.60	4.586000e+02	458.60
murder_rate	Mean	4.98	4.98	4.96	4.96	4.94	4.96	4.950000e+00	4.95
murder_rate	Median	5.00	5.00	5.00	5.00	5.00	5.00	5.000000e+00	5.00
murder_rate	SD	0.31	0.27	0.31	0.31	0.31	0.31	3.100000e-01	0.31
murder_rate	Min	4.40	4.40	4.40	4.40	4.40	4.40	4.400000e+00	4.40
murder_rate	Max	5.40	5.40	5.40	5.40	5.40	5.40	5.400000e+00	5.40
rape_legacy_rate	Mean	29.05	29.34	28.95	28.98	28.79	29.00	2.890000e+01	28.84
rape_legacy_rate	Median	29.80	29.90	29.80	29.90	29.10	29.90	2.980000e+01	29.80
rape_legacy_rate	SD	1.62	1.54	1.67	1.69	1.67	1.68	1.680000e+00	1.69
rape_legacy_rate	Min	25.90	25.90	25.90	25.90	25.90	25.90	2.590000e+01	25.90
rape_legacy_rate	Max	31.00	31.00	31.00	31.00	31.00	31.00	3.100000e+01	31.00
robbery_rate	Mean	104.81	98.46	103.69	102.48	105.68	102.42	1.040400e+02	105.02
robbery_rate	Median	102.20	98.60	102.20	102.20	102.90	102.20	1.022000e+02	102.20
robbery_rate	SD	17.90	16.42	16.33	15.68	16.66	15.93	1.637000e+01	17.01
robbery_rate	Min	81.60	81.60	81.60	81.60	81.60	81.60	8.160000e+01	81.60
robbery_rate	Max	145.90	145.90	145.90	145.90	145.90	145.90	1.459000e+02	145.90
aggravated_assault_rate	Mean	248.07	247.26	246.65	246.09	246.94	246.29	2.465400e+02	246.86
aggravated_assault_rate	Median	248.30	248.30	248.30	248.30	248.30	248.30	2.483000e+02	248.20
aggravated_assault_rate	SD	12.03	9.91	11.25	10.80	11.65	10.83	1.138000e+01	11.98
aggravated_assault_rate	Min	229.20	229.20	229.20	229.20	229.20	229.20	2.292000e+02	229.20
aggravated_assault_rate	Max	277.50	277.50	277.50	277.50	277.50	277.50	2.775000e+02	277.50
property_crime_rate	Mean	2563.76	2439.67	2547.38	2524.52	2592.99	2522.99	2.556320e+03	2577.13
property_crime_rate	Median	2500.50	2362.90	2500.50	2451.60	2500.50	2451.60	2.500500e+03	2500.50
property_crime_rate	SD	341.42	318.74	321.11	312.01	327.86	315.76	3.218900e+02	330.11
property_crime_rate	Min	2109.90	2109.90	2109.90	2109.90	2109.90	2109.90	2.109900e+03	2109.90
property_crime_rate	Max	3214.60	3214.60	3214.60	3214.60	3214.60	3214.60	3.214600e+03	3214.60
burglary_rate	Mean	522.74	472.91	517.97	508.79	538.05	508.12	5.219900e+02	530.47
burglary_rate	Median	494.70	429.70	494.70	468.90	494.70	468.90	4.947000e+02	494.70
burglary_rate	SD	138.14	130.39	133.07	129.97	136.55	131.29	1.335100e+02	135.87
burglary_rate	Min	340.50	340.50	340.50	340.50	340.50	340.50	3.405000e+02	340.50
burglary_rate	Max	733.00	733.00	733.00	733.00	733.00	733.00	7.330000e+02	733.00
larceny_theft_rate	Mean	1803.92	1735.06	1795.10	1782.63	1819.63	1781.66	1.800000e+03	1811.30
larceny_theft_rate	Median	1783.60	1695.50	1783.60	1745.40	1783.60	1745.40	1.783600e+03	1783.60
larceny_theft_rate	SD	188.33	177.14	176.81	172.13	179.82	174.24	1.771600e+02	181.67
larceny_theft_rate	Min	1549.50	1549.50	1549.50	1549.50	1549.50	1549.50	1.549500e+03	1549.50
larceny_theft_rate	Max	2166.10	2166.10	2166.10	2166.10	2166.10	2166.10	2.166100e+03	2166.10
motor_vehicle_theft_rate	Mean	237.10	231.70	234.32	233.10	235.32	233.22	2.343400e+02	235.36
motor_vehicle_theft_rate	Median	230.20	230.20	230.20	230.20	230.20	230.20	2.302000e+02	230.20
motor_vehicle_theft_rate	SD	25.30	19.60	21.52	20.12	22.15	20.31	2.164000e+01	23.28
motor_vehicle_theft_rate	Min	215.40	215.40	215.40	215.40	215.40	215.40	2.154000e+02	215.40
motor_vehicle_theft_rate	Max	315.40	315.40	315.40	315.40	315.40	315.40	3.154000e+02	315.40

Part iv.

###Creating region variable

#Converting division to region
division_to_region <- c(
  "DV_NE" = "Northeast", "DV_MA" = "Northeast", # New England & Middle Atlantic
  
  "DV_ENC" = "Midwest", "DV_WNC" = "Midwest", # East & West North Central
  
  "DV_SA" = "South", "DV_ESC" = "South", "DV_WSC" = "South", # South Atlantic & Central
  
  "DV_MT" = "West", "DV_PAC" = "West" # Mountain & Pacific
)

#Regions Based on Division
cleanhousinghpicrimedata <- cleanhousinghpicrimedata %>%
  mutate(region = recode(division, !!!division_to_region))

Summarizing

# Compute summary statistics by census region
summary_by_region_df <- cleanhousinghpicrimedata %>%
  group_by(region) %>%
  summarise(across(all_of(vars_to_summarize),
                   list(Mean = mean, Median = median, SD = sd, Min = min, Max = max),
                   .names = "{.col}.{.fn}")) %>%  
  pivot_longer(cols = -region, 
               names_to = c("Variable", "Statistic"), 
               names_pattern = "(.*)\\.(.*)") %>%  
  mutate(Value = round(as.numeric(value), 2)) %>%
  select(-value) %>%  # Drop old value column
  pivot_wider(names_from = region, values_from = Value)  

# Display table using kable
kable(summary_by_region_df, format = "html", align = "c", caption = "Summary Statistics by Census Region") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))

Summary Statistics by Census Region
Variable	Statistic	Midwest	Northeast	South	West
sale_amount	Mean	3.088721e+05	209559.07	303133.49	572709.28
sale_amount	Median	2.240000e+05	160000.00	218000.00	389000.00
sale_amount	SD	8.113455e+06	372586.52	1000965.34	1285126.43
sale_amount	Min	4.850000e+02	1000.00	100.00	280.00
sale_amount	Max	2.729785e+09	50000000.00	600000000.00	288164568.00
dlog_sale_amount	Mean	1.221000e+01	11.96	12.23	12.89
dlog_sale_amount	Median	1.225000e+01	12.00	12.22	12.85
dlog_sale_amount	SD	7.400000e-01	0.80	0.77	0.74
dlog_sale_amount	Min	5.950000e+00	6.81	4.39	5.37
dlog_sale_amount	Max	2.164000e+01	17.66	20.00	19.21
year_built	Mean	1.979090e+03	1971.79	1988.82	1975.79
year_built	Median	1.987000e+03	1978.00	1993.00	1978.00
year_built	SD	2.955000e+01	30.45	20.95	24.95
year_built	Min	1.800000e+03	1700.00	1700.00	1800.00
year_built	Max	2.019000e+03	2019.00	2019.00	2019.00
bedrooms_all_buildings	Mean	3.290000e+00	3.27	3.50	3.38
bedrooms_all_buildings	Median	3.000000e+00	3.00	3.00	3.00
bedrooms_all_buildings	SD	9.300000e-01	0.89	0.81	0.92
bedrooms_all_buildings	Min	1.000000e+00	1.00	1.00	1.00
bedrooms_all_buildings	Max	1.300000e+01	11.00	13.00	13.00
number_of_bathrooms	Mean	2.580000e+00	1.97	2.56	2.39
number_of_bathrooms	Median	2.500000e+00	2.00	2.50	2.00
number_of_bathrooms	SD	1.030000e+00	0.79	0.90	0.88
number_of_bathrooms	Min	1.000000e+00	1.00	1.00	1.00
number_of_bathrooms	Max	1.000000e+01	9.00	10.50	10.75
number_of_fireplaces	Mean	1.190000e+00	1.11	1.11	1.12
number_of_fireplaces	Median	1.000000e+00	1.00	1.00	1.00
number_of_fireplaces	SD	4.600000e-01	0.42	0.41	0.40
number_of_fireplaces	Min	1.000000e+00	1.00	1.00	1.00
number_of_fireplaces	Max	1.200000e+01	20.00	93.00	30.00
stories_number	Mean	1.380000e+00	1.57	1.37	1.36
stories_number	Median	1.000000e+00	1.50	1.00	1.00
stories_number	SD	4.800000e-01	0.51	0.48	0.49
stories_number	Min	1.000000e+00	1.00	1.00	1.00
stories_number	Max	6.500000e+00	5.00	6.00	6.00
log_sale_amount	Mean	1.228000e+01	11.94	12.28	12.90
log_sale_amount	Median	1.232000e+01	11.98	12.29	12.87
log_sale_amount	SD	7.500000e-01	0.81	0.78	0.76
log_sale_amount	Min	6.180000e+00	6.91	4.61	5.63
log_sale_amount	Max	2.173000e+01	17.73	20.21	19.48
log_land_square_footage	Mean	9.770000e+00	9.95	9.56	9.19
log_land_square_footage	Median	9.410000e+00	9.93	9.37	8.97
log_land_square_footage	SD	1.150000e+00	1.12	1.00	0.98
log_land_square_footage	Min	1.390000e+00	3.78	0.00	0.00
log_land_square_footage	Max	2.038000e+01	15.40	20.72	19.54
violent_crime_rate	Mean	3.846200e+02	384.30	383.75	385.58
violent_crime_rate	Median	3.737000e+02	383.80	373.70	383.80
violent_crime_rate	SD	2.402000e+01	23.66	23.55	24.23
violent_crime_rate	Min	3.616000e+02	361.60	361.60	361.60
violent_crime_rate	Max	4.586000e+02	458.60	458.60	458.60
murder_rate	Mean	4.960000e+00	4.96	4.96	4.95
murder_rate	Median	5.000000e+00	5.00	5.00	5.00
murder_rate	SD	3.100000e-01	0.31	0.31	0.31
murder_rate	Min	4.400000e+00	4.40	4.40	4.40
murder_rate	Max	5.400000e+00	5.40	5.40	5.40
rape_legacy_rate	Mean	2.891000e+01	28.95	28.95	28.83
rape_legacy_rate	Median	2.980000e+01	29.80	29.80	29.10
rape_legacy_rate	SD	1.680000e+00	1.67	1.69	1.68
rape_legacy_rate	Min	2.590000e+01	25.90	25.90	25.90
rape_legacy_rate	Max	3.100000e+01	31.00	31.00	31.00
robbery_rate	Mean	1.040900e+02	103.69	103.29	105.00
robbery_rate	Median	1.022000e+02	102.20	102.20	102.20
robbery_rate	SD	1.646000e+01	16.33	16.40	16.51
robbery_rate	Min	8.160000e+01	81.60	81.60	81.60
robbery_rate	Max	1.459000e+02	145.90	145.90	145.90
aggravated_assault_rate	Mean	2.466300e+02	246.65	246.51	246.76
aggravated_assault_rate	Median	2.483000e+02	248.30	248.30	248.30
aggravated_assault_rate	SD	1.143000e+01	11.25	11.24	11.48
aggravated_assault_rate	Min	2.292000e+02	229.20	229.20	229.20
aggravated_assault_rate	Max	2.775000e+02	277.50	277.50	277.50
property_crime_rate	Mean	2.556770e+03	2547.38	2540.98	2578.34
property_crime_rate	Median	2.500500e+03	2500.50	2500.50	2500.50
property_crime_rate	SD	3.231200e+02	321.11	322.37	325.74
property_crime_rate	Min	2.109900e+03	2109.90	2109.90	2109.90
property_crime_rate	Max	3.214600e+03	3214.60	3214.60	3214.60
burglary_rate	Mean	5.220400e+02	517.97	515.53	531.79
burglary_rate	Median	4.947000e+02	494.70	494.70	494.70
burglary_rate	SD	1.338000e+02	133.07	133.49	135.70
burglary_rate	Min	3.405000e+02	340.50	340.50	340.50
burglary_rate	Max	7.330000e+02	733.00	733.00	733.00
larceny_theft_rate	Mean	1.800240e+03	1795.10	1791.49	1811.72
larceny_theft_rate	Median	1.783600e+03	1783.60	1783.60	1783.60
larceny_theft_rate	SD	1.778600e+02	176.81	177.72	178.85
larceny_theft_rate	Min	1.549500e+03	1549.50	1549.50	1549.50
larceny_theft_rate	Max	2.166100e+03	2166.10	2166.10	2166.10
motor_vehicle_theft_rate	Mean	2.345100e+02	234.32	233.96	234.84
motor_vehicle_theft_rate	Median	2.302000e+02	230.20	230.20	230.20
motor_vehicle_theft_rate	SD	2.189000e+01	21.52	21.44	21.75
motor_vehicle_theft_rate	Min	2.154000e+02	215.40	215.40	215.40
motor_vehicle_theft_rate	Max	3.154000e+02	315.40	315.40	315.40

PROBLEM 5

Part i.

library(lfe)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

library(fixest)

## 
## Attaching package: 'fixest'

## The following object is masked from 'package:lfe':
## 
##     fepois

## The following object is masked from 'package:scales':
## 
##     pvalue

#Dependent Variable
cleanhousinghpicrimedata$log_sale_hpi <- log(cleanhousinghpicrimedata$sale_amount / cleanhousinghpicrimedata$hpi)

# Run the regression
reg_model1 <- feols( log_sale_hpi ~ violent_crime_rate + murder_rate + rape_legacy_rate + robbery_rate + 
                    aggravated_assault_rate + property_crime_rate + burglary_rate + larceny_theft_rate + 
                    motor_vehicle_theft_rate +
                    bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + 
                    log_land_square_footage + AC_presence
  | year + division, data = cleanhousinghpicrimedata)

## The variables 'violent_crime_rate', 'murder_rate', 'rape_legacy_rate', 'robbery_rate', 'aggravated_assault_rate', 'burglary_rate' and 2 others have been removed because of collinearity (see $collin.var).

summary(reg_model1)

## OLS estimation, Dep. Var.: log_sale_hpi
## Observations: 2,348,446
## Fixed-effects: year: 12,  division: 8
## Standard-errors: Clustered (year) 
##                          Estimate   Std. Error     t value   Pr(>|t|)    
## property_crime_rate     -0.035109 90113.721445 -0.00000039 1.0000e+00    
## bedrooms_all_buildings   0.033349     0.002606 12.79508467 5.9999e-08 ***
## number_of_bathrooms      0.390041     0.005534 70.47807530 5.8287e-16 ***
## number_of_fireplaces     0.104705     0.005774 18.13351701 1.5247e-09 ***
## stories_number           0.025739     0.003864  6.66059565 3.5599e-05 ***
## log_land_square_footage  0.009049     0.002383  3.79735806 2.9568e-03 ** 
## AC_presence             -0.150589     0.015525 -9.70005503 1.0014e-06 ***
## ... 8 variables were removed because of collinearity (violent_crime_rate, murder_rate and 6 others [full set in $collin.var])
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.644389     Adj. R2: 0.391734
##                  Within R2: 0.259987

The model automatically removes violent_crime_rate, murder_rate, rape_legacy_rate, robbery_rate, aggravated_assault_rate, burglary_rate, larceny_theft_rate due to collinearity between variables and only keeps property_crime_rate.

Part ii.

#Model 1: Adding year_built_group
reg_model2 <- feols( log_sale_hpi ~ property_crime_rate +
                    year_built_grp + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + 
                    log_land_square_footage + AC_presence
  | year + division, data = cleanhousinghpicrimedata)

summary(reg_model2)

## OLS estimation, Dep. Var.: log_sale_hpi
## Observations: 2,348,446
## Fixed-effects: year: 12,  division: 8
## Standard-errors: Clustered (year) 
##                          Estimate   Std. Error       t value   Pr(>|t|)    
## property_crime_rate     -0.032002 87219.715689  -0.000000367 1.0000e+00    
## year_built_grp          -0.069411     0.003459 -20.069152836 5.1500e-10 ***
## bedrooms_all_buildings   0.033820     0.002604  12.985422222 5.1481e-08 ***
## number_of_bathrooms      0.412299     0.005985  68.892500438 7.4826e-16 ***
## number_of_fireplaces     0.091589     0.005564  16.461993841 4.2657e-09 ***
## stories_number           0.051416     0.004274  12.029278393 1.1345e-07 ***
## log_land_square_footage  0.007059     0.002350   3.003710145 1.2000e-02 *  
## AC_presence             -0.132790     0.013812  -9.614021443 1.0940e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.640959     Adj. R2: 0.398192
##                  Within R2: 0.267844

#Model 2: Adding year_built to the model
reg_model3 <- feols( log_sale_hpi ~ property_crime_rate +
                    year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + 
                    log_land_square_footage + AC_presence
  | year + division, data = cleanhousinghpicrimedata)

summary(reg_model3)

## OLS estimation, Dep. Var.: log_sale_hpi
## Observations: 2,348,446
## Fixed-effects: year: 12,  division: 8
## Standard-errors: Clustered (year) 
##                          Estimate   Std. Error       t value   Pr(>|t|)    
## property_crime_rate     -0.029196 85328.669408  -0.000000342 1.0000e+00    
## year_built              -0.003539     0.000174 -20.320988727 4.5049e-10 ***
## bedrooms_all_buildings   0.032436     0.002563  12.657242362 6.7119e-08 ***
## number_of_bathrooms      0.417360     0.006166  67.685403662 9.0848e-16 ***
## number_of_fireplaces     0.086945     0.005398  16.106409654 5.3762e-09 ***
## stories_number           0.044878     0.004192  10.706758146 3.7212e-07 ***
## log_land_square_footage  0.008783     0.002408   3.647265701 3.8390e-03 ** 
## AC_presence             -0.129416     0.013560  -9.543658617 1.1766e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.640031     Adj. R2: 0.399934
##                  Within R2: 0.269963

#Model 3: using fips_county_code as fixed effects
reg_model4 <- feols(log_sale_hpi ~ property_crime_rate +
                    year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + 
                    log_land_square_footage + AC_presence
  | year + fips_county_code, data = cleanhousinghpicrimedata)

summary(reg_model4)

## OLS estimation, Dep. Var.: log_sale_hpi
## Observations: 2,348,446
## Fixed-effects: year: 12,  fips_county_code: 501
## Standard-errors: Clustered (year) 
##                          Estimate   Std. Error      t value   Pr(>|t|)    
## property_crime_rate      0.005961 31813.967915  0.000000187 1.0000e+00    
## year_built               0.001051     0.000083 12.648130888 6.7621e-08 ***
## bedrooms_all_buildings   0.044571     0.002188 20.366365222 4.3982e-10 ***
## number_of_bathrooms      0.303712     0.005577 54.454969683 9.8735e-15 ***
## number_of_fireplaces     0.114228     0.005384 21.214922398 2.8362e-10 ***
## stories_number           0.041854     0.006819  6.137660350 7.3345e-05 ***
## log_land_square_footage  0.070799     0.001031 68.668072951 7.7554e-16 ***
## AC_presence             -0.021494     0.003507 -6.129301249 7.4219e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.533756     Adj. R2: 0.582579
##                  Within R2: 0.276663

# Load necessary package
library(modelsummary)

## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)

# Create regression table
model_list <- list(
  "Model 1 (Year Built Group)" = reg_model2,
  "Model 2 (Year Built Added)" = reg_model3,
  "Model 3 (County FE)" = reg_model4
)

# Generate summary table
modelsummary(model_list, 
  title = "Regression Results: Impact of Crime Rates on Housing Prices",
  coef_omit = "Intercept",  # Hide intercept
  stars = TRUE, 
  gof_omit = "IC|RMSE",     # Remove unnecessary stats
  fmt = 4,                  # Round numbers
  escape = FALSE  # Ensures special characters (e.g., &nbsp;) don't appear in the title
)

Regression Results: Impact of Crime Rates on Housing Prices
	Model 1 (Year Built Group)	Model 2 (Year Built Added)	Model 3 (County FE)
+ p < 0.1, * p < 0.05, p < 0.01, * p < 0.001
property_crime_rate	-0.0320	-0.0292	0.0060
	(87219.7157)	(85328.6694)	(31813.9679)
year_built_grp	-0.0694***
	(0.0035)
bedrooms_all_buildings	0.0338***	0.0324***	0.0446***
	(0.0026)	(0.0026)	(0.0022)
number_of_bathrooms	0.4123***	0.4174***	0.3037***
	(0.0060)	(0.0062)	(0.0056)
number_of_fireplaces	0.0916***	0.0869***	0.1142***
	(0.0056)	(0.0054)	(0.0054)
stories_number	0.0514***	0.0449***	0.0419***
	(0.0043)	(0.0042)	(0.0068)
log_land_square_footage	0.0071*	0.0088**	0.0708***
	(0.0024)	(0.0024)	(0.0010)
AC_presence	-0.1328***	-0.1294***	-0.0215***
	(0.0138)	(0.0136)	(0.0035)
year_built		-0.0035***	0.0011***
		(0.0002)	(0.0001)
Num.Obs.	2348446	2348446	2348446
R2	0.398	0.400	0.583
R2 Adj.	0.398	0.400	0.583
R2 Within	0.268	0.270	0.277
R2 Within Adj.	0.268	0.270	0.277
Std.Errors	by: year	by: year	by: year
FE: year	X	X	X
FE: division	X	X
FE: fips_county_code			X

Part iii.

I prefer the last model, as the year_built has positive significant effect which is consistent with the housing market where newer houses has better sale values.

#Model 3 by Region
reg_model4_reg <- feols(log_sale_hpi ~ property_crime_rate +
                    year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + 
                    log_land_square_footage + AC_presence +
                      region
  | year, data = cleanhousinghpicrimedata)

summary(reg_model4_reg)

## OLS estimation, Dep. Var.: log_sale_hpi
## Observations: 2,348,446
## Fixed-effects: year: 12
## Standard-errors: Clustered (year) 
##                          Estimate Std. Error   t value   Pr(>|t|)    
## property_crime_rate     -0.000016   0.000011  -1.43860 1.7810e-01    
## year_built              -0.004729   0.000152 -31.08332 4.5319e-12 ***
## bedrooms_all_buildings   0.039166   0.003082  12.70701 6.4448e-08 ***
## number_of_bathrooms      0.410606   0.005558  73.87403 3.4769e-16 ***
## number_of_fireplaces     0.069861   0.005188  13.46618 3.5275e-08 ***
## stories_number           0.031290   0.003783   8.27116 4.7512e-06 ***
## log_land_square_footage  0.006742   0.001930   3.49268 5.0347e-03 ** 
## AC_presence             -0.102340   0.012734  -8.03683 6.2529e-06 ***
## regionNortheast         -0.057900   0.027833  -2.08029 6.1668e-02 .  
## regionSouth              0.115901   0.014255   8.13051 5.5988e-06 ***
## regionWest               0.761214   0.014750  51.60695 1.7790e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.648676     Adj. R2: 0.383615
##                  Within R2: 0.382314

Part iv.

Here, the key crime variable is property crime rate as others had collinearity

#New variable to store estimated effects
cleanhousinghpicrimedata$crime_effect_region <- NA

#Loop through each region and estimate the effect
for (r in unique(cleanhousinghpicrimedata$region)) {
  
  #Run regression for each region
  model <- feols(log_sale_hpi ~ property_crime_rate + bedrooms_all_buildings +
                   number_of_bathrooms + number_of_fireplaces + stories_number +
                   log_land_square_footage + AC_presence, 
                 data = cleanhousinghpicrimedata %>% filter(region == r))
  
  #Extract the coefficient for property_crime_rate
  effect <- coef(model)["property_crime_rate"]
  
  #Assigning effect to all rows 
  cleanhousinghpicrimedata$crime_effect_region[cleanhousinghpicrimedata$region == r] <- effect
}

# Round the estimated crime effect values to 4 decimal places
cleanhousinghpicrimedata$crime_effect_region <- round(cleanhousinghpicrimedata$crime_effect_region, 4)

#Scaling the crime rate variable
cleanhousinghpicrimedata$scaled_property_crime_rate <- cleanhousinghpicrimedata$property_crime_rate / 1000

summary(cleanhousinghpicrimedata$scaled_property_crime_rate)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.110   2.363   2.501   2.561   2.868   3.215

Map the region wise effect

# Define U.S. Census regions
census_regions <- data.frame(
  state = c("Maine", "New Hampshire", "Vermont", "Massachusetts", "Rhode Island", "Connecticut",
            "New York", "New Jersey", "Pennsylvania",
            "Wisconsin", "Michigan", "Illinois", "Indiana", "Ohio",
            "North Dakota", "South Dakota", "Nebraska", "Kansas", "Minnesota", "Iowa", "Missouri",
            "Delaware", "Maryland", "District of Columbia", "Virginia", "West Virginia",
            "North Carolina", "South Carolina", "Georgia", "Florida", "Kentucky", "Tennessee",
            "Alabama", "Mississippi", "Arkansas", "Louisiana",
            "Oklahoma", "Texas",
            "Montana", "Idaho", "Wyoming", "Colorado", "New Mexico",
            "Arizona", "Utah", "Nevada",
            "Washington", "Oregon", "California", "Alaska", "Hawaii"),
  region = c(rep("Northeast", 9),
             rep("Midwest", 12),
             rep("South", 17),
             rep("West", 13))
)

# Convert state names to lowercase for merging with `usmap` data
census_regions$state <- tolower(census_regions$state)

# Aggregate mean crime effect by region
region_effects <- cleanhousinghpicrimedata %>%
  group_by(region) %>%
  summarise(mean_crime_effect = mean(crime_effect_region, na.rm = TRUE))

# Merge state-region mapping with crime effect data
plot_data <- left_join(census_regions, region_effects, by = "region")

# Plot the map, now with a **gradient scale**
plot_usmap(data = plot_data, values = "mean_crime_effect", regions = "state") +
  scale_fill_gradient2(
    low = "blue", mid = "white", high = "red",
    midpoint = mean(plot_data$mean_crime_effect, na.rm = TRUE),
    name = "Mean Crime Effect",
    na.value = "grey50"
  ) +
  theme(legend.position = "right") +
  labs(title = "Variation in Estimated Crime Rate Effect by U.S. Region (Gradient)")

EEFE 530: PS1

2024-06-13

Setting Working Directory

Part ii. Summarizing Variables

Part iii - 3D Plot

Part iv. Cleaning the data

Part i

Part ii

Part iii. US Maps

PROBLEM 3

Part i

Part ii.

Part iii.

Part iv. Merge “cleancrimedata” and “housingdata_cleanhpidata”

Find common variables

Merge Data

Part i. Summarize

Part ii.

Part iii.

Part iv.

Summarizing

PROBLEM 5

Part i.

Part ii.

Part iii.

Part iv.

Map the region wise effect