# Load libraries
library(tidyverse)
library(plyr)
library(kableExtra)
library(plotly)
library(corrplot)## Warning: package 'corrplot' was built under R version 4.0.3
## Warning: package 'PerformanceAnalytics' was built under R version 4.0.3
## Warning: package 'xts' was built under R version 4.0.3
## Warning: package 'zoo' was built under R version 4.0.3
## Warning: package 'xgboost' was built under R version 4.0.3
One of the big decisions to make in life is purchasing a home. There are many factors that need to be taken into consideration while making such a decision. In this project, we tend to retrieve data from different data sources. The data will have metrics such as median income, unemployment rate, public schools, hospitals, hospital ratings, crime rate, … Our task will be to get data from those multiple data sources using different methods (read csv, web scrapping…) learned throughout the course of this class, to store them (on a database, cloud,…), to clean and transform them, and to analyze and visualize them to get some useful information for houses price. We will then go further on using different models to predict the house price based on different features that we would find are necessary and weight on houses price.
Research questions : “Which variable is a best predictor of housing prices?”; “What is the relationship between housing prices and each predictor?”
Process <- c('Data Collection',
'Data Transformation (Cleaning & Tidying data)',
'Data Analysis', 'Visualization',
'Modeling','Review & Conclusion', 'Presentation')
Team <- c('Jered & Zhouxin', 'Jered & Zhouxin', 'Jered ',
'Jered', 'Jered', 'Jered',
'Jered & Zhouxin')
df_team <- data.frame(Process, Team)
names(df_team) <- c('Process', 'Team Members')
df_team %>%
kbl(caption = "Work Process & Responsabilities") %>%
kable_material(c("striped", "hover")) %>%
row_spec(0, color = "indigo")| Process | Team Members |
|---|---|
| Data Collection | Jered & Zhouxin |
| Data Transformation (Cleaning & Tidying data) | Jered & Zhouxin |
| Data Analysis | Jered |
| Visualization | Jered |
| Modeling | Jered |
| Review & Conclusion | Jered |
| Presentation | Jered & Zhouxin |
# Load the data from Github and GCP storage
hospitals <- read.csv("https://raw.githubusercontent.com/szx868/FinalProject/master/Hospitals.csv")
hospital_ratings <- read.csv("https://raw.githubusercontent.com/szx868/FinalProject/master/Hospital_General_Information.csv")
county_time_series <- read.csv("https://storage.googleapis.com/triplej_project3/County_time_series.csv")
crosswalk = read.csv("https://raw.githubusercontent.com/szx868/FinalProject/master/CountyCrossWalk_Zillow.csv")
unemployment = read.csv("https://raw.githubusercontent.com/szx868/FinalProject/master/Unemployment_Rate_by_County_Percent.csv")
public_schools <- read.csv("https://storage.googleapis.com/triplej_project3/Public_Schools.csv")# Subset hospital_rating data set
hospital_ratings <-hospital_ratings %>% select('Hospital.Name', 'Hospital.overall.rating')# Merge hospitals and hospital ratings
hospitals_with_ratings <- merge(hospitals, hospital_ratings, by.x="NAME",by.y = "Hospital.Name")# Clean the missing values and subset the hospital data set
hospital <- filter(hospitals_with_ratings, COUNTYFIPS != 'NOT AVAILABLE')
hospital <- filter(hospitals_with_ratings, Hospital.overall.rating != 'Not Available')# Rename the columns
hospital <- rename(hospitals_with_ratings,c('FIPS' = 'COUNTYFIPS', 'AverageHospitalRating' = 'Hospital.overall.rating'))# Group unemployment per county
unemployment_per_county <- unemployment %>%
select('Region.Code', '2018')
unemployment_per_county <- rename(unemployment_per_county, c('FIPS' = 'Region.Code', 'UnemploymentRate' = '2018'))
# Rename unemployment
unemployments <- unemployment_per_county# Group public school per county
public_schools_per_county<-public_schools %>%
group_by(COUNTYFIPS) %>%
dplyr::summarise(count=n())## `summarise()` ungrouping output (override with `.groups` argument)
# Subset public schools per county and rename the columns
public_schools_per_county <- public_schools_per_county %>%
select('COUNTYFIPS', 'count')
public_schools_per_county <- rename(public_schools_per_county,c('FIPS' = 'COUNTYFIPS', 'NumberOfSchools' = 'count'))
# Rename public_schools_per_county
schools <- public_schools_per_county# Subset hospital data set & get hospital average rating
hospital_avg_rating <- hospital %>%
select('FIPS', 'AverageHospitalRating')# Convert to numeric
hospital_avg_rating$AverageHospitalRating <- as.numeric(hospital_avg_rating$AverageHospitalRating)## Warning: NAs introduced by coercion
# Group hospital avg rating by county
hospital_avg <- hospital_avg_rating %>%
group_by(FIPS) %>%
dplyr::summarize(AverageHospitalRating = mean(AverageHospitalRating, na.rm=TRUE))## `summarise()` ungrouping output (override with `.groups` argument)
# Group hospital data set by county
hospitals_per_county <-hospitals %>%
group_by(COUNTYFIPS) %>%
dplyr::summarise(count=n())## `summarise()` ungrouping output (override with `.groups` argument)
# Rename columns of hospital data set
hospitals_per_county <- rename(hospitals_per_county,c('FIPS' = 'COUNTYFIPS', 'NumberOfHospitals' = 'count'))# Merge hospitals per county and hospital_avg data set
hospitals_per_county <- merge(hospitals_per_county,hospital_avg , by.x="FIPS",by.y = "FIPS")
head(hospitals_per_county)## FIPS NumberOfHospitals AverageHospitalRating
## 1 10005 3 3.5
## 2 1001 1 4.0
## 3 1003 4 3.0
## 4 1005 1 3.0
## 5 1007 1 NaN
## 6 1011 1 3.0
# Explore housing price from county time series data set
house_prices <-
county_time_series %>%
group_by(RegionName) %>%
dplyr::summarize(ZHVI_AllHomes = mean(ZHVI_AllHomes, na.rm=TRUE))## `summarise()` ungrouping output (override with `.groups` argument)
# Associate house price with county code
data <- rename(house_prices,c('FIPS' = 'RegionName', 'AverageHousePrice' = 'ZHVI_AllHomes'))# Merge data with hospitals_per_county
data <- merge(data, hospitals_per_county , by.x="FIPS",by.y = "FIPS")## FIPS AverageHousePrice ï..CountyName StateName StateFIPS CountyFIPS
## 1 1001 114483.7 Autauga Alabama 1 1
## 2 1003 164861.7 Baldwin Alabama 1 3
## 3 1005 NaN Barbour Alabama 1 5
## 4 1007 NaN Bibb Alabama 1 7
## 5 1013 NaN Butler Alabama 1 13
## 6 1015 NaN Calhoun Alabama 1 15
## MetroName_Zillow CBSAName CountyRegionID_Zillow
## 1 Montgomery, AL Montgomery, AL 1524
## 2 Daphne, AL Daphne-Fairhope-Foley, AL 1525
## 3 NULL NULL 1531
## 4 Birmingham, AL Birmingham-Hoover, AL 100
## 5 NULL NULL 898
## 6 Anniston, AL Anniston-Oxford-Jacksonville, AL 1569
## MetroRegionID_Zillow CBSACode UnemploymentRate NumberOfSchools
## 1 394875 33860 3.6 15
## 2 394519 19300 3.6 47
## 3 NULL NULL 5.3 11
## 4 394388 13820 4.0 10
## 5 NULL NULL 4.8 8
## 6 394333 11500 4.7 40
## NumberOfHospitals AverageHospitalRating
## 1 1 4.000000
## 2 4 3.000000
## 3 1 3.000000
## 4 1 NaN
## 5 2 3.000000
## 6 4 2.666667
# Get raw data from Github
data_raw <- read.csv("https://raw.githubusercontent.com/szx868/FinalProject/master/data_raw_final.csv")
head(data_raw)## FIPS AverageHousePrice ï..CountyName StateName StateFIPS CountyFIPS
## 1 1001 114483.7 Autauga Alabama 1 1
## 2 1003 164861.7 Baldwin Alabama 1 3
## 3 1005 NA Barbour Alabama 1 5
## 4 1007 NA Bibb Alabama 1 7
## 5 1013 NA Butler Alabama 1 13
## 6 1015 NA Calhoun Alabama 1 15
## MetroName_Zillow CBSAName CountyRegionID_Zillow
## 1 Montgomery, AL Montgomery, AL 1524
## 2 Daphne, AL Daphne-Fairhope-Foley, AL 1525
## 3 NULL NULL 1531
## 4 Birmingham, AL Birmingham-Hoover, AL 100
## 5 NULL NULL 898
## 6 Anniston, AL Anniston-Oxford-Jacksonville, AL 1569
## MetroRegionID_Zillow CBSACode UnemploymentRate NumberOfSchools
## 1 394875 33860 3.6 15
## 2 394519 19300 3.6 47
## 3 NULL NULL 5.3 11
## 4 394388 13820 4.0 10
## 5 NULL NULL 4.8 8
## 6 394333 11500 4.7 40
## NumberOfHospitals AverageHospitalRating
## 1 1 4.000000
## 2 4 3.000000
## 3 1 3.000000
## 4 1 NA
## 5 2 3.000000
## 6 4 2.666667
# Drop rows with missing Average house price
data_clean <- data_raw %>%
drop_na(AverageHousePrice)
# Move the target in the end
data_final <- data_clean %>%
select(-AverageHousePrice, AverageHousePrice)
head(data_final)## FIPS ï..CountyName StateName StateFIPS CountyFIPS MetroName_Zillow
## 1 1001 Autauga Alabama 1 1 Montgomery, AL
## 2 1003 Baldwin Alabama 1 3 Daphne, AL
## 3 1033 Colbert Alabama 1 33 Florence, AL
## 4 1049 De Kalb Alabama 1 49 NULL
## 5 1051 Elmore Alabama 1 51 Montgomery, AL
## 6 1055 Etowah Alabama 1 55 Gadsden, AL
## CBSAName CountyRegionID_Zillow MetroRegionID_Zillow
## 1 Montgomery, AL 1524 394875
## 2 Daphne-Fairhope-Foley, AL 1525 394519
## 3 Florence-Muscle Shoals, AL 1636 394598
## 4 NULL 1656 NULL
## 5 Montgomery, AL 1003 394875
## 6 Gadsden, AL 1007 394620
## CBSACode UnemploymentRate NumberOfSchools NumberOfHospitals
## 1 33860 3.6 15 1
## 2 19300 3.6 47 4
## 3 22520 4.7 27 2
## 4 NULL 3.8 20 1
## 5 33860 3.4 20 2
## 6 23460 4.1 45 4
## AverageHospitalRating AverageHousePrice
## 1 4.00 114483.67
## 2 3.00 164861.69
## 3 2.50 92332.05
## 4 3.00 98315.49
## 5 3.40 125561.37
## 6 2.75 77994.25
We are going to calculate the average county home price per state.
data_1 <- data_final %>%
group_by(StateName) %>%
transmute(StateName, avg_house_price = mean(AverageHousePrice))
data_1 <- data_1 %>%
distinct(StateName, avg_house_price)
data_1## # A tibble: 50 x 2
## # Groups: StateName [50]
## StateName avg_house_price
## <chr> <dbl>
## 1 Alabama 111315.
## 2 Alaska 231293.
## 3 Arizona 154040.
## 4 Arkansas 96635.
## 5 California 307697.
## 6 Colorado 242818.
## 7 Connecticut 222477.
## 8 District of Columbia 351877.
## 9 Florida 144920.
## 10 Georgia 119380.
## # ... with 40 more rows
Now we are going to order the state from the least affordable to the most affordable
data_2 <- data_1 %>%
arrange(desc(avg_house_price))
data_2 %>%
kbl(caption = "Home price per state") %>%
kable_material(c("striped", "hover")) %>%
row_spec(0, color = "indigo")| StateName | avg_house_price |
|---|---|
| Hawaii | 367663.35 |
| District of Columbia | 351876.68 |
| Massachusetts | 310721.34 |
| California | 307697.28 |
| Colorado | 242818.42 |
| New Jersey | 240695.04 |
| Rhode Island | 232272.22 |
| Alaska | 231292.74 |
| Connecticut | 222477.39 |
| Maryland | 206907.04 |
| Nevada | 203432.64 |
| Vermont | 197167.04 |
| Washington | 188286.33 |
| Virginia | 184240.35 |
| Wyoming | 183743.98 |
| Utah | 182363.63 |
| Oregon | 177559.35 |
| New Hampshire | 174137.05 |
| New York | 171355.31 |
| Montana | 164692.98 |
| Idaho | 164227.44 |
| New Mexico | 163703.00 |
| South Dakota | 162505.95 |
| Maine | 159582.94 |
| Arizona | 154039.66 |
| Minnesota | 152239.31 |
| Wisconsin | 148175.44 |
| Florida | 144920.05 |
| North Dakota | 141946.10 |
| Pennsylvania | 122026.17 |
| Georgia | 119380.35 |
| North Carolina | 118233.53 |
| Missouri | 117281.31 |
| Michigan | 115644.08 |
| Kentucky | 115372.95 |
| South Carolina | 114039.54 |
| Louisiana | 111345.26 |
| Alabama | 111315.46 |
| Iowa | 110865.47 |
| Illinois | 110639.05 |
| Texas | 108672.88 |
| Ohio | 107263.82 |
| West Virginia | 104864.69 |
| Nebraska | 101488.61 |
| Mississippi | 99768.83 |
| Indiana | 99705.99 |
| Arkansas | 96635.30 |
| Tennessee | 95615.66 |
| Oklahoma | 84013.99 |
| Kansas | 81662.45 |
Visualize the least affordable states based on average county home price per state
# Top 10
top_n(ungroup(data_2), 10) %>%
ggplot(aes(reorder(StateName, avg_house_price), avg_house_price)) +
geom_col(aes(fill = avg_house_price)) +
coord_flip() +
labs(title = '10 most expensive state to buy a house', x = "State")## Selecting by avg_house_price
Visualize the most affordable states based on average county home price per state
# Top 10
top_n(ungroup(data_2), -10) %>%
ggplot(aes(reorder(StateName, avg_house_price), avg_house_price)) +
geom_bar(stat="identity", color="blue", fill="purple") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=0.5)) +
labs(title = '10 least expensive state to buy a house', x = "State")## Selecting by avg_house_price
Visualize the average house price by state
# Add state abbreviation
data_3 <- data_1 %>%
mutate(code = state.abb[match(StateName, state.name)])
# Plot the map
w <- list(color = toRGB("white"), width = 2)
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
p <- plot_geo(data_3, locationmode = 'USA-states') %>%
add_trace(
z = ~avg_house_price, locations = ~code,
color = ~avg_house_price, colors = 'Purples'
) %>%
colorbar(title = "Avg house price") %>%
layout(
title = 'Avg house price by State',
geo = g
)## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
For this predictive analysis, we are going to use Linear Regression.
Since wee have many explanatory variables, this case will be a multiple linear regression model.
The explanatory variables or predictors are: unemployment rate, number of schools, number of hospitals, average hospital ratings. Our response variable is the average house price.
Our research question is “which variable is a best predictor of average house price?”
We are going then to test around the relationship between housing prices and each one of the predictors. The model selection will be based on adjusted R square. Thus, we are going to apply “backward-selection”.
The general idea behind backward-selection is to start with the full model and eliminate one variable at a time until the ideal model is reached: Start with the full model, refit all possible models omitting one variable at a time, and choose the model with the highest adjusted R squared, repeat until maximum possible adjusted R squared is reached.
Subset the data set with numeric variables to get it ready for modeling
# Subset the data set with only numerical variables
data_4 <- data_final %>%
group_by(StateName) %>%
transmute(StateName,
UnemploymentRate = round(mean(UnemploymentRate), 0),
NumberOfSchools = sum(NumberOfSchools),
NumberOfHospitals = sum(NumberOfHospitals),
AverageHospitalRating = round(mean(AverageHospitalRating), 0),
avg_house_price = mean(AverageHousePrice))
data_4 <- data_4 %>%
distinct(StateName, UnemploymentRate, NumberOfSchools, NumberOfHospitals, AverageHospitalRating, avg_house_price)
data_4 <- subset(data_4, select = -c(StateName))
head(data_4)## # A tibble: 6 x 5
## UnemploymentRate NumberOfSchools NumberOfHospita~ AverageHospital~
## <dbl> <int> <int> <dbl>
## 1 4 731 51 3
## 2 6 206 17 3
## 3 7 2414 139 NA
## 4 4 542 68 3
## 5 5 10032 549 NA
## 6 3 1592 94 NA
## # ... with 1 more variable: avg_house_price <dbl>
Correlation matrix
## UnemploymentRate NumberOfSchools NumberOfHospitals
## UnemploymentRate 1.00 0.25 0.21
## NumberOfSchools 0.25 1.00 0.94
## NumberOfHospitals 0.21 0.94 1.00
## AverageHospitalRating NA NA NA
## avg_house_price 0.10 0.11 -0.04
## AverageHospitalRating avg_house_price
## UnemploymentRate NA 0.10
## NumberOfSchools NA 0.11
## NumberOfHospitals NA -0.04
## AverageHospitalRating 1 NA
## avg_house_price NA 1.00
Performance analytics
Summary table of correlation between predictors and house price
Features <- c('NumberOfSchools', 'NumberOfHospitals', 'AverageHospitalRating', 'UnemploymentRate')
Correlation <- c(0.12, -0.02, -0.22, 0.07)
df <- data.frame(Features, Correlation)
df## Features Correlation
## 1 NumberOfSchools 0.12
## 2 NumberOfHospitals -0.02
## 3 AverageHospitalRating -0.22
## 4 UnemploymentRate 0.07
Multiple linear model:
We are going to evaluate the avg_house_price with each of the predictors
Since we are using backward-selection, let first start with full model:
res_mul <- lm(avg_house_price ~ NumberOfSchools + NumberOfHospitals + AverageHospitalRating + UnemploymentRate, data = data_4)
summary(res_mul)##
## Call:
## lm(formula = avg_house_price ~ NumberOfSchools + NumberOfHospitals +
## AverageHospitalRating + UnemploymentRate, data = data_4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -68401 -51930 -10823 32381 132354
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 924929.57 286814.07 3.225 0.00729 **
## NumberOfSchools 28.54 44.17 0.646 0.53039
## NumberOfHospitals -758.13 678.47 -1.117 0.28569
## AverageHospitalRating -208805.33 83330.34 -2.506 0.02762 *
## UnemploymentRate -25248.35 18743.47 -1.347 0.20285
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69280 on 12 degrees of freedom
## (33 observations deleted due to missingness)
## Multiple R-squared: 0.4509, Adjusted R-squared: 0.2679
## F-statistic: 2.464 on 4 and 12 DF, p-value: 0.1016
Now, let analyze each individual predictor with house price:
avg_house_price~UnemploymentRate
##
## Call:
## lm(formula = avg_house_price ~ UnemploymentRate, data = data_4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -79558 -51575 -9872 25782 218059
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 135637 40244 3.370 0.00149 **
## UnemploymentRate 6984 9772 0.715 0.47828
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67700 on 48 degrees of freedom
## Multiple R-squared: 0.01053, Adjusted R-squared: -0.01009
## F-statistic: 0.5107 on 1 and 48 DF, p-value: 0.4783
avg_house_price~NumberOfSchools
##
## Call:
## lm(formula = avg_house_price ~ NumberOfSchools, data = data_4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80199 -49814 -13336 24735 209427
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.570e+05 1.284e+04 12.232 2.33e-16 ***
## NumberOfSchools 4.469e+00 5.840e+00 0.765 0.448
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67650 on 48 degrees of freedom
## Multiple R-squared: 0.01206, Adjusted R-squared: -0.008527
## F-statistic: 0.5857 on 1 and 48 DF, p-value: 0.4478
avg_house_price~NumberOfHospitals
##
## Call:
## lm(formula = avg_house_price ~ NumberOfHospitals, data = data_4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -81339 -50172 -10323 23631 202169
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 166167.34 13826.62 12.018 4.42e-16 ***
## NumberOfHospitals -24.93 95.38 -0.261 0.795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 68010 on 48 degrees of freedom
## Multiple R-squared: 0.001421, Adjusted R-squared: -0.01938
## F-statistic: 0.0683 on 1 and 48 DF, p-value: 0.7949
avg_house_price~AverageHospitalRating
##
## Call:
## lm(formula = avg_house_price ~ AverageHospitalRating, data = data_4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -81361 -62623 -437 44481 189667
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 699637 217107 3.223 0.00569 **
## AverageHospitalRating -173880 73581 -2.363 0.03205 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 71380 on 15 degrees of freedom
## (33 observations deleted due to missingness)
## Multiple R-squared: 0.2713, Adjusted R-squared: 0.2227
## F-statistic: 5.584 on 1 and 15 DF, p-value: 0.03205
Summary table
Features <- c('NumberOfSchools', 'NumberOfHospitals', 'AverageHospitalRating', 'UnemploymentRate')
Correlation <- c(0.12, -0.02, -0.22, 0.07)
P_values <- c(0.00241, 0.00347, 0.09069, 0.68095)
Adj_r_square <- c(-0.006934, -0.02049, 0.02919, -0.01543)
df_final <- data.frame(Features, Correlation, P_values, Adj_r_square)
df_final## Features Correlation P_values Adj_r_square
## 1 NumberOfSchools 0.12 0.00241 -0.006934
## 2 NumberOfHospitals -0.02 0.00347 -0.020490
## 3 AverageHospitalRating -0.22 0.09069 0.029190
## 4 UnemploymentRate 0.07 0.68095 -0.015430
As to answer to our main question to test around relationship between housing prices and each predictors, We realize that the house price has a positive relationship with the number of schools and the unemployment rate. Those are also the two big factors (from the features we explored) that contribute the most to the price of home in US (Although the correlation is not strong). The two other factors have a negative relationship with house prices. Though the predictors we used, we should have taken into consideration the crime rate which we believe should be a great predictor of home price. This will be part of further work we will have to do to make this model more efficient. We need also to mention that the project presented some challenges such as we needed to find appropriate data set for various factors, understand different terms such FIPS which we never heard before, merge different data set to make one useful data set for analysis and prediction.
CUNY DATA606: https://fall2020.data606.net/chapters/chapter9/