For this final project, I will be working alone. I am going to analyze a dataset that holds data for the Real-Estate industry as reported by Zestimate. Zestimate was created to give consumers as much information as possible about homes and the housing market, marking the first time consumers had access to this type of home value information at no cost
Datasets are available at: https://www.kaggle.com/c/zillow-prize-1/data File descriptions
I seek to establish an understanding of the relationship between home prices and characteristcs of housing as described in Zestimate’s datasets. I will draw a relation to factors such as economic forces, home features, and geographical inequality; and their impact on prices of homes.
I will exhaustively apply the various concepts that I have learnt thus far, to come up with a good analysis report of this dataset.
library(tidyverse)
library(psych)
library(dplyr)
library(shiny)
library(leaflet)
library(ggmap)
library(data.table)
library(ggplot2)
library(scales)
library(magrittr)
library(bit64)
library(lubridate)
library(corrplot)
library(h2o)
library(lime)
library(lubridate)
library(magrittr)
library(data.table)
library(bit64)
library(tidyverse)
library(lubridate)
library(mice)
library(corrplot)
properties_2017 <- read.csv(file="C:/FinalProject/properties_2017.csv", row.names = NULL)
str(properties_2017)
## 'data.frame': 2985217 obs. of 58 variables:
## $ parcelid : int 10754147 10759547 10843547 10859147 10879947 10898347 10933547 10940747 10954547 10976347 ...
## $ airconditioningtypeid : int NA NA NA NA NA NA NA NA NA NA ...
## $ architecturalstyletypeid : int NA NA NA NA NA NA NA NA NA NA ...
## $ basementsqft : int NA NA NA NA NA NA NA NA NA NA ...
## $ bathroomcnt : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bedroomcnt : num 0 0 0 0 0 0 0 0 0 0 ...
## $ buildingclasstypeid : int NA NA 5 3 4 4 NA NA NA 3 ...
## $ buildingqualitytypeid : int NA NA NA 6 NA 4 NA NA NA 4 ...
## $ calculatedbathnbr : num NA NA NA NA NA NA NA NA NA NA ...
## $ decktypeid : int NA NA NA NA NA NA NA NA NA NA ...
## $ finishedfloor1squarefeet : int NA NA NA NA NA NA NA NA NA NA ...
## $ calculatedfinishedsquarefeet: num NA NA 73026 5068 1776 ...
## $ finishedsquarefeet12 : int NA NA NA NA NA NA NA NA NA NA ...
## $ finishedsquarefeet13 : int NA NA NA NA NA NA NA NA NA NA ...
## $ finishedsquarefeet15 : int NA NA 73026 5068 1776 2400 NA 3611 NA 3754 ...
## $ finishedsquarefeet50 : int NA NA NA NA NA NA NA NA NA NA ...
## $ finishedsquarefeet6 : int NA NA NA NA NA NA NA NA NA NA ...
## $ fips : int 6037 6037 6037 6037 6037 6037 6037 6037 6037 6037 ...
## $ fireplacecnt : int NA NA NA NA NA NA NA NA NA NA ...
## $ fullbathcnt : int NA NA NA NA NA NA NA NA NA NA ...
## $ garagecarcnt : int NA NA NA NA NA NA NA NA NA NA ...
## $ garagetotalsqft : int NA NA NA NA NA NA NA NA NA NA ...
## $ hashottuborspa : chr "" "" "" "" ...
## $ heatingorsystemtypeid : int NA NA NA NA NA NA NA NA NA NA ...
## $ latitude : int 34144442 34140430 33989359 34148863 34194168 34171873 34131929 34171345 34218210 34289776 ...
## $ longitude : int -118654084 -118625364 -118394633 -118437206 -118385816 -118380906 -118351474 -118314900 -118331311 -118432085 ...
## $ lotsizesquarefeet : num 85768 4083 63085 7521 8512 ...
## $ poolcnt : int NA NA NA NA NA NA NA NA NA NA ...
## $ poolsizesum : int NA NA NA NA NA NA NA NA NA NA ...
## $ pooltypeid10 : int NA NA NA NA NA NA NA NA NA NA ...
## $ pooltypeid2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ pooltypeid7 : int NA NA NA NA NA NA NA NA NA NA ...
## $ propertycountylandusecode : chr "010D" "0109" "1200" "1200" ...
## $ propertylandusetypeid : int 269 261 47 47 31 31 260 31 269 31 ...
## $ propertyzoningdesc : chr "" "LCA11*" "LAC2" "LAC2" ...
## $ rawcensustractandblock : num 60378002 60378001 60377030 60371412 60371232 ...
## $ regionidcity : int 37688 37688 51617 12447 12447 12447 12447 396054 396054 47547 ...
## $ regionidcounty : int 3101 3101 3101 3101 3101 3101 3101 3101 3101 3101 ...
## $ regionidneighborhood : int NA NA NA 27080 46795 46795 274049 NA NA NA ...
## $ regionidzip : int 96337 96337 96095 96424 96450 96446 96049 96434 96436 96366 ...
## $ roomcnt : num 0 0 0 0 0 0 0 0 0 0 ...
## $ storytypeid : int NA NA NA NA NA NA NA NA NA NA ...
## $ threequarterbathnbr : int NA NA NA NA NA NA NA NA NA NA ...
## $ typeconstructiontypeid : int NA NA NA NA NA NA NA NA NA NA ...
## $ unitcnt : int NA NA 2 NA 1 NA NA NA NA NA ...
## $ yardbuildingsqft17 : int NA NA NA NA NA NA NA NA NA NA ...
## $ yardbuildingsqft26 : int NA NA NA NA NA NA NA NA NA NA ...
## $ yearbuilt : num NA NA 1959 1948 1947 ...
## $ numberofstories : int NA NA 1 1 1 1 NA 1 NA 1 ...
## $ fireplaceflag : chr "" "" "" "" ...
## $ structuretaxvaluedollarcnt : num NA NA 660680 580059 196751 ...
## $ taxvaluedollarcnt : num 9 27516 1434941 1174475 440101 ...
## $ assessmentyear : int 2016 2015 2016 2016 2016 2016 2016 2016 2016 2016 ...
## $ landtaxvaluedollarcnt : num 9 27516 774261 594416 243350 ...
## $ taxamount : num NA NA 20800 14558 5725 ...
## $ taxdelinquencyflag : chr "" "" "" "" ...
## $ taxdelinquencyyear : int NA NA NA NA NA NA NA NA NA NA ...
## $ censustractandblock : num NA NA NA NA NA NA NA NA NA NA ...
prop_miss <- function(x){
sum(is.na(x)) / length(x)*100
}
missing.bycol <- apply(properties_2017,2,prop_miss)
missing.byrow <- apply(properties_2017,1,prop_miss)
missdata.df <- as.data.frame(missing.bycol)
setDT(missdata.df, keep.rownames = TRUE)
names(missdata.df) <- c('Col_Names', 'pct_missing')
plt1<-ggplot(data = missdata.df , aes(x= reorder(Col_Names, pct_missing), y=pct_missing)) + geom_bar(stat = "identity",aes(fill = pct_missing), position = position_stack(reverse= TRUE)) + coord_flip()
plt1
missdata.df20 <- missdata.df %>% filter (pct_missing>=20)
plt2<-ggplot(data = missdata.df20 , aes(x= reorder(Col_Names, pct_missing), y=pct_missing)) + geom_bar(stat = "identity",aes(fill = pct_missing), position = position_stack(reverse= TRUE)) + coord_flip()
plt2
Using the 80% rule, missing values in more than 20% of samples may be removed from the dataset.
missing_prop <- sapply(properties_2017, function(x) sum(is.na(x))/length(x))
variables_to_remove <- names(missing_prop)[missing_prop > 1 - 0.8]
variables_to_remove
## [1] "airconditioningtypeid" "architecturalstyletypeid"
## [3] "basementsqft" "buildingclasstypeid"
## [5] "buildingqualitytypeid" "decktypeid"
## [7] "finishedfloor1squarefeet" "finishedsquarefeet13"
## [9] "finishedsquarefeet15" "finishedsquarefeet50"
## [11] "finishedsquarefeet6" "fireplacecnt"
## [13] "garagecarcnt" "garagetotalsqft"
## [15] "heatingorsystemtypeid" "poolcnt"
## [17] "poolsizesum" "pooltypeid10"
## [19] "pooltypeid2" "pooltypeid7"
## [21] "regionidneighborhood" "storytypeid"
## [23] "threequarterbathnbr" "typeconstructiontypeid"
## [25] "unitcnt" "yardbuildingsqft17"
## [27] "yardbuildingsqft26" "numberofstories"
## [29] "taxdelinquencyyear"
# The dataset afterwards
df_transformed <- properties_2017[, !colnames(properties_2017) %in% variables_to_remove]
dim(df_transformed)
## [1] 2985217 29
str(df_transformed)
## 'data.frame': 2985217 obs. of 29 variables:
## $ parcelid : int 10754147 10759547 10843547 10859147 10879947 10898347 10933547 10940747 10954547 10976347 ...
## $ bathroomcnt : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bedroomcnt : num 0 0 0 0 0 0 0 0 0 0 ...
## $ calculatedbathnbr : num NA NA NA NA NA NA NA NA NA NA ...
## $ calculatedfinishedsquarefeet: num NA NA 73026 5068 1776 ...
## $ finishedsquarefeet12 : int NA NA NA NA NA NA NA NA NA NA ...
## $ fips : int 6037 6037 6037 6037 6037 6037 6037 6037 6037 6037 ...
## $ fullbathcnt : int NA NA NA NA NA NA NA NA NA NA ...
## $ hashottuborspa : chr "" "" "" "" ...
## $ latitude : int 34144442 34140430 33989359 34148863 34194168 34171873 34131929 34171345 34218210 34289776 ...
## $ longitude : int -118654084 -118625364 -118394633 -118437206 -118385816 -118380906 -118351474 -118314900 -118331311 -118432085 ...
## $ lotsizesquarefeet : num 85768 4083 63085 7521 8512 ...
## $ propertycountylandusecode : chr "010D" "0109" "1200" "1200" ...
## $ propertylandusetypeid : int 269 261 47 47 31 31 260 31 269 31 ...
## $ propertyzoningdesc : chr "" "LCA11*" "LAC2" "LAC2" ...
## $ rawcensustractandblock : num 60378002 60378001 60377030 60371412 60371232 ...
## $ regionidcity : int 37688 37688 51617 12447 12447 12447 12447 396054 396054 47547 ...
## $ regionidcounty : int 3101 3101 3101 3101 3101 3101 3101 3101 3101 3101 ...
## $ regionidzip : int 96337 96337 96095 96424 96450 96446 96049 96434 96436 96366 ...
## $ roomcnt : num 0 0 0 0 0 0 0 0 0 0 ...
## $ yearbuilt : num NA NA 1959 1948 1947 ...
## $ fireplaceflag : chr "" "" "" "" ...
## $ structuretaxvaluedollarcnt : num NA NA 660680 580059 196751 ...
## $ taxvaluedollarcnt : num 9 27516 1434941 1174475 440101 ...
## $ assessmentyear : int 2016 2015 2016 2016 2016 2016 2016 2016 2016 2016 ...
## $ landtaxvaluedollarcnt : num 9 27516 774261 594416 243350 ...
## $ taxamount : num NA NA 20800 14558 5725 ...
## $ taxdelinquencyflag : chr "" "" "" "" ...
## $ censustractandblock : num NA NA NA NA NA NA NA NA NA NA ...
The dataset now has 29 variables, which I will use for the rest of my analysis.
#Subset of columns from original data set
col_index<-c(2:9)
working_set<-df_transformed[,col_index]
head(working_set)
## bathroomcnt bedroomcnt calculatedbathnbr calculatedfinishedsquarefeet
## 1 0 0 NA NA
## 2 0 0 NA NA
## 3 0 0 NA 73026
## 4 0 0 NA 5068
## 5 0 0 NA 1776
## 6 0 0 NA 2400
## finishedsquarefeet12 fips fullbathcnt hashottuborspa
## 1 NA 6037 NA
## 2 NA 6037 NA
## 3 NA 6037 NA
## 4 NA 6037 NA
## 5 NA 6037 NA
## 6 NA 6037 NA
# proportion of each level under each variable using the table and summary functions
# Bedroom Count: nummber of bedrooms in a home
table(working_set$bedroomcnt)
##
## 0 1 2 3 4 5 6 7 8 9
## 118705 86941 606782 1172757 731475 182765 48915 12763 13542 4279
## 10 11 12 13 14 15 16 17 18 19
## 1702 425 959 86 69 24 50 11 9 1
## 20 21 23 24 25
## 8 1 1 1 1
summary(working_set$bedroomcnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.000 3.000 3.093 4.000 25.000 2945
boxplot(working_set$bedroomcnt)
# Bathroom count: Number of bathrooms in home
bathroomcnt<-table(working_set$bathroomcnt)
bathroomcnt
##
## 0 0.5 1 1.5 1.75 2 2.5 3 3.5 4
## 113470 16 499332 45735 4 1219811 208809 633089 31835 133922
## 4.5 5 5.5 6 6.5 7 7.5 8 8.5 9
## 19864 38514 6275 16416 1352 6221 385 4548 113 1341
## 9.5 10 10.5 11 11.5 12 12.5 13 14 14.5
## 50 496 14 200 3 269 3 53 39 1
## 15 16 17 18 19 19.5 20 31 32
## 21 25 8 12 3 1 8 1 1
summary(working_set$bathroomcnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 2.000 2.000 2.216 3.000 32.000 2957
# Calculated Bathroom number: Number of bathrooms in home including fractional bathroom
table(working_set$calculatedbathnbr)
##
## 1 1.5 2 2.5 3 3.5 4 4.5 5 5.5
## 499324 45427 1219799 208578 633088 31773 133922 19811 38514 6259
## 6 6.5 7 7.5 8 8.5 9 9.5 10 10.5
## 16416 1340 6221 382 4548 110 1341 50 496 14
## 11 11.5 12 12.5 13 14 14.5 15 16 17
## 200 3 269 3 53 39 1 21 25 8
## 18 19 19.5 20 31 32
## 12 3 1 8 1 1
summary(working_set$calculatedbathnbr)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.0 2.0 2.0 2.3 3.0 32.0 117156
barplot(bathroomcnt, main = "Bathroom Count")
# Full bath count: Number of full bathrooms (sink, shower + bathtub, and toilet) present in home
fullbath<-table(working_set$fullbathcnt)
fullbath
##
## 1 2 3 4 5 6 7 8 9 10
## 544794 1428927 664914 153684 44721 17499 6468 4575 1347 495
## 11 12 13 14 15 16 17 18 19 20
## 197 268 54 39 20 25 8 12 4 8
## 31 32
## 1 1
summary(working_set$fullbathcnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 2.00 2.00 2.25 3.00 32.00 117156
barplot(fullbath, main = " Full Bath Count")
boxplot(working_set$fullbathcnt)
# Hot Tub or Spa: whether or not a home has a hot tub or spa
summary(working_set$hashottuborspa)
## Length Class Mode
## 2985217 character character
#Finished Square Feet: Finished total living room area of home
summary(working_set$finishedsquarefeet12)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1 1198 1542 1764 2075 427079 264431
# Regionidcity
properties_2017 %>%
count(regionidcity)
## regionidcity n
## 1 3491 1076
## 2 3980 6
## 3 4406 21331
## 4 5465 13499
## 5 5534 49123
## 6 6021 14769
## 7 6285 19
## 8 6395 7772
## 9 6822 781
## 10 8384 16361
## 11 9840 7659
## 12 10241 13506
## 13 10389 8429
## 14 10608 16589
## 15 10723 20243
## 16 10734 6521
## 17 10774 15676
## 18 10815 110
## 19 11626 14976
## 20 12292 23034
## 21 12447 670868
## 22 12520 7465
## 23 12773 31381
## 24 13091 7214
## 25 13150 42112
## 26 13232 717
## 27 13311 182
## 28 13693 20625
## 29 13716 3822
## 30 14111 9075
## 31 14542 25781
## 32 14634 20147
## 33 14906 425
## 34 15237 4309
## 35 15554 13313
## 36 16389 760
## 37 16677 2614
## 38 16764 59793
## 39 16961 3164
## 40 17150 11744
## 41 17597 4880
## 42 17686 9758
## 43 17882 15987
## 44 18098 3773
## 45 18874 13786
## 46 18875 1984
## 47 19177 12039
## 48 19202 24
## 49 20008 29232
## 50 21395 77
## 51 21412 22562
## 52 21778 2330
## 53 22827 11445
## 54 24174 24166
## 55 24245 19129
## 56 24384 17460
## 57 24435 5527
## 58 24797 17
## 59 24812 30356
## 60 24832 34448
## 61 25218 57013
## 62 25271 696
## 63 25458 8964
## 64 25459 21986
## 65 25468 1124
## 66 25535 23
## 67 25621 582
## 68 25953 11374
## 69 25974 10401
## 70 26483 13613
## 71 26531 12992
## 72 26964 18424
## 73 26965 6541
## 74 27103 3188
## 75 27110 37820
## 76 27183 6029
## 77 27491 17742
## 78 27512 12
## 79 29189 3773
## 80 29712 5910
## 81 30187 11269
## 82 30267 3552
## 83 30399 3539
## 84 30908 9798
## 85 31134 1225
## 86 32380 22465
## 87 32616 9075
## 88 32753 2754
## 89 32923 10396
## 90 32927 183
## 91 33252 33336
## 92 33311 4922
## 93 33312 448
## 94 33612 22267
## 95 33727 3135
## 96 33836 10226
## 97 33837 13538
## 98 33840 11260
## 99 34037 666
## 100 34278 38791
## 101 34543 32552
## 102 34636 8984
## 103 34780 18992
## 104 36078 12
## 105 36502 2861
## 106 37015 12188
## 107 37086 14527
## 108 37688 7510
## 109 37882 20
## 110 38032 20857
## 111 38980 623
## 112 39076 7986
## 113 39306 6987
## 114 39308 9167
## 115 40009 3973
## 116 40081 7673
## 117 40110 1790
## 118 40227 46279
## 119 40633 3
## 120 41484 86
## 121 41673 7685
## 122 42091 597
## 123 42150 12394
## 124 42967 5479
## 125 44116 11698
## 126 44833 13085
## 127 45398 12141
## 128 45457 36090
## 129 45602 15189
## 130 45888 19989
## 131 46080 7005
## 132 46098 6612
## 133 46178 5037
## 134 46298 93454
## 135 46314 5696
## 136 47019 33806
## 137 47198 5453
## 138 47547 4355
## 139 47568 47763
## 140 47695 3786
## 141 47762 13750
## 142 47913 7
## 143 48211 23
## 144 48424 7543
## 145 50677 16531
## 146 50749 17825
## 147 51239 24814
## 148 51617 11683
## 149 51861 15833
## 150 52650 58619
## 151 52835 6655
## 152 52842 9815
## 153 53027 4495
## 154 53162 274
## 155 53571 25035
## 156 53636 21560
## 157 53655 4998
## 158 54053 14556
## 159 54212 11924
## 160 54299 4428
## 161 54311 54302
## 162 54352 5295
## 163 54722 35618
## 164 54970 2130
## 165 55753 4391
## 166 56780 1978
## 167 113412 1782
## 168 113576 3513
## 169 114828 2402
## 170 114834 798
## 171 116042 2625
## 172 118217 3186
## 173 118225 11561
## 174 118694 4200
## 175 118875 1817
## 176 118878 6334
## 177 118880 402
## 178 118895 4420
## 179 118914 4848
## 180 118994 4122
## 181 272578 2292
## 182 396053 7985
## 183 396054 24589
## 184 396550 3675
## 185 396551 5637
## 186 396556 5412
## 187 NA 62128
# geographic interactive widget to display where regionid is in relation to a regional map
city_info <- colorFactor("Set2", properties_2017$regionidcity)
df_transformed %>%
group_by(regionidcity = as.factor(regionidcity)) %>%
summarise(avg_lng = mean(longitude/1e6, na.rm = T),
avg_lat = mean(latitude/1e6, na.rm =T)) %>%
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(lng = ~avg_lng, lat = ~avg_lat, color = ~city_info(regionidcity),
stroke = FALSE, fillOpacity = 0.8, popup = ~regionidcity)
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
col_index<-c(10,11,16,23:29)
working_dataset2<-df_transformed[,col_index]
# Landtaxvaluedollarcnt: The assessed value of the land area of the parcel
summary(working_dataset2$landtaxvaluedollarcnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1 79700 176619 268456 326100 94011079 59926
# Structuretaxvaluedollarcnt: The assessed value of the built structure on the parcel
summary(working_dataset2$structuretaxvaluedollarcnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1 77666 127066 178143 204000 255321161 46464
#Taxvaluedollarcnt: The total tax assessed value of the parcel
summary(working_dataset2$taxvaluedollarcnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1 188220 321161 443528 514072 319622473 34266
# Taxamount: The total property tax assessed for that assessment year
plt <- qplot(working_dataset2$taxamount, geom="histogram", binwidth = 1000, main = "Distribution of Tax Amount", ylab = "Frequency", xlab = "Tax Amount", fill=I("blue"), col=I("red"), alpha=I(.2), xlim=c(5000,30000)) +
theme(axis.text.y = element_text(angle=45))
suppressWarnings(print(plt))