Introduction

For this final project, I will be working alone. I am going to analyze a dataset that holds data for the Real-Estate industry as reported by Zestimate. Zestimate was created to give consumers as much information as possible about homes and the housing market, marking the first time consumers had access to this type of home value information at no cost

Data Source

Datasets are available at: https://www.kaggle.com/c/zillow-prize-1/data File descriptions

Project Goal

I seek to establish an understanding of the relationship between home prices and characteristcs of housing as described in Zestimate’s datasets. I will draw a relation to factors such as economic forces, home features, and geographical inequality; and their impact on prices of homes.

I will exhaustively apply the various concepts that I have learnt thus far, to come up with a good analysis report of this dataset.

Loading Libraries

library(tidyverse)
library(psych)
library(dplyr)
library(shiny)
library(leaflet)
library(ggmap)
library(data.table)
library(ggplot2)
library(scales)
library(magrittr)
library(bit64)
library(lubridate)
library(corrplot)
library(h2o)
library(lime)
library(lubridate)
library(magrittr)
library(data.table)
library(bit64)
library(tidyverse)
library(lubridate)
library(mice)
library(corrplot)

Reading the csv file

properties_2017 <-  read.csv(file="C:/FinalProject/properties_2017.csv", row.names = NULL)
str(properties_2017)
## 'data.frame':    2985217 obs. of  58 variables:
##  $ parcelid                    : int  10754147 10759547 10843547 10859147 10879947 10898347 10933547 10940747 10954547 10976347 ...
##  $ airconditioningtypeid       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ architecturalstyletypeid    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ basementsqft                : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ bathroomcnt                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ bedroomcnt                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ buildingclasstypeid         : int  NA NA 5 3 4 4 NA NA NA 3 ...
##  $ buildingqualitytypeid       : int  NA NA NA 6 NA 4 NA NA NA 4 ...
##  $ calculatedbathnbr           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ decktypeid                  : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ finishedfloor1squarefeet    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ calculatedfinishedsquarefeet: num  NA NA 73026 5068 1776 ...
##  $ finishedsquarefeet12        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ finishedsquarefeet13        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ finishedsquarefeet15        : int  NA NA 73026 5068 1776 2400 NA 3611 NA 3754 ...
##  $ finishedsquarefeet50        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ finishedsquarefeet6         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ fips                        : int  6037 6037 6037 6037 6037 6037 6037 6037 6037 6037 ...
##  $ fireplacecnt                : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ fullbathcnt                 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ garagecarcnt                : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ garagetotalsqft             : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ hashottuborspa              : chr  "" "" "" "" ...
##  $ heatingorsystemtypeid       : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ latitude                    : int  34144442 34140430 33989359 34148863 34194168 34171873 34131929 34171345 34218210 34289776 ...
##  $ longitude                   : int  -118654084 -118625364 -118394633 -118437206 -118385816 -118380906 -118351474 -118314900 -118331311 -118432085 ...
##  $ lotsizesquarefeet           : num  85768 4083 63085 7521 8512 ...
##  $ poolcnt                     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ poolsizesum                 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ pooltypeid10                : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ pooltypeid2                 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ pooltypeid7                 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ propertycountylandusecode   : chr  "010D" "0109" "1200" "1200" ...
##  $ propertylandusetypeid       : int  269 261 47 47 31 31 260 31 269 31 ...
##  $ propertyzoningdesc          : chr  "" "LCA11*" "LAC2" "LAC2" ...
##  $ rawcensustractandblock      : num  60378002 60378001 60377030 60371412 60371232 ...
##  $ regionidcity                : int  37688 37688 51617 12447 12447 12447 12447 396054 396054 47547 ...
##  $ regionidcounty              : int  3101 3101 3101 3101 3101 3101 3101 3101 3101 3101 ...
##  $ regionidneighborhood        : int  NA NA NA 27080 46795 46795 274049 NA NA NA ...
##  $ regionidzip                 : int  96337 96337 96095 96424 96450 96446 96049 96434 96436 96366 ...
##  $ roomcnt                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ storytypeid                 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ threequarterbathnbr         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ typeconstructiontypeid      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ unitcnt                     : int  NA NA 2 NA 1 NA NA NA NA NA ...
##  $ yardbuildingsqft17          : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ yardbuildingsqft26          : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ yearbuilt                   : num  NA NA 1959 1948 1947 ...
##  $ numberofstories             : int  NA NA 1 1 1 1 NA 1 NA 1 ...
##  $ fireplaceflag               : chr  "" "" "" "" ...
##  $ structuretaxvaluedollarcnt  : num  NA NA 660680 580059 196751 ...
##  $ taxvaluedollarcnt           : num  9 27516 1434941 1174475 440101 ...
##  $ assessmentyear              : int  2016 2015 2016 2016 2016 2016 2016 2016 2016 2016 ...
##  $ landtaxvaluedollarcnt       : num  9 27516 774261 594416 243350 ...
##  $ taxamount                   : num  NA NA 20800 14558 5725 ...
##  $ taxdelinquencyflag          : chr  "" "" "" "" ...
##  $ taxdelinquencyyear          : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ censustractandblock         : num  NA NA NA NA NA NA NA NA NA NA ...

DATA TRANSFORMATION

Missing Data

prop_miss <- function(x){
  sum(is.na(x)) / length(x)*100
}

missing.bycol <- apply(properties_2017,2,prop_miss)
missing.byrow <-  apply(properties_2017,1,prop_miss)

missdata.df <- as.data.frame(missing.bycol)
setDT(missdata.df, keep.rownames = TRUE)
names(missdata.df) <- c('Col_Names', 'pct_missing')

plt1<-ggplot(data = missdata.df , aes(x= reorder(Col_Names, pct_missing), y=pct_missing)) + geom_bar(stat = "identity",aes(fill = pct_missing), position = position_stack(reverse= TRUE)) + coord_flip()
plt1

Columns with more than 20% missing value

missdata.df20 <- missdata.df %>% filter (pct_missing>=20) 
plt2<-ggplot(data = missdata.df20  , aes(x= reorder(Col_Names, pct_missing), y=pct_missing)) + geom_bar(stat = "identity",aes(fill = pct_missing), position = position_stack(reverse= TRUE)) + coord_flip()
plt2

Using the 80% rule, missing values in more than 20% of samples may be removed from the dataset.

missing_prop <- sapply(properties_2017, function(x) sum(is.na(x))/length(x))
variables_to_remove <- names(missing_prop)[missing_prop > 1 - 0.8]
variables_to_remove 
##  [1] "airconditioningtypeid"    "architecturalstyletypeid"
##  [3] "basementsqft"             "buildingclasstypeid"     
##  [5] "buildingqualitytypeid"    "decktypeid"              
##  [7] "finishedfloor1squarefeet" "finishedsquarefeet13"    
##  [9] "finishedsquarefeet15"     "finishedsquarefeet50"    
## [11] "finishedsquarefeet6"      "fireplacecnt"            
## [13] "garagecarcnt"             "garagetotalsqft"         
## [15] "heatingorsystemtypeid"    "poolcnt"                 
## [17] "poolsizesum"              "pooltypeid10"            
## [19] "pooltypeid2"              "pooltypeid7"             
## [21] "regionidneighborhood"     "storytypeid"             
## [23] "threequarterbathnbr"      "typeconstructiontypeid"  
## [25] "unitcnt"                  "yardbuildingsqft17"      
## [27] "yardbuildingsqft26"       "numberofstories"         
## [29] "taxdelinquencyyear"
# The dataset afterwards
df_transformed <- properties_2017[, !colnames(properties_2017) %in% variables_to_remove]
dim(df_transformed)
## [1] 2985217      29
str(df_transformed)
## 'data.frame':    2985217 obs. of  29 variables:
##  $ parcelid                    : int  10754147 10759547 10843547 10859147 10879947 10898347 10933547 10940747 10954547 10976347 ...
##  $ bathroomcnt                 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ bedroomcnt                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ calculatedbathnbr           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ calculatedfinishedsquarefeet: num  NA NA 73026 5068 1776 ...
##  $ finishedsquarefeet12        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ fips                        : int  6037 6037 6037 6037 6037 6037 6037 6037 6037 6037 ...
##  $ fullbathcnt                 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ hashottuborspa              : chr  "" "" "" "" ...
##  $ latitude                    : int  34144442 34140430 33989359 34148863 34194168 34171873 34131929 34171345 34218210 34289776 ...
##  $ longitude                   : int  -118654084 -118625364 -118394633 -118437206 -118385816 -118380906 -118351474 -118314900 -118331311 -118432085 ...
##  $ lotsizesquarefeet           : num  85768 4083 63085 7521 8512 ...
##  $ propertycountylandusecode   : chr  "010D" "0109" "1200" "1200" ...
##  $ propertylandusetypeid       : int  269 261 47 47 31 31 260 31 269 31 ...
##  $ propertyzoningdesc          : chr  "" "LCA11*" "LAC2" "LAC2" ...
##  $ rawcensustractandblock      : num  60378002 60378001 60377030 60371412 60371232 ...
##  $ regionidcity                : int  37688 37688 51617 12447 12447 12447 12447 396054 396054 47547 ...
##  $ regionidcounty              : int  3101 3101 3101 3101 3101 3101 3101 3101 3101 3101 ...
##  $ regionidzip                 : int  96337 96337 96095 96424 96450 96446 96049 96434 96436 96366 ...
##  $ roomcnt                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ yearbuilt                   : num  NA NA 1959 1948 1947 ...
##  $ fireplaceflag               : chr  "" "" "" "" ...
##  $ structuretaxvaluedollarcnt  : num  NA NA 660680 580059 196751 ...
##  $ taxvaluedollarcnt           : num  9 27516 1434941 1174475 440101 ...
##  $ assessmentyear              : int  2016 2015 2016 2016 2016 2016 2016 2016 2016 2016 ...
##  $ landtaxvaluedollarcnt       : num  9 27516 774261 594416 243350 ...
##  $ taxamount                   : num  NA NA 20800 14558 5725 ...
##  $ taxdelinquencyflag          : chr  "" "" "" "" ...
##  $ censustractandblock         : num  NA NA NA NA NA NA NA NA NA NA ...

The dataset now has 29 variables, which I will use for the rest of my analysis.

DATA ANALYSIS

Variables

#Subset of columns from original data set
col_index<-c(2:9)
working_set<-df_transformed[,col_index]
head(working_set)
##   bathroomcnt bedroomcnt calculatedbathnbr calculatedfinishedsquarefeet
## 1           0          0                NA                           NA
## 2           0          0                NA                           NA
## 3           0          0                NA                        73026
## 4           0          0                NA                         5068
## 5           0          0                NA                         1776
## 6           0          0                NA                         2400
##   finishedsquarefeet12 fips fullbathcnt hashottuborspa
## 1                   NA 6037          NA               
## 2                   NA 6037          NA               
## 3                   NA 6037          NA               
## 4                   NA 6037          NA               
## 5                   NA 6037          NA               
## 6                   NA 6037          NA
# proportion of each level under each variable using the table and summary functions

# Bedroom Count: nummber of bedrooms in a home 
table(working_set$bedroomcnt)
## 
##       0       1       2       3       4       5       6       7       8       9 
##  118705   86941  606782 1172757  731475  182765   48915   12763   13542    4279 
##      10      11      12      13      14      15      16      17      18      19 
##    1702     425     959      86      69      24      50      11       9       1 
##      20      21      23      24      25 
##       8       1       1       1       1
summary(working_set$bedroomcnt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.000   3.000   3.093   4.000  25.000    2945
boxplot(working_set$bedroomcnt)

# Bathroom count: Number of bathrooms in home
bathroomcnt<-table(working_set$bathroomcnt)
bathroomcnt
## 
##       0     0.5       1     1.5    1.75       2     2.5       3     3.5       4 
##  113470      16  499332   45735       4 1219811  208809  633089   31835  133922 
##     4.5       5     5.5       6     6.5       7     7.5       8     8.5       9 
##   19864   38514    6275   16416    1352    6221     385    4548     113    1341 
##     9.5      10    10.5      11    11.5      12    12.5      13      14    14.5 
##      50     496      14     200       3     269       3      53      39       1 
##      15      16      17      18      19    19.5      20      31      32 
##      21      25       8      12       3       1       8       1       1
summary(working_set$bathroomcnt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   2.000   2.000   2.216   3.000  32.000    2957
# Calculated Bathroom number: Number of bathrooms in home including fractional bathroom
table(working_set$calculatedbathnbr)
## 
##       1     1.5       2     2.5       3     3.5       4     4.5       5     5.5 
##  499324   45427 1219799  208578  633088   31773  133922   19811   38514    6259 
##       6     6.5       7     7.5       8     8.5       9     9.5      10    10.5 
##   16416    1340    6221     382    4548     110    1341      50     496      14 
##      11    11.5      12    12.5      13      14    14.5      15      16      17 
##     200       3     269       3      53      39       1      21      25       8 
##      18      19    19.5      20      31      32 
##      12       3       1       8       1       1
summary(working_set$calculatedbathnbr)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     1.0     2.0     2.0     2.3     3.0    32.0  117156
barplot(bathroomcnt, main = "Bathroom Count")

# Full bath count: Number of full bathrooms (sink, shower + bathtub, and toilet) present in home

fullbath<-table(working_set$fullbathcnt)
fullbath
## 
##       1       2       3       4       5       6       7       8       9      10 
##  544794 1428927  664914  153684   44721   17499    6468    4575    1347     495 
##      11      12      13      14      15      16      17      18      19      20 
##     197     268      54      39      20      25       8      12       4       8 
##      31      32 
##       1       1
summary(working_set$fullbathcnt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00    2.00    2.00    2.25    3.00   32.00  117156
barplot(fullbath, main = " Full Bath Count")

boxplot(working_set$fullbathcnt)

# Hot Tub or Spa: whether or not a home has a hot tub or spa
summary(working_set$hashottuborspa)
##    Length     Class      Mode 
##   2985217 character character
#Finished Square Feet: Finished total living room area of home
summary(working_set$finishedsquarefeet12)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       1    1198    1542    1764    2075  427079  264431

Land properties_2017

# Regionidcity
properties_2017 %>% 
  count(regionidcity)
##     regionidcity      n
## 1           3491   1076
## 2           3980      6
## 3           4406  21331
## 4           5465  13499
## 5           5534  49123
## 6           6021  14769
## 7           6285     19
## 8           6395   7772
## 9           6822    781
## 10          8384  16361
## 11          9840   7659
## 12         10241  13506
## 13         10389   8429
## 14         10608  16589
## 15         10723  20243
## 16         10734   6521
## 17         10774  15676
## 18         10815    110
## 19         11626  14976
## 20         12292  23034
## 21         12447 670868
## 22         12520   7465
## 23         12773  31381
## 24         13091   7214
## 25         13150  42112
## 26         13232    717
## 27         13311    182
## 28         13693  20625
## 29         13716   3822
## 30         14111   9075
## 31         14542  25781
## 32         14634  20147
## 33         14906    425
## 34         15237   4309
## 35         15554  13313
## 36         16389    760
## 37         16677   2614
## 38         16764  59793
## 39         16961   3164
## 40         17150  11744
## 41         17597   4880
## 42         17686   9758
## 43         17882  15987
## 44         18098   3773
## 45         18874  13786
## 46         18875   1984
## 47         19177  12039
## 48         19202     24
## 49         20008  29232
## 50         21395     77
## 51         21412  22562
## 52         21778   2330
## 53         22827  11445
## 54         24174  24166
## 55         24245  19129
## 56         24384  17460
## 57         24435   5527
## 58         24797     17
## 59         24812  30356
## 60         24832  34448
## 61         25218  57013
## 62         25271    696
## 63         25458   8964
## 64         25459  21986
## 65         25468   1124
## 66         25535     23
## 67         25621    582
## 68         25953  11374
## 69         25974  10401
## 70         26483  13613
## 71         26531  12992
## 72         26964  18424
## 73         26965   6541
## 74         27103   3188
## 75         27110  37820
## 76         27183   6029
## 77         27491  17742
## 78         27512     12
## 79         29189   3773
## 80         29712   5910
## 81         30187  11269
## 82         30267   3552
## 83         30399   3539
## 84         30908   9798
## 85         31134   1225
## 86         32380  22465
## 87         32616   9075
## 88         32753   2754
## 89         32923  10396
## 90         32927    183
## 91         33252  33336
## 92         33311   4922
## 93         33312    448
## 94         33612  22267
## 95         33727   3135
## 96         33836  10226
## 97         33837  13538
## 98         33840  11260
## 99         34037    666
## 100        34278  38791
## 101        34543  32552
## 102        34636   8984
## 103        34780  18992
## 104        36078     12
## 105        36502   2861
## 106        37015  12188
## 107        37086  14527
## 108        37688   7510
## 109        37882     20
## 110        38032  20857
## 111        38980    623
## 112        39076   7986
## 113        39306   6987
## 114        39308   9167
## 115        40009   3973
## 116        40081   7673
## 117        40110   1790
## 118        40227  46279
## 119        40633      3
## 120        41484     86
## 121        41673   7685
## 122        42091    597
## 123        42150  12394
## 124        42967   5479
## 125        44116  11698
## 126        44833  13085
## 127        45398  12141
## 128        45457  36090
## 129        45602  15189
## 130        45888  19989
## 131        46080   7005
## 132        46098   6612
## 133        46178   5037
## 134        46298  93454
## 135        46314   5696
## 136        47019  33806
## 137        47198   5453
## 138        47547   4355
## 139        47568  47763
## 140        47695   3786
## 141        47762  13750
## 142        47913      7
## 143        48211     23
## 144        48424   7543
## 145        50677  16531
## 146        50749  17825
## 147        51239  24814
## 148        51617  11683
## 149        51861  15833
## 150        52650  58619
## 151        52835   6655
## 152        52842   9815
## 153        53027   4495
## 154        53162    274
## 155        53571  25035
## 156        53636  21560
## 157        53655   4998
## 158        54053  14556
## 159        54212  11924
## 160        54299   4428
## 161        54311  54302
## 162        54352   5295
## 163        54722  35618
## 164        54970   2130
## 165        55753   4391
## 166        56780   1978
## 167       113412   1782
## 168       113576   3513
## 169       114828   2402
## 170       114834    798
## 171       116042   2625
## 172       118217   3186
## 173       118225  11561
## 174       118694   4200
## 175       118875   1817
## 176       118878   6334
## 177       118880    402
## 178       118895   4420
## 179       118914   4848
## 180       118994   4122
## 181       272578   2292
## 182       396053   7985
## 183       396054  24589
## 184       396550   3675
## 185       396551   5637
## 186       396556   5412
## 187           NA  62128
# geographic interactive widget to display where regionid is in relation to a regional map

city_info <- colorFactor("Set2", properties_2017$regionidcity)

df_transformed %>% 
  group_by(regionidcity = as.factor(regionidcity)) %>% 
  summarise(avg_lng = mean(longitude/1e6, na.rm = T),
            avg_lat = mean(latitude/1e6, na.rm =T)) %>% 
  leaflet() %>%
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addCircleMarkers(lng = ~avg_lng, lat = ~avg_lat, color = ~city_info(regionidcity),
                   stroke = FALSE, fillOpacity = 0.8, popup = ~regionidcity)
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
col_index<-c(10,11,16,23:29)
working_dataset2<-df_transformed[,col_index]


# Landtaxvaluedollarcnt: The assessed value of the land area of the parcel
summary(working_dataset2$landtaxvaluedollarcnt)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##        1    79700   176619   268456   326100 94011079    59926
# Structuretaxvaluedollarcnt: The assessed value of the built structure on the parcel
summary(working_dataset2$structuretaxvaluedollarcnt)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##         1     77666    127066    178143    204000 255321161     46464
#Taxvaluedollarcnt: The total tax assessed value of the parcel
summary(working_dataset2$taxvaluedollarcnt)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##         1    188220    321161    443528    514072 319622473     34266
# Taxamount: The total property tax assessed for that assessment year
plt <- qplot(working_dataset2$taxamount, geom="histogram", binwidth = 1000, main = "Distribution of Tax Amount", ylab =    "Frequency", xlab       = "Tax Amount", fill=I("blue"), col=I("red"), alpha=I(.2), xlim=c(5000,30000)) + 
      theme(axis.text.y = element_text(angle=45))
suppressWarnings(print(plt))