Housing Data Seattle

library(ggplot2)
library(devtools)
library(dplyr)
library(stringr)
library(ggmap)
library(tidyverse)
library(lubridate)
library(DT)
library(caret)
library(leaflet)
library(corrplot)
library(boot)
library(tibble)
rm(list=ls())
house <- read.csv("kc_house_data.csv")
nrow(house )
## [1] 21613

Geographic Map of House Prices in Seattle

library(leaflet)
house$PriceBin<-cut(house$price, c(0,250000,500000,750000,1000000,2000000,10000000))

center_lon = median(house$long,na.rm = TRUE)
center_lat = median(house$lat,na.rm = TRUE)

factpal <- colorFactor(c("red","orange","yellow","green","blue","purple"), 
                       house$PriceBin)

leaflet(house) %>% addProviderTiles("Esri.NatGeoWorldMap") %>%
  addCircles(lng = ~long, lat = ~lat, 
             color = ~factpal(PriceBin))  %>%
  # controls
  setView(lng=center_lon, lat=center_lat,zoom = 12) %>%
  
  addLegend("bottomleft", pal = factpal, values = ~PriceBin,
            title = "House Prices - Seattle",
            opacity = 1)

Exploratory Data Visualization

Generally, these trends fall as expected. We would have thought that as the number of bedrooms, bathrooms, the grade, and the condition increse the median price would also increase. The visuals below provide results that agree with this hypothesis.

library(dplyr)
p1 <- house %>%
  group_by(bedrooms) %>%
  summarise(PriceMedian = median(price, na.rm = TRUE)) %>%
 
ggplot(aes(x = bedrooms,y = PriceMedian)) +
  geom_bar(stat='identity',colour="white", fill = "lavender") +
  labs(x = 'Bedrooms', 
       y = 'Median Price', 
       title = 'Bedrooms and Median Price') +
  theme_bw()

p2 <- house %>%
  group_by(bathrooms) %>%
  summarise(PriceMedian = median(price, na.rm = TRUE)) %>%

ggplot(aes(x = bathrooms,y = PriceMedian)) +
  geom_bar(stat='identity',colour="white", fill = "light pink") +
  labs(x = 'Bathrooms', 
       y = 'Median Price', 
       title = 'Bathrooms and Median Price') +
  theme_bw()

p3 <- house %>%
  group_by(grade) %>%
  summarise(PriceMedian = median(price, na.rm = TRUE)) %>%
  
ggplot(aes(x = grade,y = PriceMedian)) +
  geom_bar(stat='identity',colour="white", fill = "light blue") +
  labs(x = 'Grade', 
       y = 'Median Price', 
       title = 'Grade and Median Price') +
  theme_bw()

p4 <- house %>%
  group_by(condition) %>%
  summarise(PriceMedian = median(price, na.rm = TRUE)) %>%
  
ggplot(aes(x = condition,y = PriceMedian)) +
  geom_bar(stat='identity',colour="white", fill = "light green") +
  labs(x = 'Condition', 
       y = 'Median Price', 
       title = 'Condition and Median Price') +
  theme_bw()

library(Rmisc)
multiplot(p1, p2, p3, p4, cols=2)

Correlation

From the correlation plot, we see that Price has a higher correlation with sqft_living and grade (0.70 and 0.667 respectively).

Additionally, sqft_living and grade have a high correlation of 0.762.

This poses some questions for our investigations about the relationships between these variables.

The variables that have low correlation with price are sqft_lot, floors, waterfront, view, sqft_basement, yr_built, yr_renovated, zipcode, sqft_lot15.

house2 <- house[-which(house$bedrooms==33), ]
house3 <- house2[,3:21]

corrplot(cor(house3))

VIP

fitControl <- trainControl(method="cv",number = 5)

KCHouseDataModel = train(price~., data = house3,
                   method = "lm",trControl = fitControl,metric="RMSE")
importance = varImp(KCHouseDataModel)

summary(KCHouseDataModel$finalModel)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1294571   -99124    -9548    77493  4319054 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.046e+06  2.930e+06   2.405  0.01618 *  
## bedrooms      -3.911e+04  1.969e+03 -19.864  < 2e-16 ***
## bathrooms      4.227e+04  3.256e+03  12.981  < 2e-16 ***
## sqft_living    1.521e+02  4.394e+00  34.617  < 2e-16 ***
## sqft_lot       1.262e-01  4.788e-02   2.635  0.00842 ** 
## floors         6.574e+03  3.593e+03   1.830  0.06732 .  
## waterfront     5.815e+05  1.735e+04  33.524  < 2e-16 ***
## view           5.261e+04  2.139e+03  24.601  < 2e-16 ***
## condition      2.630e+04  2.350e+03  11.195  < 2e-16 ***
## grade          9.539e+04  2.153e+03  44.312  < 2e-16 ***
## sqft_above     3.108e+01  4.357e+00   7.134 1.01e-12 ***
## sqft_basement         NA         NA      NA       NA    
## yr_built      -2.631e+03  7.262e+01 -36.225  < 2e-16 ***
## yr_renovated   1.959e+01  3.653e+00   5.363 8.29e-08 ***
## zipcode       -5.852e+02  3.296e+01 -17.755  < 2e-16 ***
## lat            6.016e+05  1.073e+04  56.082  < 2e-16 ***
## long          -2.148e+05  1.312e+04 -16.366  < 2e-16 ***
## sqft_living15  2.170e+01  3.445e+00   6.300 3.03e-10 ***
## sqft_lot15    -3.883e-01  7.321e-02  -5.304 1.14e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 201100 on 21594 degrees of freedom
## Multiple R-squared:  0.7003, Adjusted R-squared:    0.7 
## F-statistic:  2968 on 17 and 21594 DF,  p-value: < 2.2e-16
plot(importance)

PCA

library(factoextra)
library(robustHD)
house$date <- as.numeric(house$date)
stdhouse <- standardize(house3)


PCAData = house3 %>% select(lat, long, price, sqft_living, bedrooms, bathrooms, grade)
pca = prcomp(PCAData, scale. = T)
summary(pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.8609 1.1204 0.9122 0.81628 0.60265 0.51550
## Proportion of Variance 0.4947 0.1793 0.1189 0.09519 0.05188 0.03796
## Cumulative Proportion  0.4947 0.6741 0.7929 0.88810 0.93998 0.97794
##                            PC7
## Standard deviation     0.39294
## Proportion of Variance 0.02206
## Cumulative Proportion  1.00000
pca
## Standard deviations (1, .., p=7):
## [1] 1.8609259 1.1203936 0.9121597 0.8162776 0.6026476 0.5155043 0.3929373
## 
## Rotation (n x k) = (7 x 7):
##                    PC1         PC2         PC3          PC4        PC5
## lat         0.07961401 -0.71828495 -0.43041544  0.488916830 -0.1972406
## long        0.14574047  0.55939672 -0.77562657  0.192788533  0.1330083
## price       0.42128483 -0.34239729 -0.07232652 -0.276362473  0.5581812
## sqft_living 0.50236947  0.05252836  0.06079082 -0.061051836  0.1180438
## bedrooms    0.34886842  0.18620984  0.42936625  0.716229840  0.2584597
## bathrooms   0.45699923  0.11657685  0.09721982 -0.007361381 -0.7333248
## grade       0.46044312 -0.05369028 -0.10204044 -0.361455388 -0.1154889
##                     PC6         PC7
## lat          0.07467665  0.09460845
## long        -0.06337762 -0.07323312
## price       -0.44068243 -0.34337819
## sqft_living -0.04459634  0.84942313
## bedrooms     0.17113265 -0.22402758
## bathrooms   -0.43350421 -0.20582650
## grade        0.75960816 -0.23174341
KCHouseData_pca <- predict(pca, newdata = PCAData)
KCHouseData_pca = as.data.frame(KCHouseData_pca)

biplot (pca , scale =0)

plot(house3$price, house3$yr_built)

pca = prcomp(house3, scale. = T)
summary(pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4    PC5     PC6
## Standard deviation     2.3852 1.5540 1.35837 1.17428 1.0984 1.00345
## Proportion of Variance 0.2994 0.1271 0.09711 0.07258 0.0635 0.05299
## Cumulative Proportion  0.2994 0.4265 0.52366 0.59624 0.6597 0.71273
##                            PC7     PC8     PC9    PC10    PC11    PC12
## Standard deviation     0.96809 0.91448 0.81023 0.77351 0.71456 0.64769
## Proportion of Variance 0.04933 0.04401 0.03455 0.03149 0.02687 0.02208
## Cumulative Proportion  0.76206 0.80607 0.84062 0.87211 0.89899 0.92107
##                           PC13    PC14    PC15    PC16    PC17   PC18
## Standard deviation     0.57471 0.54242 0.51435 0.48458 0.43988 0.4271
## Proportion of Variance 0.01738 0.01549 0.01392 0.01236 0.01018 0.0096
## Cumulative Proportion  0.93845 0.95393 0.96786 0.98022 0.99040 1.0000
##                           PC19
## Standard deviation     2.3e-15
## Proportion of Variance 0.0e+00
## Cumulative Proportion  1.0e+00
pca
## Standard deviations (1, .., p=19):
##  [1] 2.385245e+00 1.554021e+00 1.358368e+00 1.174279e+00 1.098414e+00
##  [6] 1.003446e+00 9.680940e-01 9.144780e-01 8.102282e-01 7.735053e-01
## [11] 7.145557e-01 6.476915e-01 5.747091e-01 5.424195e-01 5.143519e-01
## [16] 4.845791e-01 4.398840e-01 4.270585e-01 2.300467e-15
## 
## Rotation (n x k) = (19 x 19):
##                       PC1         PC2         PC3         PC4         PC5
## price          0.29846528  0.30106632 -0.01011840  0.09632335 -0.03147272
## bedrooms       0.23988877  0.08479305  0.01630986 -0.36875920 -0.08419654
## bathrooms      0.34842170  0.01577263  0.11358832 -0.08055090 -0.02674327
## sqft_living    0.38744805  0.12397314 -0.03296871 -0.11923987 -0.06237750
## sqft_lot       0.09359546 -0.13608093 -0.54988501  0.25558231 -0.26884530
## floors         0.21801984 -0.16067798  0.33365135  0.29915870 -0.05420914
## waterfront     0.05970643  0.20610213 -0.10746681  0.32962509  0.57712162
## view           0.12774088  0.31277612 -0.15313856  0.23367725  0.41224236
## condition     -0.06905815  0.20698928 -0.23741220 -0.38194175  0.03648478
## grade          0.35971419  0.04327567  0.09464055  0.06599437 -0.05386132
## sqft_above     0.37645995 -0.06710388  0.05293391  0.05500496 -0.05129060
## sqft_basement  0.09966514  0.38282790 -0.16746004 -0.35036750 -0.03347992
## yr_built       0.21645587 -0.36433391  0.19772151  0.04330806  0.11171550
## yr_renovated   0.01276498  0.17425204 -0.04818962  0.20730760  0.03982378
## zipcode       -0.13666176  0.33545829  0.16697132  0.29742937 -0.29141613
## lat            0.01967748  0.25390695  0.16782550  0.16433558 -0.46949597
## long           0.15824672 -0.37988928 -0.17933990 -0.12960123  0.13303520
## sqft_living15  0.34487008  0.02768919 -0.04611681 -0.05485557  0.01780271
## sqft_lot15     0.09903096 -0.14472270 -0.55590324  0.24456108 -0.25232812
##                       PC6         PC7          PC8         PC9
## price          0.08426174 -0.26358012  0.014170036 -0.07372481
## bedrooms      -0.19002097  0.24097980  0.123888899  0.43143617
## bathrooms     -0.07406381  0.20448372  0.021140693  0.19063258
## sqft_living   -0.04124077  0.06209280  0.011425168 -0.05661964
## sqft_lot       0.02984584  0.14977446  0.051892308  0.08124263
## floors        -0.00554423  0.08182283  0.343373185  0.19288956
## waterfront     0.14133098 -0.04546196 -0.010272331  0.54423772
## view           0.14439986  0.10913786 -0.070284566 -0.29382348
## condition      0.21350339 -0.27693625  0.613869765  0.10849645
## grade          0.08230519 -0.06090289  0.018124064 -0.23341909
## sqft_above    -0.03560777 -0.09735712  0.240159282 -0.09615442
## sqft_basement -0.01895985  0.31101759 -0.425642049  0.06241174
## yr_built       0.15856100  0.22897936 -0.193895464  0.10642732
## yr_renovated  -0.87618634 -0.19471508  0.008471306  0.01224206
## zipcode        0.08304450  0.33494255  0.137254647 -0.03896069
## lat            0.21960991 -0.47109210 -0.341114401  0.34797371
## long          -0.03518728 -0.36281049 -0.252291012  0.13473731
## sqft_living15  0.05835512 -0.16171223 -0.068947539 -0.32620004
## sqft_lot15     0.03550735  0.12175648  0.030790944  0.06715275
##                       PC10         PC11         PC12         PC13
## price          0.013457680 -0.207669848  0.223733033 -0.218418070
## bedrooms      -0.460853500  0.329757615  0.173582189  0.220789250
## bathrooms      0.289276754  0.010119223 -0.038905161  0.001044943
## sqft_living   -0.080635441 -0.102858342 -0.101992589 -0.183560313
## sqft_lot       0.038562567 -0.008702018  0.130421963 -0.072500560
## floors         0.296731918  0.187873326  0.149755668 -0.432626954
## waterfront    -0.175655373 -0.330181593 -0.133756068  0.033067923
## view           0.085154850  0.674752307  0.164636668  0.081530361
## condition      0.388032909  0.062566750 -0.200013976  0.181413166
## grade          0.095188608 -0.205441964 -0.006621063  0.258232887
## sqft_above    -0.262097810 -0.044384932 -0.077785691 -0.056255731
## sqft_basement  0.323063131 -0.130407852 -0.066116465 -0.275671975
## yr_built       0.345022907 -0.024331748 -0.143061987  0.487399780
## yr_renovated   0.225841883  0.011938172 -0.106162647  0.201471357
## zipcode       -0.137047929  0.105074745 -0.671671399 -0.027557941
## lat            0.062774085  0.215846893  0.086371878  0.192461931
## long          -0.009861955  0.334520285 -0.488753111 -0.343689997
## sqft_living15 -0.207698103 -0.043284251 -0.209160323  0.185235498
## sqft_lot15     0.030124154 -0.022989712 -0.005020333  0.150156837
##                       PC14         PC15        PC16         PC17
## price          0.255662125  0.291744146  0.22531567  0.337924214
## bedrooms       0.001509477 -0.056599876  0.25734384  0.146405427
## bathrooms      0.182867995  0.252596528 -0.60248979  0.397253653
## sqft_living    0.008998624  0.068690695 -0.07810347 -0.495317814
## sqft_lot       0.457492485 -0.509286964 -0.07993086  0.001678487
## floors        -0.373923293 -0.247610746  0.15820301  0.015094088
## waterfront    -0.066753427 -0.095520136 -0.02665919 -0.023163046
## view           0.051662685  0.064299180 -0.02763026 -0.089486890
## condition      0.002293336 -0.057577637  0.01743831 -0.063355924
## grade          0.103814979 -0.090954732  0.51234498  0.138220541
## sqft_above     0.109518210  0.153488297 -0.16281688 -0.477531658
## sqft_basement -0.186240691 -0.144636542  0.14255721 -0.134408614
## yr_built       0.132952940  0.009544158  0.13043400 -0.185204006
## yr_renovated   0.002680327 -0.052987686  0.04178890 -0.057657454
## zipcode        0.120970810  0.038447261  0.09903622  0.113591723
## lat           -0.050669609 -0.055913557 -0.12699631 -0.169466328
## long           0.125458590  0.071632861  0.19218820  0.134193792
## sqft_living15 -0.432984572 -0.457084823 -0.28607067  0.289643166
## sqft_lot15    -0.499306229  0.481236091  0.08899405  0.045589312
##                       PC18          PC19
## price          0.520756468 -1.089575e-15
## bedrooms       0.073411919 -1.780893e-15
## bathrooms     -0.266106313  5.863089e-16
## sqft_living   -0.006110211 -6.992643e-01
## sqft_lot       0.035192671  1.037466e-16
## floors         0.041594655  9.148243e-16
## waterfront    -0.093049447  6.827248e-17
## view          -0.044151649 -4.593187e-16
## condition      0.053733426 -1.069604e-16
## grade         -0.607104257  4.679536e-16
## sqft_above    -0.013415040  6.304672e-01
## sqft_basement  0.012420227  3.369577e-01
## yr_built       0.440140107  3.213074e-16
## yr_renovated   0.076529148  9.849149e-17
## zipcode        0.086812909  1.126812e-16
## lat           -0.079826055 -1.713298e-16
## long          -0.067015517  6.731105e-17
## sqft_living15  0.221189245 -1.539254e-16
## sqft_lot15    -0.017196759 -1.602317e-16

CART Tree

library(rpart)
tree = rpart(price ~ ., data=house3, control=rpart.control(minsplit=6))
plot(tree, uniform=TRUE, main="Regression Tree for KC House Data ")
text(tree, use.n=TRUE, all=TRUE, cex=.8)

pfit <- prune(tree, cp=0.010) # from cptable
## simplify the tree based on a cp identified from the graph

# plot the pruned tree 
plot(tree, uniform=TRUE, main="Pruned Regression Tree for PRices")
text(tree, use.n=TRUE, all=TRUE, cex=.8)

Clustering

library(factoextra)
library(robustHD)

house$date <- as.numeric(house$date)
stdhouse <- standardize(house3)
k2 <- kmeans(stdhouse, centers = 8, nstart = 25)
k3 <- kmeans(stdhouse, centers = 2, nstart = 25)
fviz_cluster(k2, data = stdhouse)

summary(k2)
##              Length Class  Mode   
## cluster      21612  -none- numeric
## centers        152  -none- numeric
## totss            1  -none- numeric
## withinss         8  -none- numeric
## tot.withinss     1  -none- numeric
## betweenss        1  -none- numeric
## size             8  -none- numeric
## iter             1  -none- numeric
## ifault           1  -none- numeric
head(house3)
##     price bedrooms bathrooms sqft_living sqft_lot floors waterfront view
## 1  221900        3      1.00        1180     5650      1          0    0
## 2  538000        3      2.25        2570     7242      2          0    0
## 3  180000        2      1.00         770    10000      1          0    0
## 4  604000        4      3.00        1960     5000      1          0    0
## 5  510000        3      2.00        1680     8080      1          0    0
## 6 1225000        4      4.50        5420   101930      1          0    0
##   condition grade sqft_above sqft_basement yr_built yr_renovated zipcode
## 1         3     7       1180             0     1955            0   98178
## 2         3     7       2170           400     1951         1991   98125
## 3         3     6        770             0     1933            0   98028
## 4         5     7       1050           910     1965            0   98136
## 5         3     8       1680             0     1987            0   98074
## 6         3    11       3890          1530     2001            0   98053
##       lat     long sqft_living15 sqft_lot15
## 1 47.5112 -122.257          1340       5650
## 2 47.7210 -122.319          1690       7639
## 3 47.7379 -122.233          2720       8062
## 4 47.5208 -122.393          1360       5000
## 5 47.6168 -122.045          1800       7503
## 6 47.6561 -122.005          4760     101930
house_FA <- formula(~price + bedrooms + bathrooms + sqft_living + sqft_lot + floors + waterfront + view  + grade)

house_FA1 <- factanal(house_FA, factors = 4, data = house3, rotation = "none")
house_FA1
## 
## Call:
## factanal(x = house_FA, factors = 4, data = house3, rotation = "none")
## 
## Uniquenesses:
##       price    bedrooms   bathrooms sqft_living    sqft_lot      floors 
##       0.309       0.450       0.257       0.083       0.960       0.414 
##  waterfront        view       grade 
##       0.510       0.603       0.221 
## 
## Loadings:
##             Factor1 Factor2 Factor3 Factor4
## price        0.749   0.332          -0.120 
## bedrooms     0.582  -0.211  -0.323   0.250 
## bathrooms    0.812  -0.201           0.179 
## sqft_living  0.948          -0.133         
## sqft_lot     0.151                         
## floors       0.447  -0.291   0.530   0.147 
## waterfront   0.137   0.582   0.147   0.333 
## view         0.314   0.527           0.134 
## grade        0.833           0.220  -0.191 
## 
##                Factor1 Factor2 Factor3 Factor4
## SS loadings      3.493   0.898   0.499   0.302
## Proportion Var   0.388   0.100   0.055   0.034
## Cumulative Var   0.388   0.488   0.543   0.577
## 
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 419.33 on 6 degrees of freedom.
## The p-value is 1.95e-87
house_FA2 <- factanal(house_FA, factors = 3, data = house3, rotation = "varimax")
house_FA2
## 
## Call:
## factanal(x = house_FA, factors = 3, data = house3, rotation = "varimax")
## 
## Uniquenesses:
##       price    bedrooms   bathrooms sqft_living    sqft_lot      floors 
##       0.319       0.570       0.307       0.048       0.962       0.360 
##  waterfront        view       grade 
##       0.728       0.621       0.301 
## 
## Loadings:
##             Factor1 Factor2 Factor3
## price        0.532   0.296   0.557 
## bedrooms     0.627   0.171         
## bathrooms    0.610   0.560         
## sqft_living  0.876   0.361   0.233 
## sqft_lot     0.183                 
## floors               0.796         
## waterfront                   0.521 
## view         0.158           0.594 
## grade        0.570   0.531   0.304 
## 
##                Factor1 Factor2 Factor3
## SS loadings      2.205   1.478   1.101
## Proportion Var   0.245   0.164   0.122
## Cumulative Var   0.245   0.409   0.532
## 
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 2845.3 on 12 degrees of freedom.
## The p-value is 0
house_FA3 <- factanal(house_FA, factors = 3, data = house3, rotation = "varimax")
house_FA3
## 
## Call:
## factanal(x = house_FA, factors = 3, data = house3, rotation = "varimax")
## 
## Uniquenesses:
##       price    bedrooms   bathrooms sqft_living    sqft_lot      floors 
##       0.319       0.570       0.307       0.048       0.962       0.360 
##  waterfront        view       grade 
##       0.728       0.621       0.301 
## 
## Loadings:
##             Factor1 Factor2 Factor3
## price        0.532   0.296   0.557 
## bedrooms     0.627   0.171         
## bathrooms    0.610   0.560         
## sqft_living  0.876   0.361   0.233 
## sqft_lot     0.183                 
## floors               0.796         
## waterfront                   0.521 
## view         0.158           0.594 
## grade        0.570   0.531   0.304 
## 
##                Factor1 Factor2 Factor3
## SS loadings      2.205   1.478   1.101
## Proportion Var   0.245   0.164   0.122
## Cumulative Var   0.245   0.409   0.532
## 
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 2845.3 on 12 degrees of freedom.
## The p-value is 0