library(ggplot2)
library(devtools)
library(dplyr)
library(stringr)
library(ggmap)
library(tidyverse)
library(lubridate)
library(DT)
library(caret)
library(leaflet)
library(corrplot)
library(boot)
library(tibble)
rm(list=ls())
house <- read.csv("kc_house_data.csv")
nrow(house )
## [1] 21613
library(leaflet)
house$PriceBin<-cut(house$price, c(0,250000,500000,750000,1000000,2000000,10000000))
center_lon = median(house$long,na.rm = TRUE)
center_lat = median(house$lat,na.rm = TRUE)
factpal <- colorFactor(c("red","orange","yellow","green","blue","purple"),
house$PriceBin)
leaflet(house) %>% addProviderTiles("Esri.NatGeoWorldMap") %>%
addCircles(lng = ~long, lat = ~lat,
color = ~factpal(PriceBin)) %>%
# controls
setView(lng=center_lon, lat=center_lat,zoom = 12) %>%
addLegend("bottomleft", pal = factpal, values = ~PriceBin,
title = "House Prices - Seattle",
opacity = 1)
Generally, these trends fall as expected. We would have thought that as the number of bedrooms, bathrooms, the grade, and the condition increse the median price would also increase. The visuals below provide results that agree with this hypothesis.
library(dplyr)
p1 <- house %>%
group_by(bedrooms) %>%
summarise(PriceMedian = median(price, na.rm = TRUE)) %>%
ggplot(aes(x = bedrooms,y = PriceMedian)) +
geom_bar(stat='identity',colour="white", fill = "lavender") +
labs(x = 'Bedrooms',
y = 'Median Price',
title = 'Bedrooms and Median Price') +
theme_bw()
p2 <- house %>%
group_by(bathrooms) %>%
summarise(PriceMedian = median(price, na.rm = TRUE)) %>%
ggplot(aes(x = bathrooms,y = PriceMedian)) +
geom_bar(stat='identity',colour="white", fill = "light pink") +
labs(x = 'Bathrooms',
y = 'Median Price',
title = 'Bathrooms and Median Price') +
theme_bw()
p3 <- house %>%
group_by(grade) %>%
summarise(PriceMedian = median(price, na.rm = TRUE)) %>%
ggplot(aes(x = grade,y = PriceMedian)) +
geom_bar(stat='identity',colour="white", fill = "light blue") +
labs(x = 'Grade',
y = 'Median Price',
title = 'Grade and Median Price') +
theme_bw()
p4 <- house %>%
group_by(condition) %>%
summarise(PriceMedian = median(price, na.rm = TRUE)) %>%
ggplot(aes(x = condition,y = PriceMedian)) +
geom_bar(stat='identity',colour="white", fill = "light green") +
labs(x = 'Condition',
y = 'Median Price',
title = 'Condition and Median Price') +
theme_bw()
library(Rmisc)
multiplot(p1, p2, p3, p4, cols=2)
From the correlation plot, we see that Price has a higher correlation with sqft_living and grade (0.70 and 0.667 respectively).
Additionally, sqft_living and grade have a high correlation of 0.762.
This poses some questions for our investigations about the relationships between these variables.
The variables that have low correlation with price are sqft_lot, floors, waterfront, view, sqft_basement, yr_built, yr_renovated, zipcode, sqft_lot15.
house2 <- house[-which(house$bedrooms==33), ]
house3 <- house2[,3:21]
corrplot(cor(house3))
fitControl <- trainControl(method="cv",number = 5)
KCHouseDataModel = train(price~., data = house3,
method = "lm",trControl = fitControl,metric="RMSE")
importance = varImp(KCHouseDataModel)
summary(KCHouseDataModel$finalModel)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1294571 -99124 -9548 77493 4319054
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.046e+06 2.930e+06 2.405 0.01618 *
## bedrooms -3.911e+04 1.969e+03 -19.864 < 2e-16 ***
## bathrooms 4.227e+04 3.256e+03 12.981 < 2e-16 ***
## sqft_living 1.521e+02 4.394e+00 34.617 < 2e-16 ***
## sqft_lot 1.262e-01 4.788e-02 2.635 0.00842 **
## floors 6.574e+03 3.593e+03 1.830 0.06732 .
## waterfront 5.815e+05 1.735e+04 33.524 < 2e-16 ***
## view 5.261e+04 2.139e+03 24.601 < 2e-16 ***
## condition 2.630e+04 2.350e+03 11.195 < 2e-16 ***
## grade 9.539e+04 2.153e+03 44.312 < 2e-16 ***
## sqft_above 3.108e+01 4.357e+00 7.134 1.01e-12 ***
## sqft_basement NA NA NA NA
## yr_built -2.631e+03 7.262e+01 -36.225 < 2e-16 ***
## yr_renovated 1.959e+01 3.653e+00 5.363 8.29e-08 ***
## zipcode -5.852e+02 3.296e+01 -17.755 < 2e-16 ***
## lat 6.016e+05 1.073e+04 56.082 < 2e-16 ***
## long -2.148e+05 1.312e+04 -16.366 < 2e-16 ***
## sqft_living15 2.170e+01 3.445e+00 6.300 3.03e-10 ***
## sqft_lot15 -3.883e-01 7.321e-02 -5.304 1.14e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 201100 on 21594 degrees of freedom
## Multiple R-squared: 0.7003, Adjusted R-squared: 0.7
## F-statistic: 2968 on 17 and 21594 DF, p-value: < 2.2e-16
plot(importance)
library(factoextra)
library(robustHD)
house$date <- as.numeric(house$date)
stdhouse <- standardize(house3)
PCAData = house3 %>% select(lat, long, price, sqft_living, bedrooms, bathrooms, grade)
pca = prcomp(PCAData, scale. = T)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.8609 1.1204 0.9122 0.81628 0.60265 0.51550
## Proportion of Variance 0.4947 0.1793 0.1189 0.09519 0.05188 0.03796
## Cumulative Proportion 0.4947 0.6741 0.7929 0.88810 0.93998 0.97794
## PC7
## Standard deviation 0.39294
## Proportion of Variance 0.02206
## Cumulative Proportion 1.00000
pca
## Standard deviations (1, .., p=7):
## [1] 1.8609259 1.1203936 0.9121597 0.8162776 0.6026476 0.5155043 0.3929373
##
## Rotation (n x k) = (7 x 7):
## PC1 PC2 PC3 PC4 PC5
## lat 0.07961401 -0.71828495 -0.43041544 0.488916830 -0.1972406
## long 0.14574047 0.55939672 -0.77562657 0.192788533 0.1330083
## price 0.42128483 -0.34239729 -0.07232652 -0.276362473 0.5581812
## sqft_living 0.50236947 0.05252836 0.06079082 -0.061051836 0.1180438
## bedrooms 0.34886842 0.18620984 0.42936625 0.716229840 0.2584597
## bathrooms 0.45699923 0.11657685 0.09721982 -0.007361381 -0.7333248
## grade 0.46044312 -0.05369028 -0.10204044 -0.361455388 -0.1154889
## PC6 PC7
## lat 0.07467665 0.09460845
## long -0.06337762 -0.07323312
## price -0.44068243 -0.34337819
## sqft_living -0.04459634 0.84942313
## bedrooms 0.17113265 -0.22402758
## bathrooms -0.43350421 -0.20582650
## grade 0.75960816 -0.23174341
KCHouseData_pca <- predict(pca, newdata = PCAData)
KCHouseData_pca = as.data.frame(KCHouseData_pca)
biplot (pca , scale =0)
plot(house3$price, house3$yr_built)
pca = prcomp(house3, scale. = T)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.3852 1.5540 1.35837 1.17428 1.0984 1.00345
## Proportion of Variance 0.2994 0.1271 0.09711 0.07258 0.0635 0.05299
## Cumulative Proportion 0.2994 0.4265 0.52366 0.59624 0.6597 0.71273
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.96809 0.91448 0.81023 0.77351 0.71456 0.64769
## Proportion of Variance 0.04933 0.04401 0.03455 0.03149 0.02687 0.02208
## Cumulative Proportion 0.76206 0.80607 0.84062 0.87211 0.89899 0.92107
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.57471 0.54242 0.51435 0.48458 0.43988 0.4271
## Proportion of Variance 0.01738 0.01549 0.01392 0.01236 0.01018 0.0096
## Cumulative Proportion 0.93845 0.95393 0.96786 0.98022 0.99040 1.0000
## PC19
## Standard deviation 2.3e-15
## Proportion of Variance 0.0e+00
## Cumulative Proportion 1.0e+00
pca
## Standard deviations (1, .., p=19):
## [1] 2.385245e+00 1.554021e+00 1.358368e+00 1.174279e+00 1.098414e+00
## [6] 1.003446e+00 9.680940e-01 9.144780e-01 8.102282e-01 7.735053e-01
## [11] 7.145557e-01 6.476915e-01 5.747091e-01 5.424195e-01 5.143519e-01
## [16] 4.845791e-01 4.398840e-01 4.270585e-01 2.300467e-15
##
## Rotation (n x k) = (19 x 19):
## PC1 PC2 PC3 PC4 PC5
## price 0.29846528 0.30106632 -0.01011840 0.09632335 -0.03147272
## bedrooms 0.23988877 0.08479305 0.01630986 -0.36875920 -0.08419654
## bathrooms 0.34842170 0.01577263 0.11358832 -0.08055090 -0.02674327
## sqft_living 0.38744805 0.12397314 -0.03296871 -0.11923987 -0.06237750
## sqft_lot 0.09359546 -0.13608093 -0.54988501 0.25558231 -0.26884530
## floors 0.21801984 -0.16067798 0.33365135 0.29915870 -0.05420914
## waterfront 0.05970643 0.20610213 -0.10746681 0.32962509 0.57712162
## view 0.12774088 0.31277612 -0.15313856 0.23367725 0.41224236
## condition -0.06905815 0.20698928 -0.23741220 -0.38194175 0.03648478
## grade 0.35971419 0.04327567 0.09464055 0.06599437 -0.05386132
## sqft_above 0.37645995 -0.06710388 0.05293391 0.05500496 -0.05129060
## sqft_basement 0.09966514 0.38282790 -0.16746004 -0.35036750 -0.03347992
## yr_built 0.21645587 -0.36433391 0.19772151 0.04330806 0.11171550
## yr_renovated 0.01276498 0.17425204 -0.04818962 0.20730760 0.03982378
## zipcode -0.13666176 0.33545829 0.16697132 0.29742937 -0.29141613
## lat 0.01967748 0.25390695 0.16782550 0.16433558 -0.46949597
## long 0.15824672 -0.37988928 -0.17933990 -0.12960123 0.13303520
## sqft_living15 0.34487008 0.02768919 -0.04611681 -0.05485557 0.01780271
## sqft_lot15 0.09903096 -0.14472270 -0.55590324 0.24456108 -0.25232812
## PC6 PC7 PC8 PC9
## price 0.08426174 -0.26358012 0.014170036 -0.07372481
## bedrooms -0.19002097 0.24097980 0.123888899 0.43143617
## bathrooms -0.07406381 0.20448372 0.021140693 0.19063258
## sqft_living -0.04124077 0.06209280 0.011425168 -0.05661964
## sqft_lot 0.02984584 0.14977446 0.051892308 0.08124263
## floors -0.00554423 0.08182283 0.343373185 0.19288956
## waterfront 0.14133098 -0.04546196 -0.010272331 0.54423772
## view 0.14439986 0.10913786 -0.070284566 -0.29382348
## condition 0.21350339 -0.27693625 0.613869765 0.10849645
## grade 0.08230519 -0.06090289 0.018124064 -0.23341909
## sqft_above -0.03560777 -0.09735712 0.240159282 -0.09615442
## sqft_basement -0.01895985 0.31101759 -0.425642049 0.06241174
## yr_built 0.15856100 0.22897936 -0.193895464 0.10642732
## yr_renovated -0.87618634 -0.19471508 0.008471306 0.01224206
## zipcode 0.08304450 0.33494255 0.137254647 -0.03896069
## lat 0.21960991 -0.47109210 -0.341114401 0.34797371
## long -0.03518728 -0.36281049 -0.252291012 0.13473731
## sqft_living15 0.05835512 -0.16171223 -0.068947539 -0.32620004
## sqft_lot15 0.03550735 0.12175648 0.030790944 0.06715275
## PC10 PC11 PC12 PC13
## price 0.013457680 -0.207669848 0.223733033 -0.218418070
## bedrooms -0.460853500 0.329757615 0.173582189 0.220789250
## bathrooms 0.289276754 0.010119223 -0.038905161 0.001044943
## sqft_living -0.080635441 -0.102858342 -0.101992589 -0.183560313
## sqft_lot 0.038562567 -0.008702018 0.130421963 -0.072500560
## floors 0.296731918 0.187873326 0.149755668 -0.432626954
## waterfront -0.175655373 -0.330181593 -0.133756068 0.033067923
## view 0.085154850 0.674752307 0.164636668 0.081530361
## condition 0.388032909 0.062566750 -0.200013976 0.181413166
## grade 0.095188608 -0.205441964 -0.006621063 0.258232887
## sqft_above -0.262097810 -0.044384932 -0.077785691 -0.056255731
## sqft_basement 0.323063131 -0.130407852 -0.066116465 -0.275671975
## yr_built 0.345022907 -0.024331748 -0.143061987 0.487399780
## yr_renovated 0.225841883 0.011938172 -0.106162647 0.201471357
## zipcode -0.137047929 0.105074745 -0.671671399 -0.027557941
## lat 0.062774085 0.215846893 0.086371878 0.192461931
## long -0.009861955 0.334520285 -0.488753111 -0.343689997
## sqft_living15 -0.207698103 -0.043284251 -0.209160323 0.185235498
## sqft_lot15 0.030124154 -0.022989712 -0.005020333 0.150156837
## PC14 PC15 PC16 PC17
## price 0.255662125 0.291744146 0.22531567 0.337924214
## bedrooms 0.001509477 -0.056599876 0.25734384 0.146405427
## bathrooms 0.182867995 0.252596528 -0.60248979 0.397253653
## sqft_living 0.008998624 0.068690695 -0.07810347 -0.495317814
## sqft_lot 0.457492485 -0.509286964 -0.07993086 0.001678487
## floors -0.373923293 -0.247610746 0.15820301 0.015094088
## waterfront -0.066753427 -0.095520136 -0.02665919 -0.023163046
## view 0.051662685 0.064299180 -0.02763026 -0.089486890
## condition 0.002293336 -0.057577637 0.01743831 -0.063355924
## grade 0.103814979 -0.090954732 0.51234498 0.138220541
## sqft_above 0.109518210 0.153488297 -0.16281688 -0.477531658
## sqft_basement -0.186240691 -0.144636542 0.14255721 -0.134408614
## yr_built 0.132952940 0.009544158 0.13043400 -0.185204006
## yr_renovated 0.002680327 -0.052987686 0.04178890 -0.057657454
## zipcode 0.120970810 0.038447261 0.09903622 0.113591723
## lat -0.050669609 -0.055913557 -0.12699631 -0.169466328
## long 0.125458590 0.071632861 0.19218820 0.134193792
## sqft_living15 -0.432984572 -0.457084823 -0.28607067 0.289643166
## sqft_lot15 -0.499306229 0.481236091 0.08899405 0.045589312
## PC18 PC19
## price 0.520756468 -1.089575e-15
## bedrooms 0.073411919 -1.780893e-15
## bathrooms -0.266106313 5.863089e-16
## sqft_living -0.006110211 -6.992643e-01
## sqft_lot 0.035192671 1.037466e-16
## floors 0.041594655 9.148243e-16
## waterfront -0.093049447 6.827248e-17
## view -0.044151649 -4.593187e-16
## condition 0.053733426 -1.069604e-16
## grade -0.607104257 4.679536e-16
## sqft_above -0.013415040 6.304672e-01
## sqft_basement 0.012420227 3.369577e-01
## yr_built 0.440140107 3.213074e-16
## yr_renovated 0.076529148 9.849149e-17
## zipcode 0.086812909 1.126812e-16
## lat -0.079826055 -1.713298e-16
## long -0.067015517 6.731105e-17
## sqft_living15 0.221189245 -1.539254e-16
## sqft_lot15 -0.017196759 -1.602317e-16
library(rpart)
tree = rpart(price ~ ., data=house3, control=rpart.control(minsplit=6))
plot(tree, uniform=TRUE, main="Regression Tree for KC House Data ")
text(tree, use.n=TRUE, all=TRUE, cex=.8)
pfit <- prune(tree, cp=0.010) # from cptable
## simplify the tree based on a cp identified from the graph
# plot the pruned tree
plot(tree, uniform=TRUE, main="Pruned Regression Tree for PRices")
text(tree, use.n=TRUE, all=TRUE, cex=.8)
library(factoextra)
library(robustHD)
house$date <- as.numeric(house$date)
stdhouse <- standardize(house3)
k2 <- kmeans(stdhouse, centers = 8, nstart = 25)
k3 <- kmeans(stdhouse, centers = 2, nstart = 25)
fviz_cluster(k2, data = stdhouse)
summary(k2)
## Length Class Mode
## cluster 21612 -none- numeric
## centers 152 -none- numeric
## totss 1 -none- numeric
## withinss 8 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 8 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
head(house3)
## price bedrooms bathrooms sqft_living sqft_lot floors waterfront view
## 1 221900 3 1.00 1180 5650 1 0 0
## 2 538000 3 2.25 2570 7242 2 0 0
## 3 180000 2 1.00 770 10000 1 0 0
## 4 604000 4 3.00 1960 5000 1 0 0
## 5 510000 3 2.00 1680 8080 1 0 0
## 6 1225000 4 4.50 5420 101930 1 0 0
## condition grade sqft_above sqft_basement yr_built yr_renovated zipcode
## 1 3 7 1180 0 1955 0 98178
## 2 3 7 2170 400 1951 1991 98125
## 3 3 6 770 0 1933 0 98028
## 4 5 7 1050 910 1965 0 98136
## 5 3 8 1680 0 1987 0 98074
## 6 3 11 3890 1530 2001 0 98053
## lat long sqft_living15 sqft_lot15
## 1 47.5112 -122.257 1340 5650
## 2 47.7210 -122.319 1690 7639
## 3 47.7379 -122.233 2720 8062
## 4 47.5208 -122.393 1360 5000
## 5 47.6168 -122.045 1800 7503
## 6 47.6561 -122.005 4760 101930
house_FA <- formula(~price + bedrooms + bathrooms + sqft_living + sqft_lot + floors + waterfront + view + grade)
house_FA1 <- factanal(house_FA, factors = 4, data = house3, rotation = "none")
house_FA1
##
## Call:
## factanal(x = house_FA, factors = 4, data = house3, rotation = "none")
##
## Uniquenesses:
## price bedrooms bathrooms sqft_living sqft_lot floors
## 0.309 0.450 0.257 0.083 0.960 0.414
## waterfront view grade
## 0.510 0.603 0.221
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4
## price 0.749 0.332 -0.120
## bedrooms 0.582 -0.211 -0.323 0.250
## bathrooms 0.812 -0.201 0.179
## sqft_living 0.948 -0.133
## sqft_lot 0.151
## floors 0.447 -0.291 0.530 0.147
## waterfront 0.137 0.582 0.147 0.333
## view 0.314 0.527 0.134
## grade 0.833 0.220 -0.191
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 3.493 0.898 0.499 0.302
## Proportion Var 0.388 0.100 0.055 0.034
## Cumulative Var 0.388 0.488 0.543 0.577
##
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 419.33 on 6 degrees of freedom.
## The p-value is 1.95e-87
house_FA2 <- factanal(house_FA, factors = 3, data = house3, rotation = "varimax")
house_FA2
##
## Call:
## factanal(x = house_FA, factors = 3, data = house3, rotation = "varimax")
##
## Uniquenesses:
## price bedrooms bathrooms sqft_living sqft_lot floors
## 0.319 0.570 0.307 0.048 0.962 0.360
## waterfront view grade
## 0.728 0.621 0.301
##
## Loadings:
## Factor1 Factor2 Factor3
## price 0.532 0.296 0.557
## bedrooms 0.627 0.171
## bathrooms 0.610 0.560
## sqft_living 0.876 0.361 0.233
## sqft_lot 0.183
## floors 0.796
## waterfront 0.521
## view 0.158 0.594
## grade 0.570 0.531 0.304
##
## Factor1 Factor2 Factor3
## SS loadings 2.205 1.478 1.101
## Proportion Var 0.245 0.164 0.122
## Cumulative Var 0.245 0.409 0.532
##
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 2845.3 on 12 degrees of freedom.
## The p-value is 0
house_FA3 <- factanal(house_FA, factors = 3, data = house3, rotation = "varimax")
house_FA3
##
## Call:
## factanal(x = house_FA, factors = 3, data = house3, rotation = "varimax")
##
## Uniquenesses:
## price bedrooms bathrooms sqft_living sqft_lot floors
## 0.319 0.570 0.307 0.048 0.962 0.360
## waterfront view grade
## 0.728 0.621 0.301
##
## Loadings:
## Factor1 Factor2 Factor3
## price 0.532 0.296 0.557
## bedrooms 0.627 0.171
## bathrooms 0.610 0.560
## sqft_living 0.876 0.361 0.233
## sqft_lot 0.183
## floors 0.796
## waterfront 0.521
## view 0.158 0.594
## grade 0.570 0.531 0.304
##
## Factor1 Factor2 Factor3
## SS loadings 2.205 1.478 1.101
## Proportion Var 0.245 0.164 0.122
## Cumulative Var 0.245 0.409 0.532
##
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 2845.3 on 12 degrees of freedom.
## The p-value is 0