The data for this project comes from here. The data from which this was compiled is most likely the King County Tax Assessors Website.

Data Manipulation

require(dplyr)
require(leaflet)
require(ggplot2)

houses <- read.csv("kc_house_data.csv")

#Perhaps we can combine yr_built and yr_renovated into an entity that represents both 
houses$age <- 2015 - apply(select(houses,yr_built,yr_renovated), FUN = max, MARGIN = 1)

#Fix data issues - source: https://gismaps.kingcounty.gov/parcelviewer2/
houses$bedrooms[houses$id==627300145] <- 4
houses$bathrooms[houses$id==627300145] <- 3
houses$bedrooms[houses$id==8141200080] <- 7
houses$bedrooms[houses$id==2402100895] <- 3
houses$bedrooms[houses$id==9822700190] <- 5
houses$bedrooms[houses$id==1346300150] <- 6
houses$bathrooms[houses$id==1346300150] <- 4.25
houses$bedrooms[houses$id==5566100170] <- 6
houses$bathrooms[houses$id==5566100170] <- 3.5
houses$bedrooms[houses$id==7226500100] <- 5
houses$bedrooms[houses$id==424049043] <- 2
houses$bathrooms[houses$id==424049043] <- 2

#Indiviual days would not make scalable predictors. Maybe seasons will.
houses$sold_spring <- case_when(
                            substring(houses$date,5,6) %in% c("03","04","05") ~ 1,
                            TRUE ~ 0
                        )
houses$sold_summer <- case_when(
                            substring(houses$date,5,6) %in% c("06","07","08") ~ 1,
                            TRUE ~ 0
                        )
houses$sold_fall <- case_when(
                            substring(houses$date,5,6) %in% c("09","10","11") ~ 1,
                            TRUE ~ 0
                        )
houses$sold_winter <- case_when(
                            substring(houses$date,5,6) %in% c("12","01","02") ~ 1,
                            TRUE ~ 0
                        )

#Ideally, I'd ping the google maps API for travel times to downtown, but Euclidean distance should be a decent proxy. 
distance_from_seattle <- function(lat, long){
        seattle <- data.frame(lat = 47.607512, long = -122.334231)
        return(sqrt((lat - seattle$lat)^2 + (long - seattle$long)^2))
}

#initialize vector
houses$dist_from_seattle = 100

#correct values using the newly defined function
for(i in 1:nrow(houses)){
        houses$dist_from_seattle[i] <- distance_from_seattle(houses$lat[i], houses$long[i])
}

Non-Hierarchical Clustering

#Since distance to main economic hub has been calculated, lat and long shouldn't be needed in cluster analysis
x <- select(houses, -c(id, date, lat, long))
x.scale <- scale(x, center = T, scale = T)

#After several iterations, 3 clusters consistently offered the most distinguished groups
k=3
set.seed(100, sample.kind = "Rounding")
## Warning in set.seed(100, sample.kind = "Rounding"): non-uniform 'Rounding'
## sampler used
groups = data.frame(memb = factor(kmeans(x.scale,k)$cluster))

with_group <-cbind(houses, groups)

#Create interactive map that allows for cluster toggling
leaflet() %>% 
        addProviderTiles(providers$Stamen.Toner) %>%
        setView(lng = -122, lat = 47.525, zoom = 9.5) %>%
        addCircleMarkers(data = with_group[with_group$memb == 1,],
            ~long, ~lat, radius = 2, color = "#F8766D", stroke = F,
            group = 'Group 1') %>%
        addCircleMarkers(data = with_group[with_group$memb == 2,],
            ~long, ~lat, radius = 2, color = "#00BA38", stroke = F,
            group = 'Group 2') %>%
        addCircleMarkers(data = with_group[with_group$memb == 3,],
            ~long, ~lat, radius = 2, color = "#00BFC4", stroke = F,
            group = 'Group 3') %>%
        addLayersControl(
            overlayGroups = c('Group 1', 'Group 2', 'Group 3'),
            options = layersControlOptions(collapsed = F)
        )

Group 3, the blue group, represents the high end houses that represent houses that are of excellent quality in good locations, and have a high sale price.

#These 3 variables best distinguish between the three clusters. We'll look at each pair of the 3 variables.
with_group %>% group_by(memb) %>% summarize(median(price),median(age),median(sqft_living))
ggplot(with_group, mapping = aes(x = log(sqft_living), y = age, color = memb, shape = memb, alpha = .2))+geom_point()

ggplot(with_group, mapping = aes(x = grade, y = log(sqft_living), color = memb, shape = memb, alpha = .2)) + geom_point() + geom_jitter()

ggplot(with_group, mapping = aes(x = age, y = grade, color = memb, shape = memb, alpha = .2)) + geom_point() + geom_jitter()

From these graphs, a few observations can be made:
* Square feet of living space offers a fairly clear distinction between group 3 homes and “other” homes.
* Between groups 1 and 2, group 1 homes are generally a bit cheaper, older and smaller.

num_houses_by_groups_and_zip <- with_group %>% group_by(zipcode, memb) %>% summarize(group_count = n()) %>% data.frame
num_houses_by_zip <- with_group %>% group_by(zipcode) %>% summarize(total_count = n()) %>% data.frame
top_tier_zips <- merge(x = filter(num_houses_by_groups_and_zip, memb == 3), y = num_houses_by_zip, by = "zipcode")
top_tier_zips$proportion_best <- top_tier_zips$group_count/top_tier_zips$total_count
best_zips <- top_tier_zips[order(top_tier_zips$proportion_best, decreasing = T),]
head(select(best_zips, zipcode, proportion_best))

If an investor can find a house in one of these zipcodes that is currently in group 2 (older, cheaper and smaller homes), they could make upgrades to push it into the upper tier. These homes may be the most valuable in terms of profit potential.

Principle Component Analysis

kc_houses_pc <- prcomp(x, center = T, scale = T)
vjs = kc_houses_pc$sdev^2
pve = vjs/sum(vjs)
plot(cumsum(pve), type = "o", ylab="Cumulative PVE", xlab="Principal Component")

Houses in King County vary widely in their characteristics. It takes 13 principal components to explain 90% of the variation in the data.

biplot(kc_houses_pc,scale=0)

From this biplot of the first two principal components, it seems that there are a few observations which are highly weighted on principal component 1.
Since the data with the most skew falls in the higher-end houses, it is likely that these will be a part of that group.

with_group <-cbind(PC1 = kc_houses_pc$x[,1], PC2 = kc_houses_pc$x[,2], groups)
ggplot(with_group, mapping = aes(x = PC1, y = PC2, color = memb, shape = memb, alpha = .2)) + geom_point()

By using just the first two principal components, we can draw some pretty clear boundaries between each of the three groups.

Penalized Regression

The goal of this exercise will be to explain as much of the variability in house sale price with as few predictors as possible.
First let’s examine whether or not penalized regression may help us reduce some collinearity between predictors.

cor(x.scale)
##                         price     bedrooms    bathrooms  sqft_living
## price              1.00000000  0.315438405  0.525855563  0.702035055
## bedrooms           0.31543840  1.000000000  0.528477652  0.591417207
## bathrooms          0.52585556  0.528477652  1.000000000  0.754864121
## sqft_living        0.70203505  0.591417207  0.754864121  1.000000000
## sqft_lot           0.08966086  0.033180228  0.087934345  0.172825661
## floors             0.25679389  0.181299301  0.501294896  0.353949290
## waterfront         0.26636943 -0.006522701  0.063862604  0.103817818
## view               0.39729349  0.081975230  0.187751838  0.284611186
## condition          0.03636179  0.025720172 -0.124741499 -0.058752587
## grade              0.66743426  0.368025447  0.665930815  0.762704476
## sqft_above         0.60556730  0.490684149  0.685474570  0.876596599
## sqft_basement      0.32381602  0.309214530  0.283935518  0.435042974
## yr_built           0.05401153  0.160202685  0.506187799  0.318048769
## yr_renovated       0.12643379  0.018986422  0.050884481  0.055362927
## zipcode           -0.05320285 -0.157782658 -0.204479376 -0.199430043
## sqft_living15      0.58537890  0.404285848  0.569583921  0.756420259
## sqft_lot15         0.08244715  0.030847375  0.087409392  0.183285551
## age               -0.10575463 -0.171459763 -0.537394885 -0.343744607
## sold_spring        0.02240598 -0.002197565 -0.013914356 -0.013890260
## sold_summer        0.01173582  0.010005239  0.024782742  0.025647944
## sold_fall         -0.01392345 -0.010064134 -0.001471677 -0.004864645
## sold_winter       -0.02582840  0.001905583 -0.011334465 -0.008593847
## dist_from_seattle -0.19127526  0.083796795  0.141223535  0.142216199
##                       sqft_lot       floors    waterfront          view
## price              0.089660861  0.256793888  0.2663694340  0.3972934883
## bedrooms           0.033180228  0.181299301 -0.0065227010  0.0819752299
## bathrooms          0.087934345  0.501294896  0.0638626037  0.1877518383
## sqft_living        0.172825661  0.353949290  0.1038178177  0.2846111862
## sqft_lot           1.000000000 -0.005200991  0.0216036833  0.0747101056
## floors            -0.005200991  1.000000000  0.0236983203  0.0294438202
## waterfront         0.021603683  0.023698320  1.0000000000  0.4018573507
## view               0.074710106  0.029443820  0.4018573507  1.0000000000
## condition         -0.008958250 -0.263767946  0.0166531574  0.0459897367
## grade              0.113621124  0.458182514  0.0827749139  0.2513205845
## sqft_above         0.183512281  0.523884710  0.0720745917  0.1676493441
## sqft_basement      0.015286202 -0.245704542  0.0805879390  0.2769465788
## yr_built           0.053080367  0.489319425 -0.0261610856 -0.0534398514
## yr_renovated       0.007643505  0.006338401  0.0928848367  0.1039172882
## zipcode           -0.129574486 -0.059120642  0.0302847276  0.0848269169
## sqft_living15      0.144608174  0.279885265  0.0864631361  0.2804390820
## sqft_lot15         0.718556752 -0.011269187  0.0307032831  0.0725745678
## age               -0.052910604 -0.505407861  0.0005369308  0.0182644960
## sold_spring        0.006934013 -0.013888482 -0.0048614645  0.0014919895
## sold_summer       -0.013552321  0.021563419 -0.0020527723 -0.0045605560
## sold_fall          0.003681790  0.003907385  0.0098694692  0.0036668967
## sold_winter        0.003785517 -0.013524796 -0.0026935644 -0.0004309767
## dist_from_seattle  0.265812037  0.051095402 -0.0161179602 -0.0696950081
##                      condition        grade   sqft_above sqft_basement
## price              0.036361789  0.667434256  0.605567298   0.323816021
## bedrooms           0.025720172  0.368025447  0.490684149   0.309214530
## bathrooms         -0.124741499  0.665930815  0.685474570   0.283935518
## sqft_living       -0.058752587  0.762704476  0.876596599   0.435042974
## sqft_lot          -0.008958250  0.113621124  0.183512281   0.015286202
## floors            -0.263767946  0.458182514  0.523884710  -0.245704542
## waterfront         0.016653157  0.082774914  0.072074592   0.080587939
## view               0.045989737  0.251320585  0.167649344   0.276946579
## condition          1.000000000 -0.144673671 -0.158213616   0.174104914
## grade             -0.144673671  1.000000000  0.755922938   0.168391825
## sqft_above        -0.158213616  0.755922938  1.000000000  -0.051943307
## sqft_basement      0.174104914  0.168391825 -0.051943307   1.000000000
## yr_built          -0.361416562  0.446963205  0.423898352  -0.133124099
## yr_renovated      -0.060617787  0.014414281  0.023284688   0.071322902
## zipcode            0.003025524 -0.184862093 -0.261189977   0.074844608
## sqft_living15     -0.092824268  0.713202093  0.731870292   0.200354983
## sqft_lot15        -0.003405523  0.119247897  0.194049862   0.017276181
## age                0.396358406 -0.460795598 -0.435904116   0.102263245
## sold_spring       -0.028888310 -0.008474336 -0.016052927   0.001210873
## sold_summer        0.038105636  0.031685159  0.025855985   0.004846665
## sold_fall          0.005046186 -0.011687023 -0.003264508  -0.003987076
## sold_winter       -0.016514660 -0.014817006 -0.008009106  -0.002848492
## dist_from_seattle -0.093697861  0.082014119  0.262349448  -0.195745421
##                       yr_built  yr_renovated      zipcode sqft_living15
## price              0.054011531  0.1264337934 -0.053202854   0.585378904
## bedrooms           0.160202685  0.0189864224 -0.157782658   0.404285848
## bathrooms          0.506187799  0.0508844812 -0.204479376   0.569583921
## sqft_living        0.318048769  0.0553629269 -0.199430043   0.756420259
## sqft_lot           0.053080367  0.0076435050 -0.129574486   0.144608174
## floors             0.489319425  0.0063384008 -0.059120642   0.279885265
## waterfront        -0.026161086  0.0928848367  0.030284728   0.086463136
## view              -0.053439851  0.1039172882  0.084826917   0.280439082
## condition         -0.361416562 -0.0606177866  0.003025524  -0.092824268
## grade              0.446963205  0.0144142807 -0.184862093   0.713202093
## sqft_above         0.423898352  0.0232846879 -0.261189977   0.731870292
## sqft_basement     -0.133124099  0.0713229017  0.074844608   0.200354983
## yr_built           1.000000000 -0.2248735185 -0.346869178   0.326228900
## yr_renovated      -0.224873518  1.0000000000  0.064357057  -0.002672555
## zipcode           -0.346869178  0.0643570573  1.000000000  -0.279032997
## sqft_living15      0.326228900 -0.0026725554 -0.279032997   1.000000000
## sqft_lot15         0.070957926  0.0078537650 -0.147221069   0.183191749
## age               -0.909923801 -0.1645770730  0.320619685  -0.324579358
## sold_spring       -0.003735654  0.0041929993  0.008059186  -0.008288260
## sold_summer        0.010987597 -0.0008559311 -0.009759017   0.032327234
## sold_fall         -0.014081004  0.0134274661  0.006149419  -0.012929024
## sold_winter        0.007109955 -0.0191748383 -0.004944122  -0.014422960
## dist_from_seattle  0.426438618 -0.0811719194 -0.565685633   0.218608502
##                      sqft_lot15           age   sold_spring   sold_summer
## price              0.0824471525 -0.1057546314  0.0224059779  0.0117358210
## bedrooms           0.0308473753 -0.1714597626 -0.0021975648  0.0100052389
## bathrooms          0.0874093924 -0.5373948849 -0.0139143556  0.0247827423
## sqft_living        0.1832855513 -0.3437446070 -0.0138902596  0.0256479441
## sqft_lot           0.7185567524 -0.0529106035  0.0069340133 -0.0135523205
## floors            -0.0112691866 -0.5054078612 -0.0138884820  0.0215634194
## waterfront         0.0307032831  0.0005369308 -0.0048614645 -0.0020527723
## view               0.0725745678  0.0182644960  0.0014919895 -0.0045605560
## condition         -0.0034055230  0.3963584056 -0.0288883097  0.0381056358
## grade              0.1192478972 -0.4607955983 -0.0084743362  0.0316851588
## sqft_above         0.1940498619 -0.4359041155 -0.0160529275  0.0258559851
## sqft_basement      0.0172761806  0.1022632450  0.0012108725  0.0048466645
## yr_built           0.0709579264 -0.9099238009 -0.0037356544  0.0109875972
## yr_renovated       0.0078537650 -0.1645770730  0.0041929993 -0.0008559311
## zipcode           -0.1472210687  0.3206196851  0.0080591857 -0.0097590175
## sqft_living15      0.1831917487 -0.3245793581 -0.0082882604  0.0323272343
## sqft_lot15         1.0000000000 -0.0702188718  0.0066577167  0.0004691805
## age               -0.0702188718  1.0000000000  0.0002933791 -0.0118835672
## sold_spring        0.0066577167  0.0002933791  1.0000000000 -0.4230406209
## sold_summer        0.0004691805 -0.0118835672 -0.4230406209  1.0000000000
## sold_fall         -0.0034914521  0.0112611062 -0.3635306027 -0.3560008408
## sold_winter       -0.0047540612  0.0013385622 -0.2986633228 -0.2924771484
## dist_from_seattle  0.2965790496 -0.3924683605 -0.0073888692  0.0054626867
##                      sold_fall   sold_winter dist_from_seattle
## price             -0.013923447 -0.0258283960      -0.191275255
## bedrooms          -0.010064134  0.0019055832       0.083796795
## bathrooms         -0.001471677 -0.0113344651       0.141223535
## sqft_living       -0.004864645 -0.0085938466       0.142216199
## sqft_lot           0.003681790  0.0037855173       0.265812037
## floors             0.003907385 -0.0135247964       0.051095402
## waterfront         0.009869469 -0.0026935644      -0.016117960
## view               0.003666897 -0.0004309767      -0.069695008
## condition          0.005046186 -0.0165146598      -0.093697861
## grade             -0.011687023 -0.0148170056       0.082014119
## sqft_above        -0.003264508 -0.0080091065       0.262349448
## sqft_basement     -0.003987076 -0.0028484917      -0.195745421
## yr_built          -0.014081004  0.0071099545       0.426438618
## yr_renovated       0.013427466 -0.0191748383      -0.081171919
## zipcode            0.006149419 -0.0049441219      -0.565685633
## sqft_living15     -0.012929024 -0.0144229605       0.218608502
## sqft_lot15        -0.003491452 -0.0047540612       0.296579050
## age                0.011261106  0.0013385622      -0.392468361
## sold_spring       -0.363530603 -0.2986633228      -0.007388869
## sold_summer       -0.356000841 -0.2924771484       0.005462687
## sold_fall          1.000000000 -0.2513337698      -0.007765573
## sold_winter       -0.251333770  1.0000000000       0.011136050
## dist_from_seattle -0.007765573  0.0111360502       1.000000000

Consistent the cumulative variation plot we saw in the Principle Component Analysis, there is not much repeated information across the variables in this dataset. To see if there are a few variables that we can trim out, let’s continue anyway.

summary(lm(scale(log(houses$price),center=T,scale=T)~.-price-1, data = data.frame(x.scale)))
## 
## Call:
## lm(formula = scale(log(houses$price), center = T, scale = T) ~ 
##     . - price - 1, data = data.frame(x.scale))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2734 -0.3598  0.0213  0.3635  2.7410 
## 
## Coefficients: (2 not defined because of singularities)
##                    Estimate Std. Error t value Pr(>|t|)    
## bedrooms          -0.040605   0.004749  -8.550  < 2e-16 ***
## bathrooms          0.095593   0.006701  14.266  < 2e-16 ***
## sqft_living        0.245362   0.010759  22.805  < 2e-16 ***
## sqft_lot           0.048360   0.005283   9.153  < 2e-16 ***
## floors             0.060509   0.005222  11.587  < 2e-16 ***
## waterfront         0.059884   0.003999  14.975  < 2e-16 ***
## view               0.057837   0.004341  13.324  < 2e-16 ***
## condition          0.067040   0.004086  16.409  < 2e-16 ***
## grade              0.355950   0.006766  52.609  < 2e-16 ***
## sqft_above         0.019774   0.009696   2.039  0.04142 *  
## sqft_basement            NA         NA      NA       NA    
## yr_built          -0.159450   0.022015  -7.243 4.54e-13 ***
## yr_renovated       0.025048   0.009257   2.706  0.00682 ** 
## zipcode           -0.095176   0.004610 -20.646  < 2e-16 ***
## sqft_living15      0.182719   0.006217  29.388  < 2e-16 ***
## sqft_lot15         0.016365   0.005343   3.063  0.00219 ** 
## age               -0.005762   0.021960  -0.262  0.79304    
## sold_spring        0.045093   0.005058   8.915  < 2e-16 ***
## sold_summer        0.010910   0.005049   2.161  0.03073 *  
## sold_fall          0.002340   0.004906   0.477  0.63342    
## sold_winter              NA         NA      NA       NA    
## dist_from_seattle -0.337431   0.005173 -65.235  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5351 on 21593 degrees of freedom
## Multiple R-squared:  0.7139, Adjusted R-squared:  0.7137 
## F-statistic:  2694 on 20 and 21593 DF,  p-value: < 2.2e-16

Additionally, a simple linear model shows that most of our variables have low standard errors compared to their coefficient estimates - meaning that penalized regression likely won’t be too useful here.

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 3.0-2
x = model.matrix(price~.,data=data.frame(x.scale))[,-1]
y = scale(log(houses$price), center = T, scale = T)

#Set seed and initialize parameters
set.seed(100, sample.kind = "Rounding")
n = nrow(x)
ncv = 10
groups=c(rep(1:ncv,floor(n/ncv)),1:(n%%ncv))
cvgroups = sample(groups,n)
lambdalist = seq(from = 1, to = .001, by = -.001)
alphalist = seq(from = 0, to = 1, by = .01)
cv_df = data.frame(alpha = alphalist, cv = rep(NA, length(alphalist)), lambda = rep(NA, length(alphalist)))

#Since cv.glmnet inherintly uses cross validation, we'll tune the alpha parameter of the elastic net for double cross-validation 
for (a in 1:length(alphalist)){
        cv_enet = cv.glmnet(x, y, lambda=lambdalist, alpha = alphalist[a], nfolds=ncv, foldid=cvgroups)
        cv_df$cv[a] <- min(cv_enet$cvm)
        #Since we have a preference for a simplified model, let's opt for a greater penalty in hopes of reducing the number of predictors
        cv_df$lambda[a] <- cv_enet$lambda.1se
}
#This record has the lowest cross validation score
cv_df[which.min(cv_df$cv),]

With an alpha close to 1, the model appears to favor LASSO regression.

#Re-examine model with best value of alpha
cv_enet = cv.glmnet(x, y, lambda=lambdalist, alpha = cv_df$alpha[which.min(cv_df$cv)], nfolds=ncv, foldid=cvgroups)
plot(cv_enet)

There is not much change in mean squared error for the first several lambda penalties. In order to choose a simplified model with fewer predictors, let’s choose a penalty sufficient enough to eliminate a few predictors. The dashed line on the right eliminates some redundant predictors without sacrificing much in terms of price variance explained, but it looks like we probably have the best tradeoff between model simplicity and accuracy around a lambda value of e^(-2.6).

final_enet = glmnet(x, y, lambda=exp(-2.6), alpha = cv_df$alpha[which.min(cv_df$cv)])
coef(final_enet)
## 23 x 1 sparse Matrix of class "dgCMatrix"
##                              s0
## (Intercept)       -1.083927e-15
## bedrooms           .           
## bathrooms          7.040157e-02
## sqft_living        2.221151e-01
## sqft_lot           2.702731e-02
## floors             1.661264e-02
## waterfront         4.095023e-02
## view               5.865523e-02
## condition          4.742310e-02
## grade              3.283180e-01
## sqft_above         4.284874e-02
## sqft_basement      .           
## yr_built          -9.180636e-02
## yr_renovated       1.783131e-02
## zipcode           -2.132682e-02
## sqft_living15      1.706933e-01
## sqft_lot15         .           
## age                .           
## sold_spring        1.168250e-02
## sold_summer        .           
## sold_fall          .           
## sold_winter        .           
## dist_from_seattle -2.709695e-01
final_enet$dev.ratio
## [1] 0.7004216

Without the variables that have been penalized to the point where they have no impact, we can still explain more than 98% of the variation in home sale price that is explained by a model with all predictors.