Note: the data is available from https://www.kaggle.com/harlfoxem/housesalesprediction
## Warning in if (renovated == 0) {: the condition has length > 1 and only the
## first element will be used
## Rows: 21,613
## Columns: 18
## $ price <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, ...
## $ bedrooms <dbl> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3...
## $ bathrooms <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1...
## $ sqft_living <dbl> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 17...
## $ sqft_lot <dbl> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711...
## $ floors <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0,...
## $ waterfront <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ view <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0...
## $ condition <dbl> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3...
## $ grade <dbl> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, ...
## $ sqft_above <dbl> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 10...
## $ sqft_basement <dbl> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300,...
## $ zipcode <fct> 98178, 98125, 98028, 98136, 98074, 98053, 98003, ...
## $ lat <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6...
## $ long <dbl> -122.257, -122.319, -122.233, -122.393, -122.045,...
## $ sqft_living15 <dbl> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1...
## $ sqft_lot15 <dbl> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711,...
## $ `perceived_age()` <dbl> 65, 69, 87, 55, 33, 19, 25, 57, 60, 17, 55, 78, 9...
#limit to 1 mil dollars since it is a minority of the data that heavily skews the hist
df %>%
filter(price > 1000000) %>%
count() / nrow(df)
## n
## 1 0.06778328
limit.df <- df %>%
filter(price < 1000000)
price.hist <- ggplot(limit.df, aes(price)) +
geom_histogram(col = "black", fill = "pink", binwidth = 10000) +
ylab("Count") +
ggtitle("Distribution of Housing Prices in King County")
#get rid of scientific notation in x axis
price.hist + scale_x_continuous(labels = function(x) format(x, scientific = FALSE))
#suggests many outliers
boxplot <- ggplot(df, aes(price)) +
geom_boxplot(fill = "orange", color = "purple") +
coord_flip() +
ggtitle("Boxplot of Housing Prices")
boxplot
#https://rpubs.com/Mentors_Ubiqum/removing_outliers
#https://www.displayr.com/how-to-create-a-correlation-matrix-in-r/
response.df <- df %>%
select(price)
#only consider continuous vars for linear regression
predictors.df <- df %>%
select(-c(price, zipcode, waterfront, view))
#create a correlation matrix to see which (if any) variables are correlated and plot
mydata.cor = cor(predictors.df, method = c("spearman"))
mydata.rcorr = rcorr(as.matrix(predictors.df))
mydata.rcorr
## bedrooms bathrooms sqft_living sqft_lot floors condition grade
## bedrooms 1.00 0.52 0.58 0.03 0.18 0.03 0.36
## bathrooms 0.52 1.00 0.75 0.09 0.50 -0.12 0.66
## sqft_living 0.58 0.75 1.00 0.17 0.35 -0.06 0.76
## sqft_lot 0.03 0.09 0.17 1.00 -0.01 -0.01 0.11
## floors 0.18 0.50 0.35 -0.01 1.00 -0.26 0.46
## condition 0.03 -0.12 -0.06 -0.01 -0.26 1.00 -0.14
## grade 0.36 0.66 0.76 0.11 0.46 -0.14 1.00
## sqft_above 0.48 0.69 0.88 0.18 0.52 -0.16 0.76
## sqft_basement 0.30 0.28 0.44 0.02 -0.25 0.17 0.17
## lat -0.01 0.02 0.05 -0.09 0.05 -0.01 0.11
## long 0.13 0.22 0.24 0.23 0.13 -0.11 0.20
## sqft_living15 0.39 0.57 0.76 0.14 0.28 -0.09 0.71
## sqft_lot15 0.03 0.09 0.18 0.72 -0.01 0.00 0.12
## perceived_age() -0.15 -0.51 -0.32 -0.05 -0.49 0.36 -0.45
## sqft_above sqft_basement lat long sqft_living15 sqft_lot15
## bedrooms 0.48 0.30 -0.01 0.13 0.39 0.03
## bathrooms 0.69 0.28 0.02 0.22 0.57 0.09
## sqft_living 0.88 0.44 0.05 0.24 0.76 0.18
## sqft_lot 0.18 0.02 -0.09 0.23 0.14 0.72
## floors 0.52 -0.25 0.05 0.13 0.28 -0.01
## condition -0.16 0.17 -0.01 -0.11 -0.09 0.00
## grade 0.76 0.17 0.11 0.20 0.71 0.12
## sqft_above 1.00 -0.05 0.00 0.34 0.73 0.19
## sqft_basement -0.05 1.00 0.11 -0.14 0.20 0.02
## lat 0.00 0.11 1.00 -0.14 0.05 -0.09
## long 0.34 -0.14 -0.14 1.00 0.33 0.25
## sqft_living15 0.73 0.20 0.05 0.33 1.00 0.18
## sqft_lot15 0.19 0.02 -0.09 0.25 0.18 1.00
## perceived_age() -0.42 0.13 0.15 -0.41 -0.33 -0.07
## perceived_age()
## bedrooms -0.15
## bathrooms -0.51
## sqft_living -0.32
## sqft_lot -0.05
## floors -0.49
## condition 0.36
## grade -0.45
## sqft_above -0.42
## sqft_basement 0.13
## lat 0.15
## long -0.41
## sqft_living15 -0.33
## sqft_lot15 -0.07
## perceived_age() 1.00
##
## n= 21613
##
##
## P
## bedrooms bathrooms sqft_living sqft_lot floors condition grade
## bedrooms 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## bathrooms 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## sqft_living 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## sqft_lot 0.0000 0.0000 0.0000 0.4445 0.1879 0.0000
## floors 0.0000 0.0000 0.0000 0.4445 0.0000 0.0000
## condition 0.0000 0.0000 0.0000 0.1879 0.0000 0.0000
## grade 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## sqft_above 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## sqft_basement 0.0000 0.0000 0.0000 0.0246 0.0000 0.0000 0.0000
## lat 0.1892 0.0003 0.0000 0.0000 0.0000 0.0281 0.0000
## long 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## sqft_living15 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## sqft_lot15 0.0000 0.0000 0.0000 0.0000 0.0976 0.6166 0.0000
## perceived_age() 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## sqft_above sqft_basement lat long sqft_living15 sqft_lot15
## bedrooms 0.0000 0.0000 0.1892 0.0000 0.0000 0.0000
## bathrooms 0.0000 0.0000 0.0003 0.0000 0.0000 0.0000
## sqft_living 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## sqft_lot 0.0000 0.0246 0.0000 0.0000 0.0000 0.0000
## floors 0.0000 0.0000 0.0000 0.0000 0.0000 0.0976
## condition 0.0000 0.0000 0.0281 0.0000 0.0000 0.6166
## grade 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## sqft_above 0.0000 0.9045 0.0000 0.0000 0.0000
## sqft_basement 0.0000 0.0000 0.0000 0.0000 0.0111
## lat 0.9045 0.0000 0.0000 0.0000 0.0000
## long 0.0000 0.0000 0.0000 0.0000 0.0000
## sqft_living15 0.0000 0.0000 0.0000 0.0000 0.0000
## sqft_lot15 0.0000 0.0111 0.0000 0.0000 0.0000
## perceived_age() 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## perceived_age()
## bedrooms 0.0000
## bathrooms 0.0000
## sqft_living 0.0000
## sqft_lot 0.0000
## floors 0.0000
## condition 0.0000
## grade 0.0000
## sqft_above 0.0000
## sqft_basement 0.0000
## lat 0.0000
## long 0.0000
## sqft_living15 0.0000
## sqft_lot15 0.0000
## perceived_age()
corrplot(mydata.cor)
ggpairs(predictors.df)
#ggscatmat(predictors.df, columns = 1:ncol(predictors.df))
From the correlation matrix and plot, we can see that price is highly correlated with \(sqft_living\), \(grade\), \(sqfoot_{above}\) and \(bathrooms\). Specifically, having more of these explanatory variables can lead to a higher price. We should, however, check for multicollineariy and use model selection criteria to see the best linear model we can create.
response.df <- df %>%
select(price)
#only consider continuous vars
predictors.df <- df %>%
select(-c(price, zipcode, waterfront, view))
df2 <- cbind(response.df, predictors.df)
#work in progress; build a linear regression model
# RegBest(y=df$price, x = predictors.df)
#
# # stepwise
# model <- lm(price ~ ., data = df2)
# ols_step_both_p(model, pent = 0.1, prem = 0.3, details = FALSE)
# ols_step_best_subset(model)
#
# # forward
# model <- lm(price ~ ., data = df2)
# ols_step_forward_p(model)
# k <- ols_step_forward_p(model)
# plot(k)
#
# #stepwise
# intercept.only.model <- lm(price ~ 1, data = df)
# big.model <- lm(price ~ ., data = df)
# step.mod <- step(intercept.only.model, direction = 'both', scope = formula(big.model))
#
# step.mod$coefficients
#lets see if we can predict if a house is waterfront or not
#set up for a logit model
#https://rstatisticsblog.com/data-science-in-action/machine-learning/binary-logistic-regression-with-r/
table(df$waterfront)
##
## 0 1
## 21450 163
ggplot(df, aes(price)) +
geom_histogram(aes(fill=waterfront), color = "black") +
ggtitle("Distribution of price, by waterfront")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#create new df to include only what we're interested in
subset.df <- df %>%
select(price:condition, `perceived_age()`)
index <- createDataPartition(subset.df$waterfront, p = .70, list = FALSE)
train <- subset.df[index, ]
test <- subset.df[-index, ]
#build logit model
logit.model <- glm(waterfront ~ ., family = binomial(), train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#suggests some of these variables are significant
summary(logit.model)
##
## Call:
## glm(formula = waterfront ~ ., family = binomial(), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.046 0.000 0.000 0.000 3.243
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.151e+01 6.530e+02 -0.033 0.973721
## price 1.594e-06 2.582e-07 6.173 6.71e-10 ***
## bedrooms -6.131e-01 1.630e-01 -3.761 0.000169 ***
## bathrooms -9.435e-02 2.265e-01 -0.417 0.677042
## sqft_living -6.642e-04 2.122e-04 -3.130 0.001747 **
## sqft_lot 1.631e-06 1.106e-06 1.475 0.140149
## floors 2.704e-01 2.353e-01 1.149 0.250553
## view1 1.798e+01 6.530e+02 0.028 0.978027
## view2 1.850e+01 6.530e+02 0.028 0.977395
## view3 2.028e+01 6.530e+02 0.031 0.975220
## view4 2.295e+01 6.530e+02 0.035 0.971963
## condition -1.501e-01 1.842e-01 -0.815 0.415146
## `perceived_age()` 9.398e-03 5.010e-03 1.876 0.060674 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1351.41 on 15129 degrees of freedom
## Residual deviance: 421.27 on 15117 degrees of freedom
## AIC: 447.27
##
## Number of Fisher Scoring iterations: 22
pred.prob <- predict(logit.model, test, type = "response")
# Converting from probability to actual output
train$pred.class <- ifelse(logit.model$fitted.values >= 0.5, "Waterfront", "Not Waterfront")
test$pred.class <- ifelse(pred.prob >= 0.5, "Waterfront", "Not Waterfront")
# Generating the classification table
ctab_train <- table(train$waterfront, train$pred.class)
ctab_train
##
## Not Waterfront Waterfront
## 0 14986 29
## 1 59 56
ctab_test <- table(test$waterfront, test$pred.class)
ctab_test
##
## Not Waterfront Waterfront
## 0 6422 13
## 1 27 21
#check accuracy on training; add diagonals
accuracy.train <- (ctab_train[1,1] + ctab_train[2,2])/(sum(ctab_train))
cat("The accuracy on the training data is:", accuracy.train*100, "%")
## The accuracy on the training data is: 99.41837 %
#now check testing accuracy
accuracy.test <- (ctab_test[1,1] + ctab_test[2,2])/(sum(ctab_test))
cat("The accuracy on the testing data is:", accuracy.test*100, "%")
## The accuracy on the testing data is: 99.383 %
Our data is very imbalanced (i.e., there are many more homes that do NOT have a waterfront view than those that do). Despite this, our logistic regression model was able to perform very well on both training and testing data.