Note: the data is available from https://www.kaggle.com/harlfoxem/housesalesprediction

## Warning in if (renovated == 0) {: the condition has length > 1 and only the
## first element will be used
## Rows: 21,613
## Columns: 18
## $ price             <dbl> 221900, 538000, 180000, 604000, 510000, 1225000, ...
## $ bedrooms          <dbl> 3, 3, 2, 4, 3, 4, 3, 3, 3, 3, 3, 2, 3, 3, 5, 4, 3...
## $ bathrooms         <dbl> 1.00, 2.25, 1.00, 3.00, 2.00, 4.50, 2.25, 1.50, 1...
## $ sqft_living       <dbl> 1180, 2570, 770, 1960, 1680, 5420, 1715, 1060, 17...
## $ sqft_lot          <dbl> 5650, 7242, 10000, 5000, 8080, 101930, 6819, 9711...
## $ floors            <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0,...
## $ waterfront        <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ view              <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0...
## $ condition         <dbl> 3, 3, 3, 5, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3...
## $ grade             <dbl> 7, 7, 6, 7, 8, 11, 7, 7, 7, 7, 8, 7, 7, 7, 7, 9, ...
## $ sqft_above        <dbl> 1180, 2170, 770, 1050, 1680, 3890, 1715, 1060, 10...
## $ sqft_basement     <dbl> 0, 400, 0, 910, 0, 1530, 0, 0, 730, 0, 1700, 300,...
## $ zipcode           <fct> 98178, 98125, 98028, 98136, 98074, 98053, 98003, ...
## $ lat               <dbl> 47.5112, 47.7210, 47.7379, 47.5208, 47.6168, 47.6...
## $ long              <dbl> -122.257, -122.319, -122.233, -122.393, -122.045,...
## $ sqft_living15     <dbl> 1340, 1690, 2720, 1360, 1800, 4760, 2238, 1650, 1...
## $ sqft_lot15        <dbl> 5650, 7639, 8062, 5000, 7503, 101930, 6819, 9711,...
## $ `perceived_age()` <dbl> 65, 69, 87, 55, 33, 19, 25, 57, 60, 17, 55, 78, 9...
#limit to 1 mil dollars since it is a minority of the data that heavily skews the hist
df %>% 
  filter(price > 1000000) %>%
  count() / nrow(df)
##            n
## 1 0.06778328
limit.df <- df %>% 
  filter(price < 1000000)
 
price.hist <- ggplot(limit.df, aes(price)) + 
  geom_histogram(col = "black", fill = "pink", binwidth = 10000) +
  ylab("Count") +
  ggtitle("Distribution of Housing Prices in King County")

#get rid of scientific notation in x axis
price.hist + scale_x_continuous(labels = function(x) format(x, scientific = FALSE)) 

#suggests many outliers
boxplot <- ggplot(df, aes(price)) + 
  geom_boxplot(fill = "orange", color = "purple") +
  coord_flip() + 
  ggtitle("Boxplot of Housing Prices")
boxplot

#https://rpubs.com/Mentors_Ubiqum/removing_outliers
#https://www.displayr.com/how-to-create-a-correlation-matrix-in-r/

response.df <- df %>% 
  select(price)

#only consider continuous vars for linear regression
predictors.df <- df %>% 
  select(-c(price, zipcode, waterfront, view))

#create a correlation matrix to see which (if any) variables are correlated and plot
mydata.cor = cor(predictors.df, method = c("spearman"))
mydata.rcorr = rcorr(as.matrix(predictors.df))
mydata.rcorr
##                 bedrooms bathrooms sqft_living sqft_lot floors condition grade
## bedrooms            1.00      0.52        0.58     0.03   0.18      0.03  0.36
## bathrooms           0.52      1.00        0.75     0.09   0.50     -0.12  0.66
## sqft_living         0.58      0.75        1.00     0.17   0.35     -0.06  0.76
## sqft_lot            0.03      0.09        0.17     1.00  -0.01     -0.01  0.11
## floors              0.18      0.50        0.35    -0.01   1.00     -0.26  0.46
## condition           0.03     -0.12       -0.06    -0.01  -0.26      1.00 -0.14
## grade               0.36      0.66        0.76     0.11   0.46     -0.14  1.00
## sqft_above          0.48      0.69        0.88     0.18   0.52     -0.16  0.76
## sqft_basement       0.30      0.28        0.44     0.02  -0.25      0.17  0.17
## lat                -0.01      0.02        0.05    -0.09   0.05     -0.01  0.11
## long                0.13      0.22        0.24     0.23   0.13     -0.11  0.20
## sqft_living15       0.39      0.57        0.76     0.14   0.28     -0.09  0.71
## sqft_lot15          0.03      0.09        0.18     0.72  -0.01      0.00  0.12
## perceived_age()    -0.15     -0.51       -0.32    -0.05  -0.49      0.36 -0.45
##                 sqft_above sqft_basement   lat  long sqft_living15 sqft_lot15
## bedrooms              0.48          0.30 -0.01  0.13          0.39       0.03
## bathrooms             0.69          0.28  0.02  0.22          0.57       0.09
## sqft_living           0.88          0.44  0.05  0.24          0.76       0.18
## sqft_lot              0.18          0.02 -0.09  0.23          0.14       0.72
## floors                0.52         -0.25  0.05  0.13          0.28      -0.01
## condition            -0.16          0.17 -0.01 -0.11         -0.09       0.00
## grade                 0.76          0.17  0.11  0.20          0.71       0.12
## sqft_above            1.00         -0.05  0.00  0.34          0.73       0.19
## sqft_basement        -0.05          1.00  0.11 -0.14          0.20       0.02
## lat                   0.00          0.11  1.00 -0.14          0.05      -0.09
## long                  0.34         -0.14 -0.14  1.00          0.33       0.25
## sqft_living15         0.73          0.20  0.05  0.33          1.00       0.18
## sqft_lot15            0.19          0.02 -0.09  0.25          0.18       1.00
## perceived_age()      -0.42          0.13  0.15 -0.41         -0.33      -0.07
##                 perceived_age()
## bedrooms                  -0.15
## bathrooms                 -0.51
## sqft_living               -0.32
## sqft_lot                  -0.05
## floors                    -0.49
## condition                  0.36
## grade                     -0.45
## sqft_above                -0.42
## sqft_basement              0.13
## lat                        0.15
## long                      -0.41
## sqft_living15             -0.33
## sqft_lot15                -0.07
## perceived_age()            1.00
## 
## n= 21613 
## 
## 
## P
##                 bedrooms bathrooms sqft_living sqft_lot floors condition grade 
## bedrooms                 0.0000    0.0000      0.0000   0.0000 0.0000    0.0000
## bathrooms       0.0000             0.0000      0.0000   0.0000 0.0000    0.0000
## sqft_living     0.0000   0.0000                0.0000   0.0000 0.0000    0.0000
## sqft_lot        0.0000   0.0000    0.0000               0.4445 0.1879    0.0000
## floors          0.0000   0.0000    0.0000      0.4445          0.0000    0.0000
## condition       0.0000   0.0000    0.0000      0.1879   0.0000           0.0000
## grade           0.0000   0.0000    0.0000      0.0000   0.0000 0.0000          
## sqft_above      0.0000   0.0000    0.0000      0.0000   0.0000 0.0000    0.0000
## sqft_basement   0.0000   0.0000    0.0000      0.0246   0.0000 0.0000    0.0000
## lat             0.1892   0.0003    0.0000      0.0000   0.0000 0.0281    0.0000
## long            0.0000   0.0000    0.0000      0.0000   0.0000 0.0000    0.0000
## sqft_living15   0.0000   0.0000    0.0000      0.0000   0.0000 0.0000    0.0000
## sqft_lot15      0.0000   0.0000    0.0000      0.0000   0.0976 0.6166    0.0000
## perceived_age() 0.0000   0.0000    0.0000      0.0000   0.0000 0.0000    0.0000
##                 sqft_above sqft_basement lat    long   sqft_living15 sqft_lot15
## bedrooms        0.0000     0.0000        0.1892 0.0000 0.0000        0.0000    
## bathrooms       0.0000     0.0000        0.0003 0.0000 0.0000        0.0000    
## sqft_living     0.0000     0.0000        0.0000 0.0000 0.0000        0.0000    
## sqft_lot        0.0000     0.0246        0.0000 0.0000 0.0000        0.0000    
## floors          0.0000     0.0000        0.0000 0.0000 0.0000        0.0976    
## condition       0.0000     0.0000        0.0281 0.0000 0.0000        0.6166    
## grade           0.0000     0.0000        0.0000 0.0000 0.0000        0.0000    
## sqft_above                 0.0000        0.9045 0.0000 0.0000        0.0000    
## sqft_basement   0.0000                   0.0000 0.0000 0.0000        0.0111    
## lat             0.9045     0.0000               0.0000 0.0000        0.0000    
## long            0.0000     0.0000        0.0000        0.0000        0.0000    
## sqft_living15   0.0000     0.0000        0.0000 0.0000               0.0000    
## sqft_lot15      0.0000     0.0111        0.0000 0.0000 0.0000                  
## perceived_age() 0.0000     0.0000        0.0000 0.0000 0.0000        0.0000    
##                 perceived_age()
## bedrooms        0.0000         
## bathrooms       0.0000         
## sqft_living     0.0000         
## sqft_lot        0.0000         
## floors          0.0000         
## condition       0.0000         
## grade           0.0000         
## sqft_above      0.0000         
## sqft_basement   0.0000         
## lat             0.0000         
## long            0.0000         
## sqft_living15   0.0000         
## sqft_lot15      0.0000         
## perceived_age()
corrplot(mydata.cor)

ggpairs(predictors.df)

#ggscatmat(predictors.df, columns = 1:ncol(predictors.df))

From the correlation matrix and plot, we can see that price is highly correlated with \(sqft_living\), \(grade\), \(sqfoot_{above}\) and \(bathrooms\). Specifically, having more of these explanatory variables can lead to a higher price. We should, however, check for multicollineariy and use model selection criteria to see the best linear model we can create.

response.df <- df %>% 
  select(price)

#only consider continuous vars
predictors.df <- df %>% 
  select(-c(price, zipcode, waterfront, view))

df2 <- cbind(response.df, predictors.df)

#work in progress; build a linear regression model
# RegBest(y=df$price, x = predictors.df)
# 
# # stepwise 
# model <- lm(price ~ ., data = df2)
# ols_step_both_p(model, pent = 0.1, prem = 0.3, details = FALSE)
# ols_step_best_subset(model)
# 
# # forward 
# model <- lm(price ~ ., data = df2)
# ols_step_forward_p(model)
# k <- ols_step_forward_p(model)
# plot(k)
# 
# #stepwise
# intercept.only.model <- lm(price ~ 1, data = df)
# big.model <- lm(price ~ ., data = df)
# step.mod <- step(intercept.only.model, direction = 'both', scope = formula(big.model))
# 
# step.mod$coefficients
#lets see if we can predict if a house is waterfront or not
#set up for a logit model

#https://rstatisticsblog.com/data-science-in-action/machine-learning/binary-logistic-regression-with-r/
table(df$waterfront)
## 
##     0     1 
## 21450   163
ggplot(df, aes(price)) + 
  geom_histogram(aes(fill=waterfront), color = "black") +
  ggtitle("Distribution of price, by waterfront")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#create new df to include only what we're interested in
subset.df <- df %>% 
  select(price:condition, `perceived_age()`)

index <- createDataPartition(subset.df$waterfront, p = .70, list = FALSE)
train <- subset.df[index, ]
test <- subset.df[-index, ]

#build logit model
logit.model <- glm(waterfront ~ ., family = binomial(), train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#suggests some of these variables are significant
summary(logit.model)
## 
## Call:
## glm(formula = waterfront ~ ., family = binomial(), data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.046   0.000   0.000   0.000   3.243  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -2.151e+01  6.530e+02  -0.033 0.973721    
## price              1.594e-06  2.582e-07   6.173 6.71e-10 ***
## bedrooms          -6.131e-01  1.630e-01  -3.761 0.000169 ***
## bathrooms         -9.435e-02  2.265e-01  -0.417 0.677042    
## sqft_living       -6.642e-04  2.122e-04  -3.130 0.001747 ** 
## sqft_lot           1.631e-06  1.106e-06   1.475 0.140149    
## floors             2.704e-01  2.353e-01   1.149 0.250553    
## view1              1.798e+01  6.530e+02   0.028 0.978027    
## view2              1.850e+01  6.530e+02   0.028 0.977395    
## view3              2.028e+01  6.530e+02   0.031 0.975220    
## view4              2.295e+01  6.530e+02   0.035 0.971963    
## condition         -1.501e-01  1.842e-01  -0.815 0.415146    
## `perceived_age()`  9.398e-03  5.010e-03   1.876 0.060674 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1351.41  on 15129  degrees of freedom
## Residual deviance:  421.27  on 15117  degrees of freedom
## AIC: 447.27
## 
## Number of Fisher Scoring iterations: 22
pred.prob <- predict(logit.model, test, type = "response")

# Converting from probability to actual output
train$pred.class <- ifelse(logit.model$fitted.values >= 0.5, "Waterfront", "Not Waterfront")

test$pred.class <- ifelse(pred.prob >= 0.5, "Waterfront", "Not Waterfront")
 
# Generating the classification table
ctab_train <- table(train$waterfront, train$pred.class)
ctab_train
##    
##     Not Waterfront Waterfront
##   0          14986         29
##   1             59         56
ctab_test <- table(test$waterfront, test$pred.class)
ctab_test
##    
##     Not Waterfront Waterfront
##   0           6422         13
##   1             27         21
#check accuracy on training; add diagonals
accuracy.train <- (ctab_train[1,1] + ctab_train[2,2])/(sum(ctab_train))
cat("The accuracy on the training data is:", accuracy.train*100, "%")
## The accuracy on the training data is: 99.41837 %
#now check testing accuracy
accuracy.test <- (ctab_test[1,1] + ctab_test[2,2])/(sum(ctab_test))
cat("The accuracy on the testing data is:", accuracy.test*100, "%")
## The accuracy on the testing data is: 99.383 %

Our data is very imbalanced (i.e., there are many more homes that do NOT have a waterfront view than those that do). Despite this, our logistic regression model was able to perform very well on both training and testing data.