The data for the housing prediction model was downloaded from redfin.com for Puyallup, WA area.
library(XLConnect)
## Loading required package: XLConnectJars
## XLConnect 0.2-14 by Mirai Solutions GmbH [aut],
## Martin Studer [cre],
## The Apache Software Foundation [ctb, cph] (Apache POI),
## Graph Builder [ctb, cph] (Curvesapi Java library)
## http://www.mirai-solutions.com
## https://github.com/miraisolutions/xlconnect
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.8.0 v stringr 1.3.0
## v readr 1.1.1 v forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.4.4
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(corrgram)
library(chemCal)
library(forecast)
library(urca)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(readr)
redfin2 <- read.csv("~/1 UW Tacoma/560 data mining/data/redfin.csv")
names(redfin2)
## [1] "price" "beds" "baths" "squarefeet"
## [5] "lotsize" "year" "neighborhood" "propertytype"
any(is.na(redfin2))
## [1] TRUE
modified <- redfin2[which(complete.cases(redfin2[,c('neighborhood', 'beds', 'baths', 'squarefeet', 'propertytype', 'lotsize', 'price', 'year')])),]
summary(redfin2)
## price beds baths squarefeet
## Min. :105000 Min. :0.000 Min. :1.000 Min. : 710
## 1st Qu.:285000 1st Qu.:3.000 1st Qu.:2.250 1st Qu.:1768
## Median :366900 Median :4.000 Median :2.500 Median :2250
## Mean :367423 Mean :3.602 Mean :2.394 Mean :2240
## 3rd Qu.:424999 3rd Qu.:4.000 3rd Qu.:2.500 3rd Qu.:2707
## Max. :749950 Max. :7.000 Max. :3.750 Max. :4516
## NA's :3
## lotsize year neighborhood
## Min. : 34 Min. :1900 Puyallup :150
## 1st Qu.: 5000 1st Qu.:1989 South Hill : 61
## Median : 6698 Median :2004 : 17
## Mean : 13660 Mean :1998 English Ridge : 12
## 3rd Qu.: 14062 3rd Qu.:2017 Summerwood Park : 12
## Max. :179032 Max. :2017 Arborvue at Fruitland: 9
## NA's :25 NA's :19 (Other) : 8
## propertytype
## Condo/Co-op : 8
## Single Family Residential:249
## Townhouse : 12
##
##
##
##
cor1 <- cor(redfin2[,-c(7:8)])
cor1
## price beds baths squarefeet lotsize year
## price 1.0000000 0.4991457 NA 0.8506723 NA NA
## beds 0.4991457 1.0000000 NA 0.6072977 NA NA
## baths NA NA 1 NA NA NA
## squarefeet 0.8506723 0.6072977 NA 1.0000000 NA NA
## lotsize NA NA NA NA 1 NA
## year NA NA NA NA NA 1
I identified a parsimonious model from the Categorical Variables Lab and Interaction Variables Lab
test1 <- lm(price ~ beds + baths + squarefeet + propertytype + lotsize + year, data = modified)
summary(test1)
##
## Call:
## lm(formula = price ~ beds + baths + squarefeet + propertytype +
## lotsize + year, data = modified)
##
## Residuals:
## Min 1Q Median 3Q Max
## -305362 -30428 -3673 29586 164824
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -1.307e+06 4.351e+05 -3.005
## beds -1.438e+04 6.264e+03 -2.296
## baths 4.398e+03 1.055e+04 0.417
## squarefeet 1.366e+02 8.950e+00 15.267
## propertytypeSingle Family Residential 4.096e+04 2.343e+04 1.748
## propertytypeTownhouse -7.914e+03 2.707e+04 -0.292
## lotsize 8.926e-01 2.154e-01 4.145
## year 6.807e+02 2.205e+02 3.087
## Pr(>|t|)
## (Intercept) 0.00295 **
## beds 0.02258 *
## baths 0.67717
## squarefeet < 2e-16 ***
## propertytypeSingle Family Residential 0.08170 .
## propertytypeTownhouse 0.77029
## lotsize 4.77e-05 ***
## year 0.00226 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56530 on 233 degrees of freedom
## Multiple R-squared: 0.7437, Adjusted R-squared: 0.736
## F-statistic: 96.57 on 7 and 233 DF, p-value: < 2.2e-16
Used the below command to predict your pricing values manually by changing the explanatory variable I am interested in. For example, beds = 1, beds = 2, etc. etc.
lott <- redfin2$'lotsize'
year <- redfin2$'year'
sq <- redfin2$'squarefeet'
bath2 <- redfin2$'baths'
beds2 <- redfin2$'beds'
test2 <- lm(price ~beds2 + lott + year + sq + bath2, data = redfin2)
newdata2 <- data.frame (year = 2010, beds2 = 4, bath2 = 3, sq = 2400, lott = 6000)
predict.lm(test2,newdata2)
## 1
## 388094.2
pricefit2 <- lm(redfin2$price ~ redfin2$squarefeet )
inverse.predict( pricefit2, 388094.2)
## $Prediction
## [1] 2378.597
##
## $`Standard Error`
## [1] 437.0672
##
## $Confidence
## [1] 862.0135
##
## $`Confidence Limits`
## [1] 1516.583 3240.610
pricefit2 <- lm(redfin2$price ~ redfin2$beds )
inverse.predict( pricefit2, 388094.2)
## $Prediction
## [1] 3.914278
##
## $`Standard Error`
## [1] 2.863666
##
## $Confidence
## [1] 7.007139
##
## $`Confidence Limits`
## [1] -3.092861 10.921416
pricefit3 <- lm(redfin2$price ~ redfin2$baths )
inverse.predict( pricefit3, 388094.2)
## $Prediction
## [1] 2.558189
##
## $`Standard Error`
## [1] 0.7374787
##
## $Confidence
## [1] 1.668293
##
## $`Confidence Limits`
## [1] 0.8898967 4.2264821
Used the predict function to predict the price for a different zip code of data
library(readr)
redfintest <- read.csv("~/1 UW Tacoma/560 data mining/data/redfintest.csv")
modified <- redfintest[which(complete.cases(redfintest[,c('location', 'zip', 'beds', 'baths', 'squarefeet', 'propertytype', 'lotsize', 'price', 'yearbuilt')])),]
names(redfintest)
## [1] "propertytype" "city" "zip" "price"
## [5] "beds" "baths" "location" "squarefeet"
## [9] "lotsize" "yearbuilt"
lott1 <- redfintest$'lotsize'
year1 <- redfintest$'yearbuilt'
sq1 <- redfintest$'squarefeet'
bath3 <- redfintest$'baths'
beds3 <- redfintest$'beds'
test3 <- lm(price ~beds3 + lott1 + year1 + sq1 + bath3, data = redfintest)
predict(test3, newdata = redfintest, interval = "confidence")
## Warning in predict.lm(test3, newdata = redfintest, interval =
## "confidence"): prediction from a rank-deficient fit may be misleading
## Warning in qt((1 - level)/2, df): NaNs produced
## fit lwr upr
## 1 399999 NaN NaN
## 2 295000 NaN NaN
## 3 332000 NaN NaN
## 4 545900 NaN NaN
## 5 495000 NaN NaN
Sources: