The data for the housing prediction model was downloaded from redfin.com for Puyallup, WA area.

I used a predict function to predict the price for a certain area using the model for a given area code.

library(XLConnect)

## Loading required package: XLConnectJars

## XLConnect 0.2-14 by Mirai Solutions GmbH [aut],
##   Martin Studer [cre],
##   The Apache Software Foundation [ctb, cph] (Apache POI),
##   Graph Builder [ctb, cph] (Curvesapi Java library)

## http://www.mirai-solutions.com
## https://github.com/miraisolutions/xlconnect

library(tidyverse)

## -- Attaching packages ------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.2     v dplyr   0.7.4
## v tidyr   0.8.0     v stringr 1.3.0
## v readr   1.1.1     v forcats 0.3.0

## Warning: package 'ggplot2' was built under R version 3.4.4

## -- Conflicts ---------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(corrgram)
library(chemCal)
library(forecast)
library(urca)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(car)

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

library(readr)
redfin2 <- read.csv("~/1 UW Tacoma/560 data mining/data/redfin.csv")

names(redfin2)

## [1] "price"        "beds"         "baths"        "squarefeet"  
## [5] "lotsize"      "year"         "neighborhood" "propertytype"

any(is.na(redfin2))

## [1] TRUE

modified <- redfin2[which(complete.cases(redfin2[,c('neighborhood', 'beds', 'baths', 'squarefeet', 'propertytype', 'lotsize', 'price', 'year')])),]

summary(redfin2)

##      price             beds           baths         squarefeet  
##  Min.   :105000   Min.   :0.000   Min.   :1.000   Min.   : 710  
##  1st Qu.:285000   1st Qu.:3.000   1st Qu.:2.250   1st Qu.:1768  
##  Median :366900   Median :4.000   Median :2.500   Median :2250  
##  Mean   :367423   Mean   :3.602   Mean   :2.394   Mean   :2240  
##  3rd Qu.:424999   3rd Qu.:4.000   3rd Qu.:2.500   3rd Qu.:2707  
##  Max.   :749950   Max.   :7.000   Max.   :3.750   Max.   :4516  
##                                   NA's   :3                     
##     lotsize            year                     neighborhood
##  Min.   :    34   Min.   :1900   Puyallup             :150  
##  1st Qu.:  5000   1st Qu.:1989   South Hill           : 61  
##  Median :  6698   Median :2004                        : 17  
##  Mean   : 13660   Mean   :1998   English Ridge        : 12  
##  3rd Qu.: 14062   3rd Qu.:2017   Summerwood Park      : 12  
##  Max.   :179032   Max.   :2017   Arborvue at Fruitland:  9  
##  NA's   :25       NA's   :19     (Other)              :  8  
##                     propertytype
##  Condo/Co-op              :  8  
##  Single Family Residential:249  
##  Townhouse                : 12  
##                                 
##                                 
##                                 
##

cor1 <- cor(redfin2[,-c(7:8)])
cor1

##                price      beds baths squarefeet lotsize year
## price      1.0000000 0.4991457    NA  0.8506723      NA   NA
## beds       0.4991457 1.0000000    NA  0.6072977      NA   NA
## baths             NA        NA     1         NA      NA   NA
## squarefeet 0.8506723 0.6072977    NA  1.0000000      NA   NA
## lotsize           NA        NA    NA         NA       1   NA
## year              NA        NA    NA         NA      NA    1

I identified a parsimonious model from the Categorical Variables Lab and Interaction Variables Lab

test1 <- lm(price ~ beds + baths + squarefeet + propertytype + lotsize + year, data = modified)
summary(test1)

## 
## Call:
## lm(formula = price ~ beds + baths + squarefeet + propertytype + 
##     lotsize + year, data = modified)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -305362  -30428   -3673   29586  164824 
## 
## Coefficients:
##                                         Estimate Std. Error t value
## (Intercept)                           -1.307e+06  4.351e+05  -3.005
## beds                                  -1.438e+04  6.264e+03  -2.296
## baths                                  4.398e+03  1.055e+04   0.417
## squarefeet                             1.366e+02  8.950e+00  15.267
## propertytypeSingle Family Residential  4.096e+04  2.343e+04   1.748
## propertytypeTownhouse                 -7.914e+03  2.707e+04  -0.292
## lotsize                                8.926e-01  2.154e-01   4.145
## year                                   6.807e+02  2.205e+02   3.087
##                                       Pr(>|t|)    
## (Intercept)                            0.00295 ** 
## beds                                   0.02258 *  
## baths                                  0.67717    
## squarefeet                             < 2e-16 ***
## propertytypeSingle Family Residential  0.08170 .  
## propertytypeTownhouse                  0.77029    
## lotsize                               4.77e-05 ***
## year                                   0.00226 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56530 on 233 degrees of freedom
## Multiple R-squared:  0.7437, Adjusted R-squared:  0.736 
## F-statistic: 96.57 on 7 and 233 DF,  p-value: < 2.2e-16

Used the below command to predict your pricing values manually by changing the explanatory variable I am interested in. For example, beds = 1, beds = 2, etc. etc.

lott <- redfin2$'lotsize'
year <- redfin2$'year'
sq <- redfin2$'squarefeet'
bath2 <- redfin2$'baths'
beds2 <- redfin2$'beds'
test2 <- lm(price ~beds2 + lott + year + sq + bath2, data = redfin2)

newdata2 <- data.frame (year = 2010, beds2 = 4, bath2 = 3, sq = 2400, lott = 6000)
predict.lm(test2,newdata2)

##        1 
## 388094.2

I used inverse prediction to predict the number of beds, baths, and square foot for the prices you got in part b.

I inputed the price value and changed the explanatory variable from beds, baths, and SQFT.

pricefit2 <- lm(redfin2$price ~ redfin2$squarefeet )
inverse.predict( pricefit2, 388094.2)

## $Prediction
## [1] 2378.597
## 
## $`Standard Error`
## [1] 437.0672
## 
## $Confidence
## [1] 862.0135
## 
## $`Confidence Limits`
## [1] 1516.583 3240.610

pricefit2 <- lm(redfin2$price ~ redfin2$beds )
inverse.predict( pricefit2, 388094.2)

## $Prediction
## [1] 3.914278
## 
## $`Standard Error`
## [1] 2.863666
## 
## $Confidence
## [1] 7.007139
## 
## $`Confidence Limits`
## [1] -3.092861 10.921416

pricefit3 <- lm(redfin2$price ~ redfin2$baths )
inverse.predict( pricefit3, 388094.2)

## $Prediction
## [1] 2.558189
## 
## $`Standard Error`
## [1] 0.7374787
## 
## $Confidence
## [1] 1.668293
## 
## $`Confidence Limits`
## [1] 0.8898967 4.2264821

Used the predict function to predict the price for a different zip code of data

library(readr)
redfintest <- read.csv("~/1 UW Tacoma/560 data mining/data/redfintest.csv")

modified <- redfintest[which(complete.cases(redfintest[,c('location', 'zip', 'beds', 'baths', 'squarefeet', 'propertytype', 'lotsize', 'price', 'yearbuilt')])),]
names(redfintest)

##  [1] "propertytype" "city"         "zip"          "price"       
##  [5] "beds"         "baths"        "location"     "squarefeet"  
##  [9] "lotsize"      "yearbuilt"

lott1 <- redfintest$'lotsize'
year1 <- redfintest$'yearbuilt'
sq1 <- redfintest$'squarefeet'
bath3 <- redfintest$'baths'
beds3 <- redfintest$'beds'
test3 <- lm(price ~beds3 + lott1 + year1 + sq1 + bath3, data = redfintest)

predict(test3, newdata = redfintest, interval = "confidence")

## Warning in predict.lm(test3, newdata = redfintest, interval =
## "confidence"): prediction from a rank-deficient fit may be misleading

## Warning in qt((1 - level)/2, df): NaNs produced

##      fit lwr upr
## 1 399999 NaN NaN
## 2 295000 NaN NaN
## 3 332000 NaN NaN
## 4 545900 NaN NaN
## 5 495000 NaN NaN

Sources:

https://www.datascience.com/blog/introduction-to-forecasting-with-arima-in-r-learn-data-science-tutorials

http://pareonline.net/getvn.asp?v=7&n=2

Redfin Housing Price Prediction

Dev Shrestha

December 6, 2017

I used a predict function to predict the price for a certain area using the model for a given area code.

I used inverse prediction to predict the number of beds, baths, and square foot for the prices you got in part b.

I inputed the price value and changed the explanatory variable from beds, baths, and SQFT.