The Data

Data was obtained from Redfin using the following parameters:

Data Cleansing

After loading our data we want to run the following steps to cleanse it:

  1. Remove unneeded columns
  2. Remove incomplete rows
#load data
alldata <- read.csv("./redfindata.csv")

#select columns
data <- alldata %>% select(PRICE,BEDS,BATHS,SQUARE.FEET,LOT.SIZE,YEAR.BUILT)

#remove missing data
df <-  data[complete.cases(data), ]

Basic Analysis

Let’s take a look at the distribution of our data by each variable mathmatically and graphically:

#compute mean, median, min, max, and standard deviation
DSUM<-data.frame(mean=sapply(df,mean,na.rm=TRUE)
                ,median=sapply(df,median,na.rm=TRUE)
                ,min=sapply(df,min,na.rm=TRUE)
                ,max=sapply(df, max,na.rm=TRUE)
                ,sd=sapply(df, sd,na.rm=TRUE))
colnames(DSUM)=c("mean","Median","Min","Max","Standard Deviation")
DSUM
                    mean   Median    Min      Max Standard Deviation
PRICE       2.866545e+05 293400.0 170000 475000.0       6.025266e+04
BEDS        3.761364e+00      4.0      2      7.0       8.022142e-01
BATHS       2.343750e+00      2.0      1      4.5       7.687433e-01
SQUARE.FEET 1.710261e+03   1476.0    816   4868.0       7.256928e+02
LOT.SIZE    1.088049e+04   9996.5   3515  23907.0       4.541870e+03
YEAR.BUILT  1.962159e+03   1959.0   1935   2016.0       1.771932e+01
#plot histograms for each variable
ggplot(gather(df),aes(value))+
  geom_histogram(bins=10)+facet_wrap(~key,scales="free_x")

Correlations

Next let’s check out how each variable correlates to each other.

#correlation numbers
cmdf=cor(df,use="complete.obs")
round(cmdf,2)
            PRICE BEDS BATHS SQUARE.FEET LOT.SIZE YEAR.BUILT
PRICE        1.00 0.11  0.66        0.65     0.11       0.37
BEDS         0.11 1.00  0.30        0.23     0.00       0.01
BATHS        0.66 0.30  1.00        0.55     0.04       0.44
SQUARE.FEET  0.65 0.23  0.55        1.00     0.13       0.25
LOT.SIZE     0.11 0.00  0.04        0.13     1.00      -0.09
YEAR.BUILT   0.37 0.01  0.44        0.25    -0.09       1.00
#correlation matrix
corrplot(cmdf,type="lower",main="Correlation matrix",mar=c(0,0,1,0),tl.cex=0.8,tl.col="black", tl.srt=45,
         col=brewer.pal(n=8, name="PuOr"))

Impacts on Price

While Square Feet and # of Bathrooms appear to have the largest correlation with price, since we have so few variables we can view how all of them impact the sale price

ggplot(data=df, aes(x=df$BEDS, y=df$PRICE)) + geom_point(size=2)+geom_smooth()

ggplot(data=df, aes(x=df$BATHS, y=df$PRICE)) + geom_point(size=2)+geom_smooth()

ggplot(data=df, aes(x=df$SQUARE.FEET, y=df$PRICE)) + geom_point(size=2)+geom_smooth()

ggplot(data=df, aes(x=df$LOT.SIZE, y=df$PRICE)) + geom_point(size=2)+geom_smooth()

ggplot(data=df, aes(x=df$YEAR.BUILT, y=df$PRICE)) + geom_point(size=2)+geom_smooth()

Building our Model

Before we can predict how much a home will sell for, we have to build our model.

# build training/testing sets
smp_size <- floor(0.75 * nrow(df))
# set the seed to make your partition reproductible
set.seed(22)
train_ind <- sample(seq_len(nrow(df)), size = smp_size)
train <- df[train_ind, ]
test <- df[-train_ind, ]

#create a full model first
tm<- train
tm<- lm(PRICE~.,data = tm)
smr <- summary(tm)
cat(paste(paste("Multiple R-squared: ",round(smr$r.squared,digits=2)), 
      paste("Adjusted R-squared: ",round(smr$adj.r.squared,digits=2)),
      sep='\n'))
Multiple R-squared:  0.56
Adjusted R-squared:  0.52
#run stepwise regression to find most accurate model
tmstep <- lm(PRICE~.,data=train)
step(tmstep, direction = "both")
Start:  AIC=1411.63
PRICE ~ BEDS + BATHS + SQUARE.FEET + LOT.SIZE + YEAR.BUILT

              Df  Sum of Sq        RSS    AIC
- LOT.SIZE     1 5.1871e+08 1.0752e+11 1409.9
- YEAR.BUILT   1 2.9618e+09 1.0996e+11 1411.4
<none>                      1.0700e+11 1411.6
- BEDS         1 5.3498e+09 1.1235e+11 1412.8
- BATHS        1 1.6101e+10 1.2310e+11 1418.9
- SQUARE.FEET  1 3.3804e+10 1.4080e+11 1427.7

Step:  AIC=1409.94
PRICE ~ BEDS + BATHS + SQUARE.FEET + YEAR.BUILT

              Df  Sum of Sq        RSS    AIC
- YEAR.BUILT   1 3.0764e+09 1.1060e+11 1409.8
<none>                      1.0752e+11 1409.9
- BEDS         1 5.3176e+09 1.1284e+11 1411.1
+ LOT.SIZE     1 5.1871e+08 1.0700e+11 1411.6
- BATHS        1 1.5971e+10 1.2349e+11 1417.1
- SQUARE.FEET  1 3.5110e+10 1.4263e+11 1426.6

Step:  AIC=1409.81
PRICE ~ BEDS + BATHS + SQUARE.FEET

              Df  Sum of Sq        RSS    AIC
<none>                      1.1060e+11 1409.8
+ YEAR.BUILT   1 3.0764e+09 1.0752e+11 1409.9
+ LOT.SIZE     1 6.3331e+08 1.0996e+11 1411.4
- BEDS         1 8.2545e+09 1.1885e+11 1412.6
- BATHS        1 3.1094e+10 1.4169e+11 1424.2
- SQUARE.FEET  1 3.9201e+10 1.4980e+11 1427.8

Call:
lm(formula = PRICE ~ BEDS + BATHS + SQUARE.FEET, data = train)

Coefficients:
(Intercept)         BEDS        BATHS  SQUARE.FEET  
  197979.76    -13626.66     32937.41        37.99  
#select model
tm2 <- lm(formula = PRICE ~ BEDS + BATHS + SQUARE.FEET, data = train)
smr2 <- summary(tm2)
cat(paste(paste("Multiple R-squared: ",round(smr2$r.squared,digits=2)), 
      paste("Adjusted R-squared: ",round(smr2$adj.r.squared,digits=2)),
      sep='\n'))
Multiple R-squared:  0.54
Adjusted R-squared:  0.52

Predicting Price

Now that we have our models, we will predict the price of a home with the following parameters:

# create test data record
new.record <- data.frame(BEDS = 3,
                         BATHS = 2,
                         SQUARE.FEET = 1937,
                         YEAR.BUILT = 1930,
                         LOT.SIZE = 22215)
#run in both models
price.tr <- predict(tm,newdata = new.record)
price.tr2 <- predict(tm2,newdata = new.record)

cat(paste("Regression Price Estimate 1: ",scales::dollar(price.tr,0.01)),
    paste("Regression Price Estimate 2: ",scales::dollar(price.tr2,0.01)),
    sep='\n')
Regression Price Estimate 1:  $288,795.03
Regression Price Estimate 2:  $296,552.09

An important item to note is that this model does not take into account any renovations and/or upgrades that may have been done to a home (as it was not included in the Redfin data). If you are using this model to extimate how much a home is worth you would need to adjust up for those factors and down for any major issues that would necessitate repairs.