Data was obtained from Redfin using the following parameters:
After loading our data we want to run the following steps to cleanse it:
#load data
alldata <- read.csv("./redfindata.csv")
#select columns
data <- alldata %>% select(PRICE,BEDS,BATHS,SQUARE.FEET,LOT.SIZE,YEAR.BUILT)
#remove missing data
df <- data[complete.cases(data), ]
Let’s take a look at the distribution of our data by each variable mathmatically and graphically:
#compute mean, median, min, max, and standard deviation
DSUM<-data.frame(mean=sapply(df,mean,na.rm=TRUE)
,median=sapply(df,median,na.rm=TRUE)
,min=sapply(df,min,na.rm=TRUE)
,max=sapply(df, max,na.rm=TRUE)
,sd=sapply(df, sd,na.rm=TRUE))
colnames(DSUM)=c("mean","Median","Min","Max","Standard Deviation")
DSUM
mean Median Min Max Standard Deviation
PRICE 2.866545e+05 293400.0 170000 475000.0 6.025266e+04
BEDS 3.761364e+00 4.0 2 7.0 8.022142e-01
BATHS 2.343750e+00 2.0 1 4.5 7.687433e-01
SQUARE.FEET 1.710261e+03 1476.0 816 4868.0 7.256928e+02
LOT.SIZE 1.088049e+04 9996.5 3515 23907.0 4.541870e+03
YEAR.BUILT 1.962159e+03 1959.0 1935 2016.0 1.771932e+01
#plot histograms for each variable
ggplot(gather(df),aes(value))+
geom_histogram(bins=10)+facet_wrap(~key,scales="free_x")
Next let’s check out how each variable correlates to each other.
#correlation numbers
cmdf=cor(df,use="complete.obs")
round(cmdf,2)
PRICE BEDS BATHS SQUARE.FEET LOT.SIZE YEAR.BUILT
PRICE 1.00 0.11 0.66 0.65 0.11 0.37
BEDS 0.11 1.00 0.30 0.23 0.00 0.01
BATHS 0.66 0.30 1.00 0.55 0.04 0.44
SQUARE.FEET 0.65 0.23 0.55 1.00 0.13 0.25
LOT.SIZE 0.11 0.00 0.04 0.13 1.00 -0.09
YEAR.BUILT 0.37 0.01 0.44 0.25 -0.09 1.00
#correlation matrix
corrplot(cmdf,type="lower",main="Correlation matrix",mar=c(0,0,1,0),tl.cex=0.8,tl.col="black", tl.srt=45,
col=brewer.pal(n=8, name="PuOr"))
While Square Feet and # of Bathrooms appear to have the largest correlation with price, since we have so few variables we can view how all of them impact the sale price
ggplot(data=df, aes(x=df$BEDS, y=df$PRICE)) + geom_point(size=2)+geom_smooth()
ggplot(data=df, aes(x=df$BATHS, y=df$PRICE)) + geom_point(size=2)+geom_smooth()
ggplot(data=df, aes(x=df$SQUARE.FEET, y=df$PRICE)) + geom_point(size=2)+geom_smooth()
ggplot(data=df, aes(x=df$LOT.SIZE, y=df$PRICE)) + geom_point(size=2)+geom_smooth()
ggplot(data=df, aes(x=df$YEAR.BUILT, y=df$PRICE)) + geom_point(size=2)+geom_smooth()
Before we can predict how much a home will sell for, we have to build our model.
# build training/testing sets
smp_size <- floor(0.75 * nrow(df))
# set the seed to make your partition reproductible
set.seed(22)
train_ind <- sample(seq_len(nrow(df)), size = smp_size)
train <- df[train_ind, ]
test <- df[-train_ind, ]
#create a full model first
tm<- train
tm<- lm(PRICE~.,data = tm)
smr <- summary(tm)
cat(paste(paste("Multiple R-squared: ",round(smr$r.squared,digits=2)),
paste("Adjusted R-squared: ",round(smr$adj.r.squared,digits=2)),
sep='\n'))
Multiple R-squared: 0.56
Adjusted R-squared: 0.52
#run stepwise regression to find most accurate model
tmstep <- lm(PRICE~.,data=train)
step(tmstep, direction = "both")
Start: AIC=1411.63
PRICE ~ BEDS + BATHS + SQUARE.FEET + LOT.SIZE + YEAR.BUILT
Df Sum of Sq RSS AIC
- LOT.SIZE 1 5.1871e+08 1.0752e+11 1409.9
- YEAR.BUILT 1 2.9618e+09 1.0996e+11 1411.4
<none> 1.0700e+11 1411.6
- BEDS 1 5.3498e+09 1.1235e+11 1412.8
- BATHS 1 1.6101e+10 1.2310e+11 1418.9
- SQUARE.FEET 1 3.3804e+10 1.4080e+11 1427.7
Step: AIC=1409.94
PRICE ~ BEDS + BATHS + SQUARE.FEET + YEAR.BUILT
Df Sum of Sq RSS AIC
- YEAR.BUILT 1 3.0764e+09 1.1060e+11 1409.8
<none> 1.0752e+11 1409.9
- BEDS 1 5.3176e+09 1.1284e+11 1411.1
+ LOT.SIZE 1 5.1871e+08 1.0700e+11 1411.6
- BATHS 1 1.5971e+10 1.2349e+11 1417.1
- SQUARE.FEET 1 3.5110e+10 1.4263e+11 1426.6
Step: AIC=1409.81
PRICE ~ BEDS + BATHS + SQUARE.FEET
Df Sum of Sq RSS AIC
<none> 1.1060e+11 1409.8
+ YEAR.BUILT 1 3.0764e+09 1.0752e+11 1409.9
+ LOT.SIZE 1 6.3331e+08 1.0996e+11 1411.4
- BEDS 1 8.2545e+09 1.1885e+11 1412.6
- BATHS 1 3.1094e+10 1.4169e+11 1424.2
- SQUARE.FEET 1 3.9201e+10 1.4980e+11 1427.8
Call:
lm(formula = PRICE ~ BEDS + BATHS + SQUARE.FEET, data = train)
Coefficients:
(Intercept) BEDS BATHS SQUARE.FEET
197979.76 -13626.66 32937.41 37.99
#select model
tm2 <- lm(formula = PRICE ~ BEDS + BATHS + SQUARE.FEET, data = train)
smr2 <- summary(tm2)
cat(paste(paste("Multiple R-squared: ",round(smr2$r.squared,digits=2)),
paste("Adjusted R-squared: ",round(smr2$adj.r.squared,digits=2)),
sep='\n'))
Multiple R-squared: 0.54
Adjusted R-squared: 0.52
Now that we have our models, we will predict the price of a home with the following parameters:
# create test data record
new.record <- data.frame(BEDS = 3,
BATHS = 2,
SQUARE.FEET = 1937,
YEAR.BUILT = 1930,
LOT.SIZE = 22215)
#run in both models
price.tr <- predict(tm,newdata = new.record)
price.tr2 <- predict(tm2,newdata = new.record)
cat(paste("Regression Price Estimate 1: ",scales::dollar(price.tr,0.01)),
paste("Regression Price Estimate 2: ",scales::dollar(price.tr2,0.01)),
sep='\n')
Regression Price Estimate 1: $288,795.03
Regression Price Estimate 2: $296,552.09
An important item to note is that this model does not take into account any renovations and/or upgrades that may have been done to a home (as it was not included in the Redfin data). If you are using this model to extimate how much a home is worth you would need to adjust up for those factors and down for any major issues that would necessitate repairs.