Data is From the county property appraiser. Clean up steps are listed here:
library("ggplot2")
library("viridis")
Loading required package: viridisLite
library("viridisLite")
#Load data from the Property Appraiser's website
data <- read.csv("~/Downloads/3bed.csv")
data$beds <- rep(x=3, times=dim(data)[1])
data2 <- read.csv("~/Downloads/4bed.csv")
data2$beds <- rep(x=4, times=dim(data2)[1])
data<-rbind(data, data2)
invalid factor level, NA generated
# Set date format
data$Sale_Date <-as.Date(data$Sale_Date, format="%m/%d/%Y")
# filter out records with NA's
data<-data[is.na(data$HtdSqFt) ==FALSE,]
data<-data[data$HtdSqFt > 0,]
data<-data[is.na(data$Bldg_Value)==FALSE,]
data<-data[is.na(data$Sale_Price)==FALSE,]
data<-data[data$Sale_Price<800000,]
data<-data[is.na(data$Sale_Date)==FALSE,]
# Calculate cost per heated square foot
data$costPerSqFt<-data$Sale_Price/data$HtdSqFt
# Add land and building value together
data$appraised <- data$Land_Value + data$Bldg_Value
#filter out data before 2010
data<-data[data$Sale_Date>"2010-01-01",]
# Filter out extreamly low value sales that are likely to be interfamily transfers etc.
data<-data[data$Sale_Price>50000,]
# Plot the Log odds ratio of Sale Price / Appraised price to find outliers
ggplot(data = data, aes(x=log2(Sale_Price/appraised))) +
geom_histogram(bins=50) +
xlab("log2 odds ratio of sale prise vs appraised price ")
# filter outliers
data<-data[abs(log2(data$Sale_Price/data$appraised))<1, ]
hoi <- data.frame(Sale_Date=as.Date("2017-08-01"),appraised=239600, HtdSqFt=2629,costPerSqFt=310000/2629, Sale_Price=310000, Land_Value=36000, Bldg_Value=193600 )
ggplot(data= data, aes(x=Sale_Price, y= appraised)) + geom_point(aes(color=log10(HtdSqFt)), size=0.5) + scale_color_viridis() +
geom_smooth(method ="lm") +
ylab("Appraised value") +
xlab("Sale price") +
geom_point(data = hoi, colour = "red")
del <- 5000
housevalue <-hoi$appraised
slice <- data[(data$appraised > (housevalue - del) & data$appraised < (housevalue +del)),]
ggplot(data=slice, aes(x=Sale_Price)) + geom_histogram(bins=20)
# Plot the Price per Square Foot over time
#datasubset<- data[(data$Sale_Price>180000 & data$Sale_Price < 350000), ]
ggplot(data=data, aes(x=Sale_Date, y=costPerSqFt)) +
geom_point(aes(color=log10(Sale_Price)), size=0.5) +
scale_color_viridis() +
geom_smooth(method="loess") +
ylab("Dollars per Square Foot") +
xlab("Year") +
geom_point(data = hoi, colour = "red")
ggplot(data=data, aes(x=Sale_Price, y=costPerSqFt)) +
scale_x_log10(minor=c(1e5,2e5,3e5,4e5,6e5,7e5)) +
geom_point(aes(color=log10(HtdSqFt)),size=0.5) +
scale_color_viridis() +
geom_smooth(method ="lm") +
ylab("Dollars per Square Foot") +
xlab("Sale price on a log scale") +
geom_point(data = hoi, colour = "red")
ggplot(data=data, aes(x=Sale_Price, y=HtdSqFt)) +
geom_point(aes(color=costPerSqFt), size=0.5) +
scale_color_viridis() +
geom_smooth(method = "lm") +
ylab("Square Footage") +
xlab("Sale Price") +
geom_point(data = hoi, colour = "red")
mod0<- lm(Sale_Price ~ appraised, data = data)
summary(mod1)
Call:
glm(formula = Sale_Price ~ appraised, data = data)
Deviance Residuals:
Min 1Q Median 3Q Max
-166686 -14993 -480 13487 152306
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.802e+03 1.138e+03 3.341 0.000844 ***
appraised 1.206e+00 7.709e-03 156.454 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 748637182)
Null deviance: 2.1096e+13 on 3702 degrees of freedom
Residual deviance: 2.7707e+12 on 3701 degrees of freedom
AIC: 86179
Number of Fisher Scoring iterations: 2
p.df<- data.frame(appraised=housevalue)
predict(mod0, p.df,se.fit=T, level=.9, interval = "prediction")
$fit
fit lwr upr
1 292766.5 247724.6 337808.4
$se.fit
[1] 918.9558
$df
[1] 3701
$residual.scale
[1] 27361.24
mod2<-glm(Sale_Price ~ HtdSqFt + Land_Value + Bldg_Value, data=data)
summary(mod2)
Call:
glm(formula = Sale_Price ~ HtdSqFt + Land_Value + Bldg_Value,
data = data)
Deviance Residuals:
Min 1Q Median 3Q Max
-152396 -15132 -59 13827 146972
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.529e+03 1.473e+03 -1.717 0.0861 .
HtdSqFt 8.220e+00 1.360e+00 6.045 1.64e-09 ***
Land_Value 1.328e+00 3.036e-02 43.726 < 2e-16 ***
Bldg_Value 1.099e+00 1.412e-02 77.804 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 731541312)
Null deviance: 2.1096e+13 on 3702 degrees of freedom
Residual deviance: 2.7060e+12 on 3699 degrees of freedom
AIC: 86095
Number of Fisher Scoring iterations: 2
BIC(mod2)
[1] 86126.43
preds<-predict(mod2, hoi, se.fit=T,type="link")
preds
$fit
1
279575.7
$se.fit
[1] 973.5242
$residual.scale
[1] 27047.02