#read in file
houseprices <- read.csv('house_prices.csv')
houseprices <- houseprices[complete.cases(houseprices),]
# a simple linear regression model
library(ggplot2)
#plot(houseprices$distance_MRT,houseprices$house_price)
model = lm(house_price ~ distance_MRT, data = houseprices)Midterm solution
A
3 Assumptions Tests:
library(MASS)
library(lmtest)Loading required package: zoo
Attaching package: 'zoo'
The following objects are masked from 'package:base':
as.Date, as.Date.numeric
#linearity test
model_full = lm(house_price ~ as.factor(distance_MRT), data = houseprices)
anova(model, model_full)Analysis of Variance Table
Model 1: house_price ~ distance_MRT
Model 2: house_price ~ as.factor(distance_MRT)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 273 179001556
2 88 15314845 185 163686710 5.0841 3.276e-15 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#homoscedasticity test
bptest(model)
studentized Breusch-Pagan test
data: model
BP = 4.2303, df = 1, p-value = 0.03971
#normal error dist test
shapiro.test(model$residuals)
Shapiro-Wilk normality test
data: model$residuals
W = 0.94274, p-value = 7.271e-09
The model does not satisfy the linear relationship assumption based on the low p-value of the F-test.
Using a significance level of 5%, the model does not satisfy the homoscedasticity assumption based on the p-value of the Breusch-Pagan test. It is of note, however, that it is close (p = 0.03971) to satisfying the assumption.
The model does not satisfy the normal distribution of errors assumption based on the p-value of the Shapiro-Wilk test.
First, I will use a log transform on the x-values to improve the fit. I have selected a log transform based on general trend of the scatter plot of distance_MRT vs. house_price.
#log transform
model_transformed <- lm(house_price ~ log(distance_MRT), data = houseprices)
#linearity test
model_full = lm(house_price ~ as.factor(log(distance_MRT)), data = houseprices)
anova(model_transformed, model_full)Analysis of Variance Table
Model 1: house_price ~ log(distance_MRT)
Model 2: house_price ~ as.factor(log(distance_MRT))
Res.Df RSS Df Sum of Sq F Pr(>F)
1 273 132008845
2 88 15314845 185 116693999 3.6245 1.073e-10 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#homoscedasticity test
bptest(model_transformed)
studentized Breusch-Pagan test
data: model_transformed
BP = 12.04, df = 1, p-value = 0.0005208
#normal error dist test
shapiro.test(model_transformed$residuals)
Shapiro-Wilk normality test
data: model_transformed$residuals
W = 0.92304, p-value = 1.021e-10
This transformed model still does not fulfill any of the assumptions, so I will use Box-Cox to find an appropriate transformation for the y-values (house_price).
boxcox(model_transformed, lambda = seq(0,1,0.05))#lambda = 0.25 seems appropriate from the plot
model_boxcox <- lm(house_price^{0.25} ~ log(distance_MRT), data = houseprices)
#linearity test
model_full = lm(house_price^{0.25} ~ as.factor(log(distance_MRT)), data = houseprices)
anova(model_boxcox, model_full)Analysis of Variance Table
Model 1: house_price^{
0.25
} ~ log(distance_MRT)
Model 2: house_price^{
0.25
} ~ as.factor(log(distance_MRT))
Res.Df RSS Df Sum of Sq F Pr(>F)
1 273 114.991
2 88 11.201 185 103.79 4.4075 3.103e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#homoscedasticity test
bptest(model_boxcox)
studentized Breusch-Pagan test
data: model_boxcox
BP = 0.38054, df = 1, p-value = 0.5373
#normal error dist test
shapiro.test(model_boxcox$residuals)
Shapiro-Wilk normality test
data: model_boxcox$residuals
W = 0.98857, p-value = 0.0287
This model fails the linearity and normal distribution of errors assumptions, but fulfills the homoscedasticity assumption (p-value = 0.5373 > 0.05). This y-transformation did not work, so I will continue to iterate and try a log transform of the y-values.
try3 <- lm(log(house_price) ~ log(distance_MRT), data=houseprices)
#linearity test
model_full = lm(log(house_price) ~ as.factor(log(distance_MRT)), data = houseprices)
anova(try3, model_full)Analysis of Variance Table
Model 1: log(house_price) ~ log(distance_MRT)
Model 2: log(house_price) ~ as.factor(log(distance_MRT))
Res.Df RSS Df Sum of Sq F Pr(>F)
1 273 56.097
2 88 5.330 185 50.768 4.5311 1.308e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#homoscedasticity test
bptest(try3)
studentized Breusch-Pagan test
data: try3
BP = 3.3683, df = 1, p-value = 0.06646
#normal error dist test
shapiro.test(try3$residuals)
Shapiro-Wilk normality test
data: try3$residuals
W = 0.99103, p-value = 0.09097
This model fulfills the homoscedasticity assumption (p-value = 0.6646 > 0.05) and the normal error distribution assumption (p-value = 0.09097 > 0.05). However, it still does not satisfy the linearity assumption. In order to resolve this issue, the predictor values will be put into bins in order to increase the number of replicates for each distinct value of the predictor.
MRT_bins <- cut(houseprices$distance_MRT, breaks=10)
binned_full_model <- lm(log(house_price) ~ MRT_bins, data=houseprices)
anova(try3,binned_full_model)Analysis of Variance Table
Model 1: log(house_price) ~ log(distance_MRT)
Model 2: log(house_price) ~ MRT_bins
Res.Df RSS Df Sum of Sq F Pr(>F)
1 273 56.097
2 266 55.555 7 0.54234 0.371 0.9187
The p-value of the F-test for linearity is now 0.9187, which is greater than 0.05. This means that we can no longer reject the null hypothesis that there is a linear relationship between distance_MRT and house_price, and the linearity assumption is fulfilled.
B
Plotting the regression model
library(ggplot2)
#computing prediction interval
pred_int <- predict(try3, newdata=houseprices, interval="prediction", level=0.95)
pred_int <- data.frame(cbind(pred_int, distance_MRT = houseprices$distance_MRT))
#computing confidence interval
conf_int <- predict(try3, newdata=houseprices, interval="confidence", level=0.95)
conf_int <- data.frame(cbind(conf_int, distance_MRT = houseprices$distance_MRT))
colors <- c("Full Model (binned predictor)" = "red", "Power Model" = "blue", "Confidence Interval" = "green", "Prediction Interval" = "yellow")
ggplot(data = houseprices, aes(x = distance_MRT, y = house_price))+ geom_point()+ geom_line(aes(y = exp(try3$fitted.values), color = "Power Model"))+ geom_line(aes(y = exp(binned_full_model$fitted.values), color = "Full model (binned predictor)"))+ geom_line(data=pred_int, aes(x=distance_MRT, y=exp(lwr), color="Prediction Interval")) + geom_line(data=pred_int, aes(x=distance_MRT, y=exp(upr), color="Prediction Interval")) + geom_line(data=conf_int, aes(x=distance_MRT, y=exp(lwr), color="Confidence Interval")) + geom_line(data=conf_int, aes(x=distance_MRT, y=exp(upr), color="Confidence Interval")) + scale_color_manual(values = colors)+ theme(legend.title = element_blank(),legend.position = c(0.75,0.85))