library("ggplot2")
library("readr")
H <- read_csv("E:/Google Drive/University/STAT7055/House price (data set).csv")
## Parsed with column specification:
## cols(
## price = col_double(),
## size = col_double(),
## distance = col_double(),
## garage = col_double()
## )
Scatterplot of house price against house size
ggplot(data = H, aes(size, price)) + geom_point(color = "red", alpha = 0.4
) + theme_classic()
Scatterplot of price against distance
ggplot(data=H, aes(distance, price)) + geom_point(color = "dark green", alpha = 0.4
) + theme_classic()
model.1 <- lm(price~size + distance, data = H)
summary(model.1)
##
## Call:
## lm(formula = price ~ size + distance, data = H)
##
## Residuals:
## Min 1Q Median 3Q Max
## -256.725 -53.037 -3.662 58.242 193.875
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.1016 105.2765 0.238 0.812
## size 0.1846 1.1355 0.163 0.871
## distance 5.4105 4.9065 1.103 0.273
##
## Residual standard error: 92.55 on 102 degrees of freedom
## Multiple R-squared: 0.06897, Adjusted R-squared: 0.05071
## F-statistic: 3.778 on 2 and 102 DF, p-value: 0.02613
ggplot(data = H, aes(price, size)) + geom_point() + geom_smooth(method = lm) + theme_classic()
We will now perfom some regression diagnostics. Starting with a histogram of the residuals. We are looking for a roughly normal shape.
ggplot(model.1,aes(model.1$residuals)) + geom_histogram(binwidth = 10, alpha = 0.5, color = "red", fill = "white") + theme_classic()
We now create a graph of the residuals plotted against the fitted values to check for possible heteroscedasticity
ggplot(model.1, aes(model.1$fitted.values, model.1$residuals)) + geom_point(alpha = 0.5, color = "blue") + geom_abline(slope = 0, intercept = 0, colour = "red") + theme_classic()