library("ggplot2")
library("readr")
H <- read_csv("E:/Google Drive/University/STAT7055/House price (data set).csv")
## Parsed with column specification:
## cols(
##   price = col_double(),
##   size = col_double(),
##   distance = col_double(),
##   garage = col_double()
## )

Scatterplot of house price against house size

ggplot(data = H, aes(size, price)) + geom_point(color = "red", alpha = 0.4
) + theme_classic()

Scatterplot of price against distance

ggplot(data=H, aes(distance, price)) + geom_point(color = "dark green", alpha = 0.4
) + theme_classic()

model.1 <- lm(price~size + distance, data = H)
summary(model.1)
## 
## Call:
## lm(formula = price ~ size + distance, data = H)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -256.725  -53.037   -3.662   58.242  193.875 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  25.1016   105.2765   0.238    0.812
## size          0.1846     1.1355   0.163    0.871
## distance      5.4105     4.9065   1.103    0.273
## 
## Residual standard error: 92.55 on 102 degrees of freedom
## Multiple R-squared:  0.06897,    Adjusted R-squared:  0.05071 
## F-statistic: 3.778 on 2 and 102 DF,  p-value: 0.02613
ggplot(data = H, aes(price, size)) + geom_point() + geom_smooth(method = lm) + theme_classic()

We will now perfom some regression diagnostics. Starting with a histogram of the residuals. We are looking for a roughly normal shape.

ggplot(model.1,aes(model.1$residuals)) + geom_histogram(binwidth = 10, alpha = 0.5, color = "red", fill = "white") + theme_classic()

We now create a graph of the residuals plotted against the fitted values to check for possible heteroscedasticity

ggplot(model.1, aes(model.1$fitted.values, model.1$residuals)) + geom_point(alpha = 0.5, color = "blue") + geom_abline(slope = 0, intercept = 0, colour = "red") + theme_classic()