setwd("C:/Users/gabeg/Documents/Uni/Stat 5003/Week 2")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## 1.1
melb <- read.csv('Melbourne_housing_FULL.csv')
##1.2
melb_filter <- filter(melb, Suburb %in% c("Hawthorn", "Brunswick", "Craigieburn"))
## get Mean and Median Price
melb_filter %>%
group_by(Suburb) %>%
summarise(Num_properties = n(), Mean_price = mean(Price, na.rm = TRUE), Median_price = median(Price, na.rm = TRUE ), Mean_Building_Area = mean(BuildingArea, na.rm = TRUE), Median_Building_Area = median(BuildingArea, na.rm = TRUE))
## # A tibble: 3 x 6
## Suburb Num_properties Mean_price Median_price Mean_Building_Area
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Brunswick 444 977989. 950000 122.
## 2 Craigieburn 255 566173. 562500 161.
## 3 Hawthorn 428 1238074. 750500 144.
## # ... with 1 more variable: Median_Building_Area <dbl>
theme_set(
theme_bw()
)
ggplot(data = melb_filter,
aes(x = BuildingArea, y = Price/1000, color = Suburb)) +
geom_point() +
labs(title = "Property Price vs. Building Area by Suburb",
x = "Building Area in metres" ,
y = "Price in thousands")
## Warning: Removed 709 rows containing missing values (geom_point).
## Let’s have a look at the spread of Building size per Suburb
boxplot( BuildingArea ~ as.factor(Suburb), melb_filter,
main = "Spread of Building Area Across Selected Suburbs",
ylab = "Building Area",
xlab = "Suburbs")
par( mfrow = c(2,2))
boxplot( Price/1000 ~ as.factor(Suburb), melb_filter,
main = "Price per Suburbs",
ylab = "Price in thousands",
xlab = "Suburbs")
boxplot( Landsize ~ as.factor(Suburb), melb_filter,
main = "Landsize per Suburbs",
ylab = "Landsize",
xlab = "Suburbs")
boxplot( Car ~ as.factor(Suburb), melb_filter,
main = "Carparks per Suburbs",
ylab = "Building Area",
xlab = "Suburbs")
boxplot( Rooms ~ as.factor(Suburb), melb_filter,
main = "Rooms per Suburbs",
ylab = "Rooms",
xlab = "Suburbs")
## 1.3
model <- lm(Price ~ BuildingArea, data = melb_filter)
coef(model)
## (Intercept) BuildingArea
## 518192.115 3800.746
summary(model)
##
## Call:
## lm(formula = Price ~ BuildingArea, data = melb_filter)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3421769 -463874 -148030 259071 6052396
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 518192.1 66595.6 7.781 5.74e-14 ***
## BuildingArea 3800.7 408.2 9.311 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 730000 on 416 degrees of freedom
## (709 observations deleted due to missingness)
## Multiple R-squared: 0.1725, Adjusted R-squared: 0.1705
## F-statistic: 86.69 on 1 and 416 DF, p-value: < 2.2e-16
##1.4
multi_model1 <- lm(Price/1000 ~ factor(Suburb) + BuildingArea, data = melb_filter)
coef(multi_model1)
## (Intercept) factor(Suburb)Craigieburn factor(Suburb)Hawthorn
## 509.756694 -660.277321 400.715877
## BuildingArea
## 4.435441
summary(multi_model1)
##
## Call:
## lm(formula = Price/1000 ~ factor(Suburb) + BuildingArea, data = melb_filter)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4074.1 -248.5 -26.7 166.0 5479.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 509.7567 62.4510 8.163 3.99e-15 ***
## factor(Suburb)Craigieburn -660.2773 72.9115 -9.056 < 2e-16 ***
## factor(Suburb)Hawthorn 400.7159 72.5435 5.524 5.87e-08 ***
## BuildingArea 4.4354 0.3469 12.786 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 609.9 on 414 degrees of freedom
## (709 observations deleted due to missingness)
## Multiple R-squared: 0.425, Adjusted R-squared: 0.4208
## F-statistic: 102 on 3 and 414 DF, p-value: < 2.2e-16
## Visualisation for 1.4a
ggplot(data = melb_filter,
aes(x = BuildingArea, y = Price/1000, color = Suburb)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, fullrange = TRUE) +
labs(title = "Property Price vs. Building Area by Suburb",
x = "Building Area in metres" ,
y = "Price in thousands")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 709 rows containing non-finite values (stat_smooth).
## Warning: Removed 709 rows containing missing values (geom_point).
multi_model2 <- lm(Price/1000 ~ Suburb + BuildingArea + Car, data = melb_filter)
summary(multi_model2)
##
## Call:
## lm(formula = Price/1000 ~ Suburb + BuildingArea + Car, data = melb_filter)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3417.3 -273.4 -59.2 252.5 5001.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 333.5607 67.0114 4.978 9.56e-07 ***
## SuburbCraigieburn -781.1077 73.7764 -10.588 < 2e-16 ***
## SuburbHawthorn 363.4368 71.1395 5.109 5.02e-07 ***
## BuildingArea 3.7617 0.3513 10.708 < 2e-16 ***
## Car 220.7405 34.6168 6.377 4.98e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 588.5 on 402 degrees of freedom
## (720 observations deleted due to missingness)
## Multiple R-squared: 0.477, Adjusted R-squared: 0.4718
## F-statistic: 91.66 on 4 and 402 DF, p-value: < 2.2e-16
Looking at the p-values from adding Car spaces to the linear regression model we see that Car is significant at the 1% level and the R-squared increases from 0.425 -> 0.477. This means that this model does a better job at explaining the variability of the actual data. However, a R-squared of 0.477 is not a very high score meaning our model does not do a great job at explaining the variability of property prices in general even if it is slightly better than before.