1.1

setwd("C:/Users/gabeg/Documents/Uni/Stat 5003/Week 2")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## 1.1
melb <- read.csv('Melbourne_housing_FULL.csv')

1.2

##1.2
melb_filter <- filter(melb, Suburb %in% c("Hawthorn", "Brunswick", "Craigieburn")) 

## get Mean and Median Price
melb_filter %>% 
  group_by(Suburb) %>% 
  summarise(Num_properties = n(),  Mean_price = mean(Price, na.rm = TRUE), Median_price = median(Price, na.rm = TRUE ),  Mean_Building_Area = mean(BuildingArea, na.rm = TRUE), Median_Building_Area = median(BuildingArea, na.rm = TRUE))
## # A tibble: 3 x 6
##   Suburb      Num_properties Mean_price Median_price Mean_Building_Area
##   <chr>                <int>      <dbl>        <dbl>              <dbl>
## 1 Brunswick              444    977989.       950000               122.
## 2 Craigieburn            255    566173.       562500               161.
## 3 Hawthorn               428   1238074.       750500               144.
## # ... with 1 more variable: Median_Building_Area <dbl>

Let’s Visualise the relationship between Price and Suburb

theme_set(
  theme_bw()
)
ggplot(data = melb_filter,
       aes(x = BuildingArea, y = Price/1000, color = Suburb)) +
        geom_point() +
        labs(title = "Property Price vs. Building Area by Suburb",
             x = "Building Area in metres" ,
             y = "Price in thousands")
## Warning: Removed 709 rows containing missing values (geom_point).

## Let’s have a look at the spread of Building size per Suburb

boxplot( BuildingArea ~ as.factor(Suburb), melb_filter,
         main = "Spread of Building Area Across Selected Suburbs",
         ylab = "Building Area",
         xlab = "Suburbs")

Lets have a look at the spread across our other variables

par( mfrow = c(2,2))

boxplot( Price/1000 ~ as.factor(Suburb), melb_filter,
         main = "Price per Suburbs",
         ylab = "Price in thousands",
         xlab = "Suburbs")

boxplot( Landsize ~ as.factor(Suburb), melb_filter,
         main = "Landsize per Suburbs",
         ylab = "Landsize",
         xlab = "Suburbs")

boxplot( Car ~ as.factor(Suburb), melb_filter,
         main = "Carparks per Suburbs",
         ylab = "Building Area",
         xlab = "Suburbs")
boxplot( Rooms ~ as.factor(Suburb), melb_filter,
         main = "Rooms per Suburbs",
         ylab = "Rooms",
         xlab = "Suburbs")

1.3

## 1.3
model <- lm(Price ~ BuildingArea, data = melb_filter)

coef(model)
##  (Intercept) BuildingArea 
##   518192.115     3800.746
summary(model)
## 
## Call:
## lm(formula = Price ~ BuildingArea, data = melb_filter)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3421769  -463874  -148030   259071  6052396 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  518192.1    66595.6   7.781 5.74e-14 ***
## BuildingArea   3800.7      408.2   9.311  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 730000 on 416 degrees of freedom
##   (709 observations deleted due to missingness)
## Multiple R-squared:  0.1725, Adjusted R-squared:  0.1705 
## F-statistic: 86.69 on 1 and 416 DF,  p-value: < 2.2e-16

1.4a

##1.4

multi_model1 <- lm(Price/1000 ~ factor(Suburb) + BuildingArea, data = melb_filter)

coef(multi_model1)
##               (Intercept) factor(Suburb)Craigieburn    factor(Suburb)Hawthorn 
##                509.756694               -660.277321                400.715877 
##              BuildingArea 
##                  4.435441
summary(multi_model1)
## 
## Call:
## lm(formula = Price/1000 ~ factor(Suburb) + BuildingArea, data = melb_filter)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4074.1  -248.5   -26.7   166.0  5479.9 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                509.7567    62.4510   8.163 3.99e-15 ***
## factor(Suburb)Craigieburn -660.2773    72.9115  -9.056  < 2e-16 ***
## factor(Suburb)Hawthorn     400.7159    72.5435   5.524 5.87e-08 ***
## BuildingArea                 4.4354     0.3469  12.786  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 609.9 on 414 degrees of freedom
##   (709 observations deleted due to missingness)
## Multiple R-squared:  0.425,  Adjusted R-squared:  0.4208 
## F-statistic:   102 on 3 and 414 DF,  p-value: < 2.2e-16

Interpreting the regression coeffiecients:

  • All variables are statistically significant at 1% level.
  • The base case in our dummy variable regression is the Suburb Brunswick. The Hawthorn coefficient is positive which means, holding Building Area constant the average price of property in Hawthorn is $400715.90 (400.7159 * 1000) more than in Brunswick. The inverse is true for Craigieburn as the coefficient is negative; holding Building Area constant the average price of property in Craigieburn is 660277.3 less than in Brunswick.
    • This could be intuited by looking at the graph. If we compare the slopes of the 3 suburbs we see that Hawthorn is significantly steeper and Craigieburn is slightly flatter than Brunswick’s. This would also mean that an increase (decrease) in Building Area would lead to a larger increase (decrease) in Property Price.
  • The intercept can be thought of as the lowest price to enter the property market. ie when a property has Building Area = 0 which is not practical but is a good proxy for a floor price for the Melbourne property market.
## Visualisation for 1.4a
ggplot(data = melb_filter,
       aes(x = BuildingArea, y = Price/1000, color = Suburb)) +
        geom_point() +
        geom_smooth(method = "lm", se = FALSE, fullrange = TRUE) +
        labs(title = "Property Price vs. Building Area by Suburb",
             x = "Building Area in metres" ,
             y = "Price in thousands")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 709 rows containing non-finite values (stat_smooth).
## Warning: Removed 709 rows containing missing values (geom_point).

1.4b

multi_model2 <- lm(Price/1000 ~ Suburb + BuildingArea + Car, data = melb_filter)

summary(multi_model2)
## 
## Call:
## lm(formula = Price/1000 ~ Suburb + BuildingArea + Car, data = melb_filter)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3417.3  -273.4   -59.2   252.5  5001.7 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        333.5607    67.0114   4.978 9.56e-07 ***
## SuburbCraigieburn -781.1077    73.7764 -10.588  < 2e-16 ***
## SuburbHawthorn     363.4368    71.1395   5.109 5.02e-07 ***
## BuildingArea         3.7617     0.3513  10.708  < 2e-16 ***
## Car                220.7405    34.6168   6.377 4.98e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 588.5 on 402 degrees of freedom
##   (720 observations deleted due to missingness)
## Multiple R-squared:  0.477,  Adjusted R-squared:  0.4718 
## F-statistic: 91.66 on 4 and 402 DF,  p-value: < 2.2e-16

Interpreting the addition of Cars as an explanatory variable

Looking at the p-values from adding Car spaces to the linear regression model we see that Car is significant at the 1% level and the R-squared increases from 0.425 -> 0.477. This means that this model does a better job at explaining the variability of the actual data. However, a R-squared of 0.477 is not a very high score meaning our model does not do a great job at explaining the variability of property prices in general even if it is slightly better than before.

References