import library

library(dplyr)
## 
## 载入程序包:'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

Necessary code

# put your code here that is necessary to finish the two assessable questions
melbourne_data <- read.csv("E:/悉尼大学/S2/5003/Melbourne_housing_FULL.csv")
suburbs_data <- subset(melbourne_data, Suburb %in% c("Brunswick", "Craigieburn", "Hawthorn"))

avg_prices <- aggregate(suburbs_data$Price, by = list(suburbs_data$Suburb), FUN = mean, na.rm = TRUE)
colnames(avg_prices) <- c("Suburb", "Average_Price")
print(avg_prices)
##        Suburb Average_Price
## 1   Brunswick      977988.8
## 2 Craigieburn      566173.5
## 3    Hawthorn     1238074.2

1.4 (a)

# put your code here
model_extended <- lm(Price ~ BuildingArea + Suburb, data = suburbs_data)
summary(model_extended)
## 
## Call:
## lm(formula = Price ~ BuildingArea + Suburb, data = suburbs_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -4074050  -248461   -26674   166018  5479862 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        509756.7    62451.0   8.163 3.99e-15 ***
## BuildingArea         4435.4      346.9  12.786  < 2e-16 ***
## SuburbCraigieburn -660277.3    72911.5  -9.056  < 2e-16 ***
## SuburbHawthorn     400715.9    72543.5   5.524 5.87e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 609900 on 414 degrees of freedom
##   (因为不存在,709个观察量被删除了)
## Multiple R-squared:  0.425,  Adjusted R-squared:  0.4208 
## F-statistic:   102 on 3 and 414 DF,  p-value: < 2.2e-16
 ggplot(suburbs_data, aes(x = BuildingArea, y = Price, color = Suburb)) +
     geom_point() +
     geom_smooth(method = "lm", se = FALSE) +  # 绘制三根回归线
     labs(title = "House Prices vs. Building Area by Suburb",
          x = "Building Area (m²)",
          y = "Price (AUD)") +
     theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 709 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 709 rows containing missing values or values outside the scale range
## (`geom_point()`).

Intercept : Represents the predicted value of house prices in a benchmark suburb (selected automatically by R) when the built area is zero.

BuildingArea : Average change in house price per additional square meter of floor space.

Suburb : Average change in house prices in the corresponding suburb relative to the benchmark suburb.

1.4 (b)

# put your code here
model_with_car <- lm(Price ~ BuildingArea + Suburb + Car, data = suburbs_data)
summary(model_with_car)
## 
## Call:
## lm(formula = Price ~ BuildingArea + Suburb + Car, data = suburbs_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3417281  -273352   -59191   252474  5001704 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        333560.7    67011.4   4.978 9.56e-07 ***
## BuildingArea         3761.7      351.3  10.708  < 2e-16 ***
## SuburbCraigieburn -781107.7    73776.4 -10.588  < 2e-16 ***
## SuburbHawthorn     363436.8    71139.5   5.109 5.02e-07 ***
## Car                220740.5    34616.8   6.377 4.98e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 588500 on 402 degrees of freedom
##   (因为不存在,720个观察量被删除了)
## Multiple R-squared:  0.477,  Adjusted R-squared:  0.4718 
## F-statistic: 91.66 on 4 and 402 DF,  p-value: < 2.2e-16

The r-squared went from 0.4208 to 0.4718, so I think the prediction model has improved.