library(dplyr)
##
## 载入程序包:'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# put your code here that is necessary to finish the two assessable questions
melbourne_data <- read.csv("E:/悉尼大学/S2/5003/Melbourne_housing_FULL.csv")
suburbs_data <- subset(melbourne_data, Suburb %in% c("Brunswick", "Craigieburn", "Hawthorn"))
avg_prices <- aggregate(suburbs_data$Price, by = list(suburbs_data$Suburb), FUN = mean, na.rm = TRUE)
colnames(avg_prices) <- c("Suburb", "Average_Price")
print(avg_prices)
## Suburb Average_Price
## 1 Brunswick 977988.8
## 2 Craigieburn 566173.5
## 3 Hawthorn 1238074.2
# put your code here
model_extended <- lm(Price ~ BuildingArea + Suburb, data = suburbs_data)
summary(model_extended)
##
## Call:
## lm(formula = Price ~ BuildingArea + Suburb, data = suburbs_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4074050 -248461 -26674 166018 5479862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 509756.7 62451.0 8.163 3.99e-15 ***
## BuildingArea 4435.4 346.9 12.786 < 2e-16 ***
## SuburbCraigieburn -660277.3 72911.5 -9.056 < 2e-16 ***
## SuburbHawthorn 400715.9 72543.5 5.524 5.87e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 609900 on 414 degrees of freedom
## (因为不存在,709个观察量被删除了)
## Multiple R-squared: 0.425, Adjusted R-squared: 0.4208
## F-statistic: 102 on 3 and 414 DF, p-value: < 2.2e-16
ggplot(suburbs_data, aes(x = BuildingArea, y = Price, color = Suburb)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) + # 绘制三根回归线
labs(title = "House Prices vs. Building Area by Suburb",
x = "Building Area (m²)",
y = "Price (AUD)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 709 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 709 rows containing missing values or values outside the scale range
## (`geom_point()`).
Intercept : Represents the predicted value of house prices in a
benchmark suburb (selected automatically by R) when the built area is
zero.
BuildingArea : Average change in house price per additional square meter of floor space.
Suburb : Average change in house prices in the corresponding suburb relative to the benchmark suburb.
# put your code here
model_with_car <- lm(Price ~ BuildingArea + Suburb + Car, data = suburbs_data)
summary(model_with_car)
##
## Call:
## lm(formula = Price ~ BuildingArea + Suburb + Car, data = suburbs_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3417281 -273352 -59191 252474 5001704
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 333560.7 67011.4 4.978 9.56e-07 ***
## BuildingArea 3761.7 351.3 10.708 < 2e-16 ***
## SuburbCraigieburn -781107.7 73776.4 -10.588 < 2e-16 ***
## SuburbHawthorn 363436.8 71139.5 5.109 5.02e-07 ***
## Car 220740.5 34616.8 6.377 4.98e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 588500 on 402 degrees of freedom
## (因为不存在,720个观察量被删除了)
## Multiple R-squared: 0.477, Adjusted R-squared: 0.4718
## F-statistic: 91.66 on 4 and 402 DF, p-value: < 2.2e-16
The r-squared went from 0.4208 to 0.4718, so I think the prediction model has improved.