Loading libraries
library(ggplot2)
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Loading data
df=read.csv("melb_data.csv")
df
Suburb: Name of the suburb where the property is located. Address:
Address of the property. Rooms: Number of rooms in the property. Price:
Price of the property in Australian dollars. <———————————— Method:
Method of selling the property (e.g. S - property sold, SP - property
sold prior, PI - property passed in, VB - vendor bid, SA - sold after
auction, W - withdrawn prior to auction, PN - passed in at auction, SN -
sold not disclosed). Type: Type of property (e.g. h - house, u - unit, t
- townhouse). SellerG: Real estate agent handling the property sale.
Date: Date of sale. Distance: Distance of the property from Melbourne’s
central business district (in kilometers). Postcode: Postal code of the
suburb. Bedroom2: Number of bedrooms in the property (secondary to the
main one). Bathroom: Number of bathrooms in the property. Car: Number of
car spaces in the property. Landsize: Land size of the property in
square meters. BuildingArea: Building size of the property in square
meters. YearBuilt: Year the property was built. CouncilArea: Local
government area the property is in. Lattitude: Lattitude of the property
location. Longtitude: Longitude of the property location. Regionname:
General region (West, North West, North, South East, East, or South) the
property is located in Melbourne. Propertycount: Number of properties in
the suburb.
Structure of data
str(df)
## 'data.frame': 13580 obs. of 21 variables:
## $ Suburb : chr "Abbotsford" "Abbotsford" "Abbotsford" "Abbotsford" ...
## $ Address : chr "85 Turner St" "25 Bloomburg St" "5 Charles St" "40 Federation La" ...
## $ Rooms : int 2 2 3 3 4 2 3 2 1 2 ...
## $ Type : chr "h" "h" "h" "h" ...
## $ Price : num 1480000 1035000 1465000 850000 1600000 ...
## $ Method : chr "S" "S" "SP" "PI" ...
## $ SellerG : chr "Biggin" "Biggin" "Biggin" "Biggin" ...
## $ Date : chr "3/12/2016" "4/02/2016" "4/03/2017" "4/03/2017" ...
## $ Distance : num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ Postcode : num 3067 3067 3067 3067 3067 ...
## $ Bedroom2 : num 2 2 3 3 3 2 4 2 1 3 ...
## $ Bathroom : num 1 1 2 2 1 1 2 1 1 1 ...
## $ Car : num 1 0 0 1 2 0 0 2 1 2 ...
## $ Landsize : num 202 156 134 94 120 181 245 256 0 220 ...
## $ BuildingArea : num NA 79 150 NA 142 NA 210 107 NA 75 ...
## $ YearBuilt : num NA 1900 1900 NA 2014 ...
## $ CouncilArea : chr "Yarra" "Yarra" "Yarra" "Yarra" ...
## $ Lattitude : num -37.8 -37.8 -37.8 -37.8 -37.8 ...
## $ Longtitude : num 145 145 145 145 145 ...
## $ Regionname : chr "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" "Northern Metropolitan" ...
## $ Propertycount: num 4019 4019 4019 4019 4019 ...
Checking for missing values
df %>%
summarise_all(list(~ sum(is.na(.))))
Plotting the number of nas
x <- df %>%
summarise_all(list(~ sum(is.na(.))))
class(x)
## [1] "data.frame"
Summary ( central tendency of continous features and count of
categorical features )
summary(df)
## Suburb Address Rooms Type
## Length:13580 Length:13580 Min. : 1.000 Length:13580
## Class :character Class :character 1st Qu.: 2.000 Class :character
## Mode :character Mode :character Median : 3.000 Mode :character
## Mean : 2.938
## 3rd Qu.: 3.000
## Max. :10.000
##
## Price Method SellerG Date
## Min. : 85000 Length:13580 Length:13580 Length:13580
## 1st Qu.: 650000 Class :character Class :character Class :character
## Median : 903000 Mode :character Mode :character Mode :character
## Mean :1075684
## 3rd Qu.:1330000
## Max. :9000000
##
## Distance Postcode Bedroom2 Bathroom
## Min. : 0.00 Min. :3000 Min. : 0.000 Min. :0.000
## 1st Qu.: 6.10 1st Qu.:3044 1st Qu.: 2.000 1st Qu.:1.000
## Median : 9.20 Median :3084 Median : 3.000 Median :1.000
## Mean :10.14 Mean :3105 Mean : 2.915 Mean :1.534
## 3rd Qu.:13.00 3rd Qu.:3148 3rd Qu.: 3.000 3rd Qu.:2.000
## Max. :48.10 Max. :3977 Max. :20.000 Max. :8.000
##
## Car Landsize BuildingArea YearBuilt
## Min. : 0.00 Min. : 0.0 Min. : 0 Min. :1196
## 1st Qu.: 1.00 1st Qu.: 177.0 1st Qu.: 93 1st Qu.:1940
## Median : 2.00 Median : 440.0 Median : 126 Median :1970
## Mean : 1.61 Mean : 558.4 Mean : 152 Mean :1965
## 3rd Qu.: 2.00 3rd Qu.: 651.0 3rd Qu.: 174 3rd Qu.:1999
## Max. :10.00 Max. :433014.0 Max. :44515 Max. :2018
## NA's :62 NA's :6450 NA's :5375
## CouncilArea Lattitude Longtitude Regionname
## Length:13580 Min. :-38.18 Min. :144.4 Length:13580
## Class :character 1st Qu.:-37.86 1st Qu.:144.9 Class :character
## Mode :character Median :-37.80 Median :145.0 Mode :character
## Mean :-37.81 Mean :145.0
## 3rd Qu.:-37.76 3rd Qu.:145.1
## Max. :-37.41 Max. :145.5
##
## Propertycount
## Min. : 249
## 1st Qu.: 4380
## Median : 6555
## Mean : 7454
## 3rd Qu.:10331
## Max. :21650
##
Convert categorical variables to factors
# Convert "Suburb" to a factor
df$Suburb <- factor(df$Suburb)
# Convert "Method" to a factor
df$Method <- factor(df$Method)
# Convert "Type" to a factor
df$Type <- factor(df$Type)
# Convert "SellerG" to a factor
df$SellerG <- factor(df$SellerG)
# Convert "CouncilArea" to a factor
df$CouncilArea <- factor(df$CouncilArea)
# Convert "Regionname" to a factor
df$Regionname <- factor(df$Regionname)
print(str(df))
## 'data.frame': 13580 obs. of 21 variables:
## $ Suburb : Factor w/ 314 levels "Abbotsford","Aberfeldie",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Address : chr "85 Turner St" "25 Bloomburg St" "5 Charles St" "40 Federation La" ...
## $ Rooms : int 2 2 3 3 4 2 3 2 1 2 ...
## $ Type : Factor w/ 3 levels "h","t","u": 1 1 1 1 1 1 1 1 3 1 ...
## $ Price : num 1480000 1035000 1465000 850000 1600000 ...
## $ Method : Factor w/ 5 levels "PI","S","SA",..: 2 2 4 1 5 2 2 2 2 2 ...
## $ SellerG : Factor w/ 268 levels "@Realty","Abercromby's",..: 24 24 24 24 165 114 165 165 24 24 ...
## $ Date : chr "3/12/2016" "4/02/2016" "4/03/2017" "4/03/2017" ...
## $ Distance : num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ Postcode : num 3067 3067 3067 3067 3067 ...
## $ Bedroom2 : num 2 2 3 3 3 2 4 2 1 3 ...
## $ Bathroom : num 1 1 2 2 1 1 2 1 1 1 ...
## $ Car : num 1 0 0 1 2 0 0 2 1 2 ...
## $ Landsize : num 202 156 134 94 120 181 245 256 0 220 ...
## $ BuildingArea : num NA 79 150 NA 142 NA 210 107 NA 75 ...
## $ YearBuilt : num NA 1900 1900 NA 2014 ...
## $ CouncilArea : Factor w/ 34 levels "","Banyule","Bayside",..: 33 33 33 33 33 33 33 33 33 33 ...
## $ Lattitude : num -37.8 -37.8 -37.8 -37.8 -37.8 ...
## $ Longtitude : num 145 145 145 145 145 ...
## $ Regionname : Factor w/ 8 levels "Eastern Metropolitan",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Propertycount: num 4019 4019 4019 4019 4019 ...
## NULL
Main Features choosen for predicting target: Price
Location based
1.Distance 2.Region Name
House size based
- Rooms
- Type
- Bathroom
- Bedroom2
new_df <- df %>% select(Distance,Regionname,Type,,Rooms,Bathroom,Bedroom2,Price)
# Choosing only the numeric features from the new_df
int_df <- df %>% select(Distance,Rooms,Bathroom,Bedroom2,Price)
int_df
Correlation and significance
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(int_df)

Correlations between variables ( Assumption)
library(reshape)
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
# Calculate the correlation matrix
cor_matrix <- cor(int_df)
# Melt the correlation matrix into a data frame for plotting
melted_matrix <- melt(cor_matrix)
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by the
## caller; using TRUE
# Plot the heatmap
ggplot(melted_matrix, aes(X1, X2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "yellow", high = "red", mid = "orange", midpoint = 0, limit = c(-1,1), space = "Lab", name="Correlation\nCoefficient") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.position = "bottom") +
geom_text(aes(X1, X2, label = round(value, 2)), color = "white", size = 4,fontface = "bold", show.legend = FALSE) +
ggtitle("Correlation Heatmap")

Regression Analysis
model <- lm(Price ~ Rooms+Bathroom+Bedroom2+Distance+Type+Regionname, data=new_df)
summary(model)
##
## Call:
## lm(formula = Price ~ Rooms + Bathroom + Bedroom2 + Distance +
## Type + Regionname, data = new_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1959702 -241020 -47132 169202 7997255
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 826530.4 21514.1 38.418 < 2e-16 ***
## Rooms 165779.8 11698.0 14.172 < 2e-16 ***
## Bathroom 203899.7 6655.4 30.637 < 2e-16 ***
## Bedroom2 24383.3 11279.9 2.162 0.03066 *
## Distance -43660.5 830.2 -52.587 < 2e-16 ***
## Typet -347458.0 13553.6 -25.636 < 2e-16 ***
## Typeu -536049.1 10747.2 -49.878 < 2e-16 ***
## RegionnameEastern Victoria 382229.0 60401.0 6.328 2.56e-10 ***
## RegionnameNorthern Metropolitan -236383.1 13610.3 -17.368 < 2e-16 ***
## RegionnameNorthern Victoria 186660.8 67678.4 2.758 0.00582 **
## RegionnameSouth-Eastern Metropolitan 222643.4 23792.4 9.358 < 2e-16 ***
## RegionnameSouthern Metropolitan 262502.4 13114.6 20.016 < 2e-16 ***
## RegionnameWestern Metropolitan -294776.0 13628.7 -21.629 < 2e-16 ***
## RegionnameWestern Victoria -33531.0 75419.0 -0.445 0.65662
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 414800 on 13566 degrees of freedom
## Multiple R-squared: 0.5793, Adjusted R-squared: 0.5789
## F-statistic: 1437 on 13 and 13566 DF, p-value: < 2.2e-16
One hot encoding categorical vars: Type and Regionname
# Create a new data frame with one-hot encoding for the categorical variable
encoded_df <- model.matrix(~ Type+Regionname - 1, data = new_df)
# Rename the columns to remove the 'Type' prefix
colnames(encoded_df) <- gsub("Type", "Type_", colnames(encoded_df))
# Rename the columns to remove the 'Regionname' prefix and replace space with dash
colnames(encoded_df) <- gsub("Regionname", "Regionname_", colnames(encoded_df))
colnames(encoded_df) <- gsub(" ", "_", colnames(encoded_df), fixed = TRUE)
colnames(encoded_df) <- gsub("-", "_", colnames(encoded_df), fixed = TRUE)
# Add the encoded columns to the original data frame
t <- cbind(new_df, encoded_df)
t
colnames(t)
## [1] "Distance"
## [2] "Regionname"
## [3] "Type"
## [4] "Rooms"
## [5] "Bathroom"
## [6] "Bedroom2"
## [7] "Price"
## [8] "Type_h"
## [9] "Type_t"
## [10] "Type_u"
## [11] "Regionname_Eastern_Victoria"
## [12] "Regionname_Northern_Metropolitan"
## [13] "Regionname_Northern_Victoria"
## [14] "Regionname_South_Eastern_Metropolitan"
## [15] "Regionname_Southern_Metropolitan"
## [16] "Regionname_Western_Metropolitan"
## [17] "Regionname_Western_Victoria"
Refined model1
model1 <- lm(Price ~ Rooms +
Bathroom +
Distance +
Type_t +
Type_u +
Regionname_Eastern_Victoria , data=t)
summary(model1)
##
## Call:
## lm(formula = Price ~ Rooms + Bathroom + Distance + Type_t + Type_u +
## Regionname_Eastern_Victoria, data = t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2419447 -276017 -68549 188179 8272155
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 622603.4 18165.7 34.273 < 2e-16 ***
## Rooms 185619.8 6412.4 28.947 < 2e-16 ***
## Bathroom 268956.9 7522.5 35.753 < 2e-16 ***
## Distance -38328.4 759.9 -50.439 < 2e-16 ***
## Type_t -324711.8 15559.8 -20.869 < 2e-16 ***
## Type_u -408977.6 12137.1 -33.696 < 2e-16 ***
## Regionname_Eastern_Victoria 294147.3 68039.4 4.323 1.55e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 476900 on 13573 degrees of freedom
## Multiple R-squared: 0.4437, Adjusted R-squared: 0.4435
## F-statistic: 1804 on 6 and 13573 DF, p-value: < 2.2e-16
Refined model2
model2 <- lm(Price ~ Rooms +
Bathroom +
Distance +
Type_t +
Type_u +
Regionname_Eastern_Victoria +
Regionname_Northern_Metropolitan +
Regionname_Northern_Victoria +
Regionname_South_Eastern_Metropolitan +
Regionname_Southern_Metropolitan +
Regionname_Western_Metropolitan, data=t)
summary(model2)
##
## Call:
## lm(formula = Price ~ Rooms + Bathroom + Distance + Type_t + Type_u +
## Regionname_Eastern_Victoria + Regionname_Northern_Metropolitan +
## Regionname_Northern_Victoria + Regionname_South_Eastern_Metropolitan +
## Regionname_Southern_Metropolitan + Regionname_Western_Metropolitan,
## data = t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1927854 -241571 -46877 169143 7997553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 830007.1 21455.2 38.686 < 2e-16 ***
## Rooms 188053.3 5594.9 33.611 < 2e-16 ***
## Bathroom 205424.5 6619.5 31.033 < 2e-16 ***
## Distance -43645.5 815.6 -53.512 < 2e-16 ***
## Type_t -348280.2 13546.9 -25.709 < 2e-16 ***
## Type_u -537059.3 10738.0 -50.015 < 2e-16 ***
## Regionname_Eastern_Victoria 382699.0 60226.3 6.354 2.16e-10 ***
## Regionname_Northern_Metropolitan -236493.3 13588.1 -17.405 < 2e-16 ***
## Regionname_Northern_Victoria 187363.7 67531.8 2.774 0.00554 **
## Regionname_South_Eastern_Metropolitan 223390.5 23597.1 9.467 < 2e-16 ***
## Regionname_Southern_Metropolitan 262233.2 13085.4 20.040 < 2e-16 ***
## Regionname_Western_Metropolitan -294906.4 13588.1 -21.703 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 414900 on 13568 degrees of freedom
## Multiple R-squared: 0.5792, Adjusted R-squared: 0.5788
## F-statistic: 1698 on 11 and 13568 DF, p-value: < 2.2e-16
Comparing two models
anova(model, model1,model2)
Using step method to add parameters
model4.stepwise.0 <- lm(Price~1,data=t)
Adding a new variable ( Which one to choose ?)
result <- add1(model4.stepwise.0,scope = ~Rooms +
Bathroom +
Distance +
Type_t +
Type_u +
Type_h +
Regionname_Eastern_Victoria +
Regionname_Northern_Metropolitan +
Regionname_Northern_Victoria +
Regionname_South_Eastern_Metropolitan +
Regionname_Southern_Metropolitan +
Regionname_Western_Metropolitan)
min_row <- which.min(result[, "AIC"]) # assuming "AIC" is the column name that contains AIC values
row.names(result)[min_row]
## [1] "Rooms"
Adding the new variable : Rooms ( as it has least AIC)
model4.stepwise.1 <- lm(Price~Rooms,data=t)
Adding a new variable ( Which one to choose ?)
result<-add1(model4.stepwise.1,scope = ~Rooms+
Bathroom +
Distance +
Type_t +
Type_u +
Type_h +
Regionname_Eastern_Victoria +
Regionname_Northern_Metropolitan +
Regionname_Northern_Victoria +
Regionname_South_Eastern_Metropolitan +
Regionname_Southern_Metropolitan +
Regionname_Western_Metropolitan)
min_row <- which.min(result[, "AIC"]) # assuming "AIC" is the column name that contains AIC values
row.names(result)[min_row]
## [1] "Regionname_Southern_Metropolitan"
Adding the new variable ( as it has least AIC)
model4.stepwise.2 <- lm(Price~Rooms+
Regionname_Southern_Metropolitan,data=t)
result<-add1(model4.stepwise.2,scope = ~Rooms+
Bathroom +
Distance +
Type_t +
Type_u +
Type_h +
Regionname_Eastern_Victoria +
Regionname_Northern_Metropolitan +
Regionname_Northern_Victoria +
Regionname_South_Eastern_Metropolitan +
Regionname_Southern_Metropolitan +
Regionname_Western_Metropolitan)
min_row <- which.min(result[, "AIC"]) # assuming "AIC" is the column name that contains AIC values
row.names(result)[min_row]
## [1] "Distance"
model4.stepwise.3 <- lm(Price~Rooms+
Regionname_Southern_Metropolitan+
Distance,data=t)
result<-add1(model4.stepwise.3,scope = ~Rooms+
Bathroom +
Distance +
Type_t +
Type_u +
Type_h +
Regionname_Eastern_Victoria +
Regionname_Northern_Metropolitan +
Regionname_Northern_Victoria +
Regionname_South_Eastern_Metropolitan +
Regionname_Southern_Metropolitan +
Regionname_Western_Metropolitan)
result
min_row <- which.min(result[, "AIC"]) # assuming "AIC" is the column name that contains AIC values
row.names(result)[min_row]
## [1] "Type_h"
model4.stepwise.4 <- lm(Price~Rooms+
Regionname_Southern_Metropolitan+
Distance+
Type_h,data=t)
result<-add1(model4.stepwise.4,scope = ~Rooms+
Bathroom +
Distance +
Type_t +
Type_u +
Type_h +
Regionname_Eastern_Victoria +
Regionname_Northern_Metropolitan +
Regionname_Northern_Victoria +
Regionname_South_Eastern_Metropolitan +
Regionname_Southern_Metropolitan +
Regionname_Western_Metropolitan)
result
min_row <- which.min(result[, "AIC"]) # assuming "AIC" is the column name that contains AIC values
row.names(result)[min_row]
## [1] "Bathroom"
potential_vars <- colnames(t)
potential_vars
## [1] "Distance"
## [2] "Regionname"
## [3] "Type"
## [4] "Rooms"
## [5] "Bathroom"
## [6] "Bedroom2"
## [7] "Price"
## [8] "Type_h"
## [9] "Type_t"
## [10] "Type_u"
## [11] "Regionname_Eastern_Victoria"
## [12] "Regionname_Northern_Metropolitan"
## [13] "Regionname_Northern_Victoria"
## [14] "Regionname_South_Eastern_Metropolitan"
## [15] "Regionname_Southern_Metropolitan"
## [16] "Regionname_Western_Metropolitan"
## [17] "Regionname_Western_Victoria"
Automating the add step using the loop
# Initial model with intercept only
model5 <- lm(Price ~ 1, data = t)
# Create an empty vector to store selected variables
selected_vars <- c()
# Start the loop
for (i in 1:length(potential_vars)) {
#Run add1()
result<-add1(model5,scope = ~ Rooms+
Bathroom +
Distance +
Type_t +
Type_u +
Type_h +
Regionname_Eastern_Victoria +
Regionname_Northern_Metropolitan +
Regionname_Northern_Victoria +
Regionname_South_Eastern_Metropolitan +
Regionname_Southern_Metropolitan +
Regionname_Western_Metropolitan +
Regionname_Western_Victoria)
min_row <- which.min(result[, "AIC"]) # assuming "AIC" is the column name that contains AIC values
best_var <- row.names(result)[min_row][1]
if (best_var=="<none>") {
break
}
else{
print(best_var)
selected_vars <- append(selected_vars,best_var)
formula_str <- paste("Price ~ ", paste(selected_vars, collapse = " + "))
model_formula <- as.formula(formula_str)
model5 <- lm(model_formula, data = t)
}
}
## [1] "Rooms"
## [1] "Regionname_Southern_Metropolitan"
## [1] "Distance"
## [1] "Type_h"
## [1] "Bathroom"
## [1] "Regionname_Western_Metropolitan"
## [1] "Regionname_Northern_Metropolitan"
## [1] "Type_u"
## [1] "Regionname_South_Eastern_Metropolitan"
## [1] "Regionname_Eastern_Victoria"
## [1] "Regionname_Northern_Victoria"
# Final model with selected variables
final_model <- model5
summary(final_model)
##
## Call:
## lm(formula = model_formula, data = t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1927854 -241571 -46877 169143 7997553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 481726.9 23554.3 20.452 < 2e-16 ***
## Rooms 188053.3 5594.9 33.611 < 2e-16 ***
## Regionname_Southern_Metropolitan 262233.2 13085.4 20.040 < 2e-16 ***
## Distance -43645.5 815.6 -53.512 < 2e-16 ***
## Type_h 348280.2 13546.9 25.709 < 2e-16 ***
## Bathroom 205424.5 6619.5 31.033 < 2e-16 ***
## Regionname_Western_Metropolitan -294906.4 13588.1 -21.703 < 2e-16 ***
## Regionname_Northern_Metropolitan -236493.3 13588.1 -17.405 < 2e-16 ***
## Type_u -188779.1 15232.2 -12.393 < 2e-16 ***
## Regionname_South_Eastern_Metropolitan 223390.5 23597.1 9.467 < 2e-16 ***
## Regionname_Eastern_Victoria 382699.0 60226.3 6.354 2.16e-10 ***
## Regionname_Northern_Victoria 187363.7 67531.8 2.774 0.00554 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 414900 on 13568 degrees of freedom
## Multiple R-squared: 0.5792, Adjusted R-squared: 0.5788
## F-statistic: 1698 on 11 and 13568 DF, p-value: < 2.2e-16
Testing assumptions of models
Assumptions 1. all predictor variables must be quantitative or
categorical, and outcome must be quantitative, continuous, and unbounded
2. non-zero variance 3. no perfect multicollinearity (predictor
variables should not correlate highly) 4. predictors are uncorrelated
with external variables 5. residuals are homoscedastic (constant
variance), independent (test with Durbin-Watson), Normal 6. linearity
(outcome variable means lie on straight line)
# Choose the model for which assumptions need to be tested
model_name = model5
Testing multicollinearity
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ purrr 1.0.1
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks reshape::expand()
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ reshape::rename() masks dplyr::rename()
## ✖ purrr::set_names() masks magrittr::set_names()
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:dplyr':
##
## recode
print("VIFS")
## [1] "VIFS"
vif(model_name)
## Rooms Regionname_Southern_Metropolitan
## 2.255679 3.055643
## Distance Type_h
## 1.807471 3.064501
## Bathroom Regionname_Western_Metropolitan
## 1.653894 2.475693
## Regionname_Northern_Metropolitan Type_u
## 2.977313 3.163185
## Regionname_South_Eastern_Metropolitan Regionname_Eastern_Victoria
## 1.407441 1.112464
## Regionname_Northern_Victoria
## 1.082987
print("Tolerance")
## [1] "Tolerance"
1/vif(model_name)
## Rooms Regionname_Southern_Metropolitan
## 0.4433254 0.3272634
## Distance Type_h
## 0.5532592 0.3263174
## Bathroom Regionname_Western_Metropolitan
## 0.6046337 0.4039272
## Regionname_Northern_Metropolitan Type_u
## 0.3358734 0.3161371
## Regionname_South_Eastern_Metropolitan Regionname_Eastern_Victoria
## 0.7105096 0.8989058
## Regionname_Northern_Victoria
## 0.9233720
print("Mean VIF")
## [1] "Mean VIF"
mean(vif(model_name))
## [1] 2.186934
if the VIFs are all lower than 10, there is probably no cause for
concern. If the average is close to 1, then there’s no cause for
concern.
the tolerance is compared to 0.1 and lower (which indicates a serious
problem) and 0.2 and lower (which indicates a potential problem)
these tend to kick in when correlation is very high (0.8 or 0.9, for
example)
Testing residuals
Independence of residuals
durbinWatsonTest(model_name)
## lag Autocorrelation D-W Statistic p-value
## 1 0.1766647 1.646595 0
## Alternative hypothesis: rho != 0
If the p-value is less than 0.05 then the autocorrelation is
significant and rho is not equal to 0. We reject the null hypothesis(
that rho==0).
Normality, Homoscedasticity and Linearity
plot(model_name)



### Outlliers
x=t
x$fitted_vals <- model_name$fitted
x$residual_vals <- model_name$residuals
x$std_residual_vals <- rstandard(model_name)
x
possible.outliers <- subset(x, std_residual_vals< -1.96 | std_residual_vals > 1.96)
possible.outliers
nrow(possible.outliers)/nrow(x)*100
## [1] 3.895434
Influential Points
cooks_dist <- cooks.distance(model_name)
plot(sort(cooks_dist, decreasing=TRUE))

max(cooks_dist)
## [1] 0.07495043
Comparison of all the previous models
anova(model1,model2,model5)
The Rsquared values of the 3 models
summary(model1)
##
## Call:
## lm(formula = Price ~ Rooms + Bathroom + Distance + Type_t + Type_u +
## Regionname_Eastern_Victoria, data = t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2419447 -276017 -68549 188179 8272155
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 622603.4 18165.7 34.273 < 2e-16 ***
## Rooms 185619.8 6412.4 28.947 < 2e-16 ***
## Bathroom 268956.9 7522.5 35.753 < 2e-16 ***
## Distance -38328.4 759.9 -50.439 < 2e-16 ***
## Type_t -324711.8 15559.8 -20.869 < 2e-16 ***
## Type_u -408977.6 12137.1 -33.696 < 2e-16 ***
## Regionname_Eastern_Victoria 294147.3 68039.4 4.323 1.55e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 476900 on 13573 degrees of freedom
## Multiple R-squared: 0.4437, Adjusted R-squared: 0.4435
## F-statistic: 1804 on 6 and 13573 DF, p-value: < 2.2e-16
summary(model2)
##
## Call:
## lm(formula = Price ~ Rooms + Bathroom + Distance + Type_t + Type_u +
## Regionname_Eastern_Victoria + Regionname_Northern_Metropolitan +
## Regionname_Northern_Victoria + Regionname_South_Eastern_Metropolitan +
## Regionname_Southern_Metropolitan + Regionname_Western_Metropolitan,
## data = t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1927854 -241571 -46877 169143 7997553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 830007.1 21455.2 38.686 < 2e-16 ***
## Rooms 188053.3 5594.9 33.611 < 2e-16 ***
## Bathroom 205424.5 6619.5 31.033 < 2e-16 ***
## Distance -43645.5 815.6 -53.512 < 2e-16 ***
## Type_t -348280.2 13546.9 -25.709 < 2e-16 ***
## Type_u -537059.3 10738.0 -50.015 < 2e-16 ***
## Regionname_Eastern_Victoria 382699.0 60226.3 6.354 2.16e-10 ***
## Regionname_Northern_Metropolitan -236493.3 13588.1 -17.405 < 2e-16 ***
## Regionname_Northern_Victoria 187363.7 67531.8 2.774 0.00554 **
## Regionname_South_Eastern_Metropolitan 223390.5 23597.1 9.467 < 2e-16 ***
## Regionname_Southern_Metropolitan 262233.2 13085.4 20.040 < 2e-16 ***
## Regionname_Western_Metropolitan -294906.4 13588.1 -21.703 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 414900 on 13568 degrees of freedom
## Multiple R-squared: 0.5792, Adjusted R-squared: 0.5788
## F-statistic: 1698 on 11 and 13568 DF, p-value: < 2.2e-16
summary(model5)
##
## Call:
## lm(formula = model_formula, data = t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1927854 -241571 -46877 169143 7997553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 481726.9 23554.3 20.452 < 2e-16 ***
## Rooms 188053.3 5594.9 33.611 < 2e-16 ***
## Regionname_Southern_Metropolitan 262233.2 13085.4 20.040 < 2e-16 ***
## Distance -43645.5 815.6 -53.512 < 2e-16 ***
## Type_h 348280.2 13546.9 25.709 < 2e-16 ***
## Bathroom 205424.5 6619.5 31.033 < 2e-16 ***
## Regionname_Western_Metropolitan -294906.4 13588.1 -21.703 < 2e-16 ***
## Regionname_Northern_Metropolitan -236493.3 13588.1 -17.405 < 2e-16 ***
## Type_u -188779.1 15232.2 -12.393 < 2e-16 ***
## Regionname_South_Eastern_Metropolitan 223390.5 23597.1 9.467 < 2e-16 ***
## Regionname_Eastern_Victoria 382699.0 60226.3 6.354 2.16e-10 ***
## Regionname_Northern_Victoria 187363.7 67531.8 2.774 0.00554 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 414900 on 13568 degrees of freedom
## Multiple R-squared: 0.5792, Adjusted R-squared: 0.5788
## F-statistic: 1698 on 11 and 13568 DF, p-value: < 2.2e-16
Adding Year Built to the data frame remove null values
new_df_1 <- df %>% select(Distance,Regionname,Type,Rooms,Bathroom,Bedroom2,YearBuilt,Price)
encoded_df <- model.matrix(~ Type+Regionname - 1, data = new_df_1)
# Rename the columns to remove the 'Type' prefix
colnames(encoded_df) <- gsub("Type", "Type_", colnames(encoded_df))
# Rename the columns to remove the 'Regionname' prefix and replace space with dash
colnames(encoded_df) <- gsub("Regionname", "Regionname_", colnames(encoded_df))
colnames(encoded_df) <- gsub(" ", "_", colnames(encoded_df), fixed = TRUE)
colnames(encoded_df) <- gsub("-", "_", colnames(encoded_df), fixed = TRUE)
# Add the encoded columns to the original data frame
u <- cbind(new_df_1, encoded_df)
u
Removing NA values
u <- u %>% drop_na()
u
Automating the add step using the loop
# Initial model with intercept only
model6 <- lm(Price ~ 1, data = u)
# Create an empty vector to store selected variables
selected_vars <- c()
# Start the loop
for (i in 1:length(potential_vars)) {
#Run add1()
result<-add1(model6,scope = ~ Rooms+
Bedroom2+
Bathroom +
Distance +
Type_t +
Type_u +
Type_h +
YearBuilt +
Regionname_Eastern_Victoria +
Regionname_Northern_Metropolitan +
Regionname_Northern_Victoria +
Regionname_South_Eastern_Metropolitan +
Regionname_Southern_Metropolitan +
Regionname_Western_Metropolitan +
Regionname_Western_Victoria)
min_row <- which.min(result[, "AIC"]) # assuming "AIC" is the column name that contains AIC values
best_var <- row.names(result)[min_row][1]
if (best_var=="<none>") {
break
}
else{
print(best_var)
selected_vars <- append(selected_vars,best_var)
formula_str <- paste("Price ~ ", paste(selected_vars, collapse = " + "))
model_formula <- as.formula(formula_str)
model6 <- lm(model_formula, data = u)
}
}
## [1] "Rooms"
## [1] "Regionname_Southern_Metropolitan"
## [1] "Distance"
## [1] "Type_h"
## [1] "Bathroom"
## [1] "YearBuilt"
## [1] "Regionname_Western_Metropolitan"
## [1] "Regionname_Northern_Metropolitan"
## [1] "Type_t"
## [1] "Regionname_South_Eastern_Metropolitan"
## [1] "Regionname_Eastern_Victoria"
## [1] "Regionname_Northern_Victoria"
# Final model with selected variables
final_model <- model6
summary(final_model)
##
## Call:
## lm(formula = model_formula, data = u)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2288378 -229250 -45040 161012 8003217
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5965193.0 299875.4 19.892 < 2e-16 ***
## Rooms 194727.0 7288.5 26.717 < 2e-16 ***
## Regionname_Southern_Metropolitan 232201.9 17257.9 13.455 < 2e-16 ***
## Distance -37840.0 1112.5 -34.014 < 2e-16 ***
## Type_h 393315.9 14999.9 26.221 < 2e-16 ***
## Bathroom 262964.6 8802.3 29.875 < 2e-16 ***
## YearBuilt -2926.1 153.3 -19.084 < 2e-16 ***
## Regionname_Western_Metropolitan -279181.2 17970.6 -15.535 < 2e-16 ***
## Regionname_Northern_Metropolitan -218171.9 17900.0 -12.188 < 2e-16 ***
## Type_t 174530.6 18632.3 9.367 < 2e-16 ***
## Regionname_South_Eastern_Metropolitan 237749.1 32131.1 7.399 1.50e-13 ***
## Regionname_Eastern_Victoria 318544.8 79043.9 4.030 5.63e-05 ***
## Regionname_Northern_Victoria 140937.4 83645.2 1.685 0.092 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 415200 on 8192 degrees of freedom
## Multiple R-squared: 0.6121, Adjusted R-squared: 0.6116
## F-statistic: 1077 on 12 and 8192 DF, p-value: < 2.2e-16
Simple models
model7 <- lm(Price ~ Rooms+Bathroom+YearBuilt,data=u)
summary(model7)
##
## Call:
## lm(formula = Price ~ Rooms + Bathroom + YearBuilt, data = u)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4710183 -264286 -68636 179398 8108223
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12863495.7 297140.4 43.29 <2e-16 ***
## Rooms 175530.9 7192.3 24.41 <2e-16 ***
## Bathroom 376133.7 10095.6 37.26 <2e-16 ***
## YearBuilt -6568.6 150.8 -43.57 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 492400 on 8201 degrees of freedom
## Multiple R-squared: 0.454, Adjusted R-squared: 0.4538
## F-statistic: 2273 on 3 and 8201 DF, p-value: < 2.2e-16
model8 <- lm(Price ~ Rooms+Bathroom+YearBuilt+Regionname+
Type,data=u)
summary(model8)
##
## Call:
## lm(formula = Price ~ Rooms + Bathroom + YearBuilt + Regionname +
## Type, data = u)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3321654 -255320 -44088 178911 8217746
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9394961.7 298256.5 31.500 < 2e-16 ***
## Rooms 149317.2 7631.9 19.565 < 2e-16 ***
## Bathroom 306043.0 9277.0 32.990 < 2e-16 ***
## YearBuilt -4705.1 153.2 -30.705 < 2e-16 ***
## RegionnameEastern Victoria -438907.6 81012.3 -5.418 6.21e-08 ***
## RegionnameNorthern Metropolitan -34856.7 18188.8 -1.916 0.0554 .
## RegionnameNorthern Victoria -550206.5 86633.2 -6.351 2.25e-10 ***
## RegionnameSouth-Eastern Metropolitan -144702.0 32372.1 -4.470 7.93e-06 ***
## RegionnameSouthern Metropolitan 372051.9 17887.4 20.800 < 2e-16 ***
## RegionnameWestern Metropolitan -143077.2 18712.5 -7.646 2.31e-14 ***
## RegionnameWestern Victoria -615136.9 93681.7 -6.566 5.47e-11 ***
## Typet -144643.9 18956.5 -7.630 2.60e-14 ***
## Typeu -294371.1 15665.9 -18.791 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 442400 on 8192 degrees of freedom
## Multiple R-squared: 0.5597, Adjusted R-squared: 0.559
## F-statistic: 867.7 on 12 and 8192 DF, p-value: < 2.2e-16