The real estate markets, like those in Sydney and Melbourne, present an interesting opportunity for data analysts to analyze and predict where property prices are moving towards. Prediction of property prices is becoming increasingly important and beneficial. Property prices are a good indicator of both the overall market condition and the economic health of a country. Considering the data provided, we are wrangling a large set of property sales records stored in an unknown format and with unknown data quality issues.
This report provides house price prediction using regression algorithms. The dataset used in this report for modelling house price in Australia. The dataset link: Here
The report is structured as follows:
Data Extraction
Exploratory Data Analysis (EDA)
Data Preprocessing
Modelling
Evaluations
Slides Link
Import Libraries
rm(list=ls())
library(ggplot2)
house_df <- read.csv("house.csv")
str(house_df)
## 'data.frame': 4600 obs. of 18 variables:
## $ date : chr "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" ...
## $ price : num 313000 2384000 342000 420000 550000 ...
## $ bedrooms : num 3 5 3 3 4 2 2 4 3 4 ...
## $ bathrooms : num 1.5 2.5 2 2.25 2.5 1 2 2.5 2.5 2 ...
## $ sqft_living : int 1340 3650 1930 2000 1940 880 1350 2710 2430 1520 ...
## $ sqft_lot : int 7912 9050 11947 8030 10500 6380 2560 35868 88426 6200 ...
## $ floors : num 1.5 2 1 1 1 1 1 2 1 1.5 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 4 0 0 0 0 0 0 0 0 ...
## $ condition : int 3 5 4 4 4 3 3 3 4 3 ...
## $ sqft_above : int 1340 3370 1930 1000 1140 880 1350 2710 1570 1520 ...
## $ sqft_basement: int 0 280 0 1000 800 0 0 0 860 0 ...
## $ yr_built : int 1955 1921 1966 1963 1976 1938 1976 1989 1985 1945 ...
## $ yr_renovated : int 2005 0 0 0 1992 1994 0 0 0 2010 ...
## $ street : chr "18810 Densmore Ave N" "709 W Blaine St" "26206-26214 143rd Ave SE" "857 170th Pl NE" ...
## $ city : chr "Shoreline" "Seattle" "Kent" "Bellevue" ...
## $ statezip : chr "WA 98133" "WA 98119" "WA 98042" "WA 98008" ...
## $ country : chr "USA" "USA" "USA" "USA" ...
dim(house_df)
## [1] 4600 18
The dataset contains 4600 rows and 18 columns. The target variable is PRICE
## date price bedrooms bathrooms
## Length:4600 Min. : 0 Min. :0.000 Min. :0.000
## Class :character 1st Qu.: 322875 1st Qu.:3.000 1st Qu.:1.750
## Mode :character Median : 460943 Median :3.000 Median :2.250
## Mean : 551963 Mean :3.401 Mean :2.161
## 3rd Qu.: 654962 3rd Qu.:4.000 3rd Qu.:2.500
## Max. :26590000 Max. :9.000 Max. :8.000
## sqft_living sqft_lot floors waterfront
## Min. : 370 Min. : 638 Min. :1.000 Min. :0.000000
## 1st Qu.: 1460 1st Qu.: 5001 1st Qu.:1.000 1st Qu.:0.000000
## Median : 1980 Median : 7683 Median :1.500 Median :0.000000
## Mean : 2139 Mean : 14852 Mean :1.512 Mean :0.007174
## 3rd Qu.: 2620 3rd Qu.: 11001 3rd Qu.:2.000 3rd Qu.:0.000000
## Max. :13540 Max. :1074218 Max. :3.500 Max. :1.000000
## view condition sqft_above sqft_basement
## Min. :0.0000 Min. :1.000 Min. : 370 Min. : 0.0
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:1190 1st Qu.: 0.0
## Median :0.0000 Median :3.000 Median :1590 Median : 0.0
## Mean :0.2407 Mean :3.452 Mean :1827 Mean : 312.1
## 3rd Qu.:0.0000 3rd Qu.:4.000 3rd Qu.:2300 3rd Qu.: 610.0
## Max. :4.0000 Max. :5.000 Max. :9410 Max. :4820.0
## yr_built yr_renovated street city
## Min. :1900 Min. : 0.0 Length:4600 Length:4600
## 1st Qu.:1951 1st Qu.: 0.0 Class :character Class :character
## Median :1976 Median : 0.0 Mode :character Mode :character
## Mean :1971 Mean : 808.6
## 3rd Qu.:1997 3rd Qu.:1999.0
## Max. :2014 Max. :2014.0
## statezip country
## Length:4600 Length:4600
## Class :character Class :character
## Mode :character Mode :character
##
##
##
Plot distribution of price using Boxplot
ggplot(data=house_df, aes(y=price)) +
geom_boxplot() +
scale_y_continuous(limits = c(0,2000000))
Convert the categorical variables to factor
house_df$bedrooms2 <- factor(house_df$bedrooms)
house_df$city2 <- factor(house_df$city)
house_df$statezip2 <- factor(house_df$statezip)
house_df$street2 <- factor(house_df$street)
house_df$country2 <- factor(house_df$country)
ggplot(data=house_df, aes(x=bedrooms2,
y=price)) +
geom_boxplot()+
scale_y_continuous(limits = c(0,2000000))
house_df_num <- house_df[,2:12]
r <- cor(house_df_num)
library(corrgram)
corrgram(house_df_num, order=TRUE,
upper.panel=panel.pie)
Remove rows with incorrect value Prices
idx <- which(house_df_num$price %in% c(0))
house_df_num <- house_df_num[-idx,]
summary(house_df_num)
## price bedrooms bathrooms sqft_living
## Min. : 7800 Min. :0.000 Min. :0.000 Min. : 370
## 1st Qu.: 326264 1st Qu.:3.000 1st Qu.:1.750 1st Qu.: 1460
## Median : 465000 Median :3.000 Median :2.250 Median : 1970
## Mean : 557906 Mean :3.395 Mean :2.155 Mean : 2132
## 3rd Qu.: 657500 3rd Qu.:4.000 3rd Qu.:2.500 3rd Qu.: 2610
## Max. :26590000 Max. :9.000 Max. :8.000 Max. :13540
## sqft_lot floors waterfront view
## Min. : 638 Min. :1.000 Min. :0.000000 Min. :0.0000
## 1st Qu.: 5000 1st Qu.:1.000 1st Qu.:0.000000 1st Qu.:0.0000
## Median : 7680 Median :1.500 Median :0.000000 Median :0.0000
## Mean : 14835 Mean :1.512 Mean :0.006592 Mean :0.2347
## 3rd Qu.: 10978 3rd Qu.:2.000 3rd Qu.:0.000000 3rd Qu.:0.0000
## Max. :1074218 Max. :3.500 Max. :1.000000 Max. :4.0000
## condition sqft_above sqft_basement
## Min. :1.000 Min. : 370 Min. : 0.0
## 1st Qu.:3.000 1st Qu.:1190 1st Qu.: 0.0
## Median :3.000 Median :1590 Median : 0.0
## Mean :3.449 Mean :1822 Mean : 310.2
## 3rd Qu.:4.000 3rd Qu.:2300 3rd Qu.: 600.0
## Max. :5.000 Max. :9410 Max. :4820.0
Remove rows with outlier in Prices
out_price <- boxplot.stats(house_df_num$price)$out
out_idx <- which(house_df_num$price %in% c(out_price))
house_df_num <- house_df_num[-out_idx,]
summary(house_df_num$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7800 320000 450000 487457 615000 1150000
house_df <- house_df[rownames(house_df_num),]
statezip <- house_df$statezip2
statezip_df <- data.frame(statezip)
colnames(statezip_df) <- c("loc.")
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
##
## panel.fill
df1 <- dummyVars("~.", data=statezip_df)
df2 <- data.frame(predict(df1, newdata = statezip_df))
house_df_num <- cbind(house_df_num, df2)
d <- dim(house_df_num)
m <- d[1] #jumlah baris
n <- d[2] #jumlah kolom
set.seed(2022)
train_idx <- sample(m, 0.8*m)
train_data <- house_df_num[train_idx,]
test_data <- house_df_num[-train_idx,]
dim(train_data)
## [1] 3448 88
dim(test_data)
## [1] 863 88
train_data contains 3448 rows and 88 columns
test_data contains 863 rows and 88 columns
mymodel <- lm(formula = price~. +
I(sqft_living^2) +
sqft_living:bedrooms,
data=train_data)
summary(mymodel)
##
## Call:
## lm(formula = price ~ . + I(sqft_living^2) + sqft_living:bedrooms,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -827044 -51604 -627 48278 451427
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.141e+05 7.253e+04 -1.573 0.115710
## bedrooms 1.099e+04 6.834e+03 1.607 0.108084
## bathrooms 1.574e+04 3.923e+03 4.011 6.17e-05 ***
## sqft_living 1.773e+02 1.101e+01 16.104 < 2e-16 ***
## sqft_lot 1.864e-01 4.923e-02 3.786 0.000156 ***
## floors -2.555e+04 4.761e+03 -5.365 8.63e-08 ***
## waterfront 1.528e+05 2.949e+04 5.182 2.32e-07 ***
## view 3.396e+04 2.998e+03 11.328 < 2e-16 ***
## condition 2.420e+04 2.737e+03 8.841 < 2e-16 ***
## sqft_above 7.963e+01 5.794e+00 13.744 < 2e-16 ***
## sqft_basement NA NA NA NA
## loc..WA.98001 -9.925e+04 7.105e+04 -1.397 0.162538
## loc..WA.98002 -9.947e+04 7.213e+04 -1.379 0.167962
## loc..WA.98003 -8.481e+04 7.158e+04 -1.185 0.236197
## loc..WA.98004 4.663e+05 7.184e+04 6.490 9.83e-11 ***
## loc..WA.98005 2.549e+05 7.259e+04 3.511 0.000452 ***
## loc..WA.98006 2.093e+05 7.074e+04 2.959 0.003106 **
## loc..WA.98007 2.308e+05 7.236e+04 3.189 0.001439 **
## loc..WA.98008 1.550e+05 7.195e+04 2.154 0.031304 *
## loc..WA.98010 1.552e+04 8.252e+04 0.188 0.850847
## loc..WA.98011 5.293e+04 7.233e+04 0.732 0.464326
## loc..WA.98014 4.264e+04 7.340e+04 0.581 0.561349
## loc..WA.98019 -1.506e+04 7.171e+04 -0.210 0.833693
## loc..WA.98022 -1.006e+05 7.255e+04 -1.386 0.165822
## loc..WA.98023 -1.102e+05 7.067e+04 -1.560 0.118939
## loc..WA.98024 7.590e+04 7.715e+04 0.984 0.325279
## loc..WA.98027 1.154e+05 7.073e+04 1.631 0.103015
## loc..WA.98028 5.800e+04 7.110e+04 0.816 0.414724
## loc..WA.98029 1.676e+05 7.090e+04 2.363 0.018164 *
## loc..WA.98030 -8.965e+04 7.181e+04 -1.248 0.211971
## loc..WA.98031 -7.969e+04 7.102e+04 -1.122 0.261914
## loc..WA.98032 -1.272e+05 7.426e+04 -1.712 0.086917 .
## loc..WA.98033 2.503e+05 7.082e+04 3.535 0.000413 ***
## loc..WA.98034 1.128e+05 7.055e+04 1.599 0.110001
## loc..WA.98038 -6.468e+04 7.069e+04 -0.915 0.360262
## loc..WA.98039 -3.712e+05 1.208e+05 -3.073 0.002137 **
## loc..WA.98040 3.023e+05 7.158e+04 4.223 2.48e-05 ***
## loc..WA.98042 -8.852e+04 7.055e+04 -1.255 0.209640
## loc..WA.98045 7.072e+03 7.143e+04 0.099 0.921140
## loc..WA.98047 -5.699e+04 8.246e+04 -0.691 0.489556
## loc..WA.98050 3.062e+04 1.212e+05 0.253 0.800614
## loc..WA.98051 4.679e+03 8.279e+04 0.057 0.954933
## loc..WA.98052 2.005e+05 7.035e+04 2.850 0.004393 **
## loc..WA.98053 1.696e+05 7.080e+04 2.396 0.016647 *
## loc..WA.98055 -3.077e+04 7.314e+04 -0.421 0.673979
## loc..WA.98056 2.264e+04 7.077e+04 0.320 0.749024
## loc..WA.98057 -7.399e+04 7.491e+04 -0.988 0.323352
## loc..WA.98058 -4.490e+04 7.060e+04 -0.636 0.524854
## loc..WA.98059 2.822e+04 7.053e+04 0.400 0.689081
## loc..WA.98065 4.594e+04 7.111e+04 0.646 0.518278
## loc..WA.98068 NA NA NA NA
## loc..WA.98070 3.100e+03 7.305e+04 0.042 0.966151
## loc..WA.98072 1.072e+05 7.126e+04 1.505 0.132461
## loc..WA.98074 1.455e+05 7.078e+04 2.056 0.039878 *
## loc..WA.98075 1.898e+05 7.103e+04 2.672 0.007585 **
## loc..WA.98077 1.131e+05 7.154e+04 1.581 0.113866
## loc..WA.98092 -9.293e+04 7.074e+04 -1.314 0.189072
## loc..WA.98102 3.513e+05 7.467e+04 4.705 2.64e-06 ***
## loc..WA.98103 2.464e+05 7.040e+04 3.499 0.000472 ***
## loc..WA.98105 3.270e+05 7.228e+04 4.524 6.28e-06 ***
## loc..WA.98106 3.883e+04 7.110e+04 0.546 0.584952
## loc..WA.98107 2.507e+05 7.120e+04 3.520 0.000436 ***
## loc..WA.98108 5.293e+04 7.182e+04 0.737 0.461161
## loc..WA.98109 4.051e+05 7.462e+04 5.428 6.10e-08 ***
## loc..WA.98112 3.555e+05 7.173e+04 4.956 7.55e-07 ***
## loc..WA.98115 2.190e+05 7.051e+04 3.106 0.001915 **
## loc..WA.98116 2.327e+05 7.115e+04 3.271 0.001083 **
## loc..WA.98117 2.247e+05 7.045e+04 3.190 0.001435 **
## loc..WA.98118 9.206e+04 7.088e+04 1.299 0.194092
## loc..WA.98119 3.626e+05 7.199e+04 5.037 4.99e-07 ***
## loc..WA.98122 2.617e+05 7.101e+04 3.686 0.000232 ***
## loc..WA.98125 9.594e+04 7.093e+04 1.353 0.176260
## loc..WA.98126 1.182e+05 7.081e+04 1.670 0.095066 .
## loc..WA.98133 6.455e+04 7.066e+04 0.913 0.361047
## loc..WA.98136 1.882e+05 7.128e+04 2.641 0.008309 **
## loc..WA.98144 1.869e+05 7.104e+04 2.632 0.008537 **
## loc..WA.98146 4.802e+04 7.121e+04 0.674 0.500173
## loc..WA.98148 -2.976e+04 7.636e+04 -0.390 0.696731
## loc..WA.98155 5.246e+04 7.065e+04 0.742 0.457855
## loc..WA.98166 2.783e+04 7.134e+04 0.390 0.696482
## loc..WA.98168 -3.975e+04 7.106e+04 -0.559 0.575906
## loc..WA.98177 1.350e+05 7.197e+04 1.876 0.060718 .
## loc..WA.98178 -6.123e+04 7.156e+04 -0.856 0.392274
## loc..WA.98188 -8.010e+04 7.315e+04 -1.095 0.273627
## loc..WA.98198 -7.887e+04 7.138e+04 -1.105 0.269212
## loc..WA.98199 2.628e+05 7.130e+04 3.685 0.000232 ***
## loc..WA.98288 -3.630e+04 9.002e+04 -0.403 0.686759
## loc..WA.98354 NA NA NA NA
## I(sqft_living^2) -8.225e-03 2.415e-03 -3.405 0.000669 ***
## bedrooms:sqft_living -1.168e+01 2.878e+00 -4.057 5.08e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 98520 on 3361 degrees of freedom
## Multiple R-squared: 0.7915, Adjusted R-squared: 0.7862
## F-statistic: 148.4 on 86 and 3361 DF, p-value: < 2.2e-16
actual <- test_data$price
pred.mymodel <- predict(mymodel, test_data)
## Warning in predict.lm(mymodel, test_data): prediction from a rank-deficient fit
## may be misleading
price_df <- data.frame(actual, pred.mymodel)
ggplot(data = price_df, aes(x = actual,
y = pred.mymodel)) +
geom_point() +
scale_x_continuous(limits = c(0,1500000)) +
scale_y_continuous(limits = c(0,1500000))
performance <- function(prediction, actual, method) {
error <- prediction - actual
squared_error <- error^2
sum_squared_error <- sum(squared_error)
mean_squared_error <- mean(squared_error)
root_mean_squared_error <- sqrt(mean_squared_error)
result <- paste("Method:", method,
"\n SSE =", round(sum_squared_error,3),
"\n MSE =",round(mean_squared_error,3),
"\n RMSE =" ,round(root_mean_squared_error,3))
cat(result)
}
performance(pred.mymodel, actual, "Polynomial Regression")
## Method: Polynomial Regression
## SSE = 9710277801637.52
## MSE = 11251770337.935
## RMSE = 106074.362