Report Provides house price prediction using regression algorithms. The dataset used in this documentation for modeling is house data in Australia
The dataset Link: Here
Import Libraries
Read house datasets
getwd()
## [1] "D:/Document/R/Final Project"
setwd("D:/Document/R")
house_df <- read.csv("house.csv")
str(house_df)
## 'data.frame': 4600 obs. of 18 variables:
## $ date : chr "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" ...
## $ price : num 313000 2384000 342000 420000 550000 ...
## $ bedrooms : num 3 5 3 3 4 2 2 4 3 4 ...
## $ bathrooms : num 1.5 2.5 2 2.25 2.5 1 2 2.5 2.5 2 ...
## $ sqft_living : int 1340 3650 1930 2000 1940 880 1350 2710 2430 1520 ...
## $ sqft_lot : int 7912 9050 11947 8030 10500 6380 2560 35868 88426 6200 ...
## $ floors : num 1.5 2 1 1 1 1 1 2 1 1.5 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 4 0 0 0 0 0 0 0 0 ...
## $ condition : int 3 5 4 4 4 3 3 3 4 3 ...
## $ sqft_above : int 1340 3370 1930 1000 1140 880 1350 2710 1570 1520 ...
## $ sqft_basement: int 0 280 0 1000 800 0 0 0 860 0 ...
## $ yr_built : int 1955 1921 1966 1963 1976 1938 1976 1989 1985 1945 ...
## $ yr_renovated : int 2005 0 0 0 1992 1994 0 0 0 2010 ...
## $ street : chr "18810 Densmore Ave N" "709 W Blaine St" "26206-26214 143rd Ave SE" "857 170th Pl NE" ...
## $ city : chr "Shoreline" "Seattle" "Kent" "Bellevue" ...
## $ statezip : chr "WA 98133" "WA 98119" "WA 98042" "WA 98008" ...
## $ country : chr "USA" "USA" "USA" "USA" ...
This dataset contain 4600 rows and 18 columns. The target variable is Price
ggplot(house_df, aes(y=price)) +
geom_boxplot() +
scale_y_continuous(limits = c(0, 2000000))
## Warning: Removed 47 rows containing non-finite values (stat_boxplot).
house_df$bedrooms2 <- factor(house_df$bedrooms)
house_df$city2 <- factor(house_df$city)
house_df$statezip2 <- factor(house_df$statezip)
house_df$street2 <- factor(house_df$street)
house_df$country2 <- factor(house_df$country)
ggplot(data = house_df, aes(x = bedrooms2, y = price)) +
geom_boxplot() +
scale_y_continuous(limits = c(0,2000000))
Remove rows with incorrect prices
house_df_num <- house_df[,2:12]
idx <- which(house_df_num$price %in% c(0))
house_df_num <- house_df_num[-idx,]
summary(house_df_num$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7800 326264 465000 557906 657500 26590000
Remove rows with outliers
out_price <- boxplot.stats(house_df_num$price)$out
out_idx <- which(house_df_num$price %in% c(out_price))
house_df_num <- house_df_num[-out_idx,]
summary(house_df_num)
## price bedrooms bathrooms sqft_living
## Min. : 7800 Min. :0.000 Min. :0.000 Min. : 370
## 1st Qu.: 320000 1st Qu.:3.000 1st Qu.:1.750 1st Qu.:1430
## Median : 450000 Median :3.000 Median :2.250 Median :1920
## Mean : 487457 Mean :3.352 Mean :2.094 Mean :2031
## 3rd Qu.: 615000 3rd Qu.:4.000 3rd Qu.:2.500 3rd Qu.:2510
## Max. :1150000 Max. :9.000 Max. :5.750 Max. :7320
## sqft_lot floors waterfront view
## Min. : 638 Min. :1.000 Min. :0.000000 Min. :0.0000
## 1st Qu.: 5000 1st Qu.:1.000 1st Qu.:0.000000 1st Qu.:0.0000
## Median : 7566 Median :1.500 Median :0.000000 Median :0.0000
## Mean : 14599 Mean :1.495 Mean :0.003711 Mean :0.1737
## 3rd Qu.: 10696 3rd Qu.:2.000 3rd Qu.:0.000000 3rd Qu.:0.0000
## Max. :1074218 Max. :3.500 Max. :1.000000 Max. :4.0000
## condition sqft_above sqft_basement
## Min. :1.000 Min. : 370 Min. : 0.0
## 1st Qu.:3.000 1st Qu.:1170 1st Qu.: 0.0
## Median :3.000 Median :1540 Median : 0.0
## Mean :3.444 Mean :1747 Mean : 283.7
## 3rd Qu.:4.000 3rd Qu.:2190 3rd Qu.: 560.0
## Max. :5.000 Max. :7320 Max. :2300.0
Preprocess data is 4311 rows
One HOT Encoding for location features
house_df <- house_df[rownames(house_df_num),]
### 1. dataframe statezip
statezip <- house_df$statezip
statezip_df <- data.frame(statezip)
colnames(statezip_df) <- c("loc.")
### 2. One HOT encoding statezip dataframe
library(caret)
## Loading required package: lattice
df1 <- dummyVars("~.", data = statezip_df)
df2 <- data.frame(predict(df1, newdata = statezip_df))
house_df_num <- cbind(house_df_num, df2)
### 3. Training and testing split
d <- dim(house_df_num)
m <- d[1]
set.seed(2022)
train_idx <- sample(m, 0.85 * m)
train_idx[1:5]
## [1] 1459 2871 3915 708 2751
train_data <- house_df_num[train_idx,]
test_data <- house_df_num[-train_idx,]
Create Regression Models
mymodel <- lm(formula = price~., data= train_data)
actual <- test_data$price
pred.mymodel <- predict(mymodel, test_data)
price_df <- data.frame(actual, pred.mymodel)
ggplot(price_df, aes(x = actual,
y = pred.mymodel)) +
geom_point() +
scale_x_continuous(limits = c(0, 2000000)) +
scale_y_continuous(limits = c(0, 2000000))
### Function Performance
performance <- function(pred, actual, method){
error <- pred - actual
se <- error^2
mse <- mean(se)
rmse <- sqrt(mse)
r <- cor(pred, actual)
result <- paste("Method : ", method,
"\n RMSE : ", round(mse,3),
"\n R : ", round(r, 3))
cat(result)
}
performance(pred.mymodel, actual, "Polynomial Regression")
## Method : Polynomial Regression
## RMSE : 11139923787.583
## R : 0.886