#Description

This project provides house price prediction using regression. the dataset for the modeling [HERE(https://www.kaggle.com/shree1992/housedata)

The project is structured as follows :

  1. Data Understanding

  2. Data Visualizations / EDA

  3. Data Preprocessing

  4. Modeling

  5. Performance Evaluations

1. Data Understanding

rm(list = ls())
library(ggplot2)
house_df <- read.csv("house.csv")

str(house_df)
## 'data.frame':    4600 obs. of  18 variables:
##  $ date         : chr  "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" ...
##  $ price        : num  313000 2384000 342000 420000 550000 ...
##  $ bedrooms     : num  3 5 3 3 4 2 2 4 3 4 ...
##  $ bathrooms    : num  1.5 2.5 2 2.25 2.5 1 2 2.5 2.5 2 ...
##  $ sqft_living  : int  1340 3650 1930 2000 1940 880 1350 2710 2430 1520 ...
##  $ sqft_lot     : int  7912 9050 11947 8030 10500 6380 2560 35868 88426 6200 ...
##  $ floors       : num  1.5 2 1 1 1 1 1 2 1 1.5 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 4 0 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 5 4 4 4 3 3 3 4 3 ...
##  $ sqft_above   : int  1340 3370 1930 1000 1140 880 1350 2710 1570 1520 ...
##  $ sqft_basement: int  0 280 0 1000 800 0 0 0 860 0 ...
##  $ yr_built     : int  1955 1921 1966 1963 1976 1938 1976 1989 1985 1945 ...
##  $ yr_renovated : int  2005 0 0 0 1992 1994 0 0 0 2010 ...
##  $ street       : chr  "18810 Densmore Ave N" "709 W Blaine St" "26206-26214 143rd Ave SE" "857 170th Pl NE" ...
##  $ city         : chr  "Shoreline" "Seattle" "Kent" "Bellevue" ...
##  $ statezip     : chr  "WA 98133" "WA 98119" "WA 98042" "WA 98008" ...
##  $ country      : chr  "USA" "USA" "USA" "USA" ...

The dataset contains 4600 rows and 18 columns. The target variable is Price

2. Data Visualizations / EDA

2.1 Univariate Data Analysis

Plot distribution of Price with Boxplot

ggplot(data = house_df,
       aes(y = price)) +
  geom_boxplot() +
  scale_y_continuous(limits = c(0, 2000000))

2.2 Bivariate Data Analysis

house_df$bedrooms2 <- factor(house_df$bedrooms)
house_df$city2 <- factor(house_df$city)
house_df$statezip2 <- factor(house_df$statezip)
house_df$street2 <- factor(house_df$street)
house_df$country2 <- factor(house_df$country)

ggplot(data = house_df,
       aes(x = bedrooms2,
           y = price)) +
  geom_boxplot() +
  scale_y_continuous(limits = c(0, 2000000))

2.3 Multivariate Data Analysis

house_df_num <- house_df[, 2:12]

corre <- cor(house_df_num)

library(corrgram)

corrgram(house_df_num, order = TRUE,
         upper.panel = panel.pie)

3. Data Preprocessing

3.1 Data Cleansing

Remove rows with incorrect value prices

# menampung index urutan data dimana harga adalah 0
idx <- which(house_df_num$price %in% c(0))

house_df_num <- house_df_num[-idx,]
summary(house_df_num$price)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     7800   326264   465000   557906   657500 26590000

Remove rows with outliers prices

out_price <- boxplot.stats(house_df_num$price)$out

out_idx <- which(house_df_num$price %in% c(out_price))

house_df_num <- house_df_num[-out_idx,]
summary(house_df_num$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7800  320000  450000  487457  615000 1150000

3.2 Feature Engineering

One Hot Encoding for Location Features

house_df <- house_df[rownames(house_df_num), ]

# 1. create dataframe for statezip

statezip <- house_df$statezip
statezip_df <- data.frame(statezip)
colnames(statezip_df) <- c("loc.")

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
## 
##     panel.fill
df1 <- dummyVars("~.", data = statezip_df)
df2 <- data.frame(predict(df1, newdata = statezip_df))

house_df_num <- cbind(house_df_num, df2)
dim(house_df_num)
## [1] 4311   88

3.3 Training and Testing Split

set.seed(2022)
row <- dim(house_df_num)[1]

train_idx <- sample(row, 0.7*row)

train_data <- house_df_num[train_idx, ]
test_data <- house_df_num[-train_idx, ]

4. Modeling

Create Regression Model

{r}
mymodel <- lm(formula = price~. + I(sqft_living^2)+
                  I(bathrooms^2) + I(view^2),
                data = train_data)

5. Performance Evaluations

{r}
actual <- test_data$price

pred.myModel <- predict(mymodel, test_data)

price_df <- data.frame(actual, pred.myModel)

ggplot(data = price_df,
       aes(x = actual,
           y = pred.myModel)) +
  geom_point() +
  scale_x_continuous(limits = c(0, 1500000)) +
  scale_y_continuous(limits = c(0, 1500000))

{r} performance <- function(prediction, actual, method){ error <- prediction - actual squared_error <- error ^2 sum_squared_error <- sum(squared_error) mean_squared_error <- mean(squared_error) root_mean_squared_error <- sqrt(mean_squared_error) mean_absolute_error <- abs(sum_squared_error)

r <- cor(prediction, actual)

result <- paste(“Method”, method, “Mean Squared Error:”, round(mean_squared_error,2), “Root Mean Squared Error:”, round(root_mean_squared_error,2), “Mean Absolute Error:”, round(mean_absolute_error,2), “Correlarion:”, round(r, 2))

cat(result) }

performance(pred.myModel, actual, “Regression Model”)