Description

This report provides house price prediction using regression algorithms.

The dataset using in this report for modeling is real data in the US. The dataset is hosted in Kaggle. It can be download here : https://www.kaggle.com/datasets/gabrielsantello/cars-purchase-decision-dataset

rm(list = ls())
library(ggplot2)
library(dplyr)

1. Data Extraction

house_df <- read.csv('data/house.csv')

2. Exploratory Data Analysis

## 2.1. Univariate Analysis
ggplot(data = house_df, aes(y = price)) +
  geom_boxplot() +
  scale_y_continuous(limits = c(0, 2000000))

ggplot(data = house_df, aes(y = sqft_living)) +
  geom_boxplot()

ggplot(data = house_df, aes(x = sqft_living)) +
  geom_histogram()

## 2.2. Bivariate Analysis
ggplot(data = house_df, aes(x = sqft_living,
                            y = price)) +
  geom_point()

ggplot(data = house_df, aes(x = sqft_living,
                            y = price)) +
  geom_point() +
  scale_y_continuous(limits = c(0, 2000000)) +
  scale_x_continuous(limits = c(0, 7500)) +
  geom_smooth(method = lm)

ggplot(data = house_df, aes(x = bedrooms,
                            y = price)) +
  geom_point() +
  scale_y_continuous(limits = c(0, 2000000))

## add bedrooms columns as factor

house_df$bedrooms2 <- factor(house_df$bedrooms)
ggplot(data = house_df, aes(x = bedrooms2,
                            y = price)) +
  geom_boxplot() +
  scale_y_continuous(limits = c(0, 2000000))

## 2.3. Multivariate Analysis

### Challenge: plot sqft_living, bedrooms, price
### Hint: try geom_point, with x, y, color 

ggplot(data = house_df, aes(x = sqft_living, 
                            y = price,
                            color = bedrooms2)) +
  geom_point() +
  scale_y_continuous(limits = c(0, 2000000)) +
  scale_x_continuous(limits = c(0, 7500))

2.1 Univariate Analysis
2.2 Bivariate Analysis
2.3 Multivariate Analysis

3. Data Preparation