This report provides house price prediction using regression algorithms.
The dataset using in this report for modeling is real data in the US. The dataset is hosted in Kaggle. It can be download here : https://www.kaggle.com/datasets/gabrielsantello/cars-purchase-decision-dataset
rm(list = ls())
library(ggplot2)
library(dplyr)
house_df <- read.csv('data/house.csv')
## 2.1. Univariate Analysis
ggplot(data = house_df, aes(y = price)) +
geom_boxplot() +
scale_y_continuous(limits = c(0, 2000000))
ggplot(data = house_df, aes(y = sqft_living)) +
geom_boxplot()
ggplot(data = house_df, aes(x = sqft_living)) +
geom_histogram()
## 2.2. Bivariate Analysis
ggplot(data = house_df, aes(x = sqft_living,
y = price)) +
geom_point()
ggplot(data = house_df, aes(x = sqft_living,
y = price)) +
geom_point() +
scale_y_continuous(limits = c(0, 2000000)) +
scale_x_continuous(limits = c(0, 7500)) +
geom_smooth(method = lm)
ggplot(data = house_df, aes(x = bedrooms,
y = price)) +
geom_point() +
scale_y_continuous(limits = c(0, 2000000))
## add bedrooms columns as factor
house_df$bedrooms2 <- factor(house_df$bedrooms)
ggplot(data = house_df, aes(x = bedrooms2,
y = price)) +
geom_boxplot() +
scale_y_continuous(limits = c(0, 2000000))
## 2.3. Multivariate Analysis
### Challenge: plot sqft_living, bedrooms, price
### Hint: try geom_point, with x, y, color
ggplot(data = house_df, aes(x = sqft_living,
y = price,
color = bedrooms2)) +
geom_point() +
scale_y_continuous(limits = c(0, 2000000)) +
scale_x_continuous(limits = c(0, 7500))