Introduction

This project analyzes housing data using R to identify important factors influencing house prices. The project includes data cleaning, exploratory data analysis, visualizations, and predictive modeling.

Load Packages

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.1     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.3     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

── Attaching packages ────────────────────────────────────── tidymodels 1.5.0 ──
✔ broom        1.0.12     ✔ rsample      1.3.2 
✔ dials        1.4.3      ✔ tailor       0.1.0 
✔ infer        1.1.0      ✔ tune         2.1.0 
✔ modeldata    1.5.1      ✔ workflows    1.3.0 
✔ parsnip      1.5.0      ✔ workflowsets 1.1.1 
✔ recipes      1.3.2      ✔ yardstick    1.4.0 
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()

library(corrplot)

corrplot 0.95 loaded

Create Sample Housing Dataset

set.seed(123)

housing <- tibble(
  Sale_Price = round(rnorm(200, mean = 180000, sd = 50000)),
  Gr_Liv_Area = round(rnorm(200, mean = 1500, sd = 400)),
  Overall_Qual = sample(1:10, 200, replace = TRUE),
  Garage_Cars = sample(0:4, 200, replace = TRUE),
  Year_Built = sample(1990:2023, 200, replace = TRUE)
)

glimpse(housing)

Rows: 200
Columns: 5
$ Sale_Price   <dbl> 151976, 168491, 257935, 183525, 186464, 265753, 203046, 1…
$ Gr_Liv_Area  <dbl> 2380, 2025, 1394, 1717, 1334, 1310, 1185, 1262, 2160, 147…
$ Overall_Qual <int> 9, 8, 7, 9, 3, 1, 7, 3, 7, 10, 3, 6, 7, 4, 10, 1, 8, 5, 3…
$ Garage_Cars  <int> 2, 0, 0, 0, 0, 2, 0, 1, 2, 3, 3, 2, 3, 1, 2, 3, 2, 3, 4, …
$ Year_Built   <int> 2008, 2014, 2006, 1998, 2012, 2020, 2022, 2005, 1997, 199…

Data Cleaning

sum(is.na(housing))

[1] 0

housing <- housing %>%
  drop_na()

Exploratory Data Analysis

Summary Statistics

housing %>%
  summarize(
    avg_price = mean(Sale_Price),
    max_price = max(Sale_Price),
    min_price = min(Sale_Price)
  )

# A tibble: 1 × 3
  avg_price max_price min_price
      <dbl>     <dbl>     <dbl>
1   179571.    342052     64542

Neighborhood Style Analysis Using dplyr

housing %>%
  group_by(Overall_Qual) %>%
  summarize(
    average_price = mean(Sale_Price),
    average_area = mean(Gr_Liv_Area),
    houses = n()
  )

# A tibble: 10 × 4
   Overall_Qual average_price average_area houses
          <int>         <dbl>        <dbl>  <int>
 1            1       187978.        1583.     16
 2            2       172552.        1290.     22
 3            3       173797.        1569.     21
 4            4       196524.        1514.     16
 5            5       170890.        1328.     20
 6            6       190235.        1498.     15
 7            7       179290.        1489.     25
 8            8       175654.        1603.     24
 9            9       171627.        1734.     17
10           10       184560.        1595.     24

Data Visualization

House Price Distribution

ggplot(housing, aes(x = Sale_Price)) +
  geom_histogram(fill = "skyblue", bins = 20) +
  labs(
    title = "Distribution of House Prices",
    x = "Sale Price",
    y = "Count"
  )

Living Area vs Sale Price

ggplot(housing, aes(x = Gr_Liv_Area, y = Sale_Price)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", color = "red") +
  labs(
    title = "Living Area vs Sale Price",
    x = "Living Area",
    y = "Sale Price"
  )

`geom_smooth()` using formula = 'y ~ x'

Correlation Analysis

numeric_data <- housing %>%
  select(Sale_Price, Gr_Liv_Area, Overall_Qual, Garage_Cars)

cor_matrix <- cor(numeric_data)

corrplot(cor_matrix, method = "color")

Train-Test Split

split_data <- initial_split(housing, prop = 0.8)

train_data <- training(split_data)

test_data <- testing(split_data)

Linear Regression Model

model <- linear_reg() %>%
  set_engine("lm")

model_fit <- model %>%
  fit(
    Sale_Price ~ Gr_Liv_Area + Overall_Qual + Garage_Cars,
    data = train_data
  )

model_fit

parsnip model object


Call:
stats::lm(formula = Sale_Price ~ Gr_Liv_Area + Overall_Qual + 
    Garage_Cars, data = data)

Coefficients:
 (Intercept)   Gr_Liv_Area  Overall_Qual   Garage_Cars  
   1.624e+05     4.661e+00     5.369e+02     3.338e+03

Predictions

predictions <- predict(model_fit, test_data) %>%
  bind_cols(test_data)

head(predictions)

# A tibble: 6 × 6
    .pred Sale_Price Gr_Liv_Area Overall_Qual Garage_Cars Year_Built
    <dbl>      <dbl>       <dbl>        <int>       <int>      <int>
1 176085.     168491        2025            8           0       2014
2 185413.     200039        1993            7           3       2008
3 179537.     152208        1103           10           2       2005
4 172417.     187669        1213            2           1       1993
5 177328.     223907        1205            5           2       2001
6 188231.     248430        1536           10           4       1994

Actual vs Predicted Prices

ggplot(predictions,
       aes(x = Sale_Price,
           y = .pred)) +
  geom_point(color = "darkgreen") +
  geom_abline(
    slope = 1,
    intercept = 0,
    color = "red"
  ) +
  labs(
    title = "Actual vs Predicted Prices",
    x = "Actual Prices",
    y = "Predicted Prices"
  )

Conclusion

This project explored housing data using R and applied predictive analytics techniques to predict house prices. The project demonstrated data cleaning, exploratory analysis, visualization, and machine learning using linear regression.