library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(readr)
#Load in Data
path <- "C:/Users/nsepe/Downloads/apartments_data_cleaned.csv"
df <- read.csv(path,
stringsAsFactors = FALSE,
na.strings = c("", "NA"))
glimpse(df)
## Rows: 10,000
## Columns: 21
## $ id <dbl> 5668626895, 5664597177, 5668626833, 5659918074, 56686267…
## $ category <chr> "housing/rent/apartment", "housing/rent/apartment", "hou…
## $ title <chr> "Studio apartment 2nd St NE, Uhland Terrace NE, Washingt…
## $ body <chr> "This unit is located at second St NE, Uhland Terrace NE…
## $ amenities <chr> "nan", "nan", "nan", "nan", "nan", "Dishwasher,Elevator,…
## $ bathrooms <dbl> NA, NA, 1, 1, NA, 1, NA, NA, 1, NA, NA, NA, NA, NA, 1, 1…
## $ bedrooms <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,…
## $ fee <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "N…
## $ has_photo <chr> "Thumbnail", "Thumbnail", "Thumbnail", "Thumbnail", "Thu…
## $ pets_allowed <chr> "None", "None", "None", "None", "None", "nan", "None", "…
## $ price <int> 790, 425, 1390, 925, 880, 2475, 1800, 840, 1495, 890, 99…
## $ price_display <chr> "$790", "$425", "$1,390", "$925", "$880", "$2,475", "$1,…
## $ price_type <chr> "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "…
## $ square_feet <int> 101, 106, 107, 116, 125, 130, 132, 136, 138, 141, 146, 1…
## $ address <chr> "nan", "814 Schutte Rd", "nan", "1717 12th Avenue", "nan…
## $ cityname <chr> "Washington", "Evansville", "Arlington", "Seattle", "Arl…
## $ state <chr> "DC", "IN", "VA", "WA", "VA", "NY", "CA", "DC", "CA", "D…
## $ latitude <dbl> 38.9057, 37.9680, 38.8910, 47.6160, 38.8738, 40.7629, 33…
## $ longitude <dbl> -76.9861, -87.6621, -77.0816, -122.3275, -77.1055, -73.9…
## $ source <chr> "RentLingo", "RentLingo", "RentLingo", "RentLingo", "Ren…
## $ time <chr> "2019-12-26 11:23:35", "2019-12-22 12:17:43", "2019-12-2…
skim(df)
Name | df |
Number of rows | 10000 |
Number of columns | 21 |
_______________________ | |
Column type frequency: | |
character | 14 |
numeric | 7 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
category | 0 | 1 | 17 | 23 | 0 | 3 | 0 |
title | 0 | 1 | 5 | 80 | 0 | 9359 | 0 |
body | 0 | 1 | 6 | 1039 | 0 | 9961 | 0 |
amenities | 0 | 1 | 2 | 184 | 0 | 2272 | 0 |
fee | 0 | 1 | 2 | 2 | 0 | 1 | 0 |
has_photo | 0 | 1 | 2 | 9 | 0 | 3 | 0 |
pets_allowed | 0 | 1 | 3 | 10 | 0 | 6 | 0 |
price_display | 0 | 1 | 4 | 19 | 0 | 1734 | 0 |
price_type | 0 | 1 | 6 | 14 | 0 | 3 | 0 |
address | 0 | 1 | 3 | 76 | 0 | 6659 | 0 |
cityname | 0 | 1 | 3 | 22 | 0 | 1575 | 0 |
state | 0 | 1 | 2 | 3 | 0 | 52 | 0 |
source | 0 | 1 | 8 | 17 | 0 | 12 | 0 |
time | 1 | 1 | 19 | 19 | 0 | 6309 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 5623395652.88 | 70210252.04 | 5508654087.00 | 5509248467.00 | 5.66861e+09 | 5668626447.25 | 5668662559.00 | ▃▁▁▁▇ |
bathrooms | 34 | 1 | 1.38 | 0.62 | 1.00 | 1.00 | 1.00000e+00 | 2.00 | 8.50 | ▇▁▁▁▁ |
bedrooms | 7 | 1 | 1.74 | 0.94 | 0.00 | 1.00 | 2.00000e+00 | 2.00 | 9.00 | ▇▇▁▁▁ |
price | 0 | 1 | 1486.28 | 1076.51 | 200.00 | 949.00 | 1.27000e+03 | 1695.00 | 52500.00 | ▇▁▁▁▁ |
square_feet | 0 | 1 | 945.81 | 655.76 | 101.00 | 649.00 | 8.02000e+02 | 1100.00 | 40000.00 | ▇▁▁▁▁ |
latitude | 10 | 1 | 37.70 | 5.50 | 21.32 | 33.68 | 3.88100e+01 | 41.35 | 61.59 | ▁▇▇▂▁ |
longitude | 10 | 1 | -94.65 | 15.76 | -158.02 | -101.30 | -9.36500e+01 | -82.21 | -70.19 | ▁▁▅▇▇ |
#Cleaning Data
library(tidyverse)
library(skimr)
library(here)
## here() starts at C:/Users/nsepe/Documents
#Remove duplicate rows
df <- df |> distinct()
#Impute missing values
#Numeric missing values are replaced by the median of the column they belong
#Text missing columns will read as "Unknown"
df <- df |>
mutate(
across(where(is.numeric),
~ if_else(is.na(.),
median(., na.rm = TRUE),
.)),
across(where(is.character),
~ replace_na(., "Unknown"))
)
#Amenity‐count column
df <- df |>
mutate(
n_amenities = if_else(
amenities == "Unknown" | amenities == "",
0,
str_count(amenities, ",") + 1
)
)
#Binarize pets_alowed
#1 = allowed
#0 = not allowed
df <- df |>
mutate(pets_allowed = str_replace_all(pets_allowed, '"', '')) |>
mutate(pet_flag = if_else(
pets_allowed %in% c("None","Unknown",""), 0, 1
))
#Drop redundant columns (fee, price_display, address)
#Fee: This column is always no
#price_display: Appears to be a string version of price
#Address: Will use other columns if we want information regarding location
df <- df |>
select(-c(fee, price_display, address))
#Create log(price)
df <- df |>
mutate(
log_price = log1p(price)
)
glimpse(df)
## Rows: 10,000
## Columns: 21
## $ id <dbl> 5668626895, 5664597177, 5668626833, 5659918074, 566862675…
## $ category <chr> "housing/rent/apartment", "housing/rent/apartment", "hous…
## $ title <chr> "Studio apartment 2nd St NE, Uhland Terrace NE, Washingto…
## $ body <chr> "This unit is located at second St NE, Uhland Terrace NE,…
## $ amenities <chr> "nan", "nan", "nan", "nan", "nan", "Dishwasher,Elevator,P…
## $ bathrooms <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ bedrooms <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, …
## $ has_photo <chr> "Thumbnail", "Thumbnail", "Thumbnail", "Thumbnail", "Thum…
## $ pets_allowed <chr> "None", "None", "None", "None", "None", "nan", "None", "N…
## $ price <dbl> 790, 425, 1390, 925, 880, 2475, 1800, 840, 1495, 890, 990…
## $ price_type <chr> "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "M…
## $ square_feet <dbl> 101, 106, 107, 116, 125, 130, 132, 136, 138, 141, 146, 15…
## $ cityname <chr> "Washington", "Evansville", "Arlington", "Seattle", "Arli…
## $ state <chr> "DC", "IN", "VA", "WA", "VA", "NY", "CA", "DC", "CA", "DC…
## $ latitude <dbl> 38.9057, 37.9680, 38.8910, 47.6160, 38.8738, 40.7629, 33.…
## $ longitude <dbl> -76.9861, -87.6621, -77.0816, -122.3275, -77.1055, -73.98…
## $ source <chr> "RentLingo", "RentLingo", "RentLingo", "RentLingo", "Rent…
## $ time <chr> "2019-12-26 11:23:35", "2019-12-22 12:17:43", "2019-12-26…
## $ n_amenities <dbl> 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 1, 1, …
## $ pet_flag <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, …
## $ log_price <dbl> 6.673298, 6.054439, 7.237778, 6.830874, 6.781058, 7.81440…
skim(df)
Name | df |
Number of rows | 10000 |
Number of columns | 21 |
_______________________ | |
Column type frequency: | |
character | 11 |
numeric | 10 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
category | 0 | 1 | 17 | 23 | 0 | 3 | 0 |
title | 0 | 1 | 5 | 80 | 0 | 9359 | 0 |
body | 0 | 1 | 6 | 1039 | 0 | 9961 | 0 |
amenities | 0 | 1 | 2 | 184 | 0 | 2272 | 0 |
has_photo | 0 | 1 | 2 | 9 | 0 | 3 | 0 |
pets_allowed | 0 | 1 | 3 | 9 | 0 | 5 | 0 |
price_type | 0 | 1 | 6 | 14 | 0 | 3 | 0 |
cityname | 0 | 1 | 3 | 22 | 0 | 1575 | 0 |
state | 0 | 1 | 2 | 3 | 0 | 52 | 0 |
source | 0 | 1 | 8 | 17 | 0 | 12 | 0 |
time | 0 | 1 | 7 | 19 | 0 | 6310 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 5623395652.88 | 70210252.04 | 5508654087.00 | 5509248467.00 | 5.66861e+09 | 5668626447.25 | 5668662559.00 | ▃▁▁▁▇ |
bathrooms | 0 | 1 | 1.38 | 0.61 | 1.00 | 1.00 | 1.00000e+00 | 2.00 | 8.50 | ▇▁▁▁▁ |
bedrooms | 0 | 1 | 1.74 | 0.94 | 0.00 | 1.00 | 2.00000e+00 | 2.00 | 9.00 | ▇▇▁▁▁ |
price | 0 | 1 | 1486.28 | 1076.51 | 200.00 | 949.00 | 1.27000e+03 | 1695.00 | 52500.00 | ▇▁▁▁▁ |
square_feet | 0 | 1 | 945.81 | 655.76 | 101.00 | 649.00 | 8.02000e+02 | 1100.00 | 40000.00 | ▇▁▁▁▁ |
latitude | 0 | 1 | 37.70 | 5.49 | 21.32 | 33.69 | 3.88100e+01 | 41.35 | 61.59 | ▁▇▇▂▁ |
longitude | 0 | 1 | -94.65 | 15.75 | -158.02 | -101.30 | -9.36500e+01 | -82.29 | -70.19 | ▁▁▅▇▇ |
n_amenities | 0 | 1 | 3.47 | 3.13 | 1.00 | 1.00 | 2.00000e+00 | 5.00 | 18.00 | ▇▂▁▁▁ |
pet_flag | 0 | 1 | 0.76 | 0.43 | 0.00 | 1.00 | 1.00000e+00 | 1.00 | 1.00 | ▂▁▁▁▇ |
log_price | 0 | 1 | 7.18 | 0.47 | 5.30 | 6.86 | 7.15000e+00 | 7.44 | 10.87 | ▁▇▂▁▁ |
#EDA: Looking over numeric variables
df |>
select(log_price, square_feet, bathrooms, bedrooms, n_amenities) |>
pivot_longer(everything(), names_to="feature", values_to="value") |>
ggplot(aes(x = value)) +
geom_histogram(bins = 50) +
facet_wrap(~ feature, scales = "free") +
labs(title="Distributions of Core Numeric Features")
#The log price was the right move, price is now going to be prepped for modeling
#Square feet appears to be right skewed, might want to tryu log transfor as well
#Maybe make bedrooms and bathrooms categorical since most of it is skewed to lower numbers
#Square Feet(size) v. Rent
#Positive Relationship but not linear, likely need to log transform sqrft.
#Log transform before modeling this varaible
ggplot(df, aes(x = square_feet, y = log_price)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "loess", se = FALSE, color = "steelblue") +
labs(
title = "Log(Price) vs. Square Feet",
x = "Square Feet",
y = "Log(1 + Price)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
#Amenities v. Rent
#Almost flat, suggests this to be a weak predictor of rent
ggplot(df, aes(x = n_amenities, y = log_price)) +
geom_jitter(width = 0.2, alpha = 0.2) +
geom_smooth(method = "lm", se = FALSE, color = "tomato") +
labs(
title = "Log(Price) vs. # Amenities",
x = "# Amenities",
y = "Log(1 + Price)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
#Pet-Friendly Premium
#At best, there is a modest pet premium
ggplot(df, aes(x = factor(pet_flag), y = log_price)) +
geom_boxplot(fill = c("#D3D3D3", "#A6CEE3")) +
labs(
title = "Log(Price) by Pet-Friendly Status",
x = "Pets Allowed? (0 = No, 1 = Yes)",
y = "Log(1 + Price)"
) +
theme_minimal()
#Log Transformed Size
#Create log_sqft
#Observe a positive linear relationship
df <- df |>
mutate(log_sqft = log1p(square_feet))
#Plot log_price vs. log_sqft
ggplot(df, aes(x = log_sqft, y = log_price)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", se = FALSE, color = "darkblue") +
labs(
title = "Log(Price) vs. Log(Square Feet)",
x = "Log(1 + Square Feet)",
y = "Log(1 + Price)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
#EDA on Bedrooms and Bathrooms
#Bedroom Summary
bed_stats <- df |>
group_by(bedrooms) |>
summarize(
count = n(),
mean_lp = mean(log_price),
median_lp = median(log_price),
sd_lp = sd(log_price)
) |>
arrange(bedrooms)
#Bathroom Summary
bath_stats <- df |>
group_by(bathrooms) |>
summarize(
count = n(),
mean_lp = mean(log_price),
median_lp = median(log_price),
sd_lp = sd(log_price)
) |>
arrange(bathrooms)
bed_stats
## # A tibble: 10 × 5
## bedrooms count mean_lp median_lp sd_lp
## <dbl> <int> <dbl> <dbl> <dbl>
## 1 0 198 7.16 7.17 0.514
## 2 1 4607 7.03 7.00 0.404
## 3 2 3405 7.22 7.19 0.449
## 4 3 1276 7.37 7.31 0.447
## 5 4 404 7.61 7.49 0.482
## 6 5 89 8.05 8.01 0.485
## 7 6 15 8.37 8.01 0.804
## 8 7 3 8.88 8.95 0.266
## 9 8 2 8.66 8.66 0.520
## 10 9 1 8.07 8.07 NA
bath_stats
## # A tibble: 14 × 5
## bathrooms count mean_lp median_lp sd_lp
## <dbl> <int> <dbl> <dbl> <dbl>
## 1 1 6673 7.05 7.01 0.411
## 2 1.5 282 7.23 7.19 0.450
## 3 2 2418 7.38 7.31 0.413
## 4 2.5 315 7.62 7.52 0.417
## 5 3 174 7.79 7.74 0.505
## 6 3.5 66 7.88 7.78 0.530
## 7 4 46 8.09 7.94 0.656
## 8 4.5 12 8.23 8.26 0.577
## 9 5 8 8.52 8.41 0.664
## 10 5.5 1 8.78 8.78 NA
## 11 6 1 9.16 9.16 NA
## 12 7 2 9.03 9.03 0.110
## 13 8 1 10.1 10.1 NA
## 14 8.5 1 9.31 9.31 NA
#Bedrooms v Price
ggplot(df, aes(x = factor(bedrooms), y = log_price)) +
geom_boxplot(fill = "lightsteelblue") +
labs(
title = "Log(Price) by # of Bedrooms",
x = "Bedrooms",
y = "Log(1 + Price)"
) +
theme_minimal()
#Bathrooms v price
ggplot(df, aes(x = factor(bathrooms), y = log_price)) +
geom_boxplot(fill = "lightcoral") +
labs(
title = "Log(Price) by # of Bathrooms",
x = "Bathrooms",
y = "Log(1 + Price)"
) +
theme_minimal()
-Predictive or Inferential???
-I feel like predictive is going to be the most straight forward
-Right now I only worked with numeric variables, we could also attempt to bring city name into the mix
-What Models:
1.) Multiple Linear Regression: I feel like it is a must
2.) Lasso: Addresses multicolin between bed and baths
3.) Random Forest: Go beyond Linearity