library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(readr) 

#Load in Data
path <- "C:/Users/nsepe/Downloads/apartments_data_cleaned.csv"
df   <- read.csv(path, 
                 stringsAsFactors = FALSE,
                 na.strings = c("", "NA"))

glimpse(df)
## Rows: 10,000
## Columns: 21
## $ id            <dbl> 5668626895, 5664597177, 5668626833, 5659918074, 56686267…
## $ category      <chr> "housing/rent/apartment", "housing/rent/apartment", "hou…
## $ title         <chr> "Studio apartment 2nd St NE, Uhland Terrace NE, Washingt…
## $ body          <chr> "This unit is located at second St NE, Uhland Terrace NE…
## $ amenities     <chr> "nan", "nan", "nan", "nan", "nan", "Dishwasher,Elevator,…
## $ bathrooms     <dbl> NA, NA, 1, 1, NA, 1, NA, NA, 1, NA, NA, NA, NA, NA, 1, 1…
## $ bedrooms      <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,…
## $ fee           <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "N…
## $ has_photo     <chr> "Thumbnail", "Thumbnail", "Thumbnail", "Thumbnail", "Thu…
## $ pets_allowed  <chr> "None", "None", "None", "None", "None", "nan", "None", "…
## $ price         <int> 790, 425, 1390, 925, 880, 2475, 1800, 840, 1495, 890, 99…
## $ price_display <chr> "$790", "$425", "$1,390", "$925", "$880", "$2,475", "$1,…
## $ price_type    <chr> "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "…
## $ square_feet   <int> 101, 106, 107, 116, 125, 130, 132, 136, 138, 141, 146, 1…
## $ address       <chr> "nan", "814 Schutte Rd", "nan", "1717 12th Avenue", "nan…
## $ cityname      <chr> "Washington", "Evansville", "Arlington", "Seattle", "Arl…
## $ state         <chr> "DC", "IN", "VA", "WA", "VA", "NY", "CA", "DC", "CA", "D…
## $ latitude      <dbl> 38.9057, 37.9680, 38.8910, 47.6160, 38.8738, 40.7629, 33…
## $ longitude     <dbl> -76.9861, -87.6621, -77.0816, -122.3275, -77.1055, -73.9…
## $ source        <chr> "RentLingo", "RentLingo", "RentLingo", "RentLingo", "Ren…
## $ time          <chr> "2019-12-26 11:23:35", "2019-12-22 12:17:43", "2019-12-2…
skim(df)
Data summary
Name df
Number of rows 10000
Number of columns 21
_______________________
Column type frequency:
character 14
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
category 0 1 17 23 0 3 0
title 0 1 5 80 0 9359 0
body 0 1 6 1039 0 9961 0
amenities 0 1 2 184 0 2272 0
fee 0 1 2 2 0 1 0
has_photo 0 1 2 9 0 3 0
pets_allowed 0 1 3 10 0 6 0
price_display 0 1 4 19 0 1734 0
price_type 0 1 6 14 0 3 0
address 0 1 3 76 0 6659 0
cityname 0 1 3 22 0 1575 0
state 0 1 2 3 0 52 0
source 0 1 8 17 0 12 0
time 1 1 19 19 0 6309 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 5623395652.88 70210252.04 5508654087.00 5509248467.00 5.66861e+09 5668626447.25 5668662559.00 ▃▁▁▁▇
bathrooms 34 1 1.38 0.62 1.00 1.00 1.00000e+00 2.00 8.50 ▇▁▁▁▁
bedrooms 7 1 1.74 0.94 0.00 1.00 2.00000e+00 2.00 9.00 ▇▇▁▁▁
price 0 1 1486.28 1076.51 200.00 949.00 1.27000e+03 1695.00 52500.00 ▇▁▁▁▁
square_feet 0 1 945.81 655.76 101.00 649.00 8.02000e+02 1100.00 40000.00 ▇▁▁▁▁
latitude 10 1 37.70 5.50 21.32 33.68 3.88100e+01 41.35 61.59 ▁▇▇▂▁
longitude 10 1 -94.65 15.76 -158.02 -101.30 -9.36500e+01 -82.21 -70.19 ▁▁▅▇▇
#Cleaning Data
library(tidyverse)
library(skimr)
library(here)
## here() starts at C:/Users/nsepe/Documents
#Remove duplicate rows
df <- df |> distinct()

#Impute missing values
#Numeric missing values are replaced by the median of the column they belong
#Text missing columns will read as "Unknown"
df <- df |>
  mutate(
    across(where(is.numeric),
           ~ if_else(is.na(.),
                     median(., na.rm = TRUE),
                     .)),
    across(where(is.character),
           ~ replace_na(., "Unknown"))
  )

#Amenity‐count column
df <- df |>
  mutate(
    n_amenities = if_else(
      amenities == "Unknown" | amenities == "",
      0,
      str_count(amenities, ",") + 1
    )
  )

#Binarize pets_alowed
#1 = allowed
#0 = not allowed
df <- df |>
  mutate(pets_allowed = str_replace_all(pets_allowed, '"', '')) |>
  mutate(pet_flag = if_else(
    pets_allowed %in% c("None","Unknown",""), 0, 1
  ))

#Drop redundant columns (fee, price_display, address)
#Fee: This column is always no
#price_display: Appears to be a string version of price
#Address: Will use other columns if we want information regarding location
df <- df |>
  select(-c(fee, price_display, address))

#Create log(price)
df <- df |>
  mutate(
    log_price = log1p(price)
  )

glimpse(df)
## Rows: 10,000
## Columns: 21
## $ id           <dbl> 5668626895, 5664597177, 5668626833, 5659918074, 566862675…
## $ category     <chr> "housing/rent/apartment", "housing/rent/apartment", "hous…
## $ title        <chr> "Studio apartment 2nd St NE, Uhland Terrace NE, Washingto…
## $ body         <chr> "This unit is located at second St NE, Uhland Terrace NE,…
## $ amenities    <chr> "nan", "nan", "nan", "nan", "nan", "Dishwasher,Elevator,P…
## $ bathrooms    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ bedrooms     <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, …
## $ has_photo    <chr> "Thumbnail", "Thumbnail", "Thumbnail", "Thumbnail", "Thum…
## $ pets_allowed <chr> "None", "None", "None", "None", "None", "nan", "None", "N…
## $ price        <dbl> 790, 425, 1390, 925, 880, 2475, 1800, 840, 1495, 890, 990…
## $ price_type   <chr> "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "M…
## $ square_feet  <dbl> 101, 106, 107, 116, 125, 130, 132, 136, 138, 141, 146, 15…
## $ cityname     <chr> "Washington", "Evansville", "Arlington", "Seattle", "Arli…
## $ state        <chr> "DC", "IN", "VA", "WA", "VA", "NY", "CA", "DC", "CA", "DC…
## $ latitude     <dbl> 38.9057, 37.9680, 38.8910, 47.6160, 38.8738, 40.7629, 33.…
## $ longitude    <dbl> -76.9861, -87.6621, -77.0816, -122.3275, -77.1055, -73.98…
## $ source       <chr> "RentLingo", "RentLingo", "RentLingo", "RentLingo", "Rent…
## $ time         <chr> "2019-12-26 11:23:35", "2019-12-22 12:17:43", "2019-12-26…
## $ n_amenities  <dbl> 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 1, 1, …
## $ pet_flag     <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, …
## $ log_price    <dbl> 6.673298, 6.054439, 7.237778, 6.830874, 6.781058, 7.81440…
skim(df)
Data summary
Name df
Number of rows 10000
Number of columns 21
_______________________
Column type frequency:
character 11
numeric 10
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
category 0 1 17 23 0 3 0
title 0 1 5 80 0 9359 0
body 0 1 6 1039 0 9961 0
amenities 0 1 2 184 0 2272 0
has_photo 0 1 2 9 0 3 0
pets_allowed 0 1 3 9 0 5 0
price_type 0 1 6 14 0 3 0
cityname 0 1 3 22 0 1575 0
state 0 1 2 3 0 52 0
source 0 1 8 17 0 12 0
time 0 1 7 19 0 6310 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 5623395652.88 70210252.04 5508654087.00 5509248467.00 5.66861e+09 5668626447.25 5668662559.00 ▃▁▁▁▇
bathrooms 0 1 1.38 0.61 1.00 1.00 1.00000e+00 2.00 8.50 ▇▁▁▁▁
bedrooms 0 1 1.74 0.94 0.00 1.00 2.00000e+00 2.00 9.00 ▇▇▁▁▁
price 0 1 1486.28 1076.51 200.00 949.00 1.27000e+03 1695.00 52500.00 ▇▁▁▁▁
square_feet 0 1 945.81 655.76 101.00 649.00 8.02000e+02 1100.00 40000.00 ▇▁▁▁▁
latitude 0 1 37.70 5.49 21.32 33.69 3.88100e+01 41.35 61.59 ▁▇▇▂▁
longitude 0 1 -94.65 15.75 -158.02 -101.30 -9.36500e+01 -82.29 -70.19 ▁▁▅▇▇
n_amenities 0 1 3.47 3.13 1.00 1.00 2.00000e+00 5.00 18.00 ▇▂▁▁▁
pet_flag 0 1 0.76 0.43 0.00 1.00 1.00000e+00 1.00 1.00 ▂▁▁▁▇
log_price 0 1 7.18 0.47 5.30 6.86 7.15000e+00 7.44 10.87 ▁▇▂▁▁
#EDA: Looking over numeric variables
df |>
  select(log_price, square_feet, bathrooms, bedrooms, n_amenities) |>
  pivot_longer(everything(), names_to="feature", values_to="value") |>
  ggplot(aes(x = value)) +
    geom_histogram(bins = 50) +
    facet_wrap(~ feature, scales = "free") +
    labs(title="Distributions of Core Numeric Features")

#The log price was the right move, price is now going to be prepped for modeling
#Square feet appears to be right skewed, might want to tryu log transfor as well
#Maybe make bedrooms and bathrooms categorical since most of it is skewed to lower numbers
#Square Feet(size) v. Rent
#Positive Relationship but not linear, likely need to log transform sqrft.
#Log transform before modeling this varaible
ggplot(df, aes(x = square_feet, y = log_price)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "loess", se = FALSE, color = "steelblue") +
  labs(
    title = "Log(Price) vs. Square Feet",
    x     = "Square Feet",
    y     = "Log(1 + Price)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#Amenities v. Rent 
#Almost flat, suggests this to be a weak predictor of rent
ggplot(df, aes(x = n_amenities, y = log_price)) +
  geom_jitter(width = 0.2, alpha = 0.2) +
  geom_smooth(method = "lm", se = FALSE, color = "tomato") +
  labs(
    title = "Log(Price) vs. # Amenities",
    x     = "# Amenities",
    y     = "Log(1 + Price)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#Pet-Friendly Premium
#At best, there is a modest pet premium
ggplot(df, aes(x = factor(pet_flag), y = log_price)) +
  geom_boxplot(fill = c("#D3D3D3", "#A6CEE3")) +
  labs(
    title = "Log(Price) by Pet-Friendly Status",
    x     = "Pets Allowed? (0 = No, 1 = Yes)",
    y     = "Log(1 + Price)"
  ) +
  theme_minimal()

#Log Transformed Size
#Create log_sqft
#Observe a positive linear relationship 
df <- df |>
  mutate(log_sqft = log1p(square_feet))

#Plot log_price vs. log_sqft
ggplot(df, aes(x = log_sqft, y = log_price)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE, color = "darkblue") +
  labs(
    title = "Log(Price) vs. Log(Square Feet)",
    x     = "Log(1 + Square Feet)",
    y     = "Log(1 + Price)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#EDA on Bedrooms and Bathrooms

#Bedroom Summary
bed_stats <- df |>
  group_by(bedrooms) |>
  summarize(
    count     = n(),
    mean_lp   = mean(log_price),
    median_lp = median(log_price),
    sd_lp     = sd(log_price)
  ) |>
  arrange(bedrooms)

#Bathroom Summary
bath_stats <- df |>
  group_by(bathrooms) |>
  summarize(
    count     = n(),
    mean_lp   = mean(log_price),
    median_lp = median(log_price),
    sd_lp     = sd(log_price)
  ) |>
  arrange(bathrooms)

bed_stats
## # A tibble: 10 × 5
##    bedrooms count mean_lp median_lp  sd_lp
##       <dbl> <int>   <dbl>     <dbl>  <dbl>
##  1        0   198    7.16      7.17  0.514
##  2        1  4607    7.03      7.00  0.404
##  3        2  3405    7.22      7.19  0.449
##  4        3  1276    7.37      7.31  0.447
##  5        4   404    7.61      7.49  0.482
##  6        5    89    8.05      8.01  0.485
##  7        6    15    8.37      8.01  0.804
##  8        7     3    8.88      8.95  0.266
##  9        8     2    8.66      8.66  0.520
## 10        9     1    8.07      8.07 NA
bath_stats
## # A tibble: 14 × 5
##    bathrooms count mean_lp median_lp  sd_lp
##        <dbl> <int>   <dbl>     <dbl>  <dbl>
##  1       1    6673    7.05      7.01  0.411
##  2       1.5   282    7.23      7.19  0.450
##  3       2    2418    7.38      7.31  0.413
##  4       2.5   315    7.62      7.52  0.417
##  5       3     174    7.79      7.74  0.505
##  6       3.5    66    7.88      7.78  0.530
##  7       4      46    8.09      7.94  0.656
##  8       4.5    12    8.23      8.26  0.577
##  9       5       8    8.52      8.41  0.664
## 10       5.5     1    8.78      8.78 NA    
## 11       6       1    9.16      9.16 NA    
## 12       7       2    9.03      9.03  0.110
## 13       8       1   10.1      10.1  NA    
## 14       8.5     1    9.31      9.31 NA
#Bedrooms v Price
ggplot(df, aes(x = factor(bedrooms), y = log_price)) +
  geom_boxplot(fill = "lightsteelblue") +
  labs(
    title = "Log(Price) by # of Bedrooms",
    x     = "Bedrooms",
    y     = "Log(1 + Price)"
  ) +
  theme_minimal()

#Bathrooms v price
ggplot(df, aes(x = factor(bathrooms), y = log_price)) +
  geom_boxplot(fill = "lightcoral") +
  labs(
    title = "Log(Price) by # of Bathrooms",
    x     = "Bathrooms",
    y     = "Log(1 + Price)"
  ) +
  theme_minimal()

Meeting:

-Predictive or Inferential???

-I feel like predictive is going to be the most straight forward

-Right now I only worked with numeric variables, we could also attempt to bring city name into the mix

-What Models:

1.) Multiple Linear Regression: I feel like it is a must

2.) Lasso: Addresses multicolin between bed and baths

3.) Random Forest: Go beyond Linearity