Stats_Final

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(skimr)
library(readr) 

#Load in Data
path <- "C:/Users/nsepe/Downloads/apartments_data_cleaned.csv"
df   <- read.csv(path, 
                 stringsAsFactors = FALSE,
                 na.strings = c("", "NA"))

glimpse(df)

## Rows: 10,000
## Columns: 21
## $ id            <dbl> 5668626895, 5664597177, 5668626833, 5659918074, 56686267…
## $ category      <chr> "housing/rent/apartment", "housing/rent/apartment", "hou…
## $ title         <chr> "Studio apartment 2nd St NE, Uhland Terrace NE, Washingt…
## $ body          <chr> "This unit is located at second St NE, Uhland Terrace NE…
## $ amenities     <chr> "nan", "nan", "nan", "nan", "nan", "Dishwasher,Elevator,…
## $ bathrooms     <dbl> NA, NA, 1, 1, NA, 1, NA, NA, 1, NA, NA, NA, NA, NA, 1, 1…
## $ bedrooms      <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,…
## $ fee           <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "N…
## $ has_photo     <chr> "Thumbnail", "Thumbnail", "Thumbnail", "Thumbnail", "Thu…
## $ pets_allowed  <chr> "None", "None", "None", "None", "None", "nan", "None", "…
## $ price         <int> 790, 425, 1390, 925, 880, 2475, 1800, 840, 1495, 890, 99…
## $ price_display <chr> "$790", "$425", "$1,390", "$925", "$880", "$2,475", "$1,…
## $ price_type    <chr> "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "…
## $ square_feet   <int> 101, 106, 107, 116, 125, 130, 132, 136, 138, 141, 146, 1…
## $ address       <chr> "nan", "814 Schutte Rd", "nan", "1717 12th Avenue", "nan…
## $ cityname      <chr> "Washington", "Evansville", "Arlington", "Seattle", "Arl…
## $ state         <chr> "DC", "IN", "VA", "WA", "VA", "NY", "CA", "DC", "CA", "D…
## $ latitude      <dbl> 38.9057, 37.9680, 38.8910, 47.6160, 38.8738, 40.7629, 33…
## $ longitude     <dbl> -76.9861, -87.6621, -77.0816, -122.3275, -77.1055, -73.9…
## $ source        <chr> "RentLingo", "RentLingo", "RentLingo", "RentLingo", "Ren…
## $ time          <chr> "2019-12-26 11:23:35", "2019-12-22 12:17:43", "2019-12-2…

skim(df)

Data summary
Name	df
Number of rows	10000
Number of columns	21
_______________________
Column type frequency:
character	14
numeric	7
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
category	0	1	17	23	3
title	0	1	5	80	9359
body	0	1	6	1039	9961
amenities	0	1	2	184	2272
fee	0	1	2	2	1
has_photo	0	1	2	9	3
pets_allowed	0	1	3	10	6
price_display	0	1	4	19	1734
price_type	0	1	6	14	3
address	0	1	3	76	6659
cityname	0	1	3	22	1575
state	0	1	2	3	52
source	0	1	8	17	12
time	1	1	19	19	6309

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	0	1	5623395652.88	70210252.04	5508654087.00	5509248467.00	5.66861e+09	5668626447.25	5668662559.00	▃▁▁▁▇
bathrooms	34	1	1.38	0.62	1.00	1.00	1.00000e+00	2.00	8.50	▇▁▁▁▁
bedrooms	7	1	1.74	0.94	0.00	1.00	2.00000e+00	2.00	9.00	▇▇▁▁▁
price	0	1	1486.28	1076.51	200.00	949.00	1.27000e+03	1695.00	52500.00	▇▁▁▁▁
square_feet	0	1	945.81	655.76	101.00	649.00	8.02000e+02	1100.00	40000.00	▇▁▁▁▁
latitude	10	1	37.70	5.50	21.32	33.68	3.88100e+01	41.35	61.59	▁▇▇▂▁
longitude	10	1	-94.65	15.76	-158.02	-101.30	-9.36500e+01	-82.21	-70.19	▁▁▅▇▇

#Cleaning Data
library(tidyverse)
library(skimr)
library(here)

## here() starts at C:/Users/nsepe/Documents

#Remove duplicate rows
df <- df |> distinct()

#Impute missing values
#Numeric missing values are replaced by the median of the column they belong
#Text missing columns will read as "Unknown"
df <- df |>
  mutate(
    across(where(is.numeric),
           ~ if_else(is.na(.),
                     median(., na.rm = TRUE),
                     .)),
    across(where(is.character),
           ~ replace_na(., "Unknown"))
  )

#Amenity‐count column
df <- df |>
  mutate(
    n_amenities = if_else(
      amenities == "Unknown" | amenities == "",
      0,
      str_count(amenities, ",") + 1
    )
  )

#Binarize pets_alowed
#1 = allowed
#0 = not allowed
df <- df |>
  mutate(pets_allowed = str_replace_all(pets_allowed, '"', '')) |>
  mutate(pet_flag = if_else(
    pets_allowed %in% c("None","Unknown",""), 0, 1
  ))

#Drop redundant columns (fee, price_display, address)
#Fee: This column is always no
#price_display: Appears to be a string version of price
#Address: Will use other columns if we want information regarding location
df <- df |>
  select(-c(fee, price_display, address))

#Create log(price)
df <- df |>
  mutate(
    log_price = log1p(price)
  )

glimpse(df)

## Rows: 10,000
## Columns: 21
## $ id           <dbl> 5668626895, 5664597177, 5668626833, 5659918074, 566862675…
## $ category     <chr> "housing/rent/apartment", "housing/rent/apartment", "hous…
## $ title        <chr> "Studio apartment 2nd St NE, Uhland Terrace NE, Washingto…
## $ body         <chr> "This unit is located at second St NE, Uhland Terrace NE,…
## $ amenities    <chr> "nan", "nan", "nan", "nan", "nan", "Dishwasher,Elevator,P…
## $ bathrooms    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ bedrooms     <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, …
## $ has_photo    <chr> "Thumbnail", "Thumbnail", "Thumbnail", "Thumbnail", "Thum…
## $ pets_allowed <chr> "None", "None", "None", "None", "None", "nan", "None", "N…
## $ price        <dbl> 790, 425, 1390, 925, 880, 2475, 1800, 840, 1495, 890, 990…
## $ price_type   <chr> "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "M…
## $ square_feet  <dbl> 101, 106, 107, 116, 125, 130, 132, 136, 138, 141, 146, 15…
## $ cityname     <chr> "Washington", "Evansville", "Arlington", "Seattle", "Arli…
## $ state        <chr> "DC", "IN", "VA", "WA", "VA", "NY", "CA", "DC", "CA", "DC…
## $ latitude     <dbl> 38.9057, 37.9680, 38.8910, 47.6160, 38.8738, 40.7629, 33.…
## $ longitude    <dbl> -76.9861, -87.6621, -77.0816, -122.3275, -77.1055, -73.98…
## $ source       <chr> "RentLingo", "RentLingo", "RentLingo", "RentLingo", "Rent…
## $ time         <chr> "2019-12-26 11:23:35", "2019-12-22 12:17:43", "2019-12-26…
## $ n_amenities  <dbl> 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 1, 1, …
## $ pet_flag     <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, …
## $ log_price    <dbl> 6.673298, 6.054439, 7.237778, 6.830874, 6.781058, 7.81440…

skim(df)

Data summary
Name	df
Number of rows	10000
Number of columns	21
_______________________
Column type frequency:
character	11
numeric	10
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
category	1	17	23	3
title	1	5	80	9359
body	1	6	1039	9961
amenities	1	2	184	2272
has_photo	1	2	9	3
pets_allowed	1	3	9	5
price_type	1	6	14	3
cityname	1	3	22	1575
state	1	2	3	52
source	1	8	17	12
time	1	7	19	6310

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	5623395652.88	70210252.04	5508654087.00	5509248467.00	5.66861e+09	5668626447.25	5668662559.00	▃▁▁▁▇
bathrooms	1	1.38	0.61	1.00	1.00	1.00000e+00	2.00	8.50	▇▁▁▁▁
bedrooms	1	1.74	0.94	0.00	1.00	2.00000e+00	2.00	9.00	▇▇▁▁▁
price	1	1486.28	1076.51	200.00	949.00	1.27000e+03	1695.00	52500.00	▇▁▁▁▁
square_feet	1	945.81	655.76	101.00	649.00	8.02000e+02	1100.00	40000.00	▇▁▁▁▁
latitude	1	37.70	5.49	21.32	33.69	3.88100e+01	41.35	61.59	▁▇▇▂▁
longitude	1	-94.65	15.75	-158.02	-101.30	-9.36500e+01	-82.29	-70.19	▁▁▅▇▇
n_amenities	1	3.47	3.13	1.00	1.00	2.00000e+00	5.00	18.00	▇▂▁▁▁
pet_flag	1	0.76	0.43	0.00	1.00	1.00000e+00	1.00	1.00	▂▁▁▁▇
log_price	1	7.18	0.47	5.30	6.86	7.15000e+00	7.44	10.87	▁▇▂▁▁

#EDA: Looking over numeric variables
df |>
  select(log_price, square_feet, bathrooms, bedrooms, n_amenities) |>
  pivot_longer(everything(), names_to="feature", values_to="value") |>
  ggplot(aes(x = value)) +
    geom_histogram(bins = 50) +
    facet_wrap(~ feature, scales = "free") +
    labs(title="Distributions of Core Numeric Features")

#The log price was the right move, price is now going to be prepped for modeling
#Square feet appears to be right skewed, might want to tryu log transfor as well
#Maybe make bedrooms and bathrooms categorical since most of it is skewed to lower numbers

#Square Feet(size) v. Rent
#Positive Relationship but not linear, likely need to log transform sqrft.
#Log transform before modeling this varaible
ggplot(df, aes(x = square_feet, y = log_price)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "loess", se = FALSE, color = "steelblue") +
  labs(
    title = "Log(Price) vs. Square Feet",
    x     = "Square Feet",
    y     = "Log(1 + Price)"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

#Amenities v. Rent 
#Almost flat, suggests this to be a weak predictor of rent
ggplot(df, aes(x = n_amenities, y = log_price)) +
  geom_jitter(width = 0.2, alpha = 0.2) +
  geom_smooth(method = "lm", se = FALSE, color = "tomato") +
  labs(
    title = "Log(Price) vs. # Amenities",
    x     = "# Amenities",
    y     = "Log(1 + Price)"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

#Pet-Friendly Premium
#At best, there is a modest pet premium
ggplot(df, aes(x = factor(pet_flag), y = log_price)) +
  geom_boxplot(fill = c("#D3D3D3", "#A6CEE3")) +
  labs(
    title = "Log(Price) by Pet-Friendly Status",
    x     = "Pets Allowed? (0 = No, 1 = Yes)",
    y     = "Log(1 + Price)"
  ) +
  theme_minimal()

#Log Transformed Size
#Create log_sqft
#Observe a positive linear relationship 
df <- df |>
  mutate(log_sqft = log1p(square_feet))

#Plot log_price vs. log_sqft
ggplot(df, aes(x = log_sqft, y = log_price)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", se = FALSE, color = "darkblue") +
  labs(
    title = "Log(Price) vs. Log(Square Feet)",
    x     = "Log(1 + Square Feet)",
    y     = "Log(1 + Price)"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

#EDA on Bedrooms and Bathrooms

#Bedroom Summary
bed_stats <- df |>
  group_by(bedrooms) |>
  summarize(
    count     = n(),
    mean_lp   = mean(log_price),
    median_lp = median(log_price),
    sd_lp     = sd(log_price)
  ) |>
  arrange(bedrooms)

#Bathroom Summary
bath_stats <- df |>
  group_by(bathrooms) |>
  summarize(
    count     = n(),
    mean_lp   = mean(log_price),
    median_lp = median(log_price),
    sd_lp     = sd(log_price)
  ) |>
  arrange(bathrooms)

bed_stats

## # A tibble: 10 × 5
##    bedrooms count mean_lp median_lp  sd_lp
##       <dbl> <int>   <dbl>     <dbl>  <dbl>
##  1        0   198    7.16      7.17  0.514
##  2        1  4607    7.03      7.00  0.404
##  3        2  3405    7.22      7.19  0.449
##  4        3  1276    7.37      7.31  0.447
##  5        4   404    7.61      7.49  0.482
##  6        5    89    8.05      8.01  0.485
##  7        6    15    8.37      8.01  0.804
##  8        7     3    8.88      8.95  0.266
##  9        8     2    8.66      8.66  0.520
## 10        9     1    8.07      8.07 NA

bath_stats

## # A tibble: 14 × 5
##    bathrooms count mean_lp median_lp  sd_lp
##        <dbl> <int>   <dbl>     <dbl>  <dbl>
##  1       1    6673    7.05      7.01  0.411
##  2       1.5   282    7.23      7.19  0.450
##  3       2    2418    7.38      7.31  0.413
##  4       2.5   315    7.62      7.52  0.417
##  5       3     174    7.79      7.74  0.505
##  6       3.5    66    7.88      7.78  0.530
##  7       4      46    8.09      7.94  0.656
##  8       4.5    12    8.23      8.26  0.577
##  9       5       8    8.52      8.41  0.664
## 10       5.5     1    8.78      8.78 NA    
## 11       6       1    9.16      9.16 NA    
## 12       7       2    9.03      9.03  0.110
## 13       8       1   10.1      10.1  NA    
## 14       8.5     1    9.31      9.31 NA

#Bedrooms v Price
ggplot(df, aes(x = factor(bedrooms), y = log_price)) +
  geom_boxplot(fill = "lightsteelblue") +
  labs(
    title = "Log(Price) by # of Bedrooms",
    x     = "Bedrooms",
    y     = "Log(1 + Price)"
  ) +
  theme_minimal()

#Bathrooms v price
ggplot(df, aes(x = factor(bathrooms), y = log_price)) +
  geom_boxplot(fill = "lightcoral") +
  labs(
    title = "Log(Price) by # of Bathrooms",
    x     = "Bathrooms",
    y     = "Log(1 + Price)"
  ) +
  theme_minimal()

Meeting:

-Predictive or Inferential???

-I feel like predictive is going to be the most straight forward

-Right now I only worked with numeric variables, we could also attempt to bring city name into the mix

-What Models:

1.) Multiple Linear Regression: I feel like it is a must

2.) Lasso: Addresses multicolin between bed and baths

3.) Random Forest: Go beyond Linearity

Stats_Final_Proj

2025-04-29

Meeting: