Goal: to predict the rental prices in the SF Rental Market. Click here for the data.

Import Data

rent <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-07-05/rent.csv')

skimr::skim(rent)

Data summary
Name	rent
Number of rows	200796
Number of columns	17
_______________________
Column type frequency:
character	8
numeric	9
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
post_id	0	1.00	9	14	200796
nhood	0	1.00	4	43	167
city	0	1.00	5	19	104
county	1394	0.99	4	13	10
address	196888	0.02	1	38	2869
title	2517	0.99	2	298	184961
descr	197542	0.02	13	16975	3025
details	192780	0.04	4	595	7667

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
date	0	1.00	20095718.38	44694.07	20000902.00	20050227.00	20110924.00	20120805.0	20180717.00	▁▇▁▆▃
year	0	1.00	2009.51	4.48	2000.00	2005.00	2011.00	2012.0	2018.00	▁▇▁▆▃
price	0	1.00	2135.36	1427.75	220.00	1295.00	1800.00	2505.0	40000.00	▇▁▁▁▁
beds	6608	0.97	1.89	1.08	0.00	1.00	2.00	3.0	12.00	▇▂▁▁▁
baths	158121	0.21	1.68	0.69	1.00	1.00	2.00	2.0	8.00	▇▁▁▁▁
sqft	136117	0.32	1201.83	5000.22	80.00	750.00	1000.00	1360.0	900000.00	▇▁▁▁▁
room_in_apt	0	1.00	0.00	0.04	0.00	0.00	0.00	0.0	1.00	▇▁▁▁▁
lat	193145	0.04	37.67	0.35	33.57	37.40	37.76	37.8	40.43	▁▁▅▇▁
lon	196484	0.02	-122.21	0.78	-123.20	-122.42	-122.26	-122.0	-74.20	▇▁▁▁▁

data <- rent %>%
 
    # Treat missing values
  select(-address, -descr, -details, -lat, -lon, -date, -year, -room_in_apt) %>%
  na.omit() %>%
  
    # log transform variables pos-skewed distributions
  mutate(price = log(price))

Explore Data

Identify good predictors.

sqft

data %>%
    ggplot(aes(price, sqft)) +
    scale_y_log10() +
    geom_point()

beds

data %>%
    ggplot(aes(price, as.factor(beds))) +
    geom_boxplot()

title

data %>%
  # tokenize title
  unnest_tokens(output = word, input = title) %>%
  
  # calculate avg rent per word
  group_by(word) %>%
  summarise(price = mean(price),
            n = n()) %>%
  ungroup() %>%
    
# Filter for words appearing > 10 times and remove digits
  filter(n > 10, !str_detect(word, "\\d")) %>%
  slice_max(order_by = price, n = 20) %>%
  
  ggplot(aes(price, fct_reorder(word, price))) +
  geom_point() +
  labs(y = "words in title")

EDA shortcut

# 1. Binarize the data (creates the input)
data_binarized_tbl <- data %>%
    select(-post_id, -title) %>%
    binarize()

# 2. Run Correlation (Fix: Use 'data_binarized_tbl' as the input)
data_corr_tbl <- data_binarized_tbl %>%
    correlate(target = price__8.07868822922987_Inf)

# 3. Plot the result
data_corr_tbl %>%
    plot_correlation_funnel()

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the correlationfunnel package.
##   Please report the issue at
##   <https://github.com/business-science/correlationfunnel/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: ggrepel: 69 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Code Along 1: SF Rental Market

Tyler Roden

2026-02-04

Import Data

Explore Data

Preprocess Data

Build Data

Evaluate Data

Make Predictions