Import Data

rent <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-07-05/rent.csv')

## Rows: 200796 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): post_id, nhood, city, county, address, title, descr, details
## dbl (9): date, year, price, beds, baths, sqft, room_in_apt, lat, lon
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(rent)

Data summary
Name	rent
Number of rows	200796
Number of columns	17
_______________________
Column type frequency:
character	8
numeric	9
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
post_id	0	1.00	9	14	200796
nhood	0	1.00	4	43	167
city	0	1.00	5	19	104
county	1394	0.99	4	13	10
address	196888	0.02	1	38	2869
title	2517	0.99	2	298	184961
descr	197542	0.02	13	16975	3025
details	192780	0.04	4	595	7667

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
date	0	1.00	20095718.38	44694.07	20000902.00	20050227.00	20110924.00	20120805.0	20180717.00	▁▇▁▆▃
year	0	1.00	2009.51	4.48	2000.00	2005.00	2011.00	2012.0	2018.00	▁▇▁▆▃
price	0	1.00	2135.36	1427.75	220.00	1295.00	1800.00	2505.0	40000.00	▇▁▁▁▁
beds	6608	0.97	1.89	1.08	0.00	1.00	2.00	3.0	12.00	▇▂▁▁▁
baths	158121	0.21	1.68	0.69	1.00	1.00	2.00	2.0	8.00	▇▁▁▁▁
sqft	136117	0.32	1201.83	5000.22	80.00	750.00	1000.00	1360.0	900000.00	▇▁▁▁▁
room_in_apt	0	1.00	0.00	0.04	0.00	0.00	0.00	0.0	1.00	▇▁▁▁▁
lat	193145	0.04	37.67	0.35	33.57	37.40	37.76	37.8	40.43	▁▁▅▇▁
lon	196484	0.02	-122.21	0.78	-123.20	-122.42	-122.26	-122.0	-74.20	▇▁▁▁▁

data <- rent %>%
    
    # Treat missing values
    select(-address, -descr, -details, -lat, -lon, -date, -year, -room_in_apt) %>%
    na.omit() %>%
    
    # log transform pos-skewed distributions
    mutate(price = log(price))

Explore Data

Identify good predictors.

sqft

data %>%
    ggplot(aes(price, sqft)) +
    scale_y_log10() + 
    geom_point()

beds

data %>%
    ggplot(aes(price, as.factor(beds))) + 
    geom_boxplot()

title

data %>% 
    
    # tokenize title
    unnest_tokens(output = word, input = title) %>%
    
    # calculate avg rent per word 
    group_by(word) %>% 
    summarise(price = mean(price),
              n     =n()) %>%
    ungroup() %>% 
    
    filter(n > 10, !str_detect(word, "\\d")) %>%
    slice_max(order_by = price, n = 20) %>%
    
    #Plot 
    ggplot(aes(price, fct_reorder(word, price))) +
    geom_point() + 
    
    labs(y = "Words in Title")

Code Along 1: SF Rental Market

Cam Paquette

2025-02-04

Import Data

Explore Data

Preprocess data

Build Models

Evaluate Models

Make Predictions