Code Along 4: SF Rental Market

rent <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-07-05/rent.csv')

skimr::skim(rent)

Data summary
Name	rent
Number of rows	200796
Number of columns	17
_______________________
Column type frequency:
character	8
numeric	9
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
post_id	0	1.00	9	14	200796
nhood	0	1.00	4	43	167
city	0	1.00	5	19	104
county	1394	0.99	4	13	10
address	196888	0.02	1	38	2869
title	2517	0.99	2	298	184961
descr	197542	0.02	13	16975	3025
details	192780	0.04	4	595	7667

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
date	0	1.00	20095718.38	44694.07	20000902.00	20050227.00	20110924.00	20120805.0	20180717.00	▁▇▁▆▃
year	0	1.00	2009.51	4.48	2000.00	2005.00	2011.00	2012.0	2018.00	▁▇▁▆▃
price	0	1.00	2135.36	1427.75	220.00	1295.00	1800.00	2505.0	40000.00	▇▁▁▁▁
beds	6608	0.97	1.89	1.08	0.00	1.00	2.00	3.0	12.00	▇▂▁▁▁
baths	158121	0.21	1.68	0.69	1.00	1.00	2.00	2.0	8.00	▇▁▁▁▁
sqft	136117	0.32	1201.83	5000.22	80.00	750.00	1000.00	1360.0	900000.00	▇▁▁▁▁
room_in_apt	0	1.00	0.00	0.04	0.00	0.00	0.00	0.0	1.00	▇▁▁▁▁
lat	193145	0.04	37.67	0.35	33.57	37.40	37.76	37.8	40.43	▁▁▅▇▁
lon	196484	0.02	-122.21	0.78	-123.20	-122.42	-122.26	-122.0	-74.20	▇▁▁▁▁

data <- rent %>%
    
    # Treat missing values
    select(-address, -descr, -details, -lat, -lon, -date, -year, -room_in_apt) %>%
    na.omit()

data %>%
    ggplot(aes(price)) +
    geom_histogram(bins = 20)

library(tidytext)

tidy_data <-
    data %>%
    unnest_tokens(description, title)

tidy_data %>%
    count(description, sort = TRUE)

## # A tibble: 8,928 × 2
##    description     n
##    <chr>       <int>
##  1 2            7097
##  2 bath         5722
##  3 2br          5044
##  4 1            4363
##  5 in           3450
##  6 3br          3121
##  7 bedroom      2893
##  8 this         2541
##  9 3            2476
## 10 home         2190
## # ℹ 8,918 more rows

tidy_data %>%
    group_by(description) %>%
    summarise(n = n(),
              price = mean(price)) %>%
    ggplot(aes(n, price)) +
    geom_hline(yintercept = mean(data$price),
               lty = 2, color = "gray50", linewidth = 1.2) +
    geom_point(color = "midnightblue", alpha = 0.7) +
    geom_text(aes(label = description), check_overlap = TRUE, vjust = "top", hjust = "left") +
    scale_x_log10()

library(tidymodels)

## Warning: package 'broom' was built under R version 4.3.3

## Warning: package 'modeldata' was built under R version 4.3.3

set.seed(123)
place_split <- initial_split(data, strata = price)
place_train <- training(place_split)
place_test <- testing(place_split)

set.seed(234)
place_folds <- vfold_cv(place_train, strata = price)
place_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits              id    
##    <list>              <chr> 
##  1 <split [9714/1080]> Fold01
##  2 <split [9714/1080]> Fold02
##  3 <split [9714/1080]> Fold03
##  4 <split [9714/1080]> Fold04
##  5 <split [9714/1080]> Fold05
##  6 <split [9715/1079]> Fold06
##  7 <split [9715/1079]> Fold07
##  8 <split [9715/1079]> Fold08
##  9 <split [9715/1079]> Fold09
## 10 <split [9716/1078]> Fold10

library(textrecipes)
data_recipe <- 
    recipe(formula = price ~ title, data = place_train) %>%
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tfidf(title)

ranger_spec <-
    rand_forest(trees = 500) %>%
    set_mode("regression")

ranger_spec

## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   trees = 500
## 
## Computational engine: ranger

svm_spec <-
    svm_linear() %>%
    set_mode("regression")

svm_spec

## Linear Support Vector Machine Model Specification (regression)
## 
## Computational engine: LiblineaR

ranger_wf <-  workflow(data_recipe, ranger_spec)
glm_wf <- workflow(data_recipe, svm_spec)

doParallel :: registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)

glm_rs <- fit_resamples(
    glm_wf,
    resamples = place_folds,
    control = contrl_preds
)

## Warning: package 'LiblineaR' was built under R version 4.3.3

ranger_rs <- fit_resamples(
    ranger_wf,
    resamples = place_folds,
    control = contrl_preds
)

collect_metrics(glm_rs)

## # A tibble: 2 × 6
##   .metric .estimator     mean     n  std_err .config             
##   <chr>   <chr>         <dbl> <int>    <dbl> <chr>               
## 1 rmse    standard   1397.       10 29.6     Preprocessor1_Model1
## 2 rsq     standard      0.237    10  0.00733 Preprocessor1_Model1

collect_metrics(ranger_rs)

## # A tibble: 2 × 6
##   .metric .estimator     mean     n std_err .config             
##   <chr>   <chr>         <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   1314.       10 29.2    Preprocessor1_Model1
## 2 rsq     standard      0.327    10  0.0126 Preprocessor1_Model1

bind_rows(
    collect_predictions(glm_rs) %>%
        mutate(mod= "GLM"),
    collect_predictions(ranger_rs) %>%
       mutate(mod = "ranger")
 ) %>%
    ggplot(aes(price, .pred, color = id)) +
    geom_abline(lty = 2, color = "gray50", size = 1.2) +
    facet_wrap(vars(mod)) +
    coord_fixed()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

final_fitted <- last_fit(glm_wf, place_split)
collect_metrics(final_fitted)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard    1331.    Preprocessor1_Model1
## 2 rsq     standard       0.193 Preprocessor1_Model1

final_wf <- extract_workflow(final_fitted)
predict(final_wf, place_test[55,])

## # A tibble: 1 × 1
##   .pred
##   <dbl>
## 1 3557.

extract_workflow(final_fitted) %>%
    tidy() %>%
    filter(term != "Bias") %>%
    group_by(estimate > 0) %>%
    slice_max(abs(estimate), n = 10) %>%
    ungroup() %>%
    mutate(term = str_remove(term, "tf_title")) %>%
    ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0)) +
    geom_col(alpha = 0.8)

collect_metrics(glm_rs)

## # A tibble: 2 × 6
##   .metric .estimator     mean     n  std_err .config             
##   <chr>   <chr>         <dbl> <int>    <dbl> <chr>               
## 1 rmse    standard   1397.       10 29.6     Preprocessor1_Model1
## 2 rsq     standard      0.237    10  0.00733 Preprocessor1_Model1

collect_metrics(ranger_rs)

## # A tibble: 2 × 6
##   .metric .estimator     mean     n std_err .config             
##   <chr>   <chr>         <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   1314.       10 29.2    Preprocessor1_Model1
## 2 rsq     standard      0.327    10  0.0126 Preprocessor1_Model1

Code Along 4: SF Rental Market

Erin McEvoy

2024-10-01