rent <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-07-05/rent.csv')

skimr::skim(rent)
Data summary
Name rent
Number of rows 200796
Number of columns 17
_______________________
Column type frequency:
character 8
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
post_id 0 1.00 9 14 0 200796 0
nhood 0 1.00 4 43 0 167 0
city 0 1.00 5 19 0 104 0
county 1394 0.99 4 13 0 10 0
address 196888 0.02 1 38 0 2869 0
title 2517 0.99 2 298 0 184961 0
descr 197542 0.02 13 16975 0 3025 0
details 192780 0.04 4 595 0 7667 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
date 0 1.00 20095718.38 44694.07 20000902.00 20050227.00 20110924.00 20120805.0 20180717.00 ▁▇▁▆▃
year 0 1.00 2009.51 4.48 2000.00 2005.00 2011.00 2012.0 2018.00 ▁▇▁▆▃
price 0 1.00 2135.36 1427.75 220.00 1295.00 1800.00 2505.0 40000.00 ▇▁▁▁▁
beds 6608 0.97 1.89 1.08 0.00 1.00 2.00 3.0 12.00 ▇▂▁▁▁
baths 158121 0.21 1.68 0.69 1.00 1.00 2.00 2.0 8.00 ▇▁▁▁▁
sqft 136117 0.32 1201.83 5000.22 80.00 750.00 1000.00 1360.0 900000.00 ▇▁▁▁▁
room_in_apt 0 1.00 0.00 0.04 0.00 0.00 0.00 0.0 1.00 ▇▁▁▁▁
lat 193145 0.04 37.67 0.35 33.57 37.40 37.76 37.8 40.43 ▁▁▅▇▁
lon 196484 0.02 -122.21 0.78 -123.20 -122.42 -122.26 -122.0 -74.20 ▇▁▁▁▁
data <- rent %>%
    
    # Treat missing values
    select(-address, -descr, -details, -lat, -lon, -date, -year, -room_in_apt) %>%
    na.omit()
data %>%
    ggplot(aes(price)) +
    geom_histogram(bins = 20) 

library(tidytext)

tidy_data <-
    data %>%
    unnest_tokens(description, title)

tidy_data %>%
    count(description, sort = TRUE)
## # A tibble: 8,928 × 2
##    description     n
##    <chr>       <int>
##  1 2            7097
##  2 bath         5722
##  3 2br          5044
##  4 1            4363
##  5 in           3450
##  6 3br          3121
##  7 bedroom      2893
##  8 this         2541
##  9 3            2476
## 10 home         2190
## # ℹ 8,918 more rows
tidy_data %>%
    group_by(description) %>%
    summarise(n = n(),
              price = mean(price)) %>%
    ggplot(aes(n, price)) +
    geom_hline(yintercept = mean(data$price),
               lty = 2, color = "gray50", linewidth = 1.2) +
    geom_point(color = "midnightblue", alpha = 0.7) +
    geom_text(aes(label = description), check_overlap = TRUE, vjust = "top", hjust = "left") +
    scale_x_log10()

library(tidymodels)
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'modeldata' was built under R version 4.3.3
set.seed(123)
place_split <- initial_split(data, strata = price)
place_train <- training(place_split)
place_test <- testing(place_split)

set.seed(234)
place_folds <- vfold_cv(place_train, strata = price)
place_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits              id    
##    <list>              <chr> 
##  1 <split [9714/1080]> Fold01
##  2 <split [9714/1080]> Fold02
##  3 <split [9714/1080]> Fold03
##  4 <split [9714/1080]> Fold04
##  5 <split [9714/1080]> Fold05
##  6 <split [9715/1079]> Fold06
##  7 <split [9715/1079]> Fold07
##  8 <split [9715/1079]> Fold08
##  9 <split [9715/1079]> Fold09
## 10 <split [9716/1078]> Fold10
library(textrecipes)
data_recipe <- 
    recipe(formula = price ~ title, data = place_train) %>%
    step_tokenize(title) %>%
    step_tokenfilter(title, max_tokens = 100) %>%
    step_tfidf(title)
ranger_spec <-
    rand_forest(trees = 500) %>%
    set_mode("regression")

ranger_spec
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   trees = 500
## 
## Computational engine: ranger
svm_spec <-
    svm_linear() %>%
    set_mode("regression")

svm_spec
## Linear Support Vector Machine Model Specification (regression)
## 
## Computational engine: LiblineaR
ranger_wf <-  workflow(data_recipe, ranger_spec)
glm_wf <- workflow(data_recipe, svm_spec)
doParallel :: registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)

glm_rs <- fit_resamples(
    glm_wf,
    resamples = place_folds,
    control = contrl_preds
)
## Warning: package 'LiblineaR' was built under R version 4.3.3
ranger_rs <- fit_resamples(
    ranger_wf,
    resamples = place_folds,
    control = contrl_preds
)
collect_metrics(glm_rs)
## # A tibble: 2 × 6
##   .metric .estimator     mean     n  std_err .config             
##   <chr>   <chr>         <dbl> <int>    <dbl> <chr>               
## 1 rmse    standard   1397.       10 29.6     Preprocessor1_Model1
## 2 rsq     standard      0.237    10  0.00733 Preprocessor1_Model1
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
##   .metric .estimator     mean     n std_err .config             
##   <chr>   <chr>         <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   1314.       10 29.2    Preprocessor1_Model1
## 2 rsq     standard      0.327    10  0.0126 Preprocessor1_Model1
bind_rows(
    collect_predictions(glm_rs) %>%
        mutate(mod= "GLM"),
    collect_predictions(ranger_rs) %>%
       mutate(mod = "ranger")
 ) %>%
    ggplot(aes(price, .pred, color = id)) +
    geom_abline(lty = 2, color = "gray50", size = 1.2) +
    facet_wrap(vars(mod)) +
    coord_fixed()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

final_fitted <- last_fit(glm_wf, place_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard    1331.    Preprocessor1_Model1
## 2 rsq     standard       0.193 Preprocessor1_Model1
final_wf <- extract_workflow(final_fitted)
predict(final_wf, place_test[55,])
## # A tibble: 1 × 1
##   .pred
##   <dbl>
## 1 3557.
extract_workflow(final_fitted) %>%
    tidy() %>%
    filter(term != "Bias") %>%
    group_by(estimate > 0) %>%
    slice_max(abs(estimate), n = 10) %>%
    ungroup() %>%
    mutate(term = str_remove(term, "tf_title")) %>%
    ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0)) +
    geom_col(alpha = 0.8)

collect_metrics(glm_rs)
## # A tibble: 2 × 6
##   .metric .estimator     mean     n  std_err .config             
##   <chr>   <chr>         <dbl> <int>    <dbl> <chr>               
## 1 rmse    standard   1397.       10 29.6     Preprocessor1_Model1
## 2 rsq     standard      0.237    10  0.00733 Preprocessor1_Model1
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
##   .metric .estimator     mean     n std_err .config             
##   <chr>   <chr>         <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   1314.       10 29.2    Preprocessor1_Model1
## 2 rsq     standard      0.327    10  0.0126 Preprocessor1_Model1