rent <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-07-05/rent.csv')
skimr::skim(rent)
Data summary
Name |
rent |
Number of rows |
200796 |
Number of columns |
17 |
_______________________ |
|
Column type frequency: |
|
character |
8 |
numeric |
9 |
________________________ |
|
Group variables |
None |
Variable type: character
post_id |
0 |
1.00 |
9 |
14 |
0 |
200796 |
0 |
nhood |
0 |
1.00 |
4 |
43 |
0 |
167 |
0 |
city |
0 |
1.00 |
5 |
19 |
0 |
104 |
0 |
county |
1394 |
0.99 |
4 |
13 |
0 |
10 |
0 |
address |
196888 |
0.02 |
1 |
38 |
0 |
2869 |
0 |
title |
2517 |
0.99 |
2 |
298 |
0 |
184961 |
0 |
descr |
197542 |
0.02 |
13 |
16975 |
0 |
3025 |
0 |
details |
192780 |
0.04 |
4 |
595 |
0 |
7667 |
0 |
Variable type: numeric
date |
0 |
1.00 |
20095718.38 |
44694.07 |
20000902.00 |
20050227.00 |
20110924.00 |
20120805.0 |
20180717.00 |
▁▇▁▆▃ |
year |
0 |
1.00 |
2009.51 |
4.48 |
2000.00 |
2005.00 |
2011.00 |
2012.0 |
2018.00 |
▁▇▁▆▃ |
price |
0 |
1.00 |
2135.36 |
1427.75 |
220.00 |
1295.00 |
1800.00 |
2505.0 |
40000.00 |
▇▁▁▁▁ |
beds |
6608 |
0.97 |
1.89 |
1.08 |
0.00 |
1.00 |
2.00 |
3.0 |
12.00 |
▇▂▁▁▁ |
baths |
158121 |
0.21 |
1.68 |
0.69 |
1.00 |
1.00 |
2.00 |
2.0 |
8.00 |
▇▁▁▁▁ |
sqft |
136117 |
0.32 |
1201.83 |
5000.22 |
80.00 |
750.00 |
1000.00 |
1360.0 |
900000.00 |
▇▁▁▁▁ |
room_in_apt |
0 |
1.00 |
0.00 |
0.04 |
0.00 |
0.00 |
0.00 |
0.0 |
1.00 |
▇▁▁▁▁ |
lat |
193145 |
0.04 |
37.67 |
0.35 |
33.57 |
37.40 |
37.76 |
37.8 |
40.43 |
▁▁▅▇▁ |
lon |
196484 |
0.02 |
-122.21 |
0.78 |
-123.20 |
-122.42 |
-122.26 |
-122.0 |
-74.20 |
▇▁▁▁▁ |
data <- rent %>%
# Treat missing values
select(-address, -descr, -details, -lat, -lon, -date, -year, -room_in_apt) %>%
na.omit()
data %>%
ggplot(aes(price)) +
geom_histogram(bins = 20)

library(tidytext)
tidy_data <-
data %>%
unnest_tokens(description, title)
tidy_data %>%
count(description, sort = TRUE)
## # A tibble: 8,928 × 2
## description n
## <chr> <int>
## 1 2 7097
## 2 bath 5722
## 3 2br 5044
## 4 1 4363
## 5 in 3450
## 6 3br 3121
## 7 bedroom 2893
## 8 this 2541
## 9 3 2476
## 10 home 2190
## # ℹ 8,918 more rows
tidy_data %>%
group_by(description) %>%
summarise(n = n(),
price = mean(price)) %>%
ggplot(aes(n, price)) +
geom_hline(yintercept = mean(data$price),
lty = 2, color = "gray50", linewidth = 1.2) +
geom_point(color = "midnightblue", alpha = 0.7) +
geom_text(aes(label = description), check_overlap = TRUE, vjust = "top", hjust = "left") +
scale_x_log10()

library(tidymodels)
## Warning: package 'broom' was built under R version 4.3.3
## Warning: package 'modeldata' was built under R version 4.3.3
set.seed(123)
place_split <- initial_split(data, strata = price)
place_train <- training(place_split)
place_test <- testing(place_split)
set.seed(234)
place_folds <- vfold_cv(place_train, strata = price)
place_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [9714/1080]> Fold01
## 2 <split [9714/1080]> Fold02
## 3 <split [9714/1080]> Fold03
## 4 <split [9714/1080]> Fold04
## 5 <split [9714/1080]> Fold05
## 6 <split [9715/1079]> Fold06
## 7 <split [9715/1079]> Fold07
## 8 <split [9715/1079]> Fold08
## 9 <split [9715/1079]> Fold09
## 10 <split [9716/1078]> Fold10
library(textrecipes)
data_recipe <-
recipe(formula = price ~ title, data = place_train) %>%
step_tokenize(title) %>%
step_tokenfilter(title, max_tokens = 100) %>%
step_tfidf(title)
ranger_spec <-
rand_forest(trees = 500) %>%
set_mode("regression")
ranger_spec
## Random Forest Model Specification (regression)
##
## Main Arguments:
## trees = 500
##
## Computational engine: ranger
svm_spec <-
svm_linear() %>%
set_mode("regression")
svm_spec
## Linear Support Vector Machine Model Specification (regression)
##
## Computational engine: LiblineaR
ranger_wf <- workflow(data_recipe, ranger_spec)
glm_wf <- workflow(data_recipe, svm_spec)
doParallel :: registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)
glm_rs <- fit_resamples(
glm_wf,
resamples = place_folds,
control = contrl_preds
)
## Warning: package 'LiblineaR' was built under R version 4.3.3
ranger_rs <- fit_resamples(
ranger_wf,
resamples = place_folds,
control = contrl_preds
)
collect_metrics(glm_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 1397. 10 29.6 Preprocessor1_Model1
## 2 rsq standard 0.237 10 0.00733 Preprocessor1_Model1
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 1314. 10 29.2 Preprocessor1_Model1
## 2 rsq standard 0.327 10 0.0126 Preprocessor1_Model1
bind_rows(
collect_predictions(glm_rs) %>%
mutate(mod= "GLM"),
collect_predictions(ranger_rs) %>%
mutate(mod = "ranger")
) %>%
ggplot(aes(price, .pred, color = id)) +
geom_abline(lty = 2, color = "gray50", size = 1.2) +
facet_wrap(vars(mod)) +
coord_fixed()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

final_fitted <- last_fit(glm_wf, place_split)
collect_metrics(final_fitted)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 1331. Preprocessor1_Model1
## 2 rsq standard 0.193 Preprocessor1_Model1
final_wf <- extract_workflow(final_fitted)
predict(final_wf, place_test[55,])
## # A tibble: 1 × 1
## .pred
## <dbl>
## 1 3557.
extract_workflow(final_fitted) %>%
tidy() %>%
filter(term != "Bias") %>%
group_by(estimate > 0) %>%
slice_max(abs(estimate), n = 10) %>%
ungroup() %>%
mutate(term = str_remove(term, "tf_title")) %>%
ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0)) +
geom_col(alpha = 0.8)

collect_metrics(glm_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 1397. 10 29.6 Preprocessor1_Model1
## 2 rsq standard 0.237 10 0.00733 Preprocessor1_Model1
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 1314. 10 29.2 Preprocessor1_Model1
## 2 rsq standard 0.327 10 0.0126 Preprocessor1_Model1