library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ recipes 1.0.8
## ✔ dials 1.2.0 ✔ rsample 1.2.0
## ✔ dplyr 1.1.3 ✔ tibble 3.2.1
## ✔ ggplot2 3.4.3 ✔ tidyr 1.3.0
## ✔ infer 1.0.4 ✔ tune 1.1.2
## ✔ modeldata 1.2.0 ✔ workflows 1.1.3
## ✔ parsnip 1.1.1 ✔ workflowsets 1.0.1
## ✔ purrr 1.0.2 ✔ yardstick 1.2.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
rent <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-07-05/rent.csv')
## Rows: 200796 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): post_id, nhood, city, county, address, title, descr, details
## dbl (9): date, year, price, beds, baths, sqft, room_in_apt, lat, lon
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(rent)
## Rows: 200,796
## Columns: 17
## $ post_id <chr> "pre2013_134138", "pre2013_135669", "pre2013_127127", "pre…
## $ date <dbl> 20050111, 20050126, 20041017, 20120601, 20041021, 20060411…
## $ year <dbl> 2005, 2005, 2004, 2012, 2004, 2006, 2007, 2017, 2009, 2006…
## $ nhood <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ city <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ county <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ price <dbl> 1250, 1295, 1100, 1425, 890, 825, 1500, 2925, 450, 1395, 1…
## $ beds <dbl> 2, 2, 2, 1, 1, 1, 1, 3, NA, 2, 2, 5, 4, 0, 4, 1, 3, 3, 1, …
## $ baths <dbl> 2, NA, NA, NA, NA, NA, 1, NA, 1, NA, NA, NA, 3, NA, NA, NA…
## $ sqft <dbl> NA, NA, NA, 735, NA, NA, NA, NA, NA, NA, NA, 2581, 1756, N…
## $ room_in_apt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ address <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ lat <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 37.53494, …
## $ lon <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ title <chr> "$1250 / 2br - 2BR/2BA 1145 ALAMEDA DE LAS PULGAS", "$12…
## $ descr <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ details <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "<p class=…
myData <- rent[!is.na(rent$nhood) & rent$price <= 9000 & !is.na(rent$year) & !is.na(rent$beds), ]
glimpse(myData)
## Rows: 193,098
## Columns: 17
## $ post_id <chr> "pre2013_134138", "pre2013_135669", "pre2013_127127", "pre…
## $ date <dbl> 20050111, 20050126, 20041017, 20120601, 20041021, 20060411…
## $ year <dbl> 2005, 2005, 2004, 2012, 2004, 2006, 2007, 2017, 2006, 2006…
## $ nhood <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ city <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ county <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ price <dbl> 1250, 1295, 1100, 1425, 890, 825, 1500, 2925, 1395, 1555, …
## $ beds <dbl> 2, 2, 2, 1, 1, 1, 1, 3, 2, 2, 5, 4, 0, 4, 1, 3, 3, 1, 1, 3…
## $ baths <dbl> 2, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, 3, NA, NA, NA, N…
## $ sqft <dbl> NA, NA, NA, 735, NA, NA, NA, NA, NA, NA, 2581, 1756, NA, 2…
## $ room_in_apt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ address <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ lat <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 37.53494, NA, …
## $ lon <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ title <chr> "$1250 / 2br - 2BR/2BA 1145 ALAMEDA DE LAS PULGAS", "$12…
## $ descr <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ details <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "<p class=\"ro…
myData %>%
ggplot(aes(price)) +
geom_histogram(bins = 100)
# Load the necessary library if not already loaded
library(dplyr)
# Define a custom function to combine "nhood" values into single tokens
combine_tokens <- function(text) {
return(paste(text, collapse = " "))
}
# Apply the custom function to create the "single_token_nhood" column
tidy_rent <- myData %>%
mutate(single_token_nhood = sapply(nhood, combine_tokens))
tidy_rent %>%
count(single_token_nhood, sort = TRUE)
## # A tibble: 167 × 2
## single_token_nhood n
## <chr> <int>
## 1 santa rosa 6095
## 2 santa cruz 5231
## 3 san mateo 4930
## 4 sunnyvale 4408
## 5 mountain view 4295
## 6 santa clara 4076
## 7 SOMA / south beach 4007
## 8 palo alto 3649
## 9 san jose south 3582
## 10 nob hill 3510
## # ℹ 157 more rows
# Now, the "single_token_nhood" column contains the "nhood" values as single tokens
glimpse(tidy_rent)
## Rows: 193,098
## Columns: 18
## $ post_id <chr> "pre2013_134138", "pre2013_135669", "pre2013_127127…
## $ date <dbl> 20050111, 20050126, 20041017, 20120601, 20041021, 2…
## $ year <dbl> 2005, 2005, 2004, 2012, 2004, 2006, 2007, 2017, 200…
## $ nhood <chr> "alameda", "alameda", "alameda", "alameda", "alamed…
## $ city <chr> "alameda", "alameda", "alameda", "alameda", "alamed…
## $ county <chr> "alameda", "alameda", "alameda", "alameda", "alamed…
## $ price <dbl> 1250, 1295, 1100, 1425, 890, 825, 1500, 2925, 1395,…
## $ beds <dbl> 2, 2, 2, 1, 1, 1, 1, 3, 2, 2, 5, 4, 0, 4, 1, 3, 3, …
## $ baths <dbl> 2, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, 3, NA, NA…
## $ sqft <dbl> NA, NA, NA, 735, NA, NA, NA, NA, NA, NA, 2581, 1756…
## $ room_in_apt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ address <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ lat <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 37.5349…
## $ lon <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ title <chr> "$1250 / 2br - 2BR/2BA 1145 ALAMEDA DE LAS PULGAS…
## $ descr <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ details <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "<p cla…
## $ single_token_nhood <chr> "alameda", "alameda", "alameda", "alameda", "alamed…
tidy_rent %>%
group_by(single_token_nhood) %>%
summarise(n = n(),
price = mean(price)) %>%
ggplot(aes(n, price)) +
geom_hline(yintercept = mean(myData$price),
lty = 2, color = "gray50", size = 1.5) +
geom_point(color = "midnightblue", alpha = 0.7) +
geom_text(aes(label = single_token_nhood), check_overlap = TRUE, vjust = "top", hjust = "left") +
scale_x_log10()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(tidymodels)
set.seed(123)
rent_split <- initial_split(tidy_rent, strata = price)
rent_train <- training(rent_split)
rent_test <- testing(rent_split)
set.seed(234)
rent_folds <- vfold_cv(rent_train, strata = price)
rent_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [130337/14485]> Fold01
## 2 <split [130338/14484]> Fold02
## 3 <split [130339/14483]> Fold03
## 4 <split [130340/14482]> Fold04
## 5 <split [130340/14482]> Fold05
## 6 <split [130340/14482]> Fold06
## 7 <split [130341/14481]> Fold07
## 8 <split [130341/14481]> Fold08
## 9 <split [130341/14481]> Fold09
## 10 <split [130341/14481]> Fold10
library(textrecipes)
rent_rec <-
recipe(price ~ single_token_nhood + beds, data = rent_train) %>%
step_tokenize(single_token_nhood) %>%
#step_tokenfilter(nhood, max_tokens = 500) %>%
step_tf(single_token_nhood)
rf_spec <-
rand_forest(trees = 50) %>%
set_mode("regression")
rf_spec
## Random Forest Model Specification (regression)
##
## Main Arguments:
## trees = 50
##
## Computational engine: ranger
svm_spec <-
svm_linear() %>%
set_mode("regression")
svm_spec
## Linear Support Vector Machine Model Specification (regression)
##
## Computational engine: LiblineaR
svm_wf <- workflow(rent_rec, svm_spec)
rf_wf <- workflow(rent_rec, rf_spec)
doParallel::registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)
svm_rs <- fit_resamples(
svm_wf,
resamples = rent_folds,
control = contrl_preds
)
ranger_rs <- fit_resamples(
rf_wf,
resamples = rent_folds,
control = contrl_preds
)
collect_metrics(svm_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 894. 10 5.46 Preprocessor1_Model1
## 2 rsq standard 0.410 10 0.00542 Preprocessor1_Model1
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 866. 10 2.28 Preprocessor1_Model1
## 2 rsq standard 0.457 10 0.00280 Preprocessor1_Model1
bind_rows(
collect_predictions(svm_rs) %>%
mutate(mod = "SVM"),
collect_predictions(ranger_rs) %>%
mutate(mod = "ranger")
) %>%
ggplot(aes(price, .pred, color = id)) +
geom_abline(lty = 2, color = "gray50", size = 1.2) +
geom_jitter(width = 0.5, alpha = 0.5) +
facet_wrap(vars(mod)) +
coord_fixed()
final_fitted <- last_fit(svm_wf, rent_split)
collect_metrics(final_fitted) ## metrics evaluated on the *testing* data
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 897. Preprocessor1_Model1
## 2 rsq standard 0.413 Preprocessor1_Model1
This object contains a fitted workflow that we can use for prediction.
final_wf <- extract_workflow(final_fitted)
predict(final_wf, rent_test[55, ])
## # A tibble: 1 × 1
## .pred
## <dbl>
## 1 1423.
library(stringr)
##
## Attaching package: 'stringr'
## The following object is masked from 'package:recipes':
##
## fixed
library(forcats)
extract_workflow(final_fitted) %>%
tidy() %>%
filter(term != "Bias") %>%
group_by(estimate > 0) %>%
slice_max(abs(estimate), n = 10) %>%
ungroup() %>%
mutate(term = str_remove(term, "tf_single_token_nhood_")) %>%
ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0)) +
geom_col(alpha = 0.8) +
scale_fill_discrete(labels = c("Low price", "High price")) +
labs(y = NULL, fill = "More from...")
The dataset is made up of 17 columns with 198098 observations with they key variables used for this model being the “nhood”column.
They key variable used for this prediction was like I mentioned above the “nhood”column, this was chosen because of the factor location plays in the price of real estate.
The original data contains raw rental information, including neighborhood names, while the transformed data for modeling includes numerical representations of neighborhoods after tokenization and term frequency transformation and other predictors like the number of bedrooms. These transformations are essential as they convert text data into numerical format, making sure that the data is formatted in a way that makes sense for a machine learning model and potential feature relationships. They also have the benefit of reducing dimensionality and enabling models to learn from textual information to make predictions effectively.
In my code, I did the data preparation using the textrecipes package. I began by tokenizing the “nhood” column using the step_tokenize function, which is essential for converting text data into a format suitable for machine learning. Next, I applied the term frequency transformation using the step_tf function, which further processed the data for modeling. These steps were crucial in preparing the text data for subsequent analysis and model building.
I employed two machine learning models in my analysis: Support Vector Machine (SVM) and Random Forest (RF) regression models. These models were used to predict rental prices based on the transformed text and numerical features in the dataset.
The model evaluation employs standard regression metrics, including Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and R-squared (R²). These metrics assess prediction accuracy and the model’s ability to explain variance.
In my analysis, I employed two machine learning models, namely Random Forest and Support Vector Machine (SVM), to predict rent prices in San Fransisco. To evaluate model performance, I used common regression metrics, Root Mean Squared Error, and R-squared. These metrics helped assess prediction accuracy.
Data preparation involved tokenization and term frequency transformation to make the dataset suitable for modeling.
Overall, both models performed reasonably well, with SVM slightly outperforming Random Forest although neither of the models performed very well in my opinion.