library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ recipes      1.0.8
## ✔ dials        1.2.0     ✔ rsample      1.2.0
## ✔ dplyr        1.1.3     ✔ tibble       3.2.1
## ✔ ggplot2      3.4.3     ✔ tidyr        1.3.0
## ✔ infer        1.0.4     ✔ tune         1.1.2
## ✔ modeldata    1.2.0     ✔ workflows    1.1.3
## ✔ parsnip      1.1.1     ✔ workflowsets 1.0.1
## ✔ purrr        1.0.2     ✔ yardstick    1.2.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
rent <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-07-05/rent.csv')
## Rows: 200796 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): post_id, nhood, city, county, address, title, descr, details
## dbl (9): date, year, price, beds, baths, sqft, room_in_apt, lat, lon
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(rent)
## Rows: 200,796
## Columns: 17
## $ post_id     <chr> "pre2013_134138", "pre2013_135669", "pre2013_127127", "pre…
## $ date        <dbl> 20050111, 20050126, 20041017, 20120601, 20041021, 20060411…
## $ year        <dbl> 2005, 2005, 2004, 2012, 2004, 2006, 2007, 2017, 2009, 2006…
## $ nhood       <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ city        <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ county      <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ price       <dbl> 1250, 1295, 1100, 1425, 890, 825, 1500, 2925, 450, 1395, 1…
## $ beds        <dbl> 2, 2, 2, 1, 1, 1, 1, 3, NA, 2, 2, 5, 4, 0, 4, 1, 3, 3, 1, …
## $ baths       <dbl> 2, NA, NA, NA, NA, NA, 1, NA, 1, NA, NA, NA, 3, NA, NA, NA…
## $ sqft        <dbl> NA, NA, NA, 735, NA, NA, NA, NA, NA, NA, NA, 2581, 1756, N…
## $ room_in_apt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ address     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ lat         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 37.53494, …
## $ lon         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ title       <chr> "$1250 / 2br - 2BR/2BA   1145 ALAMEDA DE LAS PULGAS", "$12…
## $ descr       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ details     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "<p class=…
myData <- rent[!is.na(rent$nhood) & rent$price <= 9000 & !is.na(rent$year) & !is.na(rent$beds), ]

glimpse(myData)
## Rows: 193,098
## Columns: 17
## $ post_id     <chr> "pre2013_134138", "pre2013_135669", "pre2013_127127", "pre…
## $ date        <dbl> 20050111, 20050126, 20041017, 20120601, 20041021, 20060411…
## $ year        <dbl> 2005, 2005, 2004, 2012, 2004, 2006, 2007, 2017, 2006, 2006…
## $ nhood       <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ city        <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ county      <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ price       <dbl> 1250, 1295, 1100, 1425, 890, 825, 1500, 2925, 1395, 1555, …
## $ beds        <dbl> 2, 2, 2, 1, 1, 1, 1, 3, 2, 2, 5, 4, 0, 4, 1, 3, 3, 1, 1, 3…
## $ baths       <dbl> 2, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, 3, NA, NA, NA, N…
## $ sqft        <dbl> NA, NA, NA, 735, NA, NA, NA, NA, NA, NA, 2581, 1756, NA, 2…
## $ room_in_apt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ address     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ lat         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 37.53494, NA, …
## $ lon         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ title       <chr> "$1250 / 2br - 2BR/2BA   1145 ALAMEDA DE LAS PULGAS", "$12…
## $ descr       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ details     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "<p class=\"ro…

Explore data

myData %>%
  ggplot(aes(price)) +
  geom_histogram(bins = 100)

# Load the necessary library if not already loaded
library(dplyr)

# Define a custom function to combine "nhood" values into single tokens
combine_tokens <- function(text) {
  return(paste(text, collapse = " "))
}

# Apply the custom function to create the "single_token_nhood" column
tidy_rent <- myData %>%
  mutate(single_token_nhood = sapply(nhood, combine_tokens))

tidy_rent %>%
  count(single_token_nhood, sort = TRUE)
## # A tibble: 167 × 2
##    single_token_nhood     n
##    <chr>              <int>
##  1 santa rosa          6095
##  2 santa cruz          5231
##  3 san mateo           4930
##  4 sunnyvale           4408
##  5 mountain view       4295
##  6 santa clara         4076
##  7 SOMA / south beach  4007
##  8 palo alto           3649
##  9 san jose south      3582
## 10 nob hill            3510
## # ℹ 157 more rows
# Now, the "single_token_nhood" column contains the "nhood" values as single tokens

glimpse(tidy_rent)
## Rows: 193,098
## Columns: 18
## $ post_id            <chr> "pre2013_134138", "pre2013_135669", "pre2013_127127…
## $ date               <dbl> 20050111, 20050126, 20041017, 20120601, 20041021, 2…
## $ year               <dbl> 2005, 2005, 2004, 2012, 2004, 2006, 2007, 2017, 200…
## $ nhood              <chr> "alameda", "alameda", "alameda", "alameda", "alamed…
## $ city               <chr> "alameda", "alameda", "alameda", "alameda", "alamed…
## $ county             <chr> "alameda", "alameda", "alameda", "alameda", "alamed…
## $ price              <dbl> 1250, 1295, 1100, 1425, 890, 825, 1500, 2925, 1395,…
## $ beds               <dbl> 2, 2, 2, 1, 1, 1, 1, 3, 2, 2, 5, 4, 0, 4, 1, 3, 3, …
## $ baths              <dbl> 2, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, 3, NA, NA…
## $ sqft               <dbl> NA, NA, NA, 735, NA, NA, NA, NA, NA, NA, 2581, 1756…
## $ room_in_apt        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ address            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ lat                <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 37.5349…
## $ lon                <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ title              <chr> "$1250 / 2br - 2BR/2BA   1145 ALAMEDA DE LAS PULGAS…
## $ descr              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ details            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "<p cla…
## $ single_token_nhood <chr> "alameda", "alameda", "alameda", "alameda", "alamed…
tidy_rent %>%
  group_by(single_token_nhood) %>%
  summarise(n = n(),
            price = mean(price)) %>%
  ggplot(aes(n, price)) +
  geom_hline(yintercept = mean(myData$price),
             lty = 2, color = "gray50", size = 1.5) +
  geom_point(color = "midnightblue", alpha = 0.7) +
  geom_text(aes(label = single_token_nhood), check_overlap = TRUE, vjust = "top", hjust = "left") +
  scale_x_log10()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(tidymodels)
set.seed(123)
rent_split <- initial_split(tidy_rent, strata = price)
rent_train <- training(rent_split)
rent_test <- testing(rent_split)

set.seed(234)
rent_folds <- vfold_cv(rent_train, strata = price)
rent_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits                 id    
##    <list>                 <chr> 
##  1 <split [130337/14485]> Fold01
##  2 <split [130338/14484]> Fold02
##  3 <split [130339/14483]> Fold03
##  4 <split [130340/14482]> Fold04
##  5 <split [130340/14482]> Fold05
##  6 <split [130340/14482]> Fold06
##  7 <split [130341/14481]> Fold07
##  8 <split [130341/14481]> Fold08
##  9 <split [130341/14481]> Fold09
## 10 <split [130341/14481]> Fold10

Preprocessing

library(textrecipes)

rent_rec <-
  recipe(price ~ single_token_nhood + beds, data = rent_train) %>%
  step_tokenize(single_token_nhood) %>%
  #step_tokenfilter(nhood, max_tokens = 500) %>%
  step_tf(single_token_nhood)

Model Specification

rf_spec <-
  rand_forest(trees = 50) %>%
  set_mode("regression")

rf_spec
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   trees = 50
## 
## Computational engine: ranger
svm_spec <-
  svm_linear() %>%
  set_mode("regression")

svm_spec
## Linear Support Vector Machine Model Specification (regression)
## 
## Computational engine: LiblineaR

Model Workflow

svm_wf <- workflow(rent_rec, svm_spec)
rf_wf <- workflow(rent_rec, rf_spec)

Evaluate models

doParallel::registerDoParallel()
contrl_preds <- control_resamples(save_pred = TRUE)

svm_rs <- fit_resamples(
  svm_wf,
  resamples = rent_folds,
  control = contrl_preds
)

ranger_rs <- fit_resamples(
  rf_wf,
  resamples = rent_folds,
  control = contrl_preds
)

Model comparison

collect_metrics(svm_rs)
## # A tibble: 2 × 6
##   .metric .estimator    mean     n std_err .config             
##   <chr>   <chr>        <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   894.       10 5.46    Preprocessor1_Model1
## 2 rsq     standard     0.410    10 0.00542 Preprocessor1_Model1
collect_metrics(ranger_rs)
## # A tibble: 2 × 6
##   .metric .estimator    mean     n std_err .config             
##   <chr>   <chr>        <dbl> <int>   <dbl> <chr>               
## 1 rmse    standard   866.       10 2.28    Preprocessor1_Model1
## 2 rsq     standard     0.457    10 0.00280 Preprocessor1_Model1

Vizualize the results

bind_rows(
  collect_predictions(svm_rs) %>%
    mutate(mod = "SVM"),
  collect_predictions(ranger_rs) %>%
    mutate(mod = "ranger")
) %>%
  ggplot(aes(price, .pred, color = id)) +
  geom_abline(lty = 2, color = "gray50", size = 1.2) +
  geom_jitter(width = 0.5, alpha = 0.5) +
  facet_wrap(vars(mod)) +
  coord_fixed()

final_fitted <- last_fit(svm_wf, rent_split)
collect_metrics(final_fitted) ## metrics evaluated on the *testing* data
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard     897.    Preprocessor1_Model1
## 2 rsq     standard       0.413 Preprocessor1_Model1

This object contains a fitted workflow that we can use for prediction.

final_wf <- extract_workflow(final_fitted)
predict(final_wf, rent_test[55, ])
## # A tibble: 1 × 1
##   .pred
##   <dbl>
## 1 1423.
library(stringr)
## 
## Attaching package: 'stringr'
## The following object is masked from 'package:recipes':
## 
##     fixed
library(forcats)


extract_workflow(final_fitted) %>%
  tidy() %>%
  filter(term != "Bias") %>%
  group_by(estimate > 0) %>%
  slice_max(abs(estimate), n = 10) %>%
  ungroup() %>%
  mutate(term = str_remove(term, "tf_single_token_nhood_")) %>%
  ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0)) +
  geom_col(alpha = 0.8) +
  scale_fill_discrete(labels = c("Low price", "High price")) +
  labs(y = NULL, fill = "More from...")

  1. Using the new dataset I am trying to see if I can build a model to predict the rental price for apartments in the San Fransisco area using input variables that are characters instead of numberical.

The dataset is made up of 17 columns with 198098 observations with they key variables used for this model being the “nhood”column.

They key variable used for this prediction was like I mentioned above the “nhood”column, this was chosen because of the factor location plays in the price of real estate.

  1. The original data contains raw rental information, including neighborhood names, while the transformed data for modeling includes numerical representations of neighborhoods after tokenization and term frequency transformation and other predictors like the number of bedrooms. These transformations are essential as they convert text data into numerical format, making sure that the data is formatted in a way that makes sense for a machine learning model and potential feature relationships. They also have the benefit of reducing dimensionality and enabling models to learn from textual information to make predictions effectively.

  2. In my code, I did the data preparation using the textrecipes package. I began by tokenizing the “nhood” column using the step_tokenize function, which is essential for converting text data into a format suitable for machine learning. Next, I applied the term frequency transformation using the step_tf function, which further processed the data for modeling. These steps were crucial in preparing the text data for subsequent analysis and model building.

I employed two machine learning models in my analysis: Support Vector Machine (SVM) and Random Forest (RF) regression models. These models were used to predict rental prices based on the transformed text and numerical features in the dataset.

  1. The model evaluation employs standard regression metrics, including Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and R-squared (R²). These metrics assess prediction accuracy and the model’s ability to explain variance.

  2. In my analysis, I employed two machine learning models, namely Random Forest and Support Vector Machine (SVM), to predict rent prices in San Fransisco. To evaluate model performance, I used common regression metrics, Root Mean Squared Error, and R-squared. These metrics helped assess prediction accuracy.

Data preparation involved tokenization and term frequency transformation to make the dataset suitable for modeling.

Overall, both models performed reasonably well, with SVM slightly outperforming Random Forest although neither of the models performed very well in my opinion.