library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Import data

myData <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-07-05/rent.csv')
## Rows: 200796 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): post_id, nhood, city, county, address, title, descr, details
## dbl (9): date, year, price, beds, baths, sqft, room_in_apt, lat, lon
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
myData
## # A tibble: 200,796 × 17
##    post_id     date  year nhood city  county price  beds baths  sqft room_in_apt
##    <chr>      <dbl> <dbl> <chr> <chr> <chr>  <dbl> <dbl> <dbl> <dbl>       <dbl>
##  1 pre2013_… 2.01e7  2005 alam… alam… alame…  1250     2     2    NA           0
##  2 pre2013_… 2.01e7  2005 alam… alam… alame…  1295     2    NA    NA           0
##  3 pre2013_… 2.00e7  2004 alam… alam… alame…  1100     2    NA    NA           0
##  4 pre2013_… 2.01e7  2012 alam… alam… alame…  1425     1    NA   735           0
##  5 pre2013_… 2.00e7  2004 alam… alam… alame…   890     1    NA    NA           0
##  6 pre2013_… 2.01e7  2006 alam… alam… alame…   825     1    NA    NA           0
##  7 pre2013_… 2.01e7  2007 alam… alam… alame…  1500     1     1    NA           0
##  8 63790969… 2.02e7  2017 alam… alam… alame…  2925     3    NA    NA           0
##  9 pre2013_… 2.01e7  2009 alam… alam… alame…   450    NA     1    NA           0
## 10 pre2013_… 2.01e7  2006 alam… alam… alame…  1395     2    NA    NA           0
## # ℹ 200,786 more rows
## # ℹ 6 more variables: address <chr>, lat <dbl>, lon <dbl>, title <chr>,
## #   descr <chr>, details <chr>

Data Cleaning

myData %>% na.omit() %>% skimr::skim()
Data summary
Name Piped data
Number of rows 1534
Number of columns 17
_______________________
Column type frequency:
character 8
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
post_id 0 1 10 10 0 1534 0
nhood 0 1 4 38 0 120 0
city 0 1 5 14 0 72 0
county 0 1 4 13 0 10 0
address 0 1 1 4 0 1175 0
title 0 1 6 148 0 1494 0
descr 0 1 44 8880 0 1406 0
details 0 1 11 219 0 1390 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
date 0 1 20150526.54 11357.57 20140102.00 20141012.00 20150120.00 20150623.00 20180201.00 ▇▇▁▂▁
year 0 1 2014.99 1.16 2014.00 2014.00 2015.00 2015.00 2018.00 ▇▇▁▂▁
price 0 1 3035.27 1896.77 795.00 2000.00 2647.00 3415.50 24000.00 ▇▁▁▁▁
beds 0 1 2.01 1.13 0.00 1.00 2.00 3.00 7.00 ▇▇▆▁▁
baths 0 1 1.51 0.68 1.00 1.00 1.00 2.00 5.00 ▇▅▁▁▁
sqft 0 1 1190.60 676.16 117.00 783.25 1017.50 1412.00 7900.00 ▇▁▁▁▁
room_in_apt 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
lat 0 1 37.63 0.32 36.91 37.36 37.58 37.80 39.07 ▃▇▅▁▁
lon 0 1 -122.15 0.25 -123.02 -122.37 -122.08 -121.95 -121.21 ▁▅▇▃▁
clean_data <- myData %>% select(post_id, nhood, price, beds, baths, sqft, room_in_apt) %>% na.omit()

Data Exploration

clean_data %>% glimpse()
## Rows: 14,629
## Columns: 7
## $ post_id     <chr> "4168358289", "pre2013_59350", "pre2013_72024", "pre2013_6…
## $ nhood       <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ price       <dbl> 2595, 1375, 1950, 1640, 3100, 1000, 1555, 2150, 2100, 4200…
## $ beds        <dbl> 4, 2, 3, 2, 2, 2, 3, 3, 1, 3, 3, 2, 2, 2, 2, 2, 4, 2, 2, 1…
## $ baths       <dbl> 3.0, 1.0, 2.0, 1.5, 1.0, 1.0, 2.5, 3.0, 1.0, 2.5, 2.5, 2.5…
## $ sqft        <dbl> 1756, 700, 1400, 895, 1200, 1185, 1626, 1014, 752, 2205, 1…
## $ room_in_apt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
clean_data %>% skimr::skim()
Data summary
Name Piped data
Number of rows 14629
Number of columns 7
_______________________
Column type frequency:
character 2
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
post_id 0 1 10 14 0 14629 0
nhood 0 1 4 38 0 146 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
price 0 1 2786.57 1584.30 350 1850 2450 3250 24000 ▇▁▁▁▁
beds 0 1 2.32 1.03 0 2 2 3 8 ▂▇▁▁▁
baths 0 1 1.77 0.74 1 1 2 2 8 ▇▂▁▁▁
sqft 0 1 1273.26 697.79 110 887 1100 1500 22000 ▇▁▁▁▁
room_in_apt 0 1 0.00 0.05 0 0 0 0 1 ▇▁▁▁▁

Model Building

library(rsample)

set.seed(123)
data_split <- initial_split(clean_data)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(234)
data_folds <- bootstraps(data_train, strata = price)
data_folds
## # Bootstrap sampling using stratification 
## # A tibble: 25 × 2
##    splits               id         
##    <list>               <chr>      
##  1 <split [10971/4034]> Bootstrap01
##  2 <split [10971/4057]> Bootstrap02
##  3 <split [10971/4063]> Bootstrap03
##  4 <split [10971/4036]> Bootstrap04
##  5 <split [10971/4006]> Bootstrap05
##  6 <split [10971/4092]> Bootstrap06
##  7 <split [10971/4047]> Bootstrap07
##  8 <split [10971/4046]> Bootstrap08
##  9 <split [10971/3997]> Bootstrap09
## 10 <split [10971/3970]> Bootstrap10
## # ℹ 15 more rows
library(usemodels)
use_xgboost(price ~ sqft + baths, data = data_train)
## xgboost_recipe <- 
##   recipe(formula = price ~ sqft + baths, data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(49960)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
library(recipes)
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
## 
##     fixed
## The following object is masked from 'package:stats':
## 
##     step
xgboost_recipe <- 
  recipe(formula = price ~ ., data = data_train) %>%
    recipes::update_role(post_id, new_role = "id") %>%
    step_other(nhood) %>%
    # step_dummy(all_nominal_predictors()) %>%   # for factors
    step_log(price, sqft, baths) %>%
    step_nzv(all_predictors()) 

xgboost_recipe %>% prep() %>% bake(new_data = NULL)
## # A tibble: 10,971 × 5
##    post_id        beds baths  sqft price
##    <fct>         <dbl> <dbl> <dbl> <dbl>
##  1 pre2013_71000     3 0.916  7.31  7.82
##  2 pre2013_71048     2 0.693  7.01  7.73
##  3 pre2013_22055     1 0      6.26  7.33
##  4 5507189650        2 0.693  6.93  8.07
##  5 4656280635        3 0.693  7.05  7.58
##  6 pre2013_39309     1 0      6.48  7.11
##  7 4977611777        1 0      6.62  7.75
##  8 5066102049        2 0      6.66  7.85
##  9 pre2013_40935     3 0.916  7.32  7.94
## 10 4710858332        3 0.693  7.05  7.09
## # ℹ 10,961 more rows
library(parsnip)
library(workflows)
library(tune)


xgboost_spec <- 
  boost_tree(trees = tune(), min_n = tune()) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost") 

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec) 

set.seed(15793)
doParallel::registerDoParallel()
xgboost_tune <-
  tune_grid(xgboost_workflow, 
            resamples = data_folds, 
            grid = 11)

Exploring The Results

show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 8
##   trees min_n .metric .estimator  mean     n  std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>    <dbl> <chr>                
## 1   144    38 rmse    standard   0.384    25 0.000755 Preprocessor1_Model11
## 2   250    21 rmse    standard   0.387    25 0.000763 Preprocessor1_Model06
## 3   754    31 rmse    standard   0.391    25 0.000798 Preprocessor1_Model09
## 4   421    17 rmse    standard   0.391    25 0.000805 Preprocessor1_Model05
## 5  1510    34 rmse    standard   0.396    25 0.000894 Preprocessor1_Model10
autoplot(xgboost_tune)

final_rf <- xgboost_workflow %>% 
    finalize_workflow(select_best(xgboost_tune, "rmse"))

data_fit <- last_fit(final_rf, data_split)
data_fit
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits               id              .metrics .notes   .predictions .workflow 
##   <list>               <chr>           <list>   <list>   <list>       <list>    
## 1 <split [10971/3658]> train/test spl… <tibble> <tibble> <tibble>     <workflow>

Model Evaluation

collect_metrics(data_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.376 Preprocessor1_Model1
## 2 rsq     standard       0.320 Preprocessor1_Model1
collect_predictions(data_fit)
## # A tibble: 3,658 × 5
##    id               .pred  .row price .config             
##    <chr>            <dbl> <int> <dbl> <chr>               
##  1 train/test split  7.90     3  7.58 Preprocessor1_Model1
##  2 train/test split  7.60     4  7.40 Preprocessor1_Model1
##  3 train/test split  7.78     8  7.67 Preprocessor1_Model1
##  4 train/test split  7.63    19  7.55 Preprocessor1_Model1
##  5 train/test split  7.80    22  7.92 Preprocessor1_Model1
##  6 train/test split  7.59    24  7.84 Preprocessor1_Model1
##  7 train/test split  7.83    28  7.55 Preprocessor1_Model1
##  8 train/test split  7.59    34  7.67 Preprocessor1_Model1
##  9 train/test split  7.72    35  7.28 Preprocessor1_Model1
## 10 train/test split  7.91    36  7.78 Preprocessor1_Model1
## # ℹ 3,648 more rows
collect_predictions(data_fit) %>%
    ggplot(aes(price, .pred)) +
    geom_point(alpha = 0.5, fill = "midnightblue") +
    geom_abline(lty = 2, color = "gray50") +
    coord_fixed()

1

The research question I am trying to answer using this new dataset is if we can build a model that can predict the rent price for apartments in San Fransisco.

The data is made up of 17 different variables and 200796 observations. One important observation is that the dataset is missing a lot of values that would have improved the performance of the model.

2

The original dataset like I mentioned before had a lot of na values which were removed. The new and cleaned dataset also did not include any variables where the observations were not numbers since ML models cannot work with strings. I had to make these changes in order for the model to be able to train and test on the data even if it meant using less data for both the training and the testing of the model.

3

Data Preparation and Modeling:

Names of Data Preparation Steps:

Omitting NA values: The na.omit() function is used to remove rows with missing values in the dataset. Selection: Selecting specific columns (post_id, nhood, price, beds, baths, sqft, room_in_apt) for further analysis. Setting Role: The update_role function is used to update the role of post_id to act as an identifier. Grouping Rare Factors: The step_other() function is applied to the nhood variable. It groups infrequent levels into a new level, commonly named other. Log Transformation: The step_log() function is used to apply a logarithmic transformation to the price, sqft, and baths columns. Log transformation is commonly used to normalize skewed distributions and make patterns in the data more interpretable. Removing Near Zero Variance Predictors: The step_nzv() function is used to remove predictors that have one unique value (i.e., are constant) or predictors that are dominated by one repeated value. Such predictors often do not contribute to the predictive performance of a model.

Name of the Machine Learning Model Used in the Analysis: The machine learning model used in the analysis is the XGBoost (Extreme Gradient Boosting) regression model. This is from the use of set_engine(“xgboost”) and the function use_xgboost(). XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible, and portable.

4

The metrics used to assess the performance of the XGBoost regression model on the dataset are:

RMSE (Root Mean Square Error): This metric gives the square root of the average squared differences between the predicted and actual values. It measures the magnitude of error between these two sets of values. A smaller RMSE value indicates a better fit to the data.

Evidence: The function show_best(xgboost_tune, metric = “rmse”) is specifically asking for the models with the best (smallest) RMSE values from the tuning results. RSQ (R-squared or Coefficient of Determination): While it isn’t explicitly mentioned in the displayed code, RSQ is a common metric output alongside RMSE in many tuning and modeling functions within the tidymodels framework. It represents the proportion of the variance for the dependent variable (in this case, price) that’s explained by the independent variables in the model. It provides a measure of how well the model’s predictions match the actual data. A value of 1 means the model perfectly predicts the actual values, while a value of 0 indicates the model does no better than simply predicting the mean of the target variable for all observations.

5

Some major findings from this new dataset would be that it is very incomplete and therefore does not provide for the best possible prediction by the model. I was also not able to figure out how to incorporate the nhood variable into my analysis which in my opinion would have improved the performance of the model since the price of real estate often comes down to location in combination with the other variables I did use for this prediction. There were some similarities with the dataset from the CA assignment however that dataset was more complete and did therefore make for a better end result compared to this one.