- Groups: Final project teams (if team members here)
- Respond to worksheet questions on final projects
2026-5-11
tidymodelstidymodelsSource: Breiman, 2002
Source: Breiman, 2002
Source: Breiman, 2002
Source: Breiman, 2002
Source: Breiman, 2002
## | | | 0% | |= | 2% | |=== | 4% | |==== | 6% | |===== | 7% | |====== | 8% | |======= | 10% | |======== | 12% | |========= | 12% | |========== | 15% | |============ | 17% | |============= | 18% | |============== | 20% | |================ | 22% | |================ | 23% | |================= | 24% | |================= | 25% | |==================== | 28% | |===================== | 30% | |======================= | 32% | |======================== | 34% | |========================= | 36% | |=========================== | 38% | |============================= | 41% | |============================== | 43% | |================================ | 45% | |================================= | 47% | |=================================== | 49% | |==================================== | 52% | |===================================== | 53% | |======================================= | 55% | |======================================== | 57% | |========================================= | 59% | |=========================================== | 61% | |============================================ | 63% | |============================================== | 65% | |=============================================== | 67% | |================================================ | 69% | |================================================== | 71% | |=================================================== | 73% | |===================================================== | 75% | |====================================================== | 77% | |======================================================= | 79% | |========================================================= | 81% | |========================================================== | 83% | |=========================================================== | 85% | |============================================================= | 87% | |============================================================== | 89% | |================================================================ | 91% | |================================================================= | 93% | |================================================================== | 95% | |==================================================================== | 97% | |===================================================================== | 99% | |======================================================================| 100%
ames data with View()Latitude and Longitude columnslibrary(sf) library(tidymodels) library(ggplot2) library(tidyverse) library(magrittr) data(ames)
st_as_sf!# make ames data spatial
ames_spatial <- ames %>%
st_as_sf(coords = c("Longitude", "Latitude"))
ggplot(ames_spatial) + geom_sf()
kclust <- kmeans(ames %>%
select(Longitude, Latitude),
centers = 3)
# add clusters ames_spatial %<>% mutate(cluster = as.character(kclust$cluster)) ggplot(ames_spatial) + geom_sf(aes(col = cluster))
kclust <- kmeans(ames %>%
select(Longitude, Latitude, Sale_Price),
centers = 3)
kmeans settingsNeighborhood identifiers in Ames data.
# examine ames housing data ggplot(ames, aes(x = Sale_Price)) + geom_histogram(bins = 50, col= "white")
strata = Sale_Price we ensure that high sales prices are in both train and test datalibrary(tidymodels) # split data ames_split <- initial_split(ames, prop = 0.80, strata = Sale_Price) ames_train <- training(ames_split) ames_test <- testing(ames_split)
# create a decision tree
tree_model <-
decision_tree(min_n = 2) %>%
set_engine("rpart") %>%
set_mode("regression")
# define the tree's model fit
tree_fit <-
tree_model %>%
fit(Sale_Price ~ Longitude + Latitude + Sale_Price,
data = ames_train)
ames_test_small <- ames_test %>% slice(1:10) # combine predictions with our data ames_test_small %>% select(Sale_Price) %>% bind_cols(predict(tree_fit, ames_test_small))
## # A tibble: 10 × 2 ## Sale_Price .pred ## <int> <dbl> ## 1 213500 210904. ## 2 191500 210904. ## 3 189000 210904. ## 4 185000 210904. ## 5 141000 145929. ## 6 210000 210904. ## 7 146000 145929. ## 8 376162 313213. ## 9 320000 407439. ## 10 215200 210904.
# create a random forest
rf_model <-
rand_forest(trees = 1000) %>%
set_engine("ranger") %>%
set_mode("regression")
# define the random forest workflow
rf_wflow <-
workflow() %>%
add_formula(
Sale_Price ~ Gr_Liv_Area + Year_Built + Bldg_Type +
Latitude + Longitude) %>%
add_model(rf_model)
# fit the random forest model
rf_fit <- rf_wflow %>% fit(data = ames_train)
estimate_perf <- function(model, dat) {
# Capture the names of the `model` and `dat` objects
cl <- match.call()
obj_name <- as.character(cl$model)
data_name <- as.character(cl$dat)
data_name <- gsub("ames_", "", data_name)
# Estimate these metrics:
reg_metrics <- metric_set(rmse, rsq)
# output our model
output <- model %>%
predict(dat) %>%
bind_cols(dat %>% select(Sale_Price)) %>%
reg_metrics(Sale_Price, .pred) %>%
select(-.estimator) %>%
mutate(object = obj_name, data = data_name)
return(output)
}
# first examine tree performance estimate_perf(tree_fit, ames_train)
## # A tibble: 2 × 4 ## .metric .estimate object data ## <chr> <dbl> <chr> <chr> ## 1 rmse 50318. tree_fit train ## 2 rsq 0.603 tree_fit train
# now examine random forest performance estimate_perf(rf_fit, ames_train)
## # A tibble: 2 × 4 ## .metric .estimate object data ## <chr> <dbl> <chr> <chr> ## 1 rmse 15019. rf_fit train ## 2 rsq 0.968 rf_fit train
# first examine tree performance estimate_perf(tree_fit, ames_train) # now examine random forest performance estimate_perf(rf_fit, ames_train)
last_fit and collect_metrics to evaluate models on test data# final rf model final_rf_res <- last_fit(rf_wflow, ames_split) # get model metrics collect_metrics(final_rf_res)
## # A tibble: 2 × 4 ## .metric .estimator .estimate .config ## <chr> <chr> <dbl> <chr> ## 1 rmse standard 29146. pre0_mod0_post0 ## 2 rsq standard 0.867 pre0_mod0_post0
Cross validation. Source: Kuhn and Silge, 2023.