library(tidyverse)
library(janitor)
library(tidymodels)
library(rsample)
library(yardstick)ECON 465 Stage 2
1. DATA IMPORT
airbnb <- read_csv("Airbnb_data.csv")
bank <- read_csv2("Bank_data.csv")
airbnb <- airbnb %>% clean_names() %>% drop_na()
bank <- bank %>% clean_names() %>% drop_na()2. DATA SPLITTING
set.seed(465)
airbnb_split <- initial_split(airbnb, prop = 0.8)
airbnb_train <- training(airbnb_split)
airbnb_test <- testing(airbnb_split)
bank_split <- initial_split(bank, prop = 0.8, strata = y)
bank_train <- training(bank_split)
bank_test <- testing(bank_split)🔴 IMPORTANT FIX (CLASSIFICATION FORMAT)
bank_train <- bank_train %>%
mutate(y = factor(y, levels = c("no", "yes")))
bank_test <- bank_test %>%
mutate(y = factor(y, levels = c("no", "yes")))3. AIRBNB REGRESSION
Model 1
model1 <- lm(
price ~ minimum_nights + number_of_reviews + calculated_host_listings_count,
data = airbnb_train
)
pred1 <- predict(model1, airbnb_test)
rmse1 <- rmse_vec(airbnb_test$price, pred1)
rsq1 <- rsq_vec(airbnb_test$price, pred1)Model 2
model2 <- lm(
price ~ minimum_nights + number_of_reviews + availability_365 +
calculated_host_listings_count + reviews_per_month,
data = airbnb_train
)
pred2 <- predict(model2, airbnb_test)
rmse2 <- rmse_vec(airbnb_test$price, pred2)
rsq2 <- rsq_vec(airbnb_test$price, pred2)AIRBNB COMPARISON
tibble(
Model = c("Model 1", "Model 2"),
RMSE = c(rmse1, rmse2),
R2 = c(rsq1, rsq2)
)# A tibble: 2 × 3
Model RMSE R2
<chr> <dbl> <dbl>
1 Model 1 199. 0.00391
2 Model 2 198. 0.0126
4. BANK CLASSIFICATION
Model 1
bank_model1 <- glm(
y ~ age + duration + campaign,
data = bank_train,
family = binomial
)
prob1 <- predict(bank_model1, bank_test, type = "response")
pred1 <- ifelse(prob1 > 0.5, "yes", "no")
pred1 <- factor(pred1, levels = c("no", "yes"))
confusion1 <- table(pred1, bank_test$y)
confusion1
pred1 no yes
no 7209 761
yes 101 167
acc1 <- accuracy_vec(bank_test$y, pred1)
prec1 <- precision_vec(bank_test$y, pred1)
rec1 <- recall_vec(bank_test$y, pred1)Model 2
bank_model2 <- glm(
y ~ age + duration + campaign + previous + pdays,
data = bank_train,
family = binomial
)
prob2 <- predict(bank_model2, bank_test, type = "response")
pred2 <- ifelse(prob2 > 0.5, "yes", "no")
pred2 <- factor(pred2, levels = c("no", "yes"))
confusion2 <- table(pred2, bank_test$y)
confusion2
pred2 no yes
no 7168 626
yes 142 302
acc2 <- accuracy_vec(bank_test$y, pred2)
prec2 <- precision_vec(bank_test$y, pred2)
rec2 <- recall_vec(bank_test$y, pred2)BANK COMPARISON
tibble(
Model = c("Model 1", "Model 2"),
Accuracy = c(acc1, acc2),
Precision = c(prec1, prec2),
Recall = c(rec1, rec2)
)# A tibble: 2 × 4
Model Accuracy Precision Recall
<chr> <dbl> <dbl> <dbl>
1 Model 1 0.895 0.905 0.986
2 Model 2 0.907 0.920 0.981
5. CROSS VALIDATION
Airbnb
set.seed(465)
cv_airbnb <- vfold_cv(airbnb_train, v = 5)
fit_resamples(
linear_reg() %>% set_engine("lm"),
price ~ minimum_nights + number_of_reviews +
availability_365 + calculated_host_listings_count + reviews_per_month,
resamples = cv_airbnb,
metrics = metric_set(rmse, rsq)
) %>%
collect_metrics()# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 192. 5 19.0 pre0_mod0_post0
2 rsq standard 0.0119 5 0.00314 pre0_mod0_post0
Bank
set.seed(465)
cv_bank <- vfold_cv(bank_train, v = 5)
fit_resamples(
logistic_reg() %>% set_engine("glm"),
y ~ age + duration + campaign + previous + pdays,
resamples = cv_bank,
metrics = metric_set(accuracy)
) %>%
collect_metrics()# A tibble: 1 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.904 5 0.00131 pre0_mod0_post0
6. AI LOG
Prompt: How to build regression and classification models in R using tidymodels?
Use: Structured full modeling pipeline.
Reflection: Improved understanding of train/test split, GLM, and evaluation metrics.