ECON 465 Stage 2

library(tidyverse)
library(janitor)
library(tidymodels)
library(rsample)
library(yardstick)

1. DATA IMPORT

airbnb <- read_csv("Airbnb_data.csv")
bank <- read_csv2("Bank_data.csv")

airbnb <- airbnb %>% clean_names() %>% drop_na()
bank <- bank %>% clean_names() %>% drop_na()

2. DATA SPLITTING

set.seed(465)

airbnb_split <- initial_split(airbnb, prop = 0.8)
airbnb_train <- training(airbnb_split)
airbnb_test  <- testing(airbnb_split)

bank_split <- initial_split(bank, prop = 0.8, strata = y)
bank_train <- training(bank_split)
bank_test  <- testing(bank_split)

🔴 IMPORTANT FIX (CLASSIFICATION FORMAT)

bank_train <- bank_train %>%
  mutate(y = factor(y, levels = c("no", "yes")))

bank_test <- bank_test %>%
  mutate(y = factor(y, levels = c("no", "yes")))

3. AIRBNB REGRESSION

Model 1

model1 <- lm(
  price ~ minimum_nights + number_of_reviews + calculated_host_listings_count,
  data = airbnb_train
)

pred1 <- predict(model1, airbnb_test)

rmse1 <- rmse_vec(airbnb_test$price, pred1)
rsq1  <- rsq_vec(airbnb_test$price, pred1)

Model 2

model2 <- lm(
  price ~ minimum_nights + number_of_reviews + availability_365 +
    calculated_host_listings_count + reviews_per_month,
  data = airbnb_train
)

pred2 <- predict(model2, airbnb_test)

rmse2 <- rmse_vec(airbnb_test$price, pred2)
rsq2  <- rsq_vec(airbnb_test$price, pred2)

AIRBNB COMPARISON

tibble(
  Model = c("Model 1", "Model 2"),
  RMSE = c(rmse1, rmse2),
  R2 = c(rsq1, rsq2)
)
# A tibble: 2 × 3
  Model    RMSE      R2
  <chr>   <dbl>   <dbl>
1 Model 1  199. 0.00391
2 Model 2  198. 0.0126 

4. BANK CLASSIFICATION

Model 1

bank_model1 <- glm(
  y ~ age + duration + campaign,
  data = bank_train,
  family = binomial
)

prob1 <- predict(bank_model1, bank_test, type = "response")

pred1 <- ifelse(prob1 > 0.5, "yes", "no")
pred1 <- factor(pred1, levels = c("no", "yes"))

confusion1 <- table(pred1, bank_test$y)
confusion1
     
pred1   no  yes
  no  7209  761
  yes  101  167
acc1 <- accuracy_vec(bank_test$y, pred1)
prec1 <- precision_vec(bank_test$y, pred1)
rec1  <- recall_vec(bank_test$y, pred1)

Model 2

bank_model2 <- glm(
  y ~ age + duration + campaign + previous + pdays,
  data = bank_train,
  family = binomial
)

prob2 <- predict(bank_model2, bank_test, type = "response")

pred2 <- ifelse(prob2 > 0.5, "yes", "no")
pred2 <- factor(pred2, levels = c("no", "yes"))

confusion2 <- table(pred2, bank_test$y)
confusion2
     
pred2   no  yes
  no  7168  626
  yes  142  302
acc2 <- accuracy_vec(bank_test$y, pred2)
prec2 <- precision_vec(bank_test$y, pred2)
rec2  <- recall_vec(bank_test$y, pred2)

BANK COMPARISON

tibble(
  Model = c("Model 1", "Model 2"),
  Accuracy = c(acc1, acc2),
  Precision = c(prec1, prec2),
  Recall = c(rec1, rec2)
)
# A tibble: 2 × 4
  Model   Accuracy Precision Recall
  <chr>      <dbl>     <dbl>  <dbl>
1 Model 1    0.895     0.905  0.986
2 Model 2    0.907     0.920  0.981

5. CROSS VALIDATION

Airbnb

set.seed(465)

cv_airbnb <- vfold_cv(airbnb_train, v = 5)

fit_resamples(
  linear_reg() %>% set_engine("lm"),
  price ~ minimum_nights + number_of_reviews +
    availability_365 + calculated_host_listings_count + reviews_per_month,
  resamples = cv_airbnb,
  metrics = metric_set(rmse, rsq)
) %>%
  collect_metrics()
# A tibble: 2 × 6
  .metric .estimator     mean     n  std_err .config        
  <chr>   <chr>         <dbl> <int>    <dbl> <chr>          
1 rmse    standard   192.         5 19.0     pre0_mod0_post0
2 rsq     standard     0.0119     5  0.00314 pre0_mod0_post0

Bank

set.seed(465)

cv_bank <- vfold_cv(bank_train, v = 5)

fit_resamples(
  logistic_reg() %>% set_engine("glm"),
  y ~ age + duration + campaign + previous + pdays,
  resamples = cv_bank,
  metrics = metric_set(accuracy)
) %>%
  collect_metrics()
# A tibble: 1 × 6
  .metric  .estimator  mean     n std_err .config        
  <chr>    <chr>      <dbl> <int>   <dbl> <chr>          
1 accuracy binary     0.904     5 0.00131 pre0_mod0_post0

6. AI LOG

Prompt: How to build regression and classification models in R using tidymodels?

Use: Structured full modeling pipeline.

Reflection: Improved understanding of train/test split, GLM, and evaluation metrics.