各縣市垃圾產生量及資源回收比例之影響因素分析

數據科學專題期末報告

Author

張翊如林景淇蔡孟儒

一般廢棄物產生量

未調整比例

一般廢棄物產生量<- data|>
  select(年,縣市,一般廢棄物產生量)

ggplot(一般廢棄物產生量,
       mapping = aes(x=年,y=一般廢棄物產生量))+
  geom_line(size = 1)+
  facet_wrap(~縣市)+
  labs(title = "一般廢棄物產生量(比例調整前)")+
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

已調整比例

ggplot(一般廢棄物產生量,
       mapping = aes(x=年,y=一般廢棄物產生量))+
  geom_line(size = 1)+
  geom_smooth(method = "lm", color = "#F8766D",size = 1) +  
  facet_wrap(~縣市, scales = "free_y")+
  labs(title = "一般廢棄物產生量(比例調整後)")+
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

加入線性模型

ggplot(一般廢棄物產生量,
       mapping = aes(x=年,y=一般廢棄物產生量))+
  geom_line(size = 1)+
  geom_smooth(method = "lm", color = "#F8766D",size = 1) +  
  facet_wrap(~縣市, scales = "free_y")+
  labs(title = "一般廢棄物產生量(比例調整後)")+
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

補充：臺北市人口趨勢

ggplot(data|>
         filter(縣市%in%c("臺北市")),
       mapping = aes(x=年, y=人口)
       )+
  geom_line()+
  labs(title = "臺北市人口趨勢")+
  theme_minimal() +
  theme(
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

一般廢棄物產生量及各縣市垃圾回收清除車數量

全台比較(各縣市92年~112年)

一般廢棄物產生量及各縣市垃圾回收清除車數量<- data|>
  select(年,縣市,一般廢棄物產生量,各縣市垃圾回收清除車數量)|>
  mutate(一般廢棄物產生量_千噸 = 一般廢棄物產生量/1000)

ggplot(一般廢棄物產生量及各縣市垃圾回收清除車數量, aes(x = 各縣市垃圾回收清除車數量, y = 一般廢棄物產生量_千噸)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_smooth(method = "lm", color = "#F8766D", size = 1) +
  labs(
    title = "一般廢棄物產生量及各縣市垃圾回收清除車數量",
    x = "各縣市垃圾回收清除車數量",
    y = "一般廢棄物產生量_千噸"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

依時間進行分類

ggplot(一般廢棄物產生量及各縣市垃圾回收清除車數量, aes(x = 各縣市垃圾回收清除車數量, y = 一般廢棄物產生量_千噸)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_smooth(method = "lm", color = "#F8766D", size = 1) +
  facet_wrap(~年)+
  labs(
    title = "一般廢棄物產生量及各縣市垃圾回收清除車數量",
    x = "各縣市垃圾回收清除車數量",
    y = "一般廢棄物產生量"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

一般廢棄物產生量及焚化爐量能

依縣市進行分類(未調整比例)

一般廢棄物產生量及焚化爐量能<- data|>
  select(年,縣市,一般廢棄物產生量,焚化爐量能)|>
  filter(!縣市 %in% c("金門縣", "南投縣", "連江縣", "雲林縣", "彰化縣", "澎湖縣"))

change_years <- 一般廢棄物產生量及焚化爐量能 |>
  group_by(縣市) |>
  mutate(焚化爐變化 = ifelse(焚化爐量能 != dplyr::lag(焚化爐量能, order_by = 年), 年, NA)) |>
  filter(!is.na(焚化爐變化)) |>
  select(縣市, 焚化爐變化)

ggplot(一般廢棄物產生量及焚化爐量能) +
  geom_line(mapping = aes(x = 年, y = 一般廢棄物產生量, col = "一般廢棄物產生量"), size = 1) +
  geom_point(mapping = aes(x = 年, y = 焚化爐量能, col = "焚化爐量能"), size = 1) +
  geom_vline(data = change_years, aes(xintercept = 焚化爐變化), 
             linetype = "dashed", color = "red", size = 0.8) +
  facet_wrap(~縣市) +
  labs(
    title = "一般廢棄物產生量及焚化爐量能 (未調整比例)",
    x = "年份",
    y = "量能與產生量",
    color = "指標"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    legend.position = "bottom",
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

依時間進行分類

ggplot(一般廢棄物產生量及焚化爐量能, aes(x = 焚化爐量能/10000, y = 一般廢棄物產生量)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_smooth(method = "lm", color = "#F8766D", size = 1) +
  facet_wrap(~年)+
  labs(
    title = "一般廢棄物產生量與焚化爐量能的關係",
    x = "焚化爐量能_萬噸",
    y = "一般廢棄物產生量"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

依縣市進行分類(已調整比例)

ggplot(一般廢棄物產生量及焚化爐量能) +
  geom_line(mapping = aes(x = 年, y = 一般廢棄物產生量, col = "一般廢棄物產生量"), size = 1) +
  geom_point(mapping = aes(x = 年, y = 焚化爐量能, col = "焚化爐量能"), size = 1) +
  geom_vline(data = change_years, aes(xintercept = 焚化爐變化), 
             linetype = "dashed", color = "red", size = 0.8) +
  facet_wrap(~縣市, scales = "free_y") +
  labs(
    title = "一般廢棄物產生量及焚化爐量能 (已調整比例)",
    x = "年份",
    y = "量能與產生量",
    color = "指標"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    legend.position = "bottom",
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5")
  )

一般廢棄物產生量及人口

依時間進行分類

一般廢棄物產生量及人口<- data|>
  select(年,縣市,一般廢棄物產生量,人口)|>
  mutate(一般廢棄物產生量_十萬噸 = 一般廢棄物產生量/100000)|>
  mutate(人口_十萬人 = 人口/100000)

ggplot(一般廢棄物產生量及人口,mapping = aes(x = 人口_十萬人, y = 一般廢棄物產生量_十萬噸)) +
  geom_point() +
  geom_smooth(method = "lm", color = "#F8766D",size = 1) +  
  facet_wrap(~年) +
  labs(title = "一般廢棄物產生量及人口")+
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

資源回收比例

資源回收比例<- data|>
  select(年,縣市,資源回收比例)
ggplot(資源回收比例,
       mapping = aes(x=年,y=資源回收比例))+
  geom_line(size = 1)+
  facet_wrap(~縣市)+
  labs(title = "各縣市資源回收比例")+
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

加入線性模型

資源回收比例<- data|>
  select(年,縣市,資源回收比例)
ggplot(資源回收比例,
       mapping = aes(x=年,y=資源回收比例))+
  geom_line(size = 1)+
  geom_smooth(method = "lm", color = "#F8766D",size = 1) + 
  facet_wrap(~縣市)+
  labs(title = "各縣市資源回收比例")+
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

資源回收比例及高教育程度比例

全台比較(各縣市92年~112年)

資源回收比例及高教育程度比例<- data|>
  select(年,縣市,資源回收比例,高教育程度比例)

ggplot(資源回收比例及高教育程度比例, aes(x = 高教育程度比例, y = 資源回收比例)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_smooth(method = "lm", color = "#F8766D", size = 1) +
  labs(
    title = "資源回收比例及高教育程度比例",
    x = "高教育程度比例",
    y = "資源回收比例"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

依縣市進行分類

ggplot() +
  geom_line(資源回收比例及高教育程度比例,
            mapping = aes(x = 年, y = 資源回收比例, col = "資源回收比例"), size = 1) +
  geom_line(資源回收比例及高教育程度比例,
            mapping = aes(x = 年, y = 高教育程度比例, col = "高教育程度比例"), size = 1) +
  facet_wrap(~縣市) +
  scale_color_manual(
    values = c("高教育程度比例" = "#00bfc4", 
               "資源回收比例" = "black") 
  ) +
  labs(
    title = "資源回收比例及高教育程度比例",
    color = "指標",
    y = "資源回收比例/高教育程度比例"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    legend.position = "bottom",
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5")
  )

依時間進行分類

ggplot(資源回收比例及高教育程度比例, aes(x = 高教育程度比例, y = 資源回收比例)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_smooth(method = "lm", color = "#F8766D", size = 1) +
  facet_wrap(~年)+
  labs(
    title = "資源回收比例及高教育程度比例",
    x = "高教育程度比例",
    y = "資源回收比例"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

比較92年及112年的斜率改變

lm_92 <- lm(資源回收比例 ~ 高教育程度比例, data = 資源回收比例及高教育程度比例 |> filter(年 == 92))
slope_92 <- coef(lm_92)[2]

lm_112 <- lm(資源回收比例 ~ 高教育程度比例, data = 資源回收比例及高教育程度比例 |> filter(年 == 112))
slope_112 <- coef(lm_112)[2]

ggplot(
  資源回收比例及高教育程度比例 |>
    filter(年 == 92 | 年 == 112), 
  aes(x = 高教育程度比例, y = 資源回收比例, col = as.factor(年))
) +
  geom_point(size = 2) +
  geom_smooth(method = "lm", aes(color = as.factor(年)), size = 1) +
  labs(
    title = "92及112資源回收比例及高教育程度比例",
    x = "高教育程度比例",
    y = "資源回收比例",
    color = "年份" 
  ) +
  annotate("text", 
           x = 0.2,  
           y = 0.1,  
           label = paste0("Slope_92 = ", round(slope_92, 2)), 
           color = "#F8766D") +
  annotate("text", 
           x = 0.42,  
           y = 0.45,
           label = paste0("Slope_112 = ", round(slope_112, 2)), 
           color = "#00bfc4") +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"),
    legend.position = "bottom"
  )

資源回收比例及環保稽查次數

全台比較(各縣市92年~112年)

資源回收比例及環保稽查次數 <- data |>
  select(年, 縣市, 資源回收比例, 環保稽查次數)

ggplot(資源回收比例及環保稽查次數, aes(x = 環保稽查次數/10000, y = 資源回收比例)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_smooth(method = "lm", color = "#F8766D", size = 1) +
  labs(
    title = "資源回收比例及環保稽查次數",
    x = "環保稽查次數(萬)",
    y = "資源回收比例"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"))

依時間進行分類

ggplot(資源回收比例及環保稽查次數, aes(x = 環保稽查次數/10000, y = 資源回收比例)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_smooth(method = "lm", color = "#F8766D", size = 1) +
  facet_wrap(~年)+
  labs(
    title = "資源回收比例及環保稽查次數",
    x = "環保稽查次數(萬)",
    y = "資源回收比例"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

資源回收比例及各縣市垃圾回收清除車數量

全台比較(各縣市92年~112年)

資源回收比例及各縣市垃圾回收清除車數量<- data|>
  select(年,縣市,資源回收比例,各縣市垃圾回收清除車數量)

ggplot(資源回收比例及各縣市垃圾回收清除車數量, aes(x = 各縣市垃圾回收清除車數量, y = 資源回收比例)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_smooth(method = "lm", color = "#F8766D", size = 1) +
  labs(
    title = "資源回收比例及各縣市垃圾回收清除車數量",
    x = "各縣市垃圾回收清除車數量",
    y = "資源回收比例"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"))

依時間進行分類

ggplot(資源回收比例及各縣市垃圾回收清除車數量, aes(x = 各縣市垃圾回收清除車數量, y = 資源回收比例)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_smooth(method = "lm", color = "#F8766D", size = 1) +
  facet_wrap(~年)+
  labs(
    title = "資源回收比例及各縣市垃圾回收清除車數量",
    x = "各縣市垃圾回收清除車數量",
    y = "資源回收比例"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

資源回收比例及專用垃圾袋

設有專用袋政策之縣市

資源回收比例及專用垃圾袋<- data|>
  select(年,縣市,資源回收比例,專用垃圾袋)

ggplot(資源回收比例及專用垃圾袋|>
         filter(縣市 == c("新北市","臺北市")),
       mapping = aes(x=年,y=資源回收比例))+
  geom_line(size = 1)+
  geom_vline(data = 資源回收比例及專用垃圾袋 |>
               filter(專用垃圾袋 == 1) |>
               group_by(縣市) |>
               summarize(first_year = min(年)),
             aes(xintercept = first_year),
             linetype = "dashed", color = "#F8766D",size = 1) +
  facet_wrap(~縣市)+
  labs(title = "資源回收比例及專用垃圾袋")+
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))

model1_一般廢棄物產生量linear regression

解釋變數之間的關係圖

data |> 
  select(一般廢棄物產生量, 各縣市垃圾回收清除車數量, 焚化爐量能, 人口) |> 
  ggpairs()

data |> 
  mutate(log_一般廢棄物產生量 = log(一般廢棄物產生量),log_各縣市垃圾回收清除車數量 = log(各縣市垃圾回收清除車數量), log_焚化爐量能 = log(焚化爐量能), log_人口 = log(人口)) |> 
  select(log_一般廢棄物產生量, log_各縣市垃圾回收清除車數量,log_焚化爐量能,log_人口) |> 
  ggpairs()

觀察變數間的線性關係

ord_data <- data |> 
  mutate(log_一般廢棄物產生量 = log(一般廢棄物產生量),
         log_各縣市垃圾回收清除車數量 = log(各縣市垃圾回收清除車數量),
         log_焚化爐量能 = log(焚化爐量能),
         log_人口 = log(人口))

ord_data |>
  pivot_longer(cols = c(log_各縣市垃圾回收清除車數量, log_焚化爐量能, log_人口),
               names_to = "variable", values_to = "value") |>
  ggplot(aes(x = value, y = log_一般廢棄物產生量, color = variable)) +
  geom_point(size = 0.5, alpha = 0.1) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "predictor", color = NULL) +
  theme(legend.position = "bottom")

linear regression

ord_data <- ord_data |> 
  select(log_一般廢棄物產生量,log_各縣市垃圾回收清除車數量,log_焚化爐量能,log_人口)

set.seed(1234)
ord_data_split <- initial_split(ord_data, prop = 0.8, strata = log_一般廢棄物產生量) # data splitting
ord_data_train <- training(ord_data_split) # training data
ord_data_test <- testing(ord_data_split) # testing data

#set recipe & model 
log_ord_recipe <- recipe(log_一般廢棄物產生量 ~ ., data = ord_data_train) 
model_ord_lm <- linear_reg() |> set_engine("lm") 
workflow_ord_lm <- workflow() |> add_recipe(log_ord_recipe) |> add_model(model_ord_lm)

#model fitting
fit_ord_lm <- workflow_ord_lm |> fit(ord_data_train) # fit the model to the training data
fit_ord_lm |> tidy()

# A tibble: 4 × 5
  term                         estimate std.error statistic  p.value
  <chr>                           <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)                   0.684      0.400      1.71  8.82e- 2
2 log_各縣市垃圾回收清除車數量  0.180      0.0626     2.87  4.48e- 3
3 log_焚化爐量能                0.00626    0.0371     0.169 8.66e- 1
4 log_人口                      0.792      0.0692    11.5   1.06e-24

#get the importance of each predictors
fit_ord_lm |> vip()

result_ord_lm <- fit_ord_lm |> augment(ord_data_test)
metrics(result_ord_lm, truth = log_一般廢棄物產生量, estimate = .pred) # performance metrics

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.200
2 rsq     standard       0.956
3 mae     standard       0.157

#predicted VS. observed outcomes#
result_ord_lm |>
  ggplot(aes(x = log_一般廢棄物產生量, y = .pred)) +
  geom_abline(linetype = "dashed") +
  geom_point(alpha = 0.2) +
  labs(x = "Observed outcome", y = "Predicted outcome") +
  coord_obs_pred()

model2_資源回收比例linear regression

解釋變數之間的關係圖

data |> select(資源回收比例,高教育程度比例,環保稽查次數,各縣市垃圾回收清除車數量,專用垃圾袋) |> 
  ggpairs()

data |> 
  mutate(log_環保稽查次數 = log(環保稽查次數),log_各縣市垃圾回收清除車數量 = log(各縣市垃圾回收清除車數量)) |> 
  select(資源回收比例,高教育程度比例,log_環保稽查次數,log_各縣市垃圾回收清除車數量,專用垃圾袋) |> 
  ggpairs()

觀察變數間的線性關係

recycle_data <- data |> mutate(log_環保稽查次數 = log(環保稽查次數),log_各縣市垃圾回收清除車數量 = log(各縣市垃圾回收清除車數量))

recycle_data |>
  pivot_longer(cols = c(高教育程度比例, log_環保稽查次數, log_各縣市垃圾回收清除車數量, 專用垃圾袋),
               names_to = "variable", values_to = "value") |>
  ggplot(aes(x = value, y = 資源回收比例, color = variable)) +
  geom_point(size = 0.5, alpha = 0.1) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "predictor", color = NULL) +
  theme(legend.position = "bottom")

linear regression

recycle_data <- recycle_data |> 
  select(資源回收比例, 高教育程度比例, log_環保稽查次數, log_各縣市垃圾回收清除車數量, 專用垃圾袋)

set.seed(1234)
recycle_data_split <- initial_split(recycle_data, prop = 0.8, strata = 資源回收比例)
recycle_data_train <- training(recycle_data_split)
recycle_data_test <- testing(recycle_data_split)

#set recipe & workflow
recycle_recipe <- recipe(資源回收比例 ~ ., data = recycle_data_train) 
model_recycle_lm <- linear_reg() |> set_engine("lm") 
workflow_recycle_lm <- workflow() |> 
  add_recipe(recycle_recipe) |>
  add_model(model_recycle_lm)

#model fitting
fit_recycle_lm <- workflow_recycle_lm |> fit(recycle_data_train)
fit_recycle_lm |> tidy()

# A tibble: 5 × 5
  term                         estimate std.error statistic  p.value
  <chr>                           <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)                   0.245     0.0370      6.61  1.43e-10
2 高教育程度比例                1.42      0.0595     23.9   4.97e-76
3 log_環保稽查次數             -0.0162    0.00477    -3.40  7.62e- 4
4 log_各縣市垃圾回收清除車數量  0.0147    0.00801     1.83  6.75e- 2
5 專用垃圾袋                   -0.00826   0.0199     -0.416 6.78e- 1

#get the importance of each variables
fit_recycle_lm |> vip()

(result_recycle_lm <- fit_recycle_lm |> augment(recycle_data_test))

# A tibble: 94 × 7
   .pred  .resid 資源回收比例 高教育程度比例 log_環保稽查次數
   <dbl>   <dbl>        <dbl>          <dbl>            <dbl>
 1 0.271 -0.109         0.163         0.0827             10.5
 2 0.253 -0.143         0.109         0.0697             10.6
 3 0.251 -0.0794        0.172         0.0674             10.1
 4 0.306 -0.117         0.188         0.111              10.6
 5 0.494 -0.136         0.357         0.250              12.3
 6 0.295 -0.0188        0.276         0.0997             10.7
 7 0.274 -0.0986        0.176         0.0999             12.0
 8 0.279 -0.0158        0.263         0.0914             10.8
 9 0.270 -0.0871        0.183         0.0865             11.1
10 0.289 -0.111         0.178         0.0921             10.7
# ℹ 84 more rows
# ℹ 2 more variables: log_各縣市垃圾回收清除車數量 <dbl>, 專用垃圾袋 <dbl>

metrics(result_recycle_lm, truth = 資源回收比例, estimate = .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard      0.0912
2 rsq     standard      0.653 
3 mae     standard      0.0745

#predicted outcomes VS. observed outcomes
result_recycle_lm |>
  ggplot(aes(x = 資源回收比例, y = .pred)) +
  geom_abline(linetype = "dashed") +
  geom_point(alpha = 0.2) +
  labs(x = "觀察值", y = "預測結果") +
  coord_obs_pred()

model3_資源回收比例random forest

random forest

model_recycle_rf <- rand_forest(tree = 1000, min_n = 5, mtry = 2) |>
  set_engine("ranger", importance = "impurity") |>
  set_mode("regression")
wflow_recycle_rf <- workflow() |> add_model(model_recycle_rf) |> add_formula(資源回收比例 ~ .)
fit_recycle_rf <- wflow_recycle_rf |> fit(recycle_data_train)

#get the importance of each variables & outcomes
fit_recycle_rf |> vip()

res_recycle_rf <- fit_recycle_rf |> augment(recycle_data_test)
metrics(res_recycle_rf, truth = 資源回收比例, estimate = .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard      0.0751
2 rsq     standard      0.770 
3 mae     standard      0.0536

#predicted VS. observed outcomes
res_recycle_rf |>
  ggplot(aes(x = 資源回收比例, y = .pred)) +
  geom_abline(linetype = "dashed") +
  geom_point(alpha = 0.2) +
  labs(x = "Observed", y = "Predicted") +
  coord_obs_pred()

model4_資源回收比例Hyperparameter Tuning

Hyperparameter Tuning

model_recycle_ht <- rand_forest(tree = 1000, min_n = tune(), mtry = tune()) |>
  set_engine("ranger", importance = "impurity") |>
  set_mode("regression")
wflow_recycle_ht <- workflow() |> add_model(model_recycle_ht) |> add_formula(資源回收比例 ~ .)

#settings
set.seed(456)
(folds <- vfold_cv(recycle_data_train, v = 5))

#  5-fold cross-validation 
# A tibble: 5 × 2
  splits           id   
  <list>           <chr>
1 <split [294/74]> Fold1
2 <split [294/74]> Fold2
3 <split [294/74]> Fold3
4 <split [295/73]> Fold4
5 <split [295/73]> Fold5

(grids <- grid_regular(min_n(range = c(2, 10)),mtry(range = c(2, 6)),levels = 5))

# A tibble: 25 × 2
   min_n  mtry
   <int> <int>
 1     2     2
 2     4     2
 3     6     2
 4     8     2
 5    10     2
 6     2     3
 7     4     3
 8     6     3
 9     8     3
10    10     3
# ℹ 15 more rows

cl <- makePSOCKcluster(detectCores() - 1)
registerDoParallel(cl)

set.seed(789)
tune <- wflow_recycle_ht |>
  tune_grid(resamples = folds, grid = grids)
tune

# Tuning results
# 5-fold cross-validation 
# A tibble: 5 × 4
  splits           id    .metrics          .notes           
  <list>           <chr> <list>            <list>           
1 <split [294/74]> Fold1 <tibble [50 × 6]> <tibble [10 × 3]>
2 <split [294/74]> Fold2 <tibble [50 × 6]> <tibble [10 × 3]>
3 <split [294/74]> Fold3 <tibble [50 × 6]> <tibble [10 × 3]>
4 <split [295/73]> Fold4 <tibble [50 × 6]> <tibble [10 × 3]>
5 <split [295/73]> Fold5 <tibble [50 × 6]> <tibble [10 × 3]>

There were issues with some computations:

  - Warning(s) x25: 5 columns were requested but there were 4 predictors in the data....
  - Warning(s) x25: 6 columns were requested but there were 4 predictors in the data....

Run `show_notes(.Last.tune.result)` for more information.

stopCluster(cl)

(best <- tune |> select_best(metric = "rmse"))

# A tibble: 1 × 3
   mtry min_n .config              
  <int> <int> <chr>                
1     2     2 Preprocessor1_Model01

(model_best <- finalize_model(model_recycle_ht, best))

Random Forest Model Specification (regression)

Main Arguments:
  mtry = 2
  trees = 1000
  min_n = 2

Engine-Specific Arguments:
  importance = impurity

Computational engine: ranger

fit_recycle_ht <- workflow() |> add_model(model_best) |>
  add_formula(資源回收比例 ~ .) |> fit(recycle_data_train)

#get the importance of each variables
fit_recycle_ht |> vip()

(res_recycle_ht <- fit_recycle_ht |> augment(recycle_data_test))

# A tibble: 94 × 6
   .pred 資源回收比例 高教育程度比例 log_環保稽查次數 log_各縣市垃圾回收清除車…¹
   <dbl>        <dbl>          <dbl>            <dbl>                      <dbl>
 1 0.187        0.163         0.0827             10.5                       5.37
 2 0.187        0.109         0.0697             10.6                       5.51
 3 0.222        0.172         0.0674             10.1                       5.04
 4 0.242        0.188         0.111              10.6                       5.09
 5 0.449        0.357         0.250              12.3                       6.79
 6 0.267        0.276         0.0997             10.7                       5.54
 7 0.220        0.176         0.0999             12.0                       5.55
 8 0.212        0.263         0.0914             10.8                       5.41
 9 0.189        0.183         0.0865             11.1                       5.62
10 0.177        0.178         0.0921             10.7                       5.91
# ℹ 84 more rows
# ℹ abbreviated name: ¹log_各縣市垃圾回收清除車數量
# ℹ 1 more variable: 專用垃圾袋 <dbl>

metrics(res_recycle_ht, truth = 資源回收比例, estimate = .pred)

# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard      0.0730
2 rsq     standard      0.782 
3 mae     standard      0.0520

#predicted VS. observed outcomes
res_recycle_ht |>
  ggplot(aes(x = 資源回收比例, y = .pred)) +
  geom_abline(linetype = "dashed") +
  geom_point(alpha = 0.2) +
  labs(x = "Observed", y = "Predicted") +
  coord_obs_pred()