一般廢棄物產生量<- data|>
select(年,縣市,一般廢棄物產生量)
ggplot(一般廢棄物產生量,
mapping = aes(x=年,y=一般廢棄物產生量))+
geom_line(size = 1)+
facet_wrap(~縣市)+
labs(title = "一般廢棄物產生量(比例調整前)")+
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))各縣市垃圾產生量及資源回收比例之影響因素分析
數據科學專題期末報告
一般廢棄物產生量
未調整比例
已調整比例
ggplot(一般廢棄物產生量,
mapping = aes(x=年,y=一般廢棄物產生量))+
geom_line(size = 1)+
geom_smooth(method = "lm", color = "#F8766D",size = 1) +
facet_wrap(~縣市, scales = "free_y")+
labs(title = "一般廢棄物產生量(比例調整後)")+
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))加入線性模型
ggplot(一般廢棄物產生量,
mapping = aes(x=年,y=一般廢棄物產生量))+
geom_line(size = 1)+
geom_smooth(method = "lm", color = "#F8766D",size = 1) +
facet_wrap(~縣市, scales = "free_y")+
labs(title = "一般廢棄物產生量(比例調整後)")+
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))補充:臺北市人口趨勢
ggplot(data|>
filter(縣市%in%c("臺北市")),
mapping = aes(x=年, y=人口)
)+
geom_line()+
labs(title = "臺北市人口趨勢")+
theme_minimal() +
theme(
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))一般廢棄物產生量及各縣市垃圾回收清除車數量
全台比較(各縣市92年~112年)
一般廢棄物產生量及各縣市垃圾回收清除車數量<- data|>
select(年,縣市,一般廢棄物產生量,各縣市垃圾回收清除車數量)|>
mutate(一般廢棄物產生量_千噸 = 一般廢棄物產生量/1000)
ggplot(一般廢棄物產生量及各縣市垃圾回收清除車數量, aes(x = 各縣市垃圾回收清除車數量, y = 一般廢棄物產生量_千噸)) +
geom_point(alpha = 0.5, size = 2) +
geom_smooth(method = "lm", color = "#F8766D", size = 1) +
labs(
title = "一般廢棄物產生量及各縣市垃圾回收清除車數量",
x = "各縣市垃圾回收清除車數量",
y = "一般廢棄物產生量_千噸"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))依時間進行分類
ggplot(一般廢棄物產生量及各縣市垃圾回收清除車數量, aes(x = 各縣市垃圾回收清除車數量, y = 一般廢棄物產生量_千噸)) +
geom_point(alpha = 0.5, size = 2) +
geom_smooth(method = "lm", color = "#F8766D", size = 1) +
facet_wrap(~年)+
labs(
title = "一般廢棄物產生量及各縣市垃圾回收清除車數量",
x = "各縣市垃圾回收清除車數量",
y = "一般廢棄物產生量"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))一般廢棄物產生量及焚化爐量能
依縣市進行分類(未調整比例)
一般廢棄物產生量及焚化爐量能<- data|>
select(年,縣市,一般廢棄物產生量,焚化爐量能)|>
filter(!縣市 %in% c("金門縣", "南投縣", "連江縣", "雲林縣", "彰化縣", "澎湖縣"))
change_years <- 一般廢棄物產生量及焚化爐量能 |>
group_by(縣市) |>
mutate(焚化爐變化 = ifelse(焚化爐量能 != dplyr::lag(焚化爐量能, order_by = 年), 年, NA)) |>
filter(!is.na(焚化爐變化)) |>
select(縣市, 焚化爐變化)
ggplot(一般廢棄物產生量及焚化爐量能) +
geom_line(mapping = aes(x = 年, y = 一般廢棄物產生量, col = "一般廢棄物產生量"), size = 1) +
geom_point(mapping = aes(x = 年, y = 焚化爐量能, col = "焚化爐量能"), size = 1) +
geom_vline(data = change_years, aes(xintercept = 焚化爐變化),
linetype = "dashed", color = "red", size = 0.8) +
facet_wrap(~縣市) +
labs(
title = "一般廢棄物產生量及焚化爐量能 (未調整比例)",
x = "年份",
y = "量能與產生量",
color = "指標"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
legend.position = "bottom",
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))依時間進行分類
ggplot(一般廢棄物產生量及焚化爐量能, aes(x = 焚化爐量能/10000, y = 一般廢棄物產生量)) +
geom_point(alpha = 0.5, size = 2) +
geom_smooth(method = "lm", color = "#F8766D", size = 1) +
facet_wrap(~年)+
labs(
title = "一般廢棄物產生量與焚化爐量能的關係",
x = "焚化爐量能_萬噸",
y = "一般廢棄物產生量"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))依縣市進行分類(已調整比例)
ggplot(一般廢棄物產生量及焚化爐量能) +
geom_line(mapping = aes(x = 年, y = 一般廢棄物產生量, col = "一般廢棄物產生量"), size = 1) +
geom_point(mapping = aes(x = 年, y = 焚化爐量能, col = "焚化爐量能"), size = 1) +
geom_vline(data = change_years, aes(xintercept = 焚化爐變化),
linetype = "dashed", color = "red", size = 0.8) +
facet_wrap(~縣市, scales = "free_y") +
labs(
title = "一般廢棄物產生量及焚化爐量能 (已調整比例)",
x = "年份",
y = "量能與產生量",
color = "指標"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
legend.position = "bottom",
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5")
)一般廢棄物產生量及人口
依時間進行分類
一般廢棄物產生量及人口<- data|>
select(年,縣市,一般廢棄物產生量,人口)|>
mutate(一般廢棄物產生量_十萬噸 = 一般廢棄物產生量/100000)|>
mutate(人口_十萬人 = 人口/100000)
ggplot(一般廢棄物產生量及人口,mapping = aes(x = 人口_十萬人, y = 一般廢棄物產生量_十萬噸)) +
geom_point() +
geom_smooth(method = "lm", color = "#F8766D",size = 1) +
facet_wrap(~年) +
labs(title = "一般廢棄物產生量及人口")+
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))資源回收比例
資源回收比例<- data|>
select(年,縣市,資源回收比例)
ggplot(資源回收比例,
mapping = aes(x=年,y=資源回收比例))+
geom_line(size = 1)+
facet_wrap(~縣市)+
labs(title = "各縣市資源回收比例")+
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))加入線性模型
資源回收比例<- data|>
select(年,縣市,資源回收比例)
ggplot(資源回收比例,
mapping = aes(x=年,y=資源回收比例))+
geom_line(size = 1)+
geom_smooth(method = "lm", color = "#F8766D",size = 1) +
facet_wrap(~縣市)+
labs(title = "各縣市資源回收比例")+
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))資源回收比例及高教育程度比例
全台比較(各縣市92年~112年)
資源回收比例及高教育程度比例<- data|>
select(年,縣市,資源回收比例,高教育程度比例)
ggplot(資源回收比例及高教育程度比例, aes(x = 高教育程度比例, y = 資源回收比例)) +
geom_point(alpha = 0.5, size = 2) +
geom_smooth(method = "lm", color = "#F8766D", size = 1) +
labs(
title = "資源回收比例及高教育程度比例",
x = "高教育程度比例",
y = "資源回收比例"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))依縣市進行分類
ggplot() +
geom_line(資源回收比例及高教育程度比例,
mapping = aes(x = 年, y = 資源回收比例, col = "資源回收比例"), size = 1) +
geom_line(資源回收比例及高教育程度比例,
mapping = aes(x = 年, y = 高教育程度比例, col = "高教育程度比例"), size = 1) +
facet_wrap(~縣市) +
scale_color_manual(
values = c("高教育程度比例" = "#00bfc4",
"資源回收比例" = "black")
) +
labs(
title = "資源回收比例及高教育程度比例",
color = "指標",
y = "資源回收比例/高教育程度比例"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
legend.position = "bottom",
legend.title = element_text(size = 12),
legend.text = element_text(size = 10),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5")
)依時間進行分類
ggplot(資源回收比例及高教育程度比例, aes(x = 高教育程度比例, y = 資源回收比例)) +
geom_point(alpha = 0.5, size = 2) +
geom_smooth(method = "lm", color = "#F8766D", size = 1) +
facet_wrap(~年)+
labs(
title = "資源回收比例及高教育程度比例",
x = "高教育程度比例",
y = "資源回收比例"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))比較92年及112年的斜率改變
lm_92 <- lm(資源回收比例 ~ 高教育程度比例, data = 資源回收比例及高教育程度比例 |> filter(年 == 92))
slope_92 <- coef(lm_92)[2]
lm_112 <- lm(資源回收比例 ~ 高教育程度比例, data = 資源回收比例及高教育程度比例 |> filter(年 == 112))
slope_112 <- coef(lm_112)[2]
ggplot(
資源回收比例及高教育程度比例 |>
filter(年 == 92 | 年 == 112),
aes(x = 高教育程度比例, y = 資源回收比例, col = as.factor(年))
) +
geom_point(size = 2) +
geom_smooth(method = "lm", aes(color = as.factor(年)), size = 1) +
labs(
title = "92及112資源回收比例及高教育程度比例",
x = "高教育程度比例",
y = "資源回收比例",
color = "年份"
) +
annotate("text",
x = 0.2,
y = 0.1,
label = paste0("Slope_92 = ", round(slope_92, 2)),
color = "#F8766D") +
annotate("text",
x = 0.42,
y = 0.45,
label = paste0("Slope_112 = ", round(slope_112, 2)),
color = "#00bfc4") +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"),
legend.position = "bottom"
)資源回收比例及環保稽查次數
全台比較(各縣市92年~112年)
資源回收比例及環保稽查次數 <- data |>
select(年, 縣市, 資源回收比例, 環保稽查次數)
ggplot(資源回收比例及環保稽查次數, aes(x = 環保稽查次數/10000, y = 資源回收比例)) +
geom_point(alpha = 0.5, size = 2) +
geom_smooth(method = "lm", color = "#F8766D", size = 1) +
labs(
title = "資源回收比例及環保稽查次數",
x = "環保稽查次數(萬)",
y = "資源回收比例"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"))依時間進行分類
ggplot(資源回收比例及環保稽查次數, aes(x = 環保稽查次數/10000, y = 資源回收比例)) +
geom_point(alpha = 0.5, size = 2) +
geom_smooth(method = "lm", color = "#F8766D", size = 1) +
facet_wrap(~年)+
labs(
title = "資源回收比例及環保稽查次數",
x = "環保稽查次數(萬)",
y = "資源回收比例"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))資源回收比例及各縣市垃圾回收清除車數量
全台比較(各縣市92年~112年)
資源回收比例及各縣市垃圾回收清除車數量<- data|>
select(年,縣市,資源回收比例,各縣市垃圾回收清除車數量)
ggplot(資源回收比例及各縣市垃圾回收清除車數量, aes(x = 各縣市垃圾回收清除車數量, y = 資源回收比例)) +
geom_point(alpha = 0.5, size = 2) +
geom_smooth(method = "lm", color = "#F8766D", size = 1) +
labs(
title = "資源回收比例及各縣市垃圾回收清除車數量",
x = "各縣市垃圾回收清除車數量",
y = "資源回收比例"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"))依時間進行分類
ggplot(資源回收比例及各縣市垃圾回收清除車數量, aes(x = 各縣市垃圾回收清除車數量, y = 資源回收比例)) +
geom_point(alpha = 0.5, size = 2) +
geom_smooth(method = "lm", color = "#F8766D", size = 1) +
facet_wrap(~年)+
labs(
title = "資源回收比例及各縣市垃圾回收清除車數量",
x = "各縣市垃圾回收清除車數量",
y = "資源回收比例"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))資源回收比例及專用垃圾袋
設有專用袋政策之縣市
資源回收比例及專用垃圾袋<- data|>
select(年,縣市,資源回收比例,專用垃圾袋)
ggplot(資源回收比例及專用垃圾袋|>
filter(縣市 == c("新北市","臺北市")),
mapping = aes(x=年,y=資源回收比例))+
geom_line(size = 1)+
geom_vline(data = 資源回收比例及專用垃圾袋 |>
filter(專用垃圾袋 == 1) |>
group_by(縣市) |>
summarize(first_year = min(年)),
aes(xintercept = first_year),
linetype = "dashed", color = "#F8766D",size = 1) +
facet_wrap(~縣市)+
labs(title = "資源回收比例及專用垃圾袋")+
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.background = element_rect(fill = "#d7dfe5", color = "#d7dfe5"))model1_一般廢棄物產生量linear regression
解釋變數之間的關係圖
data |>
select(一般廢棄物產生量, 各縣市垃圾回收清除車數量, 焚化爐量能, 人口) |>
ggpairs()data |>
mutate(log_一般廢棄物產生量 = log(一般廢棄物產生量),log_各縣市垃圾回收清除車數量 = log(各縣市垃圾回收清除車數量), log_焚化爐量能 = log(焚化爐量能), log_人口 = log(人口)) |>
select(log_一般廢棄物產生量, log_各縣市垃圾回收清除車數量,log_焚化爐量能,log_人口) |>
ggpairs()觀察變數間的線性關係
ord_data <- data |>
mutate(log_一般廢棄物產生量 = log(一般廢棄物產生量),
log_各縣市垃圾回收清除車數量 = log(各縣市垃圾回收清除車數量),
log_焚化爐量能 = log(焚化爐量能),
log_人口 = log(人口))
ord_data |>
pivot_longer(cols = c(log_各縣市垃圾回收清除車數量, log_焚化爐量能, log_人口),
names_to = "variable", values_to = "value") |>
ggplot(aes(x = value, y = log_一般廢棄物產生量, color = variable)) +
geom_point(size = 0.5, alpha = 0.1) +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "predictor", color = NULL) +
theme(legend.position = "bottom")linear regression
ord_data <- ord_data |>
select(log_一般廢棄物產生量,log_各縣市垃圾回收清除車數量,log_焚化爐量能,log_人口)
set.seed(1234)
ord_data_split <- initial_split(ord_data, prop = 0.8, strata = log_一般廢棄物產生量) # data splitting
ord_data_train <- training(ord_data_split) # training data
ord_data_test <- testing(ord_data_split) # testing data
#set recipe & model
log_ord_recipe <- recipe(log_一般廢棄物產生量 ~ ., data = ord_data_train)
model_ord_lm <- linear_reg() |> set_engine("lm")
workflow_ord_lm <- workflow() |> add_recipe(log_ord_recipe) |> add_model(model_ord_lm)
#model fitting
fit_ord_lm <- workflow_ord_lm |> fit(ord_data_train) # fit the model to the training data
fit_ord_lm |> tidy()# A tibble: 4 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.684 0.400 1.71 8.82e- 2
2 log_各縣市垃圾回收清除車數量 0.180 0.0626 2.87 4.48e- 3
3 log_焚化爐量能 0.00626 0.0371 0.169 8.66e- 1
4 log_人口 0.792 0.0692 11.5 1.06e-24
#get the importance of each predictors
fit_ord_lm |> vip() result_ord_lm <- fit_ord_lm |> augment(ord_data_test)
metrics(result_ord_lm, truth = log_一般廢棄物產生量, estimate = .pred) # performance metrics# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.200
2 rsq standard 0.956
3 mae standard 0.157
#predicted VS. observed outcomes#
result_ord_lm |>
ggplot(aes(x = log_一般廢棄物產生量, y = .pred)) +
geom_abline(linetype = "dashed") +
geom_point(alpha = 0.2) +
labs(x = "Observed outcome", y = "Predicted outcome") +
coord_obs_pred()model2_資源回收比例linear regression
解釋變數之間的關係圖
data |> select(資源回收比例,高教育程度比例,環保稽查次數,各縣市垃圾回收清除車數量,專用垃圾袋) |>
ggpairs()data |>
mutate(log_環保稽查次數 = log(環保稽查次數),log_各縣市垃圾回收清除車數量 = log(各縣市垃圾回收清除車數量)) |>
select(資源回收比例,高教育程度比例,log_環保稽查次數,log_各縣市垃圾回收清除車數量,專用垃圾袋) |>
ggpairs()觀察變數間的線性關係
recycle_data <- data |> mutate(log_環保稽查次數 = log(環保稽查次數),log_各縣市垃圾回收清除車數量 = log(各縣市垃圾回收清除車數量))
recycle_data |>
pivot_longer(cols = c(高教育程度比例, log_環保稽查次數, log_各縣市垃圾回收清除車數量, 專用垃圾袋),
names_to = "variable", values_to = "value") |>
ggplot(aes(x = value, y = 資源回收比例, color = variable)) +
geom_point(size = 0.5, alpha = 0.1) +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "predictor", color = NULL) +
theme(legend.position = "bottom")linear regression
recycle_data <- recycle_data |>
select(資源回收比例, 高教育程度比例, log_環保稽查次數, log_各縣市垃圾回收清除車數量, 專用垃圾袋)
set.seed(1234)
recycle_data_split <- initial_split(recycle_data, prop = 0.8, strata = 資源回收比例)
recycle_data_train <- training(recycle_data_split)
recycle_data_test <- testing(recycle_data_split)
#set recipe & workflow
recycle_recipe <- recipe(資源回收比例 ~ ., data = recycle_data_train)
model_recycle_lm <- linear_reg() |> set_engine("lm")
workflow_recycle_lm <- workflow() |>
add_recipe(recycle_recipe) |>
add_model(model_recycle_lm)
#model fitting
fit_recycle_lm <- workflow_recycle_lm |> fit(recycle_data_train)
fit_recycle_lm |> tidy()# A tibble: 5 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.245 0.0370 6.61 1.43e-10
2 高教育程度比例 1.42 0.0595 23.9 4.97e-76
3 log_環保稽查次數 -0.0162 0.00477 -3.40 7.62e- 4
4 log_各縣市垃圾回收清除車數量 0.0147 0.00801 1.83 6.75e- 2
5 專用垃圾袋 -0.00826 0.0199 -0.416 6.78e- 1
#get the importance of each variables
fit_recycle_lm |> vip() (result_recycle_lm <- fit_recycle_lm |> augment(recycle_data_test))# A tibble: 94 × 7
.pred .resid 資源回收比例 高教育程度比例 log_環保稽查次數
<dbl> <dbl> <dbl> <dbl> <dbl>
1 0.271 -0.109 0.163 0.0827 10.5
2 0.253 -0.143 0.109 0.0697 10.6
3 0.251 -0.0794 0.172 0.0674 10.1
4 0.306 -0.117 0.188 0.111 10.6
5 0.494 -0.136 0.357 0.250 12.3
6 0.295 -0.0188 0.276 0.0997 10.7
7 0.274 -0.0986 0.176 0.0999 12.0
8 0.279 -0.0158 0.263 0.0914 10.8
9 0.270 -0.0871 0.183 0.0865 11.1
10 0.289 -0.111 0.178 0.0921 10.7
# ℹ 84 more rows
# ℹ 2 more variables: log_各縣市垃圾回收清除車數量 <dbl>, 專用垃圾袋 <dbl>
metrics(result_recycle_lm, truth = 資源回收比例, estimate = .pred)# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.0912
2 rsq standard 0.653
3 mae standard 0.0745
#predicted outcomes VS. observed outcomes
result_recycle_lm |>
ggplot(aes(x = 資源回收比例, y = .pred)) +
geom_abline(linetype = "dashed") +
geom_point(alpha = 0.2) +
labs(x = "觀察值", y = "預測結果") +
coord_obs_pred()model3_資源回收比例random forest
random forest
model_recycle_rf <- rand_forest(tree = 1000, min_n = 5, mtry = 2) |>
set_engine("ranger", importance = "impurity") |>
set_mode("regression")
wflow_recycle_rf <- workflow() |> add_model(model_recycle_rf) |> add_formula(資源回收比例 ~ .)
fit_recycle_rf <- wflow_recycle_rf |> fit(recycle_data_train)
#get the importance of each variables & outcomes
fit_recycle_rf |> vip()res_recycle_rf <- fit_recycle_rf |> augment(recycle_data_test)
metrics(res_recycle_rf, truth = 資源回收比例, estimate = .pred)# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.0751
2 rsq standard 0.770
3 mae standard 0.0536
#predicted VS. observed outcomes
res_recycle_rf |>
ggplot(aes(x = 資源回收比例, y = .pred)) +
geom_abline(linetype = "dashed") +
geom_point(alpha = 0.2) +
labs(x = "Observed", y = "Predicted") +
coord_obs_pred()model4_資源回收比例Hyperparameter Tuning
Hyperparameter Tuning
model_recycle_ht <- rand_forest(tree = 1000, min_n = tune(), mtry = tune()) |>
set_engine("ranger", importance = "impurity") |>
set_mode("regression")
wflow_recycle_ht <- workflow() |> add_model(model_recycle_ht) |> add_formula(資源回收比例 ~ .)
#settings
set.seed(456)
(folds <- vfold_cv(recycle_data_train, v = 5))# 5-fold cross-validation
# A tibble: 5 × 2
splits id
<list> <chr>
1 <split [294/74]> Fold1
2 <split [294/74]> Fold2
3 <split [294/74]> Fold3
4 <split [295/73]> Fold4
5 <split [295/73]> Fold5
(grids <- grid_regular(min_n(range = c(2, 10)),mtry(range = c(2, 6)),levels = 5))# A tibble: 25 × 2
min_n mtry
<int> <int>
1 2 2
2 4 2
3 6 2
4 8 2
5 10 2
6 2 3
7 4 3
8 6 3
9 8 3
10 10 3
# ℹ 15 more rows
cl <- makePSOCKcluster(detectCores() - 1)
registerDoParallel(cl)
set.seed(789)
tune <- wflow_recycle_ht |>
tune_grid(resamples = folds, grid = grids)
tune# Tuning results
# 5-fold cross-validation
# A tibble: 5 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [294/74]> Fold1 <tibble [50 × 6]> <tibble [10 × 3]>
2 <split [294/74]> Fold2 <tibble [50 × 6]> <tibble [10 × 3]>
3 <split [294/74]> Fold3 <tibble [50 × 6]> <tibble [10 × 3]>
4 <split [295/73]> Fold4 <tibble [50 × 6]> <tibble [10 × 3]>
5 <split [295/73]> Fold5 <tibble [50 × 6]> <tibble [10 × 3]>
There were issues with some computations:
- Warning(s) x25: 5 columns were requested but there were 4 predictors in the data....
- Warning(s) x25: 6 columns were requested but there were 4 predictors in the data....
Run `show_notes(.Last.tune.result)` for more information.
stopCluster(cl)
(best <- tune |> select_best(metric = "rmse"))# A tibble: 1 × 3
mtry min_n .config
<int> <int> <chr>
1 2 2 Preprocessor1_Model01
(model_best <- finalize_model(model_recycle_ht, best))Random Forest Model Specification (regression)
Main Arguments:
mtry = 2
trees = 1000
min_n = 2
Engine-Specific Arguments:
importance = impurity
Computational engine: ranger
fit_recycle_ht <- workflow() |> add_model(model_best) |>
add_formula(資源回收比例 ~ .) |> fit(recycle_data_train)
#get the importance of each variables
fit_recycle_ht |> vip()(res_recycle_ht <- fit_recycle_ht |> augment(recycle_data_test))# A tibble: 94 × 6
.pred 資源回收比例 高教育程度比例 log_環保稽查次數 log_各縣市垃圾回收清除車…¹
<dbl> <dbl> <dbl> <dbl> <dbl>
1 0.187 0.163 0.0827 10.5 5.37
2 0.187 0.109 0.0697 10.6 5.51
3 0.222 0.172 0.0674 10.1 5.04
4 0.242 0.188 0.111 10.6 5.09
5 0.449 0.357 0.250 12.3 6.79
6 0.267 0.276 0.0997 10.7 5.54
7 0.220 0.176 0.0999 12.0 5.55
8 0.212 0.263 0.0914 10.8 5.41
9 0.189 0.183 0.0865 11.1 5.62
10 0.177 0.178 0.0921 10.7 5.91
# ℹ 84 more rows
# ℹ abbreviated name: ¹log_各縣市垃圾回收清除車數量
# ℹ 1 more variable: 專用垃圾袋 <dbl>
metrics(res_recycle_ht, truth = 資源回收比例, estimate = .pred)# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.0730
2 rsq standard 0.782
3 mae standard 0.0520
#predicted VS. observed outcomes
res_recycle_ht |>
ggplot(aes(x = 資源回收比例, y = .pred)) +
geom_abline(linetype = "dashed") +
geom_point(alpha = 0.2) +
labs(x = "Observed", y = "Predicted") +
coord_obs_pred()