survey_data <- read_csv("survey_data.csv")
internal_data <- read_csv("internal_data.csv")
full_data <- survey_data %>%
left_join(internal_data, by = "user_id") %>%
mutate(# 目標變數:把 0/1 轉成 No/Yes,方便繪圖與解讀
is_loyal = factor(is_loyal, levels = c(0, 1), labels = c("No", "Yes")),
credit_card_vendor = factor(credit_card_vendor),
register_method = factor(register_method),
class = factor(class))
glimpse(full_data)
## Rows: 1,000
## Columns: 21
## $ user_id <chr> "c8c41c4a18675a74e01c8a20e8a0f662", "9f396fe44e7c05…
## $ is_loyal <fct> No, No, Yes, Yes, No, Yes, Yes, No, Yes, Yes, Yes, …
## $ depart_on_time <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, …
## $ arrive_on_time <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, …
## $ register_method <fct> others, others, website, others, others, website, o…
## $ register_rate <dbl> 2, 3, 3, 3, 4, 3, 2, 3, 2, 1, 3, 4, 4, 5, 2, 1, 3, …
## $ class <fct> 3, 3, 2, 3, 3, 1, 3, 3, 3, 1, 2, 3, 2, 3, 2, 2, 3, …
## $ seat_rate <dbl> 1, 3, 3, 2, 2, 4, 2, 2, 3, 3, 3, 2, 2, 2, 4, 4, 2, …
## $ meal_rate <dbl> 1, 1, 2, 2, 2, 3, 3, 1, 2, 4, 4, 2, 2, 2, 4, 3, 2, …
## $ flight_rate <dbl> 1, 3, 2, 4, 4, 2, 1, 2, 3, 1, 2, 2, 4, 1, 2, 4, 4, …
## $ package_rate <dbl> 2, 2, 3, 3, 3, 3, 2, 1, 2, 3, 3, 2, 3, 3, 4, 4, 2, …
## $ tv_ad <dbl> 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, …
## $ youtube_ad_1 <dbl> 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ youtube_ad_2 <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
## $ youtube_ad_3 <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, …
## $ dm_message <dbl> 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, …
## $ dm_post <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, …
## $ dm_email <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, …
## $ credit_card_vendor <fct> Vendor C, Vendor B, Vendor B, Vendor C, Vendor B, V…
## $ credit_card_bonus <dbl> 1, 1, 1, 3, 1, 3, 3, 1, 3, 3, 1, 2, 3, 3, 3, 1, 3, …
## $ coupon <dbl> 65.7, 23.5, 43.6, 119.3, 196.1, 10.1, 144.7, 104.9,…
# 用 na.rm = TRUE 確保若有缺漏值不會報錯
ggplot(full_data, aes(x = is_loyal, y = coupon, fill = is_loyal)) +
geom_boxplot(alpha = 0.7) +
theme_minimal() +
scale_fill_manual(values = c("gray70", "steelblue")) +
labs(
title = "折扣金額對客戶忠誠度的影響",
x = "是否忠誠 (Loyal)",
y = "折扣金額 (Coupon Value)"
)
# 3. 建立模型 ## 關鍵變數: ## 服務滿意度:seat_rate, meal_rate ##
準點狀況:depart_on_time ## 行銷誘因:coupon, credit_card_vendor
model_logit <- glm(is_loyal ~ seat_rate + meal_rate + depart_on_time + coupon + credit_card_vendor, data = full_data, family = "binomial")
model_summary <- tidy(model_logit)
knitr::kable(model_summary, digits = 4, caption = "邏輯迴歸模型係數表")
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | -5.0082 | 0.4543 | -11.0252 | 0.0000 |
| seat_rate | 0.8403 | 0.1057 | 7.9514 | 0.0000 |
| meal_rate | 0.5877 | 0.1159 | 5.0712 | 0.0000 |
| depart_on_time | 1.5605 | 0.1778 | 8.7754 | 0.0000 |
| coupon | 0.0085 | 0.0015 | 5.7847 | 0.0000 |
| credit_card_vendorVendor B | -0.2679 | 0.1992 | -1.3452 | 0.1785 |
| credit_card_vendorVendor C | 0.1628 | 0.2119 | 0.7683 | 0.4423 |
# 計算 Odds Ratio 與 信賴區間
model_interpretation <- tidy(model_logit, conf.int = TRUE) %>%
mutate(
Odds_Ratio = exp(estimate), # 還原係數為勝率比
Conf_Low = exp(conf.low), # 信賴區間下限
Conf_High = exp(conf.high), # 信賴區間上限
Impact_Pct = (Odds_Ratio - 1) * 100 # 轉換成百分比影響力
) %>%
select(term, Odds_Ratio, Impact_Pct, p.value, Conf_Low, Conf_High) %>%
filter(term != "(Intercept)") # 移除截距項,只看變數
model_interpretation %>%
filter(p.value < 0.05) %>%
arrange(desc(Odds_Ratio)) %>% # 依影響力排序
knitr::kable(digits = 2, caption = "顯著變數的商業影響力分析")
| term | Odds_Ratio | Impact_Pct | p.value | Conf_Low | Conf_High |
|---|---|---|---|---|---|
| depart_on_time | 4.76 | 376.14 | 0 | 3.37 | 6.78 |
| seat_rate | 2.32 | 131.70 | 0 | 1.89 | 2.86 |
| meal_rate | 1.80 | 79.98 | 0 | 1.44 | 2.26 |
| coupon | 1.01 | 0.86 | 0 | 1.01 | 1.01 |
# 1. 預測:算出每個人的忠誠機率
full_data_pred <- full_data %>%
mutate(
pred_prob = predict(model_logit, type = "response"), # 預測機率
pred_class = ifelse(pred_prob > 0.5, "Yes", "No") # 設定門檻 0.5
)
# 2. 建立混淆矩陣
conf_matrix <- table(Predicted = full_data_pred$pred_class, Actual = full_data$is_loyal)
# 顯示矩陣
conf_matrix
## Actual
## Predicted No Yes
## No 206 95
## Yes 163 536
# 3. 計算準確度
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("模型整體準確度 (Accuracy):", round(accuracy * 100, 2), "%")
## 模型整體準確度 (Accuracy): 74.2 %
ggplot(full_data_pred, aes(x = coupon, y = pred_prob)) +
# 畫出每個客戶的預測點 (藍色)
geom_point(alpha = 0.3, color = "steelblue") +
# 畫出趨勢線 (S-Curve)
geom_smooth(method = "glm", method.args = list(family = "binomial"),
color = "darkred", se = FALSE) +
theme_minimal() +
labs(
title = "折扣金額與客戶忠誠度機率之關係",
subtitle = "呈現邏輯迴歸經典的 S 型曲線 (Logistic Sigmoid)",
x = "折扣金額 (USD)",
y = "預測忠誠機率 (Predicted Probability)"
) +
scale_y_continuous(labels = scales::percent) # Y軸顯示百分比
# 預測機率
full_data_pred <- full_data %>%
mutate(pred_prob = predict(model_logit, type = "response"))
ggplot(full_data_pred, aes(x = coupon, y = pred_prob, color = factor(depart_on_time))) +
geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE, size = 1.5) +
theme_minimal() +
scale_color_manual(values = c("gray", "red"), labels = c("誤點 (0)", "準點 (1)")) +
labs(
title = "關鍵洞察:準點的影響力遠大於折扣",
subtitle = "紅色線 (準點) 的起始機率遠高於灰色線 (誤點),即使折扣增加,誤點的挽回效果也有限。",
x = "折扣金額 (Coupon Value)",
y = "預測忠誠機率",
color = "出發準點與否"
)
# 7.總結 ##
根據邏輯迴歸模型的分析結果,我們得出以下關鍵結論:準點率是核心關鍵:depart_on_time
的勝率比 (Odds Ratio) 高達 4.76,是所有變數中影響力最強的。 ##
折扣效果有限:雖然折扣金額 (coupon) 統計上顯著,但其邊際效益極低 (OR
\(\approx\) 1.01)。 ##
策略建議:建議公司將行銷預算轉移至營運優化,優先確保航班準點率。對於發生誤點的航班,單純發放小額折價券可能無法有效挽回客戶信心。