library(tidyverse)
## Warning: package 'stringr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# 1. 讀取資料
test_table <- read_csv("test_table.csv")
## Rows: 100000 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): date, device
## dbl (3): user_id, test, purchase_amount
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
user_table <- read_csv("user_table.csv")
## Rows: 20000 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): gender, service, country
## dbl (2): user_id, age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(test_table)
## Rows: 100,000
## Columns: 5
## $ user_id <dbl> 1983, 4704, 14995, 15089, 17522, 186, 6654, 935, 16459…
## $ date <chr> "2017/1/1", "2017/1/1", "2017/1/1", "2017/1/1", "2017/…
## $ device <chr> "pc_web", "android_app", "ios_app", "pc_web", "pc_web"…
## $ test <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, …
## $ purchase_amount <dbl> 33.7, 30.2, 37.2, 49.8, 19.7, 33.6, 45.1, 12.2, 34.4, …
glimpse(user_table)
## Rows: 20,000
## Columns: 5
## $ user_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,…
## $ gender <chr> "male", "male", "female", "male", "male", "female", "female", …
## $ service <chr> "golden", "premium", "premium", "golden", "normal", "golden", …
## $ age <dbl> 20, 53, 44, 44, 20, 26, 17, 29, 42, 26, 19, 21, 48, 26, 33, 27…
## $ country <chr> "US", "US", "TW", "UK", "US", "UK", "UK", "US", "US", "US", "U…
# 2. 合併資料
full_table <- test_table %>%
left_join(user_table, by = "user_id")
glimpse(full_table)
## Rows: 100,000
## Columns: 9
## $ user_id <dbl> 1983, 4704, 14995, 15089, 17522, 186, 6654, 935, 16459…
## $ date <chr> "2017/1/1", "2017/1/1", "2017/1/1", "2017/1/1", "2017/…
## $ device <chr> "pc_web", "android_app", "ios_app", "pc_web", "pc_web"…
## $ test <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, …
## $ purchase_amount <dbl> 33.7, 30.2, 37.2, 49.8, 19.7, 33.6, 45.1, 12.2, 34.4, …
## $ gender <chr> "male", "male", "male", "male", "male", "female", "fem…
## $ service <chr> "golden", "golden", "golden", "golden", "normal", "gol…
## $ age <dbl> 37, 33, 31, 38, 35, 50, 37, 39, 50, 22, 34, 14, 29, 40…
## $ country <chr> "US", "UK", "US", "US", "UK", "BR", "US", "BR", "US", …
# 3. t-test
t.test(purchase_amount ~ test, data = full_table)
##
## Welch Two Sample t-test
##
## data: purchase_amount by test
## t = -100.88, df = 99961, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -7.363126 -7.082470
## sample estimates:
## mean in group 0 mean in group 1
## 36.15099 43.37379
# 4. ANOVA
model_anova <- aov(purchase_amount ~ device, data = full_table)
summary(model_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## device 2 467742 233871 1713 <2e-16 ***
## Residuals 99997 13651494 137
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(model_anova)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = purchase_amount ~ device, data = full_table)
##
## $device
## diff lwr upr p adj
## ios_app-android_app 5.905261 5.668809 6.141713 0
## pc_web-android_app 3.946331 3.709934 4.182729 0
## pc_web-ios_app -1.958930 -2.152823 -1.765037 0
# 5. 視覺化結果:不同裝置的消費金額差異
library(ggplot2)
ggplot(full_table, aes(x = device, y = purchase_amount, fill = device)) +
# 畫出平均值的長條圖 (Bar Chart)
stat_summary(fun = mean, geom = "bar", alpha = 0.8) +
# 加上誤差線 (Error Bar),代表信賴區間,這在統計上很重要!
stat_summary(fun.data = mean_se, geom = "errorbar", width = 0.2) +
theme_minimal() +
scale_fill_brewer(palette = "Set2") + # 使用漂亮的配色
labs(
title = "各裝置平均消費金額比較 (Average Purchase by Device)",
subtitle = "經 ANOVA 檢定證實:iOS 用戶消費力顯著最高",
x = "使用裝置 (Device)",
y = "平均消費金額 (USD)"
)
# 6.試試別的變數
glimpse(full_table)
## Rows: 100,000
## Columns: 9
## $ user_id <dbl> 1983, 4704, 14995, 15089, 17522, 186, 6654, 935, 16459…
## $ date <chr> "2017/1/1", "2017/1/1", "2017/1/1", "2017/1/1", "2017/…
## $ device <chr> "pc_web", "android_app", "ios_app", "pc_web", "pc_web"…
## $ test <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, …
## $ purchase_amount <dbl> 33.7, 30.2, 37.2, 49.8, 19.7, 33.6, 45.1, 12.2, 34.4, …
## $ gender <chr> "male", "male", "male", "male", "male", "female", "fem…
## $ service <chr> "golden", "golden", "golden", "golden", "normal", "gol…
## $ age <dbl> 37, 33, 31, 38, 35, 50, 37, 39, 50, 22, 34, 14, 29, 40…
## $ country <chr> "US", "UK", "US", "US", "UK", "BR", "US", "BR", "US", …
model_country <- aov(purchase_amount ~ country, data = full_table)
summary(model_country)
## Df Sum Sq Mean Sq F value Pr(>F)
## country 4 6728442 1682110 22758 <2e-16 ***
## Residuals 99995 7390794 74
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(model_country)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = purchase_amount ~ country, data = full_table)
##
## $country
## diff lwr upr p adj
## JP-BR 19.9135755 19.5058423 20.3213086 0.0000000
## TW-BR 10.9160206 10.4465844 11.3854568 0.0000000
## UK-BR 10.6705596 10.2982537 11.0428654 0.0000000
## US-BR 26.6288791 26.2822400 26.9755182 0.0000000
## TW-JP -8.9975548 -9.4032526 -8.5918570 0.0000000
## UK-JP -9.2430159 -9.5308214 -8.9552105 0.0000000
## US-JP 6.7153036 6.4615734 6.9690338 0.0000000
## UK-TW -0.2454611 -0.6155368 0.1246146 0.3680842
## US-TW 15.7128584 15.3686157 16.0571012 0.0000000
## US-UK 15.9583195 15.7666314 16.1500077 0.0000000
綜合 T-檢定與 ANOVA 變異數分析的結果,我們得出以下關鍵結論: