library(showtext);library(systemfonts);library(dplyr);library(sjPlot);library(ggplot2)
## Loading required package: sysfonts
## Loading required package: showtextdb
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:sjPlot':
##
## set_theme
# 1. 找到該字體的精確檔案路徑
font_info <- system_fonts() %>%
filter(family == "DFPYuanBold-B5") %>%
slice(1) # 只取第一筆
# 2. 用路徑來新增字體
# font_info$path 會直接指向 C:/Windows/Fonts/...
font_add("df_round", regular = font_info$path)
# 3. 啟動 showtext
showtext_auto()
# 3. 設定 ggplot2 全域字體
ggplot2::theme_set(theme_minimal(base_family = "df_round"))
# 4. 設定 sjPlot 全域字體
sjPlot::set_theme(base = theme_minimal(),
theme.font = "df_round")
但這個模型的建立仰賴於沒有交互作用的假設!! 甚麼是交互作用所謂的交互作用interaction,就是x因為I會對Y有不一樣的影響。 比如說體力(X)會透過穿的鞋子(I)對跑步速度有不一樣的影響,穿到拖鞋你體力怎麼上升都沒用的話,代表體力不能夠等比例的提升跑步速度了,那就是鞋子和體力就有交互作用。
data(mtcars)
df <- mtcars
df$stamina <- factor(df$am, labels = c("Automatic", "Manual"))
#am Transmission (0 = automatic, 1 = manual)
df$weight <- df$wt
df$speed <- df$mpg
#mpg Miles/(US) gallon
library(ggplot2)
ggplot(df, aes(weight, speed, color = stamina))+
geom_point()+
geom_smooth(method ="lm", se = F )+
labs(title = "車子重量對速度 是否受手自排的交互作用?",
x = "車子重量(x)",
y = "速度(y)")
## `geom_smooth()` using formula = 'y ~ x'
可以看到不用做檢定就可以看到有交互作用了 還是做個檢定
simple_model <- lm(speed ~ stamina *weight, data = df)
summary(simple_model)
##
## Call:
## lm(formula = speed ~ stamina * weight, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6004 -1.5446 -0.5325 0.9012 6.0909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.4161 3.0201 10.402 4.00e-11 ***
## staminaManual 14.8784 4.2640 3.489 0.00162 **
## weight -3.7859 0.7856 -4.819 4.55e-05 ***
## staminaManual:weight -5.2984 1.4447 -3.667 0.00102 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.591 on 28 degrees of freedom
## Multiple R-squared: 0.833, Adjusted R-squared: 0.8151
## F-statistic: 46.57 on 3 and 28 DF, p-value: 5.209e-11
首先看到weight, stamina, interaction term三個變項就已經對速度有0.833的Rsquare。 以及每一個變項都有顯著意義。
到這裡要知道,這筆數據的手自排和重量真的有交互作用效果,所以如果你原本要用ANCOVA是不可以的!
如果忽略這個交互作用,強迫手排自排的效果平均,就會高估自排的速度,低估手排的。
library(emmeans) # least square means
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
emtrends(simple_model, ~ stamina, var = 'weight')|>as_tibble()
## # A tibble: 2 × 6
## stamina weight.trend SE df lower.CL upper.CL
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Automatic -3.79 0.786 28 -5.40 -2.18
## 2 Manual -9.08 1.21 28 -11.6 -6.60
# spec 放交互作用項
# var是X
可以知道手排對車子重量很敏感,車子重量一變重就變慢很多,但自排較沒有受影響。
他們的上下界沒有overlap所以是顯著。
complex_model <- lm(speed ~ weight * stamina + cyl + hp, data = df)
summary(complex_model)
##
## Call:
## lm(formula = speed ~ weight * stamina + cyl + hp, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4534 -1.5557 -0.6668 1.2997 5.1000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.22945 3.01690 11.014 2.73e-11 ***
## weight -2.19536 0.84638 -2.594 0.0154 *
## staminaManual 11.26038 3.91986 2.873 0.0080 **
## cyl -0.83651 0.52826 -1.584 0.1254
## hp -0.01246 0.01322 -0.943 0.3545
## weight:staminaManual -3.72353 1.40714 -2.646 0.0136 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.27 on 26 degrees of freedom
## Multiple R-squared: 0.8811, Adjusted R-squared: 0.8582
## F-statistic: 38.52 on 5 and 26 DF, p-value: 3.234e-11
可以看到0.8811的r square。
emtrends(complex_model, ~stamina, var = "weight")|>as_tibble()
## # A tibble: 2 × 6
## stamina weight.trend SE df lower.CL upper.CL
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Automatic -2.20 0.846 26 -3.94 -0.456
## 2 Manual -5.92 1.50 26 -9.01 -2.83
可以看到他們各自的coefficient都變小了,代表剛剛的coef裡面有貢獻by cyl和hp。
car::vif(simple_model)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## stamina weight stamina:weight
## 20.901259 2.728248 15.366853
car::vif(complex_model)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## weight stamina cyl hp weight:stamina
## 4.127380 23.024464 5.356564 4.940686 19.002930
會發現有交互作用項VIF會爆錶(>15)的問題,所以可以用去中心化並且
type = 'predictor'的方式去計算 會漂亮很多。 這個vif
function裡面的 type是拿來看有interaction時的vif => gvif
df$weight_c <- df$weight - mean(df$weight)
model_centered <- lm(speed ~ (weight_c * stamina) + cyl + hp, data = df)
car::vif(model_centered, type = 'predictor')
## GVIFs computed for predictors
## GVIF Df GVIF^(1/(2*Df)) Interacts With Other Predictors
## weight_c 4.228652 3 1.271648 stamina cyl, hp
## stamina 4.228652 3 1.271648 weight_c cyl, hp
## cyl 5.356564 1 2.314425 -- weight_c, stamina, hp
## hp 4.940686 1 2.222765 -- weight_c, stamina, cyl
anova(simple_model, complex_model)
## Analysis of Variance Table
##
## Model 1: speed ~ stamina * weight
## Model 2: speed ~ weight * stamina + cyl + hp
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 28 188.01
## 2 26 133.93 2 54.079 5.2493 0.01216 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(ggeffects)
(predictions <- ggpredict(complex_model, terms = c("weight", "stamina")))
## # Predicted values of speed
##
## stamina: Automatic
##
## weight | Predicted | 95% CI
## ---------------------------------
## 1.40 | 23.15 | 19.12, 27.18
## 2.00 | 21.84 | 18.79, 24.89
## 2.80 | 20.08 | 18.22, 21.94
## 3.40 | 18.76 | 17.50, 20.03
## 4.00 | 17.44 | 16.06, 18.83
## 5.40 | 14.37 | 11.03, 17.71
##
## stamina: Manual
##
## weight | Predicted | 95% CI
## ---------------------------------
## 1.40 | 29.20 | 25.41, 32.99
## 2.00 | 25.65 | 23.42, 27.88
## 2.80 | 20.91 | 19.14, 22.69
## 3.40 | 17.36 | 14.23, 20.49
## 4.00 | 13.81 | 8.98, 18.64
## 5.40 | 5.52 | -3.51, 14.56
##
## Adjusted for:
## * cyl = 6.19
## * hp = 146.69
##
## Not all rows are shown in the output. Use `print(..., n = Inf)` to show
## all rows.
A <- plot(predictions) +
labs(title = "add other CF",
x = "Weight",
y = "Speed")+ylim(0,40)
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
(predictions2 <- ggpredict(simple_model, terms = c("weight", "stamina")))
## # Predicted values of speed
##
## stamina: Automatic
##
## weight | Predicted | 95% CI
## ---------------------------------
## 1.40 | 26.12 | 22.11, 30.12
## 2.00 | 23.84 | 20.75, 26.94
## 2.80 | 20.82 | 18.84, 22.79
## 3.40 | 18.54 | 17.19, 19.90
## 4.00 | 16.27 | 15.00, 17.55
## 5.40 | 10.97 | 8.08, 13.87
##
## stamina: Manual
##
## weight | Predicted | 95% CI
## ----------------------------------
## 1.40 | 33.58 | 30.67, 36.49
## 2.00 | 28.13 | 26.33, 29.92
## 2.80 | 20.86 | 19.10, 22.62
## 3.40 | 15.41 | 12.54, 18.27
## 4.00 | 9.96 | 5.75, 14.17
## 5.40 | -2.76 | -10.33, 4.81
##
## Not all rows are shown in the output. Use `print(..., n = Inf)` to show
## all rows.
B <- plot(predictions2) +
labs(title = "only interaction",
x = "Weight",
y = "Speed")+ylim(0,40)
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
library(patchwork)
(B+A)+ plot_layout(guides = 'collect') + theme(legend.position = 'right')