加载宏包
# Load libraries
library(tidyverse) # Easily Install and Load the 'Tidyverse'
library(modelr) # Modelling Functions that Work with the Pipe
# library(showtext) # Using Fonts More Easily in R Graphs
# showtext_auto()
建模方法
以mtcars数据为例。
AMC Javelin |
15.2 |
8 |
304.0 |
150 |
3.15 |
3.435 |
17.30 |
0 |
0 |
3 |
2 |
Fiat 128 |
32.4 |
4 |
78.7 |
66 |
4.08 |
2.200 |
19.47 |
1 |
1 |
4 |
1 |
Hornet 4 Drive |
21.4 |
6 |
258.0 |
110 |
3.08 |
3.215 |
19.44 |
1 |
0 |
3 |
1 |
Merc 280 |
19.2 |
6 |
167.6 |
123 |
3.92 |
3.440 |
18.30 |
1 |
0 |
4 |
4 |
Hornet Sportabout |
18.7 |
8 |
360.0 |
175 |
3.15 |
3.440 |
17.02 |
0 |
0 |
3 |
2 |
Dodge Challenger |
15.5 |
8 |
318.0 |
150 |
2.76 |
3.520 |
16.87 |
0 |
0 |
3 |
2 |
Valiant |
18.1 |
6 |
225.0 |
105 |
2.76 |
3.460 |
20.22 |
1 |
0 |
3 |
1 |
Honda Civic |
30.4 |
4 |
75.7 |
52 |
4.93 |
1.615 |
18.52 |
1 |
1 |
4 |
2 |
Merc 450SE |
16.4 |
8 |
275.8 |
180 |
3.07 |
4.070 |
17.40 |
0 |
0 |
3 |
3 |
Toyota Corolla |
33.9 |
4 |
71.1 |
65 |
4.22 |
1.835 |
19.90 |
1 |
1 |
4 |
1 |
变量概况
mtcars %>%
select(mpg, wt) %>%
# report::report(median = TRUE, range = FALSE) %>%
report::report_table()
2 |
mpg |
32 |
20.09062 |
6.0269481 |
19.200 |
5.4114900 |
10.400 |
33.900 |
0.6723771 |
-0.0220063 |
0 |
1 |
wt |
32 |
3.21725 |
0.9784574 |
3.325 |
0.7672455 |
1.513 |
5.424 |
0.4659161 |
0.4165947 |
0 |
# report::table_long(r)
总体数据建模
模型设定
library(equatiomatic)
mod <- lm(mpg ~ wt, mtcars)
extract_eq(mod, wrap = T)
## $$
## \begin{aligned}
## \operatorname{mpg} &= \alpha + \beta_{1}(\operatorname{wt}) + \epsilon
## \end{aligned}
## $$
拟以车自重wt为自变量,油耗mpg为因变量进行线性拟合。方程为: \[
\operatorname{mpg} = \alpha + \beta_{1}(\operatorname{wt}) + \epsilon
\]
拟合后的实际方程应为: \[
\operatorname{mpg} = 37.29 - 5.34(\operatorname{wt}) + \epsilon
\]
模型参数
查看模型系数估计值
- 含term, estimate, std.error, statistic, p.value等参数
## 注意data = . 与 data = cur_data()的区别
#### 无分组时一致,有分组时后者为分组后的各子集
mtcars %>%
as_tibble() %>%
summarise(
## 以下四个表达式等价
# lm(mpg ~ wt, data = cur_data()) %>% broom::tidy()
# lm(mpg ~ wt, data = .) %>% broom::tidy()
# lm(mpg ~ wt) %>% broom::tidy()
broom::tidy(lm(mpg ~ wt), conf.int = T)
)
(Intercept) |
37.285126 |
1.877627 |
19.857575 |
0 |
33.450500 |
41.119753 |
wt |
-5.344472 |
0.559101 |
-9.559044 |
0 |
-6.486308 |
-4.202635 |
#或者
mtcars %>%
lm(mpg ~ wt, data = .) %>%
broom::tidy(conf.int = T)
(Intercept) |
37.285126 |
1.877627 |
19.857575 |
0 |
33.450500 |
41.119753 |
wt |
-5.344472 |
0.559101 |
-9.559044 |
0 |
-6.486308 |
-4.202635 |
- 查看摘要统计量
- 含r.squared, adj.r.squared, sigma, statistic, p.value, df, logLik, AIC, BIC, deviance, df.residual, nobs等参数
mtcars %>%
lm(mpg ~ wt, data = .) %>%
broom::glance()
0.7528328 |
0.7445939 |
3.045882 |
91.37533 |
0 |
1 |
-80.01471 |
166.0294 |
170.4266 |
278.3219 |
30 |
32 |
- 查看拟合结果残差等
- 含.rownames, mpg, wt, .fitted, .resid, .hat, .sigma, .cooksd, .std.resid等参数
mtcars %>%
lm(mpg ~ wt, data = .) %>%
broom::augment() %>%
slice_head(n = 6)
Mazda RX4 |
21.0 |
2.620 |
23.28261 |
-2.2826106 |
0.0432690 |
3.067494 |
0.0132741 |
-0.7661677 |
Mazda RX4 Wag |
21.0 |
2.875 |
21.91977 |
-0.9197704 |
0.0351968 |
3.093068 |
0.0017240 |
-0.3074305 |
Datsun 710 |
22.8 |
2.320 |
24.88595 |
-2.0859521 |
0.0583757 |
3.072127 |
0.0154394 |
-0.7057525 |
Hornet 4 Drive |
21.4 |
3.215 |
20.10265 |
1.2973499 |
0.0312502 |
3.088268 |
0.0030206 |
0.4327511 |
Hornet Sportabout |
18.7 |
3.440 |
18.90014 |
-0.2001440 |
0.0329218 |
3.097722 |
0.0000760 |
-0.0668188 |
Valiant |
18.1 |
3.460 |
18.79325 |
-0.6932545 |
0.0332355 |
3.095184 |
0.0009211 |
-0.2314831 |
模型检验及可视化
可视化
library(statsExpressions)
mtcars %>%
ggplot(aes(wt, mpg))+
geom_point()+
geom_smooth(method = "lm", formula = "y ~ x")+
ggpmisc::stat_poly_eq(formula = "y ~ x",
aes(label = paste(..eq.label.., ..rr.label.., ..p.value.label..,
sep = "*\", \"*")),
parse = TRUE,
label.x.npc = "right",
label.y.npc = "top",
size = 5)+
labs(title = "32种汽车自重与油耗关系图",
subtitle = expr_corr_test(mtcars, mpg, wt, type = "parametric"),
x = "汽车自重(千磅)",
y = "油耗(英哩/加仑)")+
theme_classic()

分组批量建模
可视化
mtcars %>%
mutate(cyl = as.factor(cyl)) %>%
ggplot(aes(wt, mpg, group = cyl, color = cyl))+
geom_point()+
geom_smooth(method = "lm", formula = "y ~ x")+
ggpmisc::stat_poly_eq(formula = "y ~ x",
aes(label = paste(..eq.label.., ..rr.label.., ..p.value.label..,
sep = "*\", \"*")),
parse = TRUE,
label.x.npc = "right",
label.y.npc = "top",
size = 3)+
labs(title = "32种汽车自重与油耗关系图",
x = "汽车自重(千磅)",
y = "油耗(英哩/加仑)",
color = "汽缸数")+
theme_classic()

Method1: group_by + summarise
mtcars %>%
group_by(cyl) %>%
summarise(
#以下三种表达等价
# lm(mpg ~ wt, data = cur_data()) %>% broom::tidy()
# lm(mpg ~ wt) %>% broom::tidy()
broom::tidy(lm(mpg ~ wt))
## 该表达产生总体建模效果
# lm(mpg ~ wt, data = .) %>% broom::tidy()
)
4 |
(Intercept) |
39.571196 |
4.3465820 |
9.103980 |
0.0000078 |
4 |
wt |
-5.647025 |
1.8501185 |
-3.052250 |
0.0137428 |
6 |
(Intercept) |
28.408845 |
4.1843688 |
6.789278 |
0.0010548 |
6 |
wt |
-2.780106 |
1.3349173 |
-2.082605 |
0.0917577 |
8 |
(Intercept) |
23.868029 |
3.0054619 |
7.941551 |
0.0000041 |
8 |
wt |
-2.192438 |
0.7392393 |
-2.965803 |
0.0117928 |
Method2: nest_by + summarise
## Method5
mtcars %>%
nest_by(cyl) %>%
summarise(
#注意这里使用的data与group_by 不同
broom::tidy(lm(mpg ~ wt, data = data))
)
4 |
(Intercept) |
39.571196 |
4.3465820 |
9.103980 |
0.0000078 |
4 |
wt |
-5.647025 |
1.8501185 |
-3.052250 |
0.0137428 |
6 |
(Intercept) |
28.408845 |
4.1843688 |
6.789278 |
0.0010548 |
6 |
wt |
-2.780106 |
1.3349173 |
-2.082605 |
0.0917577 |
8 |
(Intercept) |
23.868029 |
3.0054619 |
7.941551 |
0.0000041 |
8 |
wt |
-2.192438 |
0.7392393 |
-2.965803 |
0.0117928 |
Method3: group_by + group_modify + ~
mtcars %>%
group_by(cyl) %>%
group_modify(
# 以下两种等价
# ~ lm(mpg ~ wt, data = .) %>% broom::tidy()
~broom::tidy(lm(mpg ~ wt, data = .))
## cur_data()只能用于dplyr verbs,如mutate, summrise等
# ~ broom::tidy(lm(mpg ~ wt, data = cur_data()))
)
4 |
(Intercept) |
39.571196 |
4.3465820 |
9.103980 |
0.0000078 |
4 |
wt |
-5.647025 |
1.8501185 |
-3.052250 |
0.0137428 |
6 |
(Intercept) |
28.408845 |
4.1843688 |
6.789278 |
0.0010548 |
6 |
wt |
-2.780106 |
1.3349173 |
-2.082605 |
0.0917577 |
8 |
(Intercept) |
23.868029 |
3.0054619 |
7.941551 |
0.0000041 |
8 |
wt |
-2.192438 |
0.7392393 |
-2.965803 |
0.0117928 |
Method4: group_nest + mutata + map + ~ + unnest
mtcars %>%
group_nest(cyl) %>%
mutate(model = purrr::map(data, ~ lm(mpg ~ wt, data = .))) %>%
mutate(result = purrr::map(model, ~ broom::tidy(.))) %>%
select(cyl, result) %>%
unnest(result)
4 |
(Intercept) |
39.571196 |
4.3465820 |
9.103980 |
0.0000078 |
4 |
wt |
-5.647025 |
1.8501185 |
-3.052250 |
0.0137428 |
6 |
(Intercept) |
28.408845 |
4.1843688 |
6.789278 |
0.0010548 |
6 |
wt |
-2.780106 |
1.3349173 |
-2.082605 |
0.0917577 |
8 |
(Intercept) |
23.868029 |
3.0054619 |
7.941551 |
0.0000041 |
8 |
wt |
-2.192438 |
0.7392393 |
-2.965803 |
0.0117928 |
Method5: nest_by + mutate + list + summarise
mtcars %>%
nest_by(cyl) %>%
mutate(model = list(lm(mpg ~ wt, data = data))) %>%
summarise(broom::tidy(model))
4 |
(Intercept) |
39.571196 |
4.3465820 |
9.103980 |
0.0000078 |
4 |
wt |
-5.647025 |
1.8501185 |
-3.052250 |
0.0137428 |
6 |
(Intercept) |
28.408845 |
4.1843688 |
6.789278 |
0.0010548 |
6 |
wt |
-2.780106 |
1.3349173 |
-2.082605 |
0.0917577 |
8 |
(Intercept) |
23.868029 |
3.0054619 |
7.941551 |
0.0000041 |
8 |
wt |
-2.192438 |
0.7392393 |
-2.965803 |
0.0117928 |
小结
基本以下四种形式,推荐使用前两种
mtcars %>%
group_by(cyl) %>%
summarise(broom::tidy(lm(mpg ~ wt)))
mtcars %>%
nest_by(cyl) %>%
summarise(
broom::tidy(lm(mpg ~ wt, data = data)))
mtcars %>%
group_by(cyl) %>%
group_modify(
~ broom::tidy(lm(mpg ~ wt, data = .))
)
mtcars %>%
group_nest(cyl) %>%
mutate(model = purrr::map(data, ~ lm(mpg ~ wt, data = .))) %>%
mutate(result = purrr::map(model, ~ broom::tidy(.))) %>%
select(cyl, result) %>%
tidyr::unnest(result)
预测
提供预测自变量集
pred_wt <- tibble(
##此坑需注意,本变量名须与模型里的自变量一致
wt = as.numeric(seq(1, 10, by = 1))
)
总体模型预测
总体模型参数
pred_t <-
modelr::add_predictions(pred_wt, lm(mpg ~ wt, data = mtcars))
pred_t
1 |
31.9406546 |
2 |
26.5961830 |
3 |
21.2517114 |
4 |
15.9072399 |
5 |
10.5627683 |
6 |
5.2182967 |
7 |
-0.1261748 |
8 |
-5.4706464 |
9 |
-10.8151180 |
10 |
-16.1595896 |
分组预测后可视化
m <- pred_t %>%
ggplot(aes(wt, pred))+
geom_line()+
geom_point(data = mtcars,
aes(wt, mpg))+
theme_classic()
plotly::ggplotly(m)
分组模型预测
分组模型参数
pred_m <- mtcars %>%
group_nest(cyl) %>%
mutate(model = purrr::map(data, ~ lm(mpg ~ wt, data = .))) %>%
mutate(pred_mpg = purrr::map(model, ~ modelr::add_predictions(pred_wt, .))) %>%
select(cyl, pred_mpg) %>%
unnest(pred_mpg) %>%
filter(pred > 3)
pred_m
4 |
1 |
33.924171 |
4 |
2 |
28.277145 |
4 |
3 |
22.630120 |
4 |
4 |
16.983095 |
4 |
5 |
11.336070 |
4 |
6 |
5.689044 |
6 |
1 |
25.628739 |
6 |
2 |
22.848633 |
6 |
3 |
20.068527 |
6 |
4 |
17.288421 |
6 |
5 |
14.508315 |
6 |
6 |
11.728209 |
6 |
7 |
8.948103 |
6 |
8 |
6.167997 |
6 |
9 |
3.387891 |
8 |
1 |
21.675591 |
8 |
2 |
19.483153 |
8 |
3 |
17.290715 |
8 |
4 |
15.098277 |
8 |
5 |
12.905839 |
8 |
6 |
10.713401 |
8 |
7 |
8.520964 |
8 |
8 |
6.328526 |
8 |
9 |
4.136088 |
分组预测后可视化
m1 <- pred_m %>%
filter(pred > 3) %>%
mutate(cyl = factor(cyl)) %>%
ggplot(aes(wt, pred, group = cyl, color = cyl))+
geom_line()+
geom_point(data = mtcars %>% mutate(cyl = factor(cyl)),
aes(wt, mpg, color = cyl))+
theme_classic()
plotly::ggplotly(m1)
- 鼠标滑到拟合曲线上会互动显示具体值;
- 数据为模拟数据,以例证为主,不具实际意义