1 引言

本文介绍批量建模操作、结果规整、可视化。

2 加载宏包

# Load libraries
library(tidyverse) # Easily Install and Load the 'Tidyverse' 
library(modelr) # Modelling Functions that Work with the Pipe
# library(showtext) # Using Fonts More Easily in R Graphs
# showtext_auto()

3 建模方法

以mtcars数据为例。

	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
AMC Javelin	15.2	8	304.0	150	3.15	3.435	17.30	0	0	3	2
Fiat 128	32.4	4	78.7	66	4.08	2.200	19.47	1	1	4	1
Hornet 4 Drive	21.4	6	258.0	110	3.08	3.215	19.44	1	0	3	1
Merc 280	19.2	6	167.6	123	3.92	3.440	18.30	1	0	4	4
Hornet Sportabout	18.7	8	360.0	175	3.15	3.440	17.02	0	0	3	2
Dodge Challenger	15.5	8	318.0	150	2.76	3.520	16.87	0	0	3	2
Valiant	18.1	6	225.0	105	2.76	3.460	20.22	1	0	3	1
Honda Civic	30.4	4	75.7	52	4.93	1.615	18.52	1	1	4	2
Merc 450SE	16.4	8	275.8	180	3.07	4.070	17.40	0	0	3	3
Toyota Corolla	33.9	4	71.1	65	4.22	1.835	19.90	1	1	4	1

变量概况

mtcars %>%
  select(mpg, wt) %>% 
  # report::report(median = TRUE, range = FALSE) %>%
  report::report_table()

	Variable	n_Obs	Mean	SD	Median	MAD	Min	Max	Skewness	Kurtosis	n_Missing
2	mpg	32	20.09062	6.0269481	19.200	5.4114900	10.400	33.900	0.6723771	-0.0220063	0
1	wt	32	3.21725	0.9784574	3.325	0.7672455	1.513	5.424	0.4659161	0.4165947	0

  # report::table_long(r)

3.1 总体数据建模

3.1.1 模型设定

library(equatiomatic)
mod <- lm(mpg ~ wt, mtcars)
extract_eq(mod, wrap = T)

## $$
## \begin{aligned}
## \operatorname{mpg} &= \alpha + \beta_{1}(\operatorname{wt}) + \epsilon
## \end{aligned}
## $$

拟以车自重wt为自变量，油耗mpg为因变量进行线性拟合。方程为： \[ \operatorname{mpg} = \alpha + \beta_{1}(\operatorname{wt}) + \epsilon \]

拟合后的实际方程应为： \[ \operatorname{mpg} = 37.29 - 5.34(\operatorname{wt}) + \epsilon \]

3.1.2 模型参数

查看模型系数估计值
- 含term, estimate, std.error, statistic, p.value等参数

## 注意data = . 与 data = cur_data()的区别
#### 无分组时一致，有分组时后者为分组后的各子集

mtcars %>%
  as_tibble() %>% 
  summarise(
    ## 以下四个表达式等价
    # lm(mpg ~ wt, data = cur_data()) %>% broom::tidy()
    # lm(mpg ~ wt, data = .) %>% broom::tidy()
    # lm(mpg ~ wt) %>% broom::tidy()
    broom::tidy(lm(mpg ~ wt), conf.int = T)
  )

term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	37.285126	1.877627	19.857575	0	33.450500	41.119753
wt	-5.344472	0.559101	-9.559044	0	-6.486308	-4.202635

#或者

mtcars %>% 
 lm(mpg ~ wt, data = .) %>% 
 broom::tidy(conf.int = T)

term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	37.285126	1.877627	19.857575	0	33.450500	41.119753
wt	-5.344472	0.559101	-9.559044	0	-6.486308	-4.202635

查看摘要统计量
- 含r.squared, adj.r.squared, sigma, statistic, p.value, df, logLik, AIC, BIC, deviance, df.residual, nobs等参数

mtcars %>% 
  lm(mpg ~ wt, data = .) %>% 
  broom::glance()

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs
0.7528328	0.7445939	3.045882	91.37533	0	1	-80.01471	166.0294	170.4266	278.3219	30	32

查看拟合结果残差等
- 含.rownames, mpg, wt, .fitted, .resid, .hat, .sigma, .cooksd, .std.resid等参数

mtcars %>% 
  lm(mpg ~ wt, data = .) %>% 
  broom::augment() %>% 
  slice_head(n = 6)

.rownames	mpg	wt	.fitted	.resid	.hat	.sigma	.cooksd	.std.resid
Mazda RX4	21.0	2.620	23.28261	-2.2826106	0.0432690	3.067494	0.0132741	-0.7661677
Mazda RX4 Wag	21.0	2.875	21.91977	-0.9197704	0.0351968	3.093068	0.0017240	-0.3074305
Datsun 710	22.8	2.320	24.88595	-2.0859521	0.0583757	3.072127	0.0154394	-0.7057525
Hornet 4 Drive	21.4	3.215	20.10265	1.2973499	0.0312502	3.088268	0.0030206	0.4327511
Hornet Sportabout	18.7	3.440	18.90014	-0.2001440	0.0329218	3.097722	0.0000760	-0.0668188
Valiant	18.1	3.460	18.79325	-0.6932545	0.0332355	3.095184	0.0009211	-0.2314831

3.1.3 模型检验及可视化

3.1.4 可视化

library(statsExpressions)
mtcars %>% 
  ggplot(aes(wt, mpg))+
  geom_point()+
  geom_smooth(method = "lm", formula = "y ~ x")+
  ggpmisc::stat_poly_eq(formula = "y ~ x",
                        aes(label = paste(..eq.label.., ..rr.label.., ..p.value.label.., 
                                          sep = "*\", \"*")),
                        parse = TRUE,
                        label.x.npc = "right", 
                        label.y.npc = "top",
                        size = 5)+
  labs(title = "32种汽车自重与油耗关系图",
       subtitle = expr_corr_test(mtcars, mpg, wt, type = "parametric"),
       x = "汽车自重(千磅)",
       y = "油耗(英哩/加仑)")+
  theme_classic()

3.2 分组批量建模

3.2.1 可视化

mtcars %>%
  mutate(cyl = as.factor(cyl)) %>% 
  ggplot(aes(wt, mpg, group = cyl, color = cyl))+
  geom_point()+
  geom_smooth(method = "lm", formula = "y ~ x")+
  ggpmisc::stat_poly_eq(formula = "y ~ x",
                        aes(label = paste(..eq.label..,    ..rr.label.., ..p.value.label.., 
                                          sep = "*\", \"*")),
                        parse = TRUE,
                        label.x.npc = "right", 
                        label.y.npc = "top",
                        size = 3)+
  labs(title = "32种汽车自重与油耗关系图",
       x = "汽车自重(千磅)",
       y = "油耗(英哩/加仑)",
       color = "汽缸数")+
  theme_classic()

3.2.2 Method1: group_by + summarise

mtcars %>%
  group_by(cyl) %>%
  summarise(
    #以下三种表达等价
    # lm(mpg ~ wt, data = cur_data()) %>% broom::tidy()
    # lm(mpg ~ wt) %>% broom::tidy()
    broom::tidy(lm(mpg ~ wt))
            ## 该表达产生总体建模效果
            # lm(mpg ~ wt, data = .) %>% broom::tidy()
  )

cyl	term	estimate	std.error	statistic	p.value
4	(Intercept)	39.571196	4.3465820	9.103980	0.0000078
4	wt	-5.647025	1.8501185	-3.052250	0.0137428
6	(Intercept)	28.408845	4.1843688	6.789278	0.0010548
6	wt	-2.780106	1.3349173	-2.082605	0.0917577
8	(Intercept)	23.868029	3.0054619	7.941551	0.0000041
8	wt	-2.192438	0.7392393	-2.965803	0.0117928

3.2.3 Method2: nest_by + summarise

## Method5
mtcars %>%
  nest_by(cyl) %>%
  summarise(
    #注意这里使用的data与group_by 不同
    broom::tidy(lm(mpg ~ wt, data = data))
  )

cyl	term	estimate	std.error	statistic	p.value
4	(Intercept)	39.571196	4.3465820	9.103980	0.0000078
4	wt	-5.647025	1.8501185	-3.052250	0.0137428
6	(Intercept)	28.408845	4.1843688	6.789278	0.0010548
6	wt	-2.780106	1.3349173	-2.082605	0.0917577
8	(Intercept)	23.868029	3.0054619	7.941551	0.0000041
8	wt	-2.192438	0.7392393	-2.965803	0.0117928

3.2.4 Method3: group_by + group_modify + ~

mtcars %>% 
  group_by(cyl) %>% 
  group_modify(
    # 以下两种等价
    # ~ lm(mpg ~ wt, data = .) %>% broom::tidy()
     ~broom::tidy(lm(mpg ~ wt, data = .))
          ## cur_data()只能用于dplyr verbs,如mutate, summrise等
           # ~ broom::tidy(lm(mpg ~ wt, data = cur_data()))
  )

cyl	term	estimate	std.error	statistic	p.value
4	(Intercept)	39.571196	4.3465820	9.103980	0.0000078
4	wt	-5.647025	1.8501185	-3.052250	0.0137428
6	(Intercept)	28.408845	4.1843688	6.789278	0.0010548
6	wt	-2.780106	1.3349173	-2.082605	0.0917577
8	(Intercept)	23.868029	3.0054619	7.941551	0.0000041
8	wt	-2.192438	0.7392393	-2.965803	0.0117928

3.2.5 Method4: group_nest + mutata + map + ~ + unnest

mtcars %>%
  group_nest(cyl) %>%
  mutate(model = purrr::map(data, ~ lm(mpg ~ wt, data = .))) %>%
  mutate(result = purrr::map(model, ~ broom::tidy(.))) %>%
  select(cyl, result) %>% 
  unnest(result)

cyl	term	estimate	std.error	statistic	p.value
4	(Intercept)	39.571196	4.3465820	9.103980	0.0000078
4	wt	-5.647025	1.8501185	-3.052250	0.0137428
6	(Intercept)	28.408845	4.1843688	6.789278	0.0010548
6	wt	-2.780106	1.3349173	-2.082605	0.0917577
8	(Intercept)	23.868029	3.0054619	7.941551	0.0000041
8	wt	-2.192438	0.7392393	-2.965803	0.0117928

3.2.6 Method5: nest_by + mutate + list + summarise

mtcars %>%
  nest_by(cyl) %>%
  mutate(model = list(lm(mpg ~ wt, data = data))) %>%
  summarise(broom::tidy(model))

cyl	term	estimate	std.error	statistic	p.value
4	(Intercept)	39.571196	4.3465820	9.103980	0.0000078
4	wt	-5.647025	1.8501185	-3.052250	0.0137428
6	(Intercept)	28.408845	4.1843688	6.789278	0.0010548
6	wt	-2.780106	1.3349173	-2.082605	0.0917577
8	(Intercept)	23.868029	3.0054619	7.941551	0.0000041
8	wt	-2.192438	0.7392393	-2.965803	0.0117928

4 小结

基本以下四种形式，推荐使用前两种

mtcars %>%
  group_by(cyl) %>%
  summarise(broom::tidy(lm(mpg ~ wt)))

mtcars %>%
  nest_by(cyl) %>% 
  summarise(
    broom::tidy(lm(mpg ~ wt, data = data)))
    
mtcars %>%
  group_by(cyl) %>%
  group_modify(
    ~ broom::tidy(lm(mpg ~ wt, data = .))
  )
    
mtcars %>%
  group_nest(cyl) %>%
  mutate(model = purrr::map(data, ~ lm(mpg ~ wt, data = .))) %>%
  mutate(result = purrr::map(model, ~ broom::tidy(.))) %>%
  select(cyl, result) %>% 
  tidyr::unnest(result)

5 预测

5.1 提供预测自变量集

pred_wt <- tibble(
  ##此坑需注意，本变量名须与模型里的自变量一致
  wt = as.numeric(seq(1, 10, by = 1))
)

5.2 总体模型预测

5.2.1 总体模型参数

pred_t <-
  modelr::add_predictions(pred_wt, lm(mpg ~ wt, data = mtcars)) 
pred_t

wt	pred
1	31.9406546
2	26.5961830
3	21.2517114
4	15.9072399
5	10.5627683
6	5.2182967
7	-0.1261748
8	-5.4706464
9	-10.8151180
10	-16.1595896

5.2.2 分组预测后可视化

m <- pred_t %>% 
  ggplot(aes(wt, pred))+
  geom_line()+
  geom_point(data = mtcars,
             aes(wt, mpg))+
  theme_classic()
  plotly::ggplotly(m)

5.3 分组模型预测

5.3.1 分组模型参数

pred_m <- mtcars %>%
  group_nest(cyl) %>%
  mutate(model = purrr::map(data, ~ lm(mpg ~ wt, data = .))) %>%
  mutate(pred_mpg = purrr::map(model, ~ modelr::add_predictions(pred_wt, .))) %>% 
  select(cyl, pred_mpg) %>% 
  unnest(pred_mpg) %>% 
  filter(pred > 3)
pred_m

cyl	wt	pred
4	1	33.924171
4	2	28.277145
4	3	22.630120
4	4	16.983095
4	5	11.336070
4	6	5.689044
6	1	25.628739
6	2	22.848633
6	3	20.068527
6	4	17.288421
6	5	14.508315
6	6	11.728209
6	7	8.948103
6	8	6.167997
6	9	3.387891
8	1	21.675591
8	2	19.483153
8	3	17.290715
8	4	15.098277
8	5	12.905839
8	6	10.713401
8	7	8.520964
8	8	6.328526
8	9	4.136088

5.3.2 分组预测后可视化

m1 <- pred_m %>%
  filter(pred > 3) %>% 
  mutate(cyl = factor(cyl)) %>% 
  ggplot(aes(wt, pred, group = cyl, color = cyl))+
  geom_line()+
  geom_point(data = mtcars %>% mutate(cyl = factor(cyl)),
             aes(wt, mpg, color = cyl))+
  theme_classic() 

  plotly::ggplotly(m1)

1. 鼠标滑到拟合曲线上会互动显示具体值；
1. 数据为模拟数据，以例证为主，不具实际意义

批量建模

Source: Batch_model_building.Rmd

Tony

2021-07-25