1 引言

本文介绍批量建模操作、结果规整、可视化。

2 加载宏包

# Load libraries
library(tidyverse) # Easily Install and Load the 'Tidyverse' 
library(modelr) # Modelling Functions that Work with the Pipe
# library(showtext) # Using Fonts More Easily in R Graphs
# showtext_auto() 

3 建模方法

以mtcars数据为例。
mpg cyl disp hp drat wt qsec vs am gear carb
AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1

变量概况

mtcars %>%
  select(mpg, wt) %>% 
  # report::report(median = TRUE, range = FALSE) %>%
  report::report_table()
Variable n_Obs Mean SD Median MAD Min Max Skewness Kurtosis n_Missing
2 mpg 32 20.09062 6.0269481 19.200 5.4114900 10.400 33.900 0.6723771 -0.0220063 0
1 wt 32 3.21725 0.9784574 3.325 0.7672455 1.513 5.424 0.4659161 0.4165947 0
  # report::table_long(r)

3.1 总体数据建模

3.1.1 模型设定

library(equatiomatic)
mod <- lm(mpg ~ wt, mtcars)
extract_eq(mod, wrap = T)
## $$
## \begin{aligned}
## \operatorname{mpg} &= \alpha + \beta_{1}(\operatorname{wt}) + \epsilon
## \end{aligned}
## $$

拟以车自重wt为自变量,油耗mpg为因变量进行线性拟合。方程为: \[ \operatorname{mpg} = \alpha + \beta_{1}(\operatorname{wt}) + \epsilon \]

拟合后的实际方程应为: \[ \operatorname{mpg} = 37.29 - 5.34(\operatorname{wt}) + \epsilon \]

3.1.2 模型参数

  • 查看模型系数估计值

    • 含term, estimate, std.error, statistic, p.value等参数
## 注意data = . 与 data = cur_data()的区别
#### 无分组时一致,有分组时后者为分组后的各子集

mtcars %>%
  as_tibble() %>% 
  summarise(
    ## 以下四个表达式等价
    # lm(mpg ~ wt, data = cur_data()) %>% broom::tidy()
    # lm(mpg ~ wt, data = .) %>% broom::tidy()
    # lm(mpg ~ wt) %>% broom::tidy()
    broom::tidy(lm(mpg ~ wt), conf.int = T)
  )
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 37.285126 1.877627 19.857575 0 33.450500 41.119753
wt -5.344472 0.559101 -9.559044 0 -6.486308 -4.202635
#或者

mtcars %>% 
 lm(mpg ~ wt, data = .) %>% 
 broom::tidy(conf.int = T)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 37.285126 1.877627 19.857575 0 33.450500 41.119753
wt -5.344472 0.559101 -9.559044 0 -6.486308 -4.202635
  • 查看摘要统计量
    • 含r.squared, adj.r.squared, sigma, statistic, p.value, df, logLik, AIC, BIC, deviance, df.residual, nobs等参数
mtcars %>% 
  lm(mpg ~ wt, data = .) %>% 
  broom::glance()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.7528328 0.7445939 3.045882 91.37533 0 1 -80.01471 166.0294 170.4266 278.3219 30 32
  • 查看拟合结果残差等
    • 含.rownames, mpg, wt, .fitted, .resid, .hat, .sigma, .cooksd, .std.resid等参数
mtcars %>% 
  lm(mpg ~ wt, data = .) %>% 
  broom::augment() %>% 
  slice_head(n = 6)
.rownames mpg wt .fitted .resid .hat .sigma .cooksd .std.resid
Mazda RX4 21.0 2.620 23.28261 -2.2826106 0.0432690 3.067494 0.0132741 -0.7661677
Mazda RX4 Wag 21.0 2.875 21.91977 -0.9197704 0.0351968 3.093068 0.0017240 -0.3074305
Datsun 710 22.8 2.320 24.88595 -2.0859521 0.0583757 3.072127 0.0154394 -0.7057525
Hornet 4 Drive 21.4 3.215 20.10265 1.2973499 0.0312502 3.088268 0.0030206 0.4327511
Hornet Sportabout 18.7 3.440 18.90014 -0.2001440 0.0329218 3.097722 0.0000760 -0.0668188
Valiant 18.1 3.460 18.79325 -0.6932545 0.0332355 3.095184 0.0009211 -0.2314831

3.1.3 模型检验及可视化

3.1.4 可视化

library(statsExpressions)
mtcars %>% 
  ggplot(aes(wt, mpg))+
  geom_point()+
  geom_smooth(method = "lm", formula = "y ~ x")+
  ggpmisc::stat_poly_eq(formula = "y ~ x",
                        aes(label = paste(..eq.label.., ..rr.label.., ..p.value.label.., 
                                          sep = "*\", \"*")),
                        parse = TRUE,
                        label.x.npc = "right", 
                        label.y.npc = "top",
                        size = 5)+
  labs(title = "32种汽车自重与油耗关系图",
       subtitle = expr_corr_test(mtcars, mpg, wt, type = "parametric"),
       x = "汽车自重(千磅)",
       y = "油耗(英哩/加仑)")+
  theme_classic()

3.2 分组批量建模

3.2.1 可视化

mtcars %>%
  mutate(cyl = as.factor(cyl)) %>% 
  ggplot(aes(wt, mpg, group = cyl, color = cyl))+
  geom_point()+
  geom_smooth(method = "lm", formula = "y ~ x")+
  ggpmisc::stat_poly_eq(formula = "y ~ x",
                        aes(label = paste(..eq.label..,    ..rr.label.., ..p.value.label.., 
                                          sep = "*\", \"*")),
                        parse = TRUE,
                        label.x.npc = "right", 
                        label.y.npc = "top",
                        size = 3)+
  labs(title = "32种汽车自重与油耗关系图",
       x = "汽车自重(千磅)",
       y = "油耗(英哩/加仑)",
       color = "汽缸数")+
  theme_classic()

3.2.2 Method1: group_by + summarise

mtcars %>%
  group_by(cyl) %>%
  summarise(
    #以下三种表达等价
    # lm(mpg ~ wt, data = cur_data()) %>% broom::tidy()
    # lm(mpg ~ wt) %>% broom::tidy()
    broom::tidy(lm(mpg ~ wt))
            ## 该表达产生总体建模效果
            # lm(mpg ~ wt, data = .) %>% broom::tidy()
  )
cyl term estimate std.error statistic p.value
4 (Intercept) 39.571196 4.3465820 9.103980 0.0000078
4 wt -5.647025 1.8501185 -3.052250 0.0137428
6 (Intercept) 28.408845 4.1843688 6.789278 0.0010548
6 wt -2.780106 1.3349173 -2.082605 0.0917577
8 (Intercept) 23.868029 3.0054619 7.941551 0.0000041
8 wt -2.192438 0.7392393 -2.965803 0.0117928

3.2.3 Method2: nest_by + summarise

## Method5
mtcars %>%
  nest_by(cyl) %>%
  summarise(
    #注意这里使用的data与group_by 不同
    broom::tidy(lm(mpg ~ wt, data = data))
  )
cyl term estimate std.error statistic p.value
4 (Intercept) 39.571196 4.3465820 9.103980 0.0000078
4 wt -5.647025 1.8501185 -3.052250 0.0137428
6 (Intercept) 28.408845 4.1843688 6.789278 0.0010548
6 wt -2.780106 1.3349173 -2.082605 0.0917577
8 (Intercept) 23.868029 3.0054619 7.941551 0.0000041
8 wt -2.192438 0.7392393 -2.965803 0.0117928

3.2.4 Method3: group_by + group_modify + ~

mtcars %>% 
  group_by(cyl) %>% 
  group_modify(
    # 以下两种等价
    # ~ lm(mpg ~ wt, data = .) %>% broom::tidy()
     ~broom::tidy(lm(mpg ~ wt, data = .))
          ## cur_data()只能用于dplyr verbs,如mutate, summrise等
           # ~ broom::tidy(lm(mpg ~ wt, data = cur_data()))
  )
cyl term estimate std.error statistic p.value
4 (Intercept) 39.571196 4.3465820 9.103980 0.0000078
4 wt -5.647025 1.8501185 -3.052250 0.0137428
6 (Intercept) 28.408845 4.1843688 6.789278 0.0010548
6 wt -2.780106 1.3349173 -2.082605 0.0917577
8 (Intercept) 23.868029 3.0054619 7.941551 0.0000041
8 wt -2.192438 0.7392393 -2.965803 0.0117928

3.2.5 Method4: group_nest + mutata + map + ~ + unnest

mtcars %>%
  group_nest(cyl) %>%
  mutate(model = purrr::map(data, ~ lm(mpg ~ wt, data = .))) %>%
  mutate(result = purrr::map(model, ~ broom::tidy(.))) %>%
  select(cyl, result) %>% 
  unnest(result)
cyl term estimate std.error statistic p.value
4 (Intercept) 39.571196 4.3465820 9.103980 0.0000078
4 wt -5.647025 1.8501185 -3.052250 0.0137428
6 (Intercept) 28.408845 4.1843688 6.789278 0.0010548
6 wt -2.780106 1.3349173 -2.082605 0.0917577
8 (Intercept) 23.868029 3.0054619 7.941551 0.0000041
8 wt -2.192438 0.7392393 -2.965803 0.0117928

3.2.6 Method5: nest_by + mutate + list + summarise

mtcars %>%
  nest_by(cyl) %>%
  mutate(model = list(lm(mpg ~ wt, data = data))) %>%
  summarise(broom::tidy(model))
cyl term estimate std.error statistic p.value
4 (Intercept) 39.571196 4.3465820 9.103980 0.0000078
4 wt -5.647025 1.8501185 -3.052250 0.0137428
6 (Intercept) 28.408845 4.1843688 6.789278 0.0010548
6 wt -2.780106 1.3349173 -2.082605 0.0917577
8 (Intercept) 23.868029 3.0054619 7.941551 0.0000041
8 wt -2.192438 0.7392393 -2.965803 0.0117928

4 小结

基本以下四种形式,推荐使用前两种

mtcars %>%
  group_by(cyl) %>%
  summarise(broom::tidy(lm(mpg ~ wt)))

mtcars %>%
  nest_by(cyl) %>% 
  summarise(
    broom::tidy(lm(mpg ~ wt, data = data)))
    
mtcars %>%
  group_by(cyl) %>%
  group_modify(
    ~ broom::tidy(lm(mpg ~ wt, data = .))
  )
    
mtcars %>%
  group_nest(cyl) %>%
  mutate(model = purrr::map(data, ~ lm(mpg ~ wt, data = .))) %>%
  mutate(result = purrr::map(model, ~ broom::tidy(.))) %>%
  select(cyl, result) %>% 
  tidyr::unnest(result)

5 预测

5.1 提供预测自变量集

pred_wt <- tibble(
  ##此坑需注意,本变量名须与模型里的自变量一致
  wt = as.numeric(seq(1, 10, by = 1))
)

5.2 总体模型预测

5.2.1 总体模型参数

pred_t <-
  modelr::add_predictions(pred_wt, lm(mpg ~ wt, data = mtcars)) 
pred_t
wt pred
1 31.9406546
2 26.5961830
3 21.2517114
4 15.9072399
5 10.5627683
6 5.2182967
7 -0.1261748
8 -5.4706464
9 -10.8151180
10 -16.1595896

5.2.2 分组预测后可视化

m <- pred_t %>% 
  ggplot(aes(wt, pred))+
  geom_line()+
  geom_point(data = mtcars,
             aes(wt, mpg))+
  theme_classic()
  plotly::ggplotly(m)

5.3 分组模型预测

5.3.1 分组模型参数

pred_m <- mtcars %>%
  group_nest(cyl) %>%
  mutate(model = purrr::map(data, ~ lm(mpg ~ wt, data = .))) %>%
  mutate(pred_mpg = purrr::map(model, ~ modelr::add_predictions(pred_wt, .))) %>% 
  select(cyl, pred_mpg) %>% 
  unnest(pred_mpg) %>% 
  filter(pred > 3)
pred_m
cyl wt pred
4 1 33.924171
4 2 28.277145
4 3 22.630120
4 4 16.983095
4 5 11.336070
4 6 5.689044
6 1 25.628739
6 2 22.848633
6 3 20.068527
6 4 17.288421
6 5 14.508315
6 6 11.728209
6 7 8.948103
6 8 6.167997
6 9 3.387891
8 1 21.675591
8 2 19.483153
8 3 17.290715
8 4 15.098277
8 5 12.905839
8 6 10.713401
8 7 8.520964
8 8 6.328526
8 9 4.136088

5.3.2 分组预测后可视化

m1 <- pred_m %>%
  filter(pred > 3) %>% 
  mutate(cyl = factor(cyl)) %>% 
  ggplot(aes(wt, pred, group = cyl, color = cyl))+
  geom_line()+
  geom_point(data = mtcars %>% mutate(cyl = factor(cyl)),
             aes(wt, mpg, color = cyl))+
  theme_classic() 

  plotly::ggplotly(m1)
    1. 鼠标滑到拟合曲线上会互动显示具体值;
    1. 数据为模拟数据,以例证为主,不具实际意义