modelr trainingThe purpose of this notebook is to illustrate how the modelr package can be used to model a continuous response variable using linear regression.
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
library(ggplot2)
library(modelr)
sim3
## # A tibble: 120 x 5
## x1 x2 rep y sd
## <int> <fct> <int> <dbl> <dbl>
## 1 1 a 1 -0.571 2
## 2 1 a 2 1.18 2
## 3 1 a 3 2.24 2
## 4 1 b 1 7.44 2
## 5 1 b 2 8.52 2
## 6 1 b 3 7.72 2
## 7 1 c 1 6.51 2
## 8 1 c 2 5.79 2
## 9 1 c 3 6.07 2
## 10 1 d 1 2.11 2
## # ... with 110 more rows
glimpse(sim3)
## Observations: 120
## Variables: 5
## $ x1 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2...
## $ x2 <fct> a, a, a, b, b, b, c, c, c, d, d, d, a, a, a, b, b, b, c, c...
## $ rep <int> 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2...
## $ y <dbl> -0.5707363, 1.1841503, 2.2373204, 7.4366963, 8.5182934, 7....
## $ sd <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
First visualise the distribution of the response variable, and next visualise how the response variable relates to explanatory variables.
ggplot(data = sim3, aes(x = y)) +
geom_histogram()
ggplot(data = sim3, aes(x = x1, y = y)) +
geom_point()
ggplot(data = sim3, aes(x = x2, y = y)) +
geom_boxplot()
ggplot(data = sim3, aes(x = x1, y = y, color = x2)) +
geom_point()
mod_lm_x1 <- lm(formula = y ~ x1, data = sim3)
mod_lm_x1
##
## Call:
## lm(formula = y ~ x1, data = sim3)
##
## Coefficients:
## (Intercept) x1
## 4.3849 -0.1967
mod_lm_x2 <- lm(formula = y ~ x2, data = sim3)
mod_lm_x2
##
## Call:
## lm(formula = y ~ x2, data = sim3)
##
## Coefficients:
## (Intercept) x2b x2c x2d
## 0.7896 2.8878 4.8057 2.3596
mod_lm_x1_x2 <- lm(formula = y ~ x1 + x2, data = sim3)
mod_lm_x1_x2
##
## Call:
## lm(formula = y ~ x1 + x2, data = sim3)
##
## Coefficients:
## (Intercept) x1 x2b x2c x2d
## 1.8717 -0.1967 2.8878 4.8057 2.3596
mod_lm_x1_x2_x1x2 <- lm(formula = y ~ x1 * x2, data = sim3)
mod_lm_x1_x2_x1x2
##
## Call:
## lm(formula = y ~ x1 * x2, data = sim3)
##
## Coefficients:
## (Intercept) x1 x2b x2c x2d
## 1.30124 -0.09302 7.06938 4.43090 0.83455
## x1:x2b x1:x2c x1:x2d
## -0.76029 0.06815 0.27728
sim3 %>%
add_predictions(mod_lm_x1) #for just one model
## # A tibble: 120 x 6
## x1 x2 rep y sd pred
## <int> <fct> <int> <dbl> <dbl> <dbl>
## 1 1 a 1 -0.571 2 4.19
## 2 1 a 2 1.18 2 4.19
## 3 1 a 3 2.24 2 4.19
## 4 1 b 1 7.44 2 4.19
## 5 1 b 2 8.52 2 4.19
## 6 1 b 3 7.72 2 4.19
## 7 1 c 1 6.51 2 4.19
## 8 1 c 2 5.79 2 4.19
## 9 1 c 3 6.07 2 4.19
## 10 1 d 1 2.11 2 4.19
## # ... with 110 more rows
sim3_pred <- sim3 %>%
gather_predictions(mod_lm_x1, mod_lm_x2, mod_lm_x1_x2, mod_lm_x1_x2_x1x2) #for many models
sim3_pred
## # A tibble: 480 x 7
## model x1 x2 rep y sd pred
## <chr> <int> <fct> <int> <dbl> <dbl> <dbl>
## 1 mod_lm_x1 1 a 1 -0.571 2 4.19
## 2 mod_lm_x1 1 a 2 1.18 2 4.19
## 3 mod_lm_x1 1 a 3 2.24 2 4.19
## 4 mod_lm_x1 1 b 1 7.44 2 4.19
## 5 mod_lm_x1 1 b 2 8.52 2 4.19
## 6 mod_lm_x1 1 b 3 7.72 2 4.19
## 7 mod_lm_x1 1 c 1 6.51 2 4.19
## 8 mod_lm_x1 1 c 2 5.79 2 4.19
## 9 mod_lm_x1 1 c 3 6.07 2 4.19
## 10 mod_lm_x1 1 d 1 2.11 2 4.19
## # ... with 470 more rows
sim3_resid <- sim3 %>%
gather_residuals(mod_lm_x1, mod_lm_x2, mod_lm_x1_x2, mod_lm_x1_x2_x1x2)
sim3_resid
## # A tibble: 480 x 7
## model x1 x2 rep y sd resid
## <chr> <int> <fct> <int> <dbl> <dbl> <dbl>
## 1 mod_lm_x1 1 a 1 -0.571 2 -4.76
## 2 mod_lm_x1 1 a 2 1.18 2 -3.00
## 3 mod_lm_x1 1 a 3 2.24 2 -1.95
## 4 mod_lm_x1 1 b 1 7.44 2 3.25
## 5 mod_lm_x1 1 b 2 8.52 2 4.33
## 6 mod_lm_x1 1 b 3 7.72 2 3.54
## 7 mod_lm_x1 1 c 1 6.51 2 2.32
## 8 mod_lm_x1 1 c 2 5.79 2 1.60
## 9 mod_lm_x1 1 c 3 6.07 2 1.88
## 10 mod_lm_x1 1 d 1 2.11 2 -2.08
## # ... with 470 more rows
ggplot(sim3_pred, aes(x = x1, y = y, color = x2)) +
geom_point() +
geom_line(aes(y = pred), size = 1) +
facet_wrap(~model)
ggplot(data = sim3_resid, aes(x = x1, y = resid, color = x2)) +
geom_point() +
facet_grid(model ~ x2)
Adjusted R-squared.summary(mod_lm_x1)
##
## Call:
## lm(formula = y ~ x1, data = sim3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7589 -1.9213 -0.1823 2.0902 4.5755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.38495 0.45283 9.684 < 2e-16 ***
## x1 -0.19674 0.07298 -2.696 0.00805 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.296 on 118 degrees of freedom
## Multiple R-squared: 0.05801, Adjusted R-squared: 0.05003
## F-statistic: 7.267 on 1 and 118 DF, p-value: 0.008048
summary(mod_lm_x2)
##
## Call:
## lm(formula = y ~ x2, data = sim3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3527 -0.7792 0.0437 0.7227 4.8409
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7896 0.2977 2.652 0.00911 **
## x2b 2.8878 0.4210 6.859 3.58e-10 ***
## x2c 4.8057 0.4210 11.415 < 2e-16 ***
## x2d 2.3596 0.4210 5.604 1.43e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.631 on 116 degrees of freedom
## Multiple R-squared: 0.533, Adjusted R-squared: 0.521
## F-statistic: 44.14 on 3 and 116 DF, p-value: < 2.2e-16
summary(mod_lm_x1_x2)
##
## Call:
## lm(formula = y ~ x1 + x2, data = sim3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4674 -0.8524 -0.0729 0.7886 4.3005
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.87167 0.38738 4.832 4.22e-06 ***
## x1 -0.19674 0.04871 -4.039 9.72e-05 ***
## x2b 2.88781 0.39571 7.298 4.07e-11 ***
## x2c 4.80574 0.39571 12.145 < 2e-16 ***
## x2d 2.35959 0.39571 5.963 2.79e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.533 on 115 degrees of freedom
## Multiple R-squared: 0.5911, Adjusted R-squared: 0.5768
## F-statistic: 41.55 on 4 and 115 DF, p-value: < 2.2e-16
summary(mod_lm_x1_x2_x1x2)
##
## Call:
## lm(formula = y ~ x1 * x2, data = sim3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.87634 -0.67655 0.04837 0.69963 2.58607
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.30124 0.40400 3.221 0.00167 **
## x1 -0.09302 0.06511 -1.429 0.15587
## x2b 7.06938 0.57134 12.373 < 2e-16 ***
## x2c 4.43090 0.57134 7.755 4.41e-12 ***
## x2d 0.83455 0.57134 1.461 0.14690
## x1:x2b -0.76029 0.09208 -8.257 3.30e-13 ***
## x1:x2c 0.06815 0.09208 0.740 0.46076
## x1:x2d 0.27728 0.09208 3.011 0.00322 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.024 on 112 degrees of freedom
## Multiple R-squared: 0.8221, Adjusted R-squared: 0.811
## F-statistic: 73.93 on 7 and 112 DF, p-value: < 2.2e-16
If the dataset has sufficient observations and you are making a complex model, you should train your preferred model using only 80% of the data, and then confirm it is appropriate with the remaining set-aside 20% of the dataset. This avoids the risk of overfitting, where the the noise is modelled as well as the signal.
sim3 <- sim3 %>%
mutate(id = row_number()) %>%
select(id, everything())
sim3_train <- sim3 %>%
sample_frac(0.8) %>%
arrange(id) #dataset to train model
sim3_train
## # A tibble: 96 x 6
## id x1 x2 rep y sd
## <int> <int> <fct> <int> <dbl> <dbl>
## 1 2 1 a 2 1.18 2
## 2 4 1 b 1 7.44 2
## 3 5 1 b 2 8.52 2
## 4 7 1 c 1 6.51 2
## 5 8 1 c 2 5.79 2
## 6 11 1 d 2 4.12 2
## 7 12 1 d 3 3.50 2
## 8 14 2 a 2 1.36 2
## 9 16 2 b 1 7.51 2
## 10 17 2 b 2 3.79 2
## # ... with 86 more rows
sim3_confirm <- anti_join(sim3, sim3_train) #dataset to confirm model
## Joining, by = c("id", "x1", "x2", "rep", "y", "sd")
## Warning: Column `y` has different attributes on LHS and RHS of join
sim3_confirm
## # A tibble: 24 x 6
## id x1 x2 rep y sd
## <int> <int> <fct> <int> <dbl> <dbl>
## 1 1 1 a 1 -0.571 2
## 2 3 1 a 3 2.24 2
## 3 6 1 b 3 7.72 2
## 4 9 1 c 3 6.07 2
## 5 10 1 d 1 2.11 2
## 6 13 2 a 1 1.55 2
## 7 15 2 a 3 0.252 2
## 8 23 2 d 2 3.45 2
## 9 27 3 a 3 0.346 2
## 10 35 3 d 2 0.844 2
## # ... with 14 more rows