Exercise 1 - Simple linear regression

  1. Load the data biomass and plot HHV as a function of carbon.
library(tidymodels)
# or run in console table with "remotes::install_github("tidymodels/parsnip")"
data("biomass") # in model.data package
head(biomass)
##                   sample  dataset carbon hydrogen oxygen nitrogen sulfur    HHV
## 1           Akhrot Shell Training  49.81     5.64  42.94     0.41   0.00 20.008
## 2 Alabama Oak Wood Waste Training  49.50     5.70  41.30     0.20   0.00 19.228
## 3                  Alder Training  47.82     5.80  46.25     0.11   0.02 18.299
## 4                Alfalfa Training  45.10     4.97  35.60     3.30   0.16 18.151
## 5     Alfalfa Seed Straw Training  46.76     5.40  40.72     1.00   0.02 18.450
## 6         Alfalfa Stalks Training  45.40     5.75  40.20     2.04   0.10 18.465
# plot
biomass %>%
ggplot(aes(x = carbon, y = HHV)) +
  geom_point() 

  1. Fit a simple linear regression model to the data. With HHV as the response and carbon as the predictor. Is it a good fit?
# create a model specification
linear_reg() %>%
  set_mode("regression") %>%
  set_engine("lm") -> lm_spec

# fit the model
fit(lm_spec, HHV ~ carbon, data = biomass)  -> simple_lm_fit

# see the summary table
simple_lm_fit %>%
  pluck("fit") %>%
  summary()
## 
## Call:
## stats::lm(formula = HHV ~ carbon, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.9357  -0.5228  -0.0704   0.4119   9.4552 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.767605   0.308779   8.963   <2e-16 ***
## carbon      0.339371   0.006256  54.243   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.473 on 534 degrees of freedom
## Multiple R-squared:  0.8464, Adjusted R-squared:  0.8461 
## F-statistic:  2942 on 1 and 534 DF,  p-value: < 2.2e-16
# another way to call summary table
# simple_lm_fit$fit %>% summary()
# using `tidy` to make a table as a tibble
simple_lm_fit %>% 
  tidy()
## # A tibble: 2 x 5
##   term        estimate std.error statistic   p.value
##   <chr>          <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)    2.77    0.309        8.96 5.29e- 18
## 2 carbon         0.339   0.00626     54.2  2.23e-219
# another way to extract the r-square
simple_lm_fit %>% 
  glance() %>%
  pull(r.squared)
## [1] 0.8463907
  1. Use the model to predict what the HHV of samples with carbon = 10, 20, …, 80.
# create a new HHV data frame
new_hhv <- tibble(carbon = seq(from = 10, to = 80, by = 10))
new_hhv
## # A tibble: 8 x 1
##   carbon
##    <dbl>
## 1     10
## 2     20
## 3     30
## 4     40
## 5     50
## 6     60
## 7     70
## 8     80
predict(simple_lm_fit, new_data = new_hhv) %>%
  rename(predictHHV = .pred)
## # A tibble: 8 x 1
##   predictHHV
##        <dbl>
## 1       6.16
## 2       9.56
## 3      12.9 
## 4      16.3 
## 5      19.7 
## 6      23.1 
## 7      26.5 
## 8      29.9
  1. Produce diagnostics plots. You can use plot() on the $fit object to produce some diagnostics.
par(mfrow = c(2, 2))
simple_lm_fit %>%
  pluck("fit") %>%
  plot() 

Exercise 2 - Multiple linear regression

  1. Fit a linear regression model to the data. With HHV as the response and carbon and hydrogen as the predictor. How is the fit compared to the simple linear model?
fit(lm_spec, HHV ~ carbon + hydrogen, data = biomass)  -> multiple_lm_fit

summary(multiple_lm_fit$fit)
## 
## Call:
## stats::lm(formula = HHV ~ carbon + hydrogen, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.1393  -0.4984  -0.1074   0.3934  10.6733 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.02853    0.49815   2.065   0.0394 *  
## carbon       0.34805    0.00646  53.882  < 2e-16 ***
## hydrogen     0.24180    0.05491   4.403 1.29e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.448 on 533 degrees of freedom
## Multiple R-squared:  0.8518, Adjusted R-squared:  0.8512 
## F-statistic:  1532 on 2 and 533 DF,  p-value: < 2.2e-16
  1. Fit a linear regression model to the data. With HHV as the response and all the molecules as the predictor. How is the fit compared to the previous models?
fit(lm_spec, HHV ~ carbon + hydrogen + oxygen + nitrogen + sulfur, data = biomass)  -> full_lm_fit
summary(full_lm_fit$fit)
## 
## Call:
## stats::lm(formula = HHV ~ carbon + hydrogen + oxygen + nitrogen + 
##     sulfur, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.3185  -0.4913  -0.0703   0.4308   9.9135 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.366432   0.700763   0.523 0.601260    
## carbon       0.352207   0.008360  42.130  < 2e-16 ***
## hydrogen     0.254543   0.062828   4.051 5.85e-05 ***
## oxygen       0.007578   0.009212   0.823 0.411108    
## nitrogen    -0.007530   0.058594  -0.129 0.897796    
## sulfur       0.460805   0.138858   3.319 0.000967 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.437 on 530 degrees of freedom
## Multiple R-squared:  0.8548, Adjusted R-squared:  0.8535 
## F-statistic: 624.3 on 5 and 530 DF,  p-value: < 2.2e-16
# find out the three model's MSE, which are close to 2
anova(simple_lm_fit$fit) %>% tidy()
## # A tibble: 2 x 6
##   term         df sumsq  meansq statistic    p.value
##   <chr>     <int> <dbl>   <dbl>     <dbl>      <dbl>
## 1 carbon        1 6385. 6385.       2942.  2.23e-219
## 2 Residuals   534 1159.    2.17       NA  NA
anova(multiple_lm_fit$fit) %>% tidy
## # A tibble: 3 x 6
##   term         df  sumsq  meansq statistic    p.value
##   <chr>     <int>  <dbl>   <dbl>     <dbl>      <dbl>
## 1 carbon        1 6385.  6385.      3044.   1.75e-222
## 2 hydrogen      1   40.7   40.7       19.4  1.29e-  5
## 3 Residuals   533 1118.     2.10      NA   NA
anova(full_lm_fit$fit) %>% tidy()
## # A tibble: 6 x 6
##   term         df      sumsq     meansq  statistic    p.value
##   <chr>     <int>      <dbl>      <dbl>      <dbl>      <dbl>
## 1 carbon        1 6385.      6385.      3090.       2.73e-223
## 2 hydrogen      1   40.7       40.7       19.7      1.11e-  5
## 3 oxygen        1    0.00433    0.00433    0.00210  9.63e-  1
## 4 nitrogen      1    0.344      0.344      0.166    6.83e-  1
## 5 sulfur        1   22.8       22.8       11.0      9.67e-  4
## 6 Residuals   530 1095.         2.07      NA       NA