ML Regression_More Examples

Kate C

2022-01-16

Load Packages and Dataset

Packages used to import and analyse the data include

  • googlesheet4 - for loading in data (dataset not provided by the trainer)
  • dplyr - for data manipulation
  • vtreat - for cross-validation on data

Readlised that the types of the columns - sex and alcohol were in char and not in factor as shown in the example so have made below changes.

alcohol <- alcohol %>% 
  mutate(Sex = factor(Sex, levels = c("Female", "Male")), Alcohol = factor(Alcohol, levels = c("Alcoholic", "Non-alcoholic")))

str(alcohol)
## tibble [32 × 5] (S3: tbl_df/tbl/data.frame)
##  $ Subject: num [1:32] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Metabol: num [1:32] 0.6 0.6 1.5 0.4 0.1 0.2 0.3 0.3 0.4 1 ...
##  $ Gastric: num [1:32] 1 1.6 1.5 2.2 1.1 1.2 0.9 0.8 1.5 0.9 ...
##  $ Sex    : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol: Factor w/ 2 levels "Alcoholic","Non-alcoholic": 1 1 1 2 2 2 2 2 2 2 ...

Some Coding

Modelling an interaction

summary of the dataset

summary(alcohol)
##     Subject         Metabol          Gastric          Sex    
##  Min.   : 1.00   Min.   : 0.100   Min.   :0.800   Female:18  
##  1st Qu.: 8.75   1st Qu.: 0.600   1st Qu.:1.200   Male  :14  
##  Median :16.50   Median : 1.700   Median :1.600              
##  Mean   :16.50   Mean   : 2.422   Mean   :1.863              
##  3rd Qu.:24.25   3rd Qu.: 2.925   3rd Qu.:2.200              
##  Max.   :32.00   Max.   :12.300   Max.   :5.200              
##           Alcohol  
##  Alcoholic    : 8  
##  Non-alcoholic:24  
##                    
##                    
##                    
## 

Create the formula with main effects only - here we do not include interactions

(fmla_add <- as.formula(Metabol ~ Gastric + Sex))
## Metabol ~ Gastric + Sex

Create the formula with interactions - note that we added Gastric as a main effect but not sex. If we use * but not colon below it would be:

x * y = x + y + x:y

(fmla_interaction <- as.formula(Metabol ~ Gastric + Gastric:Sex ) )
## Metabol ~ Gastric + Gastric:Sex

Fit both models - one without interactions and one with interaction

model_add <- lm(fmla_add, data = alcohol)
model_interaction <- lm(fmla_interaction, data = alcohol)

Call summary on both models and compare

summary(model_add)
## 
## Call:
## lm(formula = fmla_add, data = alcohol)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2779 -0.6328 -0.0966  0.5783  4.5703 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.9466     0.5198  -3.745 0.000796 ***
## Gastric       1.9656     0.2674   7.352 4.24e-08 ***
## SexMale       1.6174     0.5114   3.163 0.003649 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.331 on 29 degrees of freedom
## Multiple R-squared:  0.7654, Adjusted R-squared:  0.7492 
## F-statistic: 47.31 on 2 and 29 DF,  p-value: 7.41e-10
summary(model_interaction)
## 
## Call:
## lm(formula = fmla_interaction, data = alcohol)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4656 -0.5091  0.0143  0.5660  4.0668 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -0.7504     0.5310  -1.413 0.168236    
## Gastric           1.1489     0.3450   3.331 0.002372 ** 
## Gastric:SexMale   1.0422     0.2412   4.321 0.000166 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.204 on 29 degrees of freedom
## Multiple R-squared:  0.8081, Adjusted R-squared:  0.7948 
## F-statistic: 61.05 on 2 and 29 DF,  p-value: 4.033e-11

Check out-of-sample performance

creating the splitting plan for a 3-fold cross validation.

set.seed(34245)
splitPlan <- kWayCrossValidation(nrow(alcohol), 3, NULL, NULL)
splitPlan
## [[1]]
## [[1]]$train
##  [1]  3  5  8  9 10 11 13 14 15 16 17 18 19 21 22 24 25 26 29 30 31 32
## 
## [[1]]$app
##  [1]  7  4  6  1  2 27 20 28 12 23
## 
## 
## [[2]]
## [[2]]$train
##  [1]  1  2  4  5  6  7 10 12 14 16 17 20 21 22 23 24 25 26 27 28 32
## 
## [[2]]$app
##  [1]  8 13 30 31 29  3 19  9 15 11 18
## 
## 
## [[3]]
## [[3]]$train
##  [1]  1  2  3  4  6  7  8  9 11 12 13 15 18 19 20 23 27 28 29 30 31
## 
## [[3]]$app
##  [1]  5 25 32 10 16 26 21 17 22 24 14
## 
## 
## attr(,"splitmethod")
## [1] "kwaycross"

check if splitPlan makes sense. split the dataset into train and app in three different ways (3-fold)

splitPlan[[1]]
## $train
##  [1]  3  5  8  9 10 11 13 14 15 16 17 18 19 21 22 24 25 26 29 30 31 32
## 
## $app
##  [1]  7  4  6  1  2 27 20 28 12 23
splitPlan[[2]]
## $train
##  [1]  1  2  4  5  6  7 10 12 14 16 17 20 21 22 23 24 25 26 27 28 32
## 
## $app
##  [1]  8 13 30 31 29  3 19  9 15 11 18
splitPlan[[3]]
## $train
##  [1]  1  2  3  4  6  7  8  9 11 12 13 15 18 19 20 23 27 28 29 30 31
## 
## $app
##  [1]  5 25 32 10 16 26 21 17 22 24 14

get cross-val predictions for main-effects only model

alcohol$pred_add <- 0  # initialize the prediction vector
for(i in 1:3) {
  split <- splitPlan[[i]]
  model_add <- lm(fmla_add, data = alcohol[split$train, ]) # fit on training data splitted 
  alcohol$pred_add[split$app] <- predict(model_add, newdata = alcohol[split$app, ])
}

get the cross-val predictions for the model with interactions

alcohol$pred_interaction <- 0 # initialize the prediction vector
for (i in 1:3) {
  split <- splitPlan[[i]]
  model_interaction <- lm(fmla_interaction, data = alcohol[split$train, ])
  alcohol$pred_interaction[split$app] <- predict(model_interaction, newdata = alcohol[split$app, ])
}

get RMSE. Instead of using gather function (which is discontinued) we are using pivot_longer here,

alcohol %>% 
  pivot_longer(
    cols = starts_with("pred"),
    names_to = "modeltype", 
    values_to = "pred"
  ) %>% 
  mutate(residuals = Metabol - pred) %>%
  group_by(modeltype) %>%
  summarize(rmse = sqrt(mean(residuals^2)))
## # A tibble: 2 × 2
##   modeltype         rmse
##   <chr>            <dbl>
## 1 pred_add          1.38
## 2 pred_interaction  1.30

it looks like the model with interaction gives better predictions as the rmse is lower.

Transforming the response before modelling

For example - some usually heavily skewed data

  • log transform for monetary data. lm(log(y) ~ x. usually the log-transformed model will give a slightly bigger RMSE but a smaller RMS-relative error

Wrangling the dataset (a bit actually quite a bit and dataname is getting uglier)

first of all, some house-cleaning on the dataset never provided in the exercise (why though?)

fdata_clean <- fdata %>% 
  slice(-1) %>% 
  rename(y = X, pred = X.1, label = X.2) %>% 
  select(y, pred, label)

check on data structure and make sure it is the same as in the exercise

dim(fdata_clean)
## [1] 100   3
str(fdata_clean)
## 'data.frame':    100 obs. of  3 variables:
##  $ y    : chr  "9.1496941" "1.9025206" "3.8598260" "2.3889703" ...
##  $ pred : chr  "6.430583" "3.473332" "1.594509" "3.764175" ...
##  $ label: chr  "small purchases" "small purchases" "small purchases" "small purchases" ...

revised the stringasfactor to false in the read.csv so now the columns are imported as chr.

options(digits = 7)
fdata_clean$y <- as.numeric(fdata_clean$y)
fdata_clean$pred <- as.numeric(fdata_clean$pred)

now need to revise the “label” column from chr to factor

check on the available levels

unique(fdata_clean$label)
## [1] "small purchases" "large purchases"

convert from char to factor with levels and somehow found out numbers in Macbook dropped negative sign from four numbers.

fdata_right <-fdata_clean %>%   
 mutate(label = factor(label, levels = c("small purchases", "large purchases"))) %>% 
 mutate(label = as.factor(label))
 
fdata_right$y <- replace(fdata_right$y, 3, -3.8598260)
fdata_right$y <- replace(fdata_right$y, 14, -1.9750566)
fdata_right$y <- replace(fdata_right$y, 32, -0.8775657)
fdata_right$y <- replace(fdata_right$y, 40, -5.8934988)
fdata_right$y <- replace(fdata_right$y, 6, 13.5618785)
fdata_right$y <- replace(fdata_right$y, 7, 10.1987859)
fdata_right$y <- replace(fdata_right$y, 12, 15.7238410)
fdata_right$y <- replace(fdata_right$y, 16, 18.6282945)
fdata_right$y <- replace(fdata_right$y, 22, 14.7850358)
fdata_right$y <- replace(fdata_right$y, 27, 11.9348659)
fdata_right$y <- replace(fdata_right$y, 29, 13.0587206)
fdata_right$y <- replace(fdata_right$y, 31, 11.9348659)
fdata_right$y <- replace(fdata_right$y, 37, 11.9348659)
fdata_right$y <- replace(fdata_right$y, 38, 11.9348659)
fdata_right$y <- replace(fdata_right$y, 39, 16.1650339)
fdata_right$y <- replace(fdata_right$y, 41, 11.9348659)
fdata_right$y <- replace(fdata_right$y, 50, 10.8161739)

check if the data type is correct for label.

str(fdata_right)
## 'data.frame':    100 obs. of  3 variables:
##  $ y    : num  9.15 1.9 -3.86 2.39 1.54 ...
##  $ pred : num  6.43 3.47 1.59 3.76 9.51 ...
##  $ label: Factor w/ 2 levels "small purchases",..: 1 1 1 1 1 1 1 1 1 1 ...

otherwise we can try to convert all character columns of dataframe to factor - seems more generic.

use the as.data.frame function in combination with the unclass function to covert all character columns to factor in R.

fdata_correct <- as.data.frame(unclass(fdata_clean), stringsAsFactors = TRUE)
str(fdata_correct)
## 'data.frame':    100 obs. of  3 variables:
##  $ y    : num  9.15 1.9 3.86 2.39 1.54 ...
##  $ pred : num  6.43 3.47 1.59 3.76 9.51 ...
##  $ label: Factor w/ 2 levels "large purchases",..: 2 2 2 2 2 2 2 2 2 2 ...

for easier use, we will rename the corrected data_frame below.

fdata1 <- fdata_right

Relative error

Examine the data - generate the summaries for the groups large and small

fdata1 %>% 
  group_by(label) %>% 
  summarize(min = min(y), 
            mean = mean(y), 
            max = max(y))
## # A tibble: 2 × 4
##   label             min   mean    max
##   <fct>           <dbl>  <dbl>  <dbl>
## 1 small purchases -5.89   6.30   18.6
## 2 large purchases 96.1  606.   1102.

Add error columns

fdata2 <- fdata1 %>% 
  group_by(label) %>% 
  mutate(residual = y - pred, 
         relerr = residual / y)

Compare the rmse and rmse.rel of the large and small groups. note that RMSE is root mean squared residual

fdata2 %>% 
  group_by(label) %>% 
  summarize(rmse = sqrt(mean(residual ^ 2)), 
            rmse.rel = sqrt(mean(relerr ^2)))
## # A tibble: 2 × 3
##   label            rmse rmse.rel
##   <fct>           <dbl>    <dbl>
## 1 small purchases  3.84   1.25  
## 2 large purchases  5.54   0.0147

Plot the predictions for both groups of purchases

ggplot(fdata2, aes(x = pred, y = y, color = label)) +
  geom_point() +
  geom_abline() +
  facet_wrap(~label, ncol = 1, scales = "free") +
  ggtitle("outcome vs prediction")

An good example shows that

  • though absolute RMSE of large purchase is bigger than of small purchase

  • the relative RMSE results is the opposite

  • plotting shows the similar - the large purchase model is fitted better than small one

  • hence a model with larger RMSE might still be better if the relative errors are more important than absolute errors

  • also the virtue of the story here is plotting and visualization are essential in model evaluation

Log-transformed monetary output

load the income data.

load("~/Documents/R programming/Datacamp/DC_ML/Data/Regression/Income.RData")
income_train <- incometrain
income_test <- incometest

Examine the Income 2005 in the training set

summary(income_train$Income2005)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      63   23000   39000   49894   61500  703637

Fit the lm model and make predictions

(fmla.log <- log(Income2005) ~ Arith + Word + Parag + Math + AFQT)
## log(Income2005) ~ Arith + Word + Parag + Math + AFQT
(fmla.abs <- Income2005 ~ Arith + Word + Parag + Math + AFQT)
## Income2005 ~ Arith + Word + Parag + Math + AFQT
model.log <-  lm(fmla.log, data = income_train)

income_test$logpred <- predict(model.log, income_test)
summary(income_test$logpred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   9.766  10.133  10.423  10.419  10.705  11.006
model.abs <- lm(formula = fmla.abs, data = income_train)

convert predictions back to monetary units

income_test$pred.income <- exp(income_test$logpred)
summary(income_test$pred.income)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   17432   25167   33615   35363   44566   60217

plot predicted income vs income

ggplot(income_test, aes(x = pred.income, y = Income2005)) +
  geom_point() +
  geom_abline(color = "skyblue")

Compare RMSE and RMSE relative error

income_test <- income_test %>%
  mutate(pred.absmodel = predict(model.abs, income_test),        # predictions from model.abs
         pred.logmodel = exp(predict(model.log, income_test)))

Gather predictions and calculate residuals and relative error

income_long <- income_test %>% 
  pivot_longer(cols = starts_with("pred"),
               names_to = "modeltype", 
               values_to = "pred") %>% 
  mutate(residual = Income2005 - pred, relerr = residual / Income2005 )
income_long %>% 
  group_by(modeltype) %>%      # group by modeltype
  summarize(rmse = sqrt(mean(residual ^ 2)),    # RMSE
            rmse.rel = sqrt(mean(relerr ^2)))    # Root mean squared relative error
## # A tibble: 3 × 3
##   modeltype       rmse rmse.rel
##   <chr>          <dbl>    <dbl>
## 1 pred.absmodel 37448.     3.18
## 2 pred.income   39235.     2.22
## 3 pred.logmodel 39235.     2.22

Transforming inputs before modeling

  • knowledge based relation

  • log transformation

  • key note - when raise a variable (here meant independent variable) to a power in a formula, should use the I function to treat the expression mathematically, not as an interaction.

Data - loaded the “houseprice” dataset

  • summary of the dataset
summary(houseprice)
##       size           price      
##  Min.   : 44.0   Min.   : 42.0  
##  1st Qu.: 73.5   1st Qu.:164.5  
##  Median : 91.0   Median :203.5  
##  Mean   : 94.3   Mean   :249.2  
##  3rd Qu.:118.5   3rd Qu.:287.8  
##  Max.   :150.0   Max.   :573.0
  • formula for price as a function of squared size and fit a lm model from it
fmla_sqr <- price ~ I(size ^2)
model_sqr <- lm(fmla_sqr, data = houseprice)
  • for comparison, formula for price as size and fit a lm model from it
model_lin <- lm(price ~ size, data = houseprice)
  • make predictions and compare
houseprice %>% 
  mutate(pred_lin = predict(model_lin, data = houseprice), 
         pred_sqr = predict(model_sqr, data = houseprice)) %>% 
  pivot_longer(cols = starts_with("pred"),
               names_to = "modeltype", 
               values_to = "pred") %>% 
  ggplot(aes(x = size)) +
  geom_point(aes(y = price)) +
  geom_line(aes(y = pred, color = modeltype)) +
  scale_color_brewer(palette = "Dark2")

  • check residual and RMSE
# Create a splitting plan for 3-fold cross validation
set.seed(34245)  # set the seed for reproducibility
splitPlan <- kWayCrossValidation(nrow(houseprice), 3, NULL, NULL)

# Sample code: get cross-val predictions for price ~ size
houseprice$pred_lin <- 0  # initialize the prediction vector
for(i in 1:3) {
  split <- splitPlan[[i]]
  model_lin <- lm(price ~ size, data = houseprice[split$train,])
  houseprice$pred_lin[split$app] <- predict(model_lin, newdata = houseprice[split$app,])
}

# Get cross-val predictions for price as a function of size^2 (use fmla_sqr)
houseprice$pred_sqr <- 0 # initialize the prediction vector
for(i in 1:3) {
  split <- splitPlan[[i]]
  model_sqr <- lm(fmla_sqr, data = houseprice[split$train, ])
  houseprice$pred_sqr[split$app] <- predict(model_sqr, newdata = houseprice[split$app, ])
}

# Gather the predictions and calculate the residuals
houseprice_long <- houseprice %>%
  gather(key = modeltype, value = pred, pred_lin, pred_sqr) %>%
  mutate(residuals = price - pred)

# Compare the cross-validated RMSE for the two models
houseprice_long %>% 
  group_by(modeltype) %>% # group by modeltype
  summarize(rmse = sqrt(mean(residuals ^2)))
## # A tibble: 2 × 2
##   modeltype  rmse
##   <chr>     <dbl>
## 1 pred_lin   73.1
## 2 pred_sqr   60.3

conclusion - quadratic input improved the model.

Some additional not needed and not useful note

  • pivot longer and wider are not intuitive - same as its predecessor.

  • below is a simple explanation and hopefully next time do not have to google for solution again.

pivot_longer(dataframe, cols = columns to pivot long, names_to = “desired name for category column”, values_to = “desired name for value column”