ML Regression_More Examples

Load Packages and Dataset

Packages used to import and analyse the data include

googlesheet4 - for loading in data (dataset not provided by the trainer)
dplyr - for data manipulation
vtreat - for cross-validation on data

Readlised that the types of the columns - sex and alcohol were in char and not in factor as shown in the example so have made below changes.

alcohol <- alcohol %>% 
  mutate(Sex = factor(Sex, levels = c("Female", "Male")), Alcohol = factor(Alcohol, levels = c("Alcoholic", "Non-alcoholic")))

str(alcohol)

## tibble [32 × 5] (S3: tbl_df/tbl/data.frame)
##  $ Subject: num [1:32] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Metabol: num [1:32] 0.6 0.6 1.5 0.4 0.1 0.2 0.3 0.3 0.4 1 ...
##  $ Gastric: num [1:32] 1 1.6 1.5 2.2 1.1 1.2 0.9 0.8 1.5 0.9 ...
##  $ Sex    : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Alcohol: Factor w/ 2 levels "Alcoholic","Non-alcoholic": 1 1 1 2 2 2 2 2 2 2 ...

Some Coding

Modelling an interaction

summary of the dataset

summary(alcohol)

##     Subject         Metabol          Gastric          Sex    
##  Min.   : 1.00   Min.   : 0.100   Min.   :0.800   Female:18  
##  1st Qu.: 8.75   1st Qu.: 0.600   1st Qu.:1.200   Male  :14  
##  Median :16.50   Median : 1.700   Median :1.600              
##  Mean   :16.50   Mean   : 2.422   Mean   :1.863              
##  3rd Qu.:24.25   3rd Qu.: 2.925   3rd Qu.:2.200              
##  Max.   :32.00   Max.   :12.300   Max.   :5.200              
##           Alcohol  
##  Alcoholic    : 8  
##  Non-alcoholic:24  
##                    
##                    
##                    
##

Create the formula with main effects only - here we do not include interactions

(fmla_add <- as.formula(Metabol ~ Gastric + Sex))

## Metabol ~ Gastric + Sex

Create the formula with interactions - note that we added Gastric as a main effect but not sex. If we use * but not colon below it would be:

x * y = x + y + x:y

(fmla_interaction <- as.formula(Metabol ~ Gastric + Gastric:Sex ) )

## Metabol ~ Gastric + Gastric:Sex

Fit both models - one without interactions and one with interaction

model_add <- lm(fmla_add, data = alcohol)
model_interaction <- lm(fmla_interaction, data = alcohol)

Call summary on both models and compare

summary(model_add)

## 
## Call:
## lm(formula = fmla_add, data = alcohol)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2779 -0.6328 -0.0966  0.5783  4.5703 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.9466     0.5198  -3.745 0.000796 ***
## Gastric       1.9656     0.2674   7.352 4.24e-08 ***
## SexMale       1.6174     0.5114   3.163 0.003649 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.331 on 29 degrees of freedom
## Multiple R-squared:  0.7654, Adjusted R-squared:  0.7492 
## F-statistic: 47.31 on 2 and 29 DF,  p-value: 7.41e-10

summary(model_interaction)

## 
## Call:
## lm(formula = fmla_interaction, data = alcohol)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4656 -0.5091  0.0143  0.5660  4.0668 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -0.7504     0.5310  -1.413 0.168236    
## Gastric           1.1489     0.3450   3.331 0.002372 ** 
## Gastric:SexMale   1.0422     0.2412   4.321 0.000166 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.204 on 29 degrees of freedom
## Multiple R-squared:  0.8081, Adjusted R-squared:  0.7948 
## F-statistic: 61.05 on 2 and 29 DF,  p-value: 4.033e-11

Check out-of-sample performance

creating the splitting plan for a 3-fold cross validation.

set.seed(34245)
splitPlan <- kWayCrossValidation(nrow(alcohol), 3, NULL, NULL)
splitPlan

## [[1]]
## [[1]]$train
##  [1]  3  5  8  9 10 11 13 14 15 16 17 18 19 21 22 24 25 26 29 30 31 32
## 
## [[1]]$app
##  [1]  7  4  6  1  2 27 20 28 12 23
## 
## 
## [[2]]
## [[2]]$train
##  [1]  1  2  4  5  6  7 10 12 14 16 17 20 21 22 23 24 25 26 27 28 32
## 
## [[2]]$app
##  [1]  8 13 30 31 29  3 19  9 15 11 18
## 
## 
## [[3]]
## [[3]]$train
##  [1]  1  2  3  4  6  7  8  9 11 12 13 15 18 19 20 23 27 28 29 30 31
## 
## [[3]]$app
##  [1]  5 25 32 10 16 26 21 17 22 24 14
## 
## 
## attr(,"splitmethod")
## [1] "kwaycross"

check if splitPlan makes sense. split the dataset into train and app in three different ways (3-fold)

splitPlan[[1]]

## $train
##  [1]  3  5  8  9 10 11 13 14 15 16 17 18 19 21 22 24 25 26 29 30 31 32
## 
## $app
##  [1]  7  4  6  1  2 27 20 28 12 23

splitPlan[[2]]

## $train
##  [1]  1  2  4  5  6  7 10 12 14 16 17 20 21 22 23 24 25 26 27 28 32
## 
## $app
##  [1]  8 13 30 31 29  3 19  9 15 11 18

splitPlan[[3]]

## $train
##  [1]  1  2  3  4  6  7  8  9 11 12 13 15 18 19 20 23 27 28 29 30 31
## 
## $app
##  [1]  5 25 32 10 16 26 21 17 22 24 14

get cross-val predictions for main-effects only model

alcohol$pred_add <- 0  # initialize the prediction vector
for(i in 1:3) {
  split <- splitPlan[[i]]
  model_add <- lm(fmla_add, data = alcohol[split$train, ]) # fit on training data splitted 
  alcohol$pred_add[split$app] <- predict(model_add, newdata = alcohol[split$app, ])
}

get the cross-val predictions for the model with interactions

alcohol$pred_interaction <- 0 # initialize the prediction vector
for (i in 1:3) {
  split <- splitPlan[[i]]
  model_interaction <- lm(fmla_interaction, data = alcohol[split$train, ])
  alcohol$pred_interaction[split$app] <- predict(model_interaction, newdata = alcohol[split$app, ])
}

get RMSE. Instead of using gather function (which is discontinued) we are using pivot_longer here,

alcohol %>% 
  pivot_longer(
    cols = starts_with("pred"),
    names_to = "modeltype", 
    values_to = "pred"
  ) %>% 
  mutate(residuals = Metabol - pred) %>%
  group_by(modeltype) %>%
  summarize(rmse = sqrt(mean(residuals^2)))

## # A tibble: 2 × 2
##   modeltype         rmse
##   <chr>            <dbl>
## 1 pred_add          1.38
## 2 pred_interaction  1.30

it looks like the model with interaction gives better predictions as the rmse is lower.

Transforming the response before modelling

For example - some usually heavily skewed data

log transform for monetary data. lm(log(y) ~ x. usually the log-transformed model will give a slightly bigger RMSE but a smaller RMS-relative error

Wrangling the dataset (a bit actually quite a bit and dataname is getting uglier)

first of all, some house-cleaning on the dataset never provided in the exercise (why though?)

fdata_clean <- fdata %>% 
  slice(-1) %>% 
  rename(y = X, pred = X.1, label = X.2) %>% 
  select(y, pred, label)

check on data structure and make sure it is the same as in the exercise

dim(fdata_clean)

## [1] 100   3

str(fdata_clean)

## 'data.frame':    100 obs. of  3 variables:
##  $ y    : chr  "9.1496941" "1.9025206" "3.8598260" "2.3889703" ...
##  $ pred : chr  "6.430583" "3.473332" "1.594509" "3.764175" ...
##  $ label: chr  "small purchases" "small purchases" "small purchases" "small purchases" ...

revised the stringasfactor to false in the read.csv so now the columns are imported as chr.

options(digits = 7)
fdata_clean$y <- as.numeric(fdata_clean$y)
fdata_clean$pred <- as.numeric(fdata_clean$pred)

now need to revise the “label” column from chr to factor

check on the available levels

unique(fdata_clean$label)

## [1] "small purchases" "large purchases"

convert from char to factor with levels and somehow found out numbers in Macbook dropped negative sign from four numbers.

fdata_right <-fdata_clean %>%   
 mutate(label = factor(label, levels = c("small purchases", "large purchases"))) %>% 
 mutate(label = as.factor(label))
 
fdata_right$y <- replace(fdata_right$y, 3, -3.8598260)
fdata_right$y <- replace(fdata_right$y, 14, -1.9750566)
fdata_right$y <- replace(fdata_right$y, 32, -0.8775657)
fdata_right$y <- replace(fdata_right$y, 40, -5.8934988)
fdata_right$y <- replace(fdata_right$y, 6, 13.5618785)
fdata_right$y <- replace(fdata_right$y, 7, 10.1987859)
fdata_right$y <- replace(fdata_right$y, 12, 15.7238410)
fdata_right$y <- replace(fdata_right$y, 16, 18.6282945)
fdata_right$y <- replace(fdata_right$y, 22, 14.7850358)
fdata_right$y <- replace(fdata_right$y, 27, 11.9348659)
fdata_right$y <- replace(fdata_right$y, 29, 13.0587206)
fdata_right$y <- replace(fdata_right$y, 31, 11.9348659)
fdata_right$y <- replace(fdata_right$y, 37, 11.9348659)
fdata_right$y <- replace(fdata_right$y, 38, 11.9348659)
fdata_right$y <- replace(fdata_right$y, 39, 16.1650339)
fdata_right$y <- replace(fdata_right$y, 41, 11.9348659)
fdata_right$y <- replace(fdata_right$y, 50, 10.8161739)

check if the data type is correct for label.

str(fdata_right)

## 'data.frame':    100 obs. of  3 variables:
##  $ y    : num  9.15 1.9 -3.86 2.39 1.54 ...
##  $ pred : num  6.43 3.47 1.59 3.76 9.51 ...
##  $ label: Factor w/ 2 levels "small purchases",..: 1 1 1 1 1 1 1 1 1 1 ...

otherwise we can try to convert all character columns of dataframe to factor - seems more generic.

use the as.data.frame function in combination with the unclass function to covert all character columns to factor in R.

fdata_correct <- as.data.frame(unclass(fdata_clean), stringsAsFactors = TRUE)
str(fdata_correct)

## 'data.frame':    100 obs. of  3 variables:
##  $ y    : num  9.15 1.9 3.86 2.39 1.54 ...
##  $ pred : num  6.43 3.47 1.59 3.76 9.51 ...
##  $ label: Factor w/ 2 levels "large purchases",..: 2 2 2 2 2 2 2 2 2 2 ...

for easier use, we will rename the corrected data_frame below.

fdata1 <- fdata_right

Relative error

Examine the data - generate the summaries for the groups large and small

fdata1 %>% 
  group_by(label) %>% 
  summarize(min = min(y), 
            mean = mean(y), 
            max = max(y))

## # A tibble: 2 × 4
##   label             min   mean    max
##   <fct>           <dbl>  <dbl>  <dbl>
## 1 small purchases -5.89   6.30   18.6
## 2 large purchases 96.1  606.   1102.

Add error columns

fdata2 <- fdata1 %>% 
  group_by(label) %>% 
  mutate(residual = y - pred, 
         relerr = residual / y)

Compare the rmse and rmse.rel of the large and small groups. note that RMSE is root mean squared residual

fdata2 %>% 
  group_by(label) %>% 
  summarize(rmse = sqrt(mean(residual ^ 2)), 
            rmse.rel = sqrt(mean(relerr ^2)))

## # A tibble: 2 × 3
##   label            rmse rmse.rel
##   <fct>           <dbl>    <dbl>
## 1 small purchases  3.84   1.25  
## 2 large purchases  5.54   0.0147

Plot the predictions for both groups of purchases

ggplot(fdata2, aes(x = pred, y = y, color = label)) +
  geom_point() +
  geom_abline() +
  facet_wrap(~label, ncol = 1, scales = "free") +
  ggtitle("outcome vs prediction")

An good example shows that

though absolute RMSE of large purchase is bigger than of small purchase
the relative RMSE results is the opposite
plotting shows the similar - the large purchase model is fitted better than small one
hence a model with larger RMSE might still be better if the relative errors are more important than absolute errors
also the virtue of the story here is plotting and visualization are essential in model evaluation

Log-transformed monetary output

load the income data.

load("~/Documents/R programming/Datacamp/DC_ML/Data/Regression/Income.RData")

income_train <- incometrain
income_test <- incometest

Examine the Income 2005 in the training set

summary(income_train$Income2005)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      63   23000   39000   49894   61500  703637

Fit the lm model and make predictions

(fmla.log <- log(Income2005) ~ Arith + Word + Parag + Math + AFQT)

## log(Income2005) ~ Arith + Word + Parag + Math + AFQT

(fmla.abs <- Income2005 ~ Arith + Word + Parag + Math + AFQT)

## Income2005 ~ Arith + Word + Parag + Math + AFQT

model.log <-  lm(fmla.log, data = income_train)

income_test$logpred <- predict(model.log, income_test)
summary(income_test$logpred)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   9.766  10.133  10.423  10.419  10.705  11.006

model.abs <- lm(formula = fmla.abs, data = income_train)

convert predictions back to monetary units

income_test$pred.income <- exp(income_test$logpred)
summary(income_test$pred.income)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   17432   25167   33615   35363   44566   60217

plot predicted income vs income

ggplot(income_test, aes(x = pred.income, y = Income2005)) +
  geom_point() +
  geom_abline(color = "skyblue")

Compare RMSE and RMSE relative error

income_test <- income_test %>%
  mutate(pred.absmodel = predict(model.abs, income_test),        # predictions from model.abs
         pred.logmodel = exp(predict(model.log, income_test)))

Gather predictions and calculate residuals and relative error

income_long <- income_test %>% 
  pivot_longer(cols = starts_with("pred"),
               names_to = "modeltype", 
               values_to = "pred") %>% 
  mutate(residual = Income2005 - pred, relerr = residual / Income2005 )

income_long %>% 
  group_by(modeltype) %>%      # group by modeltype
  summarize(rmse = sqrt(mean(residual ^ 2)),    # RMSE
            rmse.rel = sqrt(mean(relerr ^2)))    # Root mean squared relative error

## # A tibble: 3 × 3
##   modeltype       rmse rmse.rel
##   <chr>          <dbl>    <dbl>
## 1 pred.absmodel 37448.     3.18
## 2 pred.income   39235.     2.22
## 3 pred.logmodel 39235.     2.22

Transforming inputs before modeling

knowledge based relation
log transformation
key note - when raise a variable (here meant independent variable) to a power in a formula, should use the I function to treat the expression mathematically, not as an interaction.

Data - loaded the “houseprice” dataset

summary of the dataset

summary(houseprice)

##       size           price      
##  Min.   : 44.0   Min.   : 42.0  
##  1st Qu.: 73.5   1st Qu.:164.5  
##  Median : 91.0   Median :203.5  
##  Mean   : 94.3   Mean   :249.2  
##  3rd Qu.:118.5   3rd Qu.:287.8  
##  Max.   :150.0   Max.   :573.0

formula for price as a function of squared size and fit a lm model from it

fmla_sqr <- price ~ I(size ^2)
model_sqr <- lm(fmla_sqr, data = houseprice)

for comparison, formula for price as size and fit a lm model from it

model_lin <- lm(price ~ size, data = houseprice)

make predictions and compare

houseprice %>% 
  mutate(pred_lin = predict(model_lin, data = houseprice), 
         pred_sqr = predict(model_sqr, data = houseprice)) %>% 
  pivot_longer(cols = starts_with("pred"),
               names_to = "modeltype", 
               values_to = "pred") %>% 
  ggplot(aes(x = size)) +
  geom_point(aes(y = price)) +
  geom_line(aes(y = pred, color = modeltype)) +
  scale_color_brewer(palette = "Dark2")

check residual and RMSE

# Create a splitting plan for 3-fold cross validation
set.seed(34245)  # set the seed for reproducibility
splitPlan <- kWayCrossValidation(nrow(houseprice), 3, NULL, NULL)

# Sample code: get cross-val predictions for price ~ size
houseprice$pred_lin <- 0  # initialize the prediction vector
for(i in 1:3) {
  split <- splitPlan[[i]]
  model_lin <- lm(price ~ size, data = houseprice[split$train,])
  houseprice$pred_lin[split$app] <- predict(model_lin, newdata = houseprice[split$app,])
}

# Get cross-val predictions for price as a function of size^2 (use fmla_sqr)
houseprice$pred_sqr <- 0 # initialize the prediction vector
for(i in 1:3) {
  split <- splitPlan[[i]]
  model_sqr <- lm(fmla_sqr, data = houseprice[split$train, ])
  houseprice$pred_sqr[split$app] <- predict(model_sqr, newdata = houseprice[split$app, ])
}

# Gather the predictions and calculate the residuals
houseprice_long <- houseprice %>%
  gather(key = modeltype, value = pred, pred_lin, pred_sqr) %>%
  mutate(residuals = price - pred)

# Compare the cross-validated RMSE for the two models
houseprice_long %>% 
  group_by(modeltype) %>% # group by modeltype
  summarize(rmse = sqrt(mean(residuals ^2)))

## # A tibble: 2 × 2
##   modeltype  rmse
##   <chr>     <dbl>
## 1 pred_lin   73.1
## 2 pred_sqr   60.3

conclusion - quadratic input improved the model.

Some additional not needed and not useful note

pivot longer and wider are not intuitive - same as its predecessor.
below is a simple explanation and hopefully next time do not have to google for solution again.

pivot_longer(dataframe, cols = columns to pivot long, names_to = “desired name for category column”, values_to = “desired name for value column”