Load Packages and Dataset
Packages used to import and analyse the data include
- googlesheet4 - for loading in data (dataset not provided by the trainer)
- dplyr - for data manipulation
- vtreat - for cross-validation on data
Readlised that the types of the columns - sex and alcohol were in char and not in factor as shown in the example so have made below changes.
alcohol <- alcohol %>%
mutate(Sex = factor(Sex, levels = c("Female", "Male")), Alcohol = factor(Alcohol, levels = c("Alcoholic", "Non-alcoholic")))
str(alcohol)## tibble [32 × 5] (S3: tbl_df/tbl/data.frame)
## $ Subject: num [1:32] 1 2 3 4 5 6 7 8 9 10 ...
## $ Metabol: num [1:32] 0.6 0.6 1.5 0.4 0.1 0.2 0.3 0.3 0.4 1 ...
## $ Gastric: num [1:32] 1 1.6 1.5 2.2 1.1 1.2 0.9 0.8 1.5 0.9 ...
## $ Sex : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol: Factor w/ 2 levels "Alcoholic","Non-alcoholic": 1 1 1 2 2 2 2 2 2 2 ...
Some Coding
Modelling an interaction
summary of the dataset
summary(alcohol)## Subject Metabol Gastric Sex
## Min. : 1.00 Min. : 0.100 Min. :0.800 Female:18
## 1st Qu.: 8.75 1st Qu.: 0.600 1st Qu.:1.200 Male :14
## Median :16.50 Median : 1.700 Median :1.600
## Mean :16.50 Mean : 2.422 Mean :1.863
## 3rd Qu.:24.25 3rd Qu.: 2.925 3rd Qu.:2.200
## Max. :32.00 Max. :12.300 Max. :5.200
## Alcohol
## Alcoholic : 8
## Non-alcoholic:24
##
##
##
##
Create the formula with main effects only - here we do not include interactions
(fmla_add <- as.formula(Metabol ~ Gastric + Sex))## Metabol ~ Gastric + Sex
Create the formula with interactions - note that we added Gastric as a main effect but not sex. If we use * but not colon below it would be:
x * y = x + y + x:y
(fmla_interaction <- as.formula(Metabol ~ Gastric + Gastric:Sex ) )## Metabol ~ Gastric + Gastric:Sex
Fit both models - one without interactions and one with interaction
model_add <- lm(fmla_add, data = alcohol)
model_interaction <- lm(fmla_interaction, data = alcohol)Call summary on both models and compare
summary(model_add)##
## Call:
## lm(formula = fmla_add, data = alcohol)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2779 -0.6328 -0.0966 0.5783 4.5703
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.9466 0.5198 -3.745 0.000796 ***
## Gastric 1.9656 0.2674 7.352 4.24e-08 ***
## SexMale 1.6174 0.5114 3.163 0.003649 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.331 on 29 degrees of freedom
## Multiple R-squared: 0.7654, Adjusted R-squared: 0.7492
## F-statistic: 47.31 on 2 and 29 DF, p-value: 7.41e-10
summary(model_interaction)##
## Call:
## lm(formula = fmla_interaction, data = alcohol)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4656 -0.5091 0.0143 0.5660 4.0668
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.7504 0.5310 -1.413 0.168236
## Gastric 1.1489 0.3450 3.331 0.002372 **
## Gastric:SexMale 1.0422 0.2412 4.321 0.000166 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.204 on 29 degrees of freedom
## Multiple R-squared: 0.8081, Adjusted R-squared: 0.7948
## F-statistic: 61.05 on 2 and 29 DF, p-value: 4.033e-11
Check out-of-sample performance
creating the splitting plan for a 3-fold cross validation.
set.seed(34245)
splitPlan <- kWayCrossValidation(nrow(alcohol), 3, NULL, NULL)
splitPlan## [[1]]
## [[1]]$train
## [1] 3 5 8 9 10 11 13 14 15 16 17 18 19 21 22 24 25 26 29 30 31 32
##
## [[1]]$app
## [1] 7 4 6 1 2 27 20 28 12 23
##
##
## [[2]]
## [[2]]$train
## [1] 1 2 4 5 6 7 10 12 14 16 17 20 21 22 23 24 25 26 27 28 32
##
## [[2]]$app
## [1] 8 13 30 31 29 3 19 9 15 11 18
##
##
## [[3]]
## [[3]]$train
## [1] 1 2 3 4 6 7 8 9 11 12 13 15 18 19 20 23 27 28 29 30 31
##
## [[3]]$app
## [1] 5 25 32 10 16 26 21 17 22 24 14
##
##
## attr(,"splitmethod")
## [1] "kwaycross"
check if splitPlan makes sense. split the dataset into train and app in three different ways (3-fold)
splitPlan[[1]]## $train
## [1] 3 5 8 9 10 11 13 14 15 16 17 18 19 21 22 24 25 26 29 30 31 32
##
## $app
## [1] 7 4 6 1 2 27 20 28 12 23
splitPlan[[2]]## $train
## [1] 1 2 4 5 6 7 10 12 14 16 17 20 21 22 23 24 25 26 27 28 32
##
## $app
## [1] 8 13 30 31 29 3 19 9 15 11 18
splitPlan[[3]]## $train
## [1] 1 2 3 4 6 7 8 9 11 12 13 15 18 19 20 23 27 28 29 30 31
##
## $app
## [1] 5 25 32 10 16 26 21 17 22 24 14
get cross-val predictions for main-effects only model
alcohol$pred_add <- 0 # initialize the prediction vector
for(i in 1:3) {
split <- splitPlan[[i]]
model_add <- lm(fmla_add, data = alcohol[split$train, ]) # fit on training data splitted
alcohol$pred_add[split$app] <- predict(model_add, newdata = alcohol[split$app, ])
}get the cross-val predictions for the model with interactions
alcohol$pred_interaction <- 0 # initialize the prediction vector
for (i in 1:3) {
split <- splitPlan[[i]]
model_interaction <- lm(fmla_interaction, data = alcohol[split$train, ])
alcohol$pred_interaction[split$app] <- predict(model_interaction, newdata = alcohol[split$app, ])
}get RMSE. Instead of using gather function (which is discontinued) we are using pivot_longer here,
alcohol %>%
pivot_longer(
cols = starts_with("pred"),
names_to = "modeltype",
values_to = "pred"
) %>%
mutate(residuals = Metabol - pred) %>%
group_by(modeltype) %>%
summarize(rmse = sqrt(mean(residuals^2)))## # A tibble: 2 × 2
## modeltype rmse
## <chr> <dbl>
## 1 pred_add 1.38
## 2 pred_interaction 1.30
it looks like the model with interaction gives better predictions as the rmse is lower.
Transforming the response before modelling
For example - some usually heavily skewed data
- log transform for monetary data. lm(log(y) ~ x. usually the log-transformed model will give a slightly bigger RMSE but a smaller RMS-relative error
Wrangling the dataset (a bit actually quite a bit and dataname is getting uglier)
first of all, some house-cleaning on the dataset never provided in the exercise (why though?)
fdata_clean <- fdata %>%
slice(-1) %>%
rename(y = X, pred = X.1, label = X.2) %>%
select(y, pred, label)check on data structure and make sure it is the same as in the exercise
dim(fdata_clean)## [1] 100 3
str(fdata_clean)## 'data.frame': 100 obs. of 3 variables:
## $ y : chr "9.1496941" "1.9025206" "3.8598260" "2.3889703" ...
## $ pred : chr "6.430583" "3.473332" "1.594509" "3.764175" ...
## $ label: chr "small purchases" "small purchases" "small purchases" "small purchases" ...
revised the stringasfactor to false in the read.csv so now the columns are imported as chr.
options(digits = 7)
fdata_clean$y <- as.numeric(fdata_clean$y)
fdata_clean$pred <- as.numeric(fdata_clean$pred)now need to revise the “label” column from chr to factor
check on the available levels
unique(fdata_clean$label)## [1] "small purchases" "large purchases"
convert from char to factor with levels and somehow found out numbers in Macbook dropped negative sign from four numbers.
fdata_right <-fdata_clean %>%
mutate(label = factor(label, levels = c("small purchases", "large purchases"))) %>%
mutate(label = as.factor(label))
fdata_right$y <- replace(fdata_right$y, 3, -3.8598260)
fdata_right$y <- replace(fdata_right$y, 14, -1.9750566)
fdata_right$y <- replace(fdata_right$y, 32, -0.8775657)
fdata_right$y <- replace(fdata_right$y, 40, -5.8934988)
fdata_right$y <- replace(fdata_right$y, 6, 13.5618785)
fdata_right$y <- replace(fdata_right$y, 7, 10.1987859)
fdata_right$y <- replace(fdata_right$y, 12, 15.7238410)
fdata_right$y <- replace(fdata_right$y, 16, 18.6282945)
fdata_right$y <- replace(fdata_right$y, 22, 14.7850358)
fdata_right$y <- replace(fdata_right$y, 27, 11.9348659)
fdata_right$y <- replace(fdata_right$y, 29, 13.0587206)
fdata_right$y <- replace(fdata_right$y, 31, 11.9348659)
fdata_right$y <- replace(fdata_right$y, 37, 11.9348659)
fdata_right$y <- replace(fdata_right$y, 38, 11.9348659)
fdata_right$y <- replace(fdata_right$y, 39, 16.1650339)
fdata_right$y <- replace(fdata_right$y, 41, 11.9348659)
fdata_right$y <- replace(fdata_right$y, 50, 10.8161739)check if the data type is correct for label.
str(fdata_right)## 'data.frame': 100 obs. of 3 variables:
## $ y : num 9.15 1.9 -3.86 2.39 1.54 ...
## $ pred : num 6.43 3.47 1.59 3.76 9.51 ...
## $ label: Factor w/ 2 levels "small purchases",..: 1 1 1 1 1 1 1 1 1 1 ...
otherwise we can try to convert all character columns of dataframe to factor - seems more generic.
use the as.data.frame function in combination with the unclass function to covert all character columns to factor in R.
fdata_correct <- as.data.frame(unclass(fdata_clean), stringsAsFactors = TRUE)
str(fdata_correct)## 'data.frame': 100 obs. of 3 variables:
## $ y : num 9.15 1.9 3.86 2.39 1.54 ...
## $ pred : num 6.43 3.47 1.59 3.76 9.51 ...
## $ label: Factor w/ 2 levels "large purchases",..: 2 2 2 2 2 2 2 2 2 2 ...
for easier use, we will rename the corrected data_frame below.
fdata1 <- fdata_rightRelative error
Examine the data - generate the summaries for the groups large and small
fdata1 %>%
group_by(label) %>%
summarize(min = min(y),
mean = mean(y),
max = max(y))## # A tibble: 2 × 4
## label min mean max
## <fct> <dbl> <dbl> <dbl>
## 1 small purchases -5.89 6.30 18.6
## 2 large purchases 96.1 606. 1102.
Add error columns
fdata2 <- fdata1 %>%
group_by(label) %>%
mutate(residual = y - pred,
relerr = residual / y)Compare the rmse and rmse.rel of the large and small groups. note that RMSE is root mean squared residual
fdata2 %>%
group_by(label) %>%
summarize(rmse = sqrt(mean(residual ^ 2)),
rmse.rel = sqrt(mean(relerr ^2)))## # A tibble: 2 × 3
## label rmse rmse.rel
## <fct> <dbl> <dbl>
## 1 small purchases 3.84 1.25
## 2 large purchases 5.54 0.0147
Plot the predictions for both groups of purchases
ggplot(fdata2, aes(x = pred, y = y, color = label)) +
geom_point() +
geom_abline() +
facet_wrap(~label, ncol = 1, scales = "free") +
ggtitle("outcome vs prediction")An good example shows that
though absolute RMSE of large purchase is bigger than of small purchase
the relative RMSE results is the opposite
plotting shows the similar - the large purchase model is fitted better than small one
hence a model with larger RMSE might still be better if the relative errors are more important than absolute errors
also the virtue of the story here is plotting and visualization are essential in model evaluation
Log-transformed monetary output
load the income data.
load("~/Documents/R programming/Datacamp/DC_ML/Data/Regression/Income.RData")income_train <- incometrain
income_test <- incometestExamine the Income 2005 in the training set
summary(income_train$Income2005)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 63 23000 39000 49894 61500 703637
Fit the lm model and make predictions
(fmla.log <- log(Income2005) ~ Arith + Word + Parag + Math + AFQT)## log(Income2005) ~ Arith + Word + Parag + Math + AFQT
(fmla.abs <- Income2005 ~ Arith + Word + Parag + Math + AFQT)## Income2005 ~ Arith + Word + Parag + Math + AFQT
model.log <- lm(fmla.log, data = income_train)
income_test$logpred <- predict(model.log, income_test)
summary(income_test$logpred)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.766 10.133 10.423 10.419 10.705 11.006
model.abs <- lm(formula = fmla.abs, data = income_train)convert predictions back to monetary units
income_test$pred.income <- exp(income_test$logpred)
summary(income_test$pred.income)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17432 25167 33615 35363 44566 60217
plot predicted income vs income
ggplot(income_test, aes(x = pred.income, y = Income2005)) +
geom_point() +
geom_abline(color = "skyblue")Compare RMSE and RMSE relative error
income_test <- income_test %>%
mutate(pred.absmodel = predict(model.abs, income_test), # predictions from model.abs
pred.logmodel = exp(predict(model.log, income_test)))Gather predictions and calculate residuals and relative error
income_long <- income_test %>%
pivot_longer(cols = starts_with("pred"),
names_to = "modeltype",
values_to = "pred") %>%
mutate(residual = Income2005 - pred, relerr = residual / Income2005 )income_long %>%
group_by(modeltype) %>% # group by modeltype
summarize(rmse = sqrt(mean(residual ^ 2)), # RMSE
rmse.rel = sqrt(mean(relerr ^2))) # Root mean squared relative error## # A tibble: 3 × 3
## modeltype rmse rmse.rel
## <chr> <dbl> <dbl>
## 1 pred.absmodel 37448. 3.18
## 2 pred.income 39235. 2.22
## 3 pred.logmodel 39235. 2.22
Transforming inputs before modeling
knowledge based relation
log transformation
key note - when raise a variable (here meant independent variable) to a power in a formula, should use the I function to treat the expression mathematically, not as an interaction.
Data - loaded the “houseprice” dataset
- summary of the dataset
summary(houseprice)## size price
## Min. : 44.0 Min. : 42.0
## 1st Qu.: 73.5 1st Qu.:164.5
## Median : 91.0 Median :203.5
## Mean : 94.3 Mean :249.2
## 3rd Qu.:118.5 3rd Qu.:287.8
## Max. :150.0 Max. :573.0
- formula for price as a function of squared size and fit a lm model from it
fmla_sqr <- price ~ I(size ^2)
model_sqr <- lm(fmla_sqr, data = houseprice)- for comparison, formula for price as size and fit a lm model from it
model_lin <- lm(price ~ size, data = houseprice)- make predictions and compare
houseprice %>%
mutate(pred_lin = predict(model_lin, data = houseprice),
pred_sqr = predict(model_sqr, data = houseprice)) %>%
pivot_longer(cols = starts_with("pred"),
names_to = "modeltype",
values_to = "pred") %>%
ggplot(aes(x = size)) +
geom_point(aes(y = price)) +
geom_line(aes(y = pred, color = modeltype)) +
scale_color_brewer(palette = "Dark2")- check residual and RMSE
# Create a splitting plan for 3-fold cross validation
set.seed(34245) # set the seed for reproducibility
splitPlan <- kWayCrossValidation(nrow(houseprice), 3, NULL, NULL)
# Sample code: get cross-val predictions for price ~ size
houseprice$pred_lin <- 0 # initialize the prediction vector
for(i in 1:3) {
split <- splitPlan[[i]]
model_lin <- lm(price ~ size, data = houseprice[split$train,])
houseprice$pred_lin[split$app] <- predict(model_lin, newdata = houseprice[split$app,])
}
# Get cross-val predictions for price as a function of size^2 (use fmla_sqr)
houseprice$pred_sqr <- 0 # initialize the prediction vector
for(i in 1:3) {
split <- splitPlan[[i]]
model_sqr <- lm(fmla_sqr, data = houseprice[split$train, ])
houseprice$pred_sqr[split$app] <- predict(model_sqr, newdata = houseprice[split$app, ])
}
# Gather the predictions and calculate the residuals
houseprice_long <- houseprice %>%
gather(key = modeltype, value = pred, pred_lin, pred_sqr) %>%
mutate(residuals = price - pred)
# Compare the cross-validated RMSE for the two models
houseprice_long %>%
group_by(modeltype) %>% # group by modeltype
summarize(rmse = sqrt(mean(residuals ^2)))## # A tibble: 2 × 2
## modeltype rmse
## <chr> <dbl>
## 1 pred_lin 73.1
## 2 pred_sqr 60.3
conclusion - quadratic input improved the model.
Some additional not needed and not useful note
pivot longer and wider are not intuitive - same as its predecessor.
below is a simple explanation and hopefully next time do not have to google for solution again.
pivot_longer(dataframe, cols = columns to pivot long, names_to = “desired name for category column”, values_to = “desired name for value column”