Importing the libraries
install.packages('pROC', repos = 'https://cran.r-project.org/')
## Installing package into 'C:/Users/adrij/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'pROC' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\adrij\AppData\Local\Temp\RtmpELNKrN\downloaded_packages
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-6
library(randomForestSRC)
##
## randomForestSRC 3.2.1
##
## Type rfsrc.news() to see new features, changes, and bug fixes.
##
library(parallel)
library(mlr)
## Loading required package: ParamHelpers
## Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
## Future development will only happen in 'mlr3'
## (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
## uncaught bugs meanwhile in {mlr} - please consider switching.
##
## Attaching package: 'mlr'
## The following objects are masked from 'package:randomForestSRC':
##
## impute, subsample
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ purrr::partial() masks randomForestSRC::partial()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(parallelMap)
library(kknn)
library(dplyr)
library(ggplot2)
library(readxl)
Importing the Data
nutrition.df <- read_excel("Final_Nutrition2.xlsx")
head(nutrition.df)
# Remove all unecessary columns
nutrition.df <- subset(nutrition.df,
select=-c(NDB_No, Shrt_Desc, Long_Desc,
FdGrp_Desc,GmWt_Desc1, GmWt_Desc2))
# Remove all null rows
nutrition.df <- na.omit(nutrition.df)
set.seed(45210)
nutrition.df <- nutrition.df[,c(1:4,8,10,16)] %>%
sample_n(size = 500)
Regression using LASSO
# Building a task
iowaTask <- makeRegrTask(data = nutrition.df,
target = "Energ_Kcal")
## Warning in makeTask(type = type, data = data, weights = weights, blocking =
## blocking, : Provided data is not a pure data.frame but from class tbl_df, hence
## it will be converted.
# CReate a learner
lasso <- makeLearner("regr.glmnet", alpha = 1, id = "lasso")
lassoParamSpace <- makeParamSet( makeNumericParam("s", lower = 0, upper = 12))
randSearch <- makeTuneControlRandom(maxit = 130)
# Cross validation of tuning process
cvForTuning <- makeResampleDesc("RepCV", folds = 3, reps = 10)
parallelStartSocket(cpus = detectCores())
## Starting parallelization in mode=socket with cpus=8.
# Tune the parameters
tunedLassoPars <- tuneParams(lasso, task = iowaTask,
resampling = cvForTuning,
par.set = lassoParamSpace,
control = randSearch)
## [Tune] Started tuning learner lasso for parameter set:
## Type len Def Constr Req Tunable Trafo
## s numeric - - 0 to 12 - TRUE -
## With control class: TuneControlRandom
## Imputation value: Inf
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 8; elements = 130.
## [Tune] Result: s=0.0686 : mse.test.mean=509.9903410
parallelStop()
## Stopped parallelization. All cleaned up.
# Obtain the hyper parameters
lassoTuningData <- generateHyperParsEffectData(tunedLassoPars)
plotHyperParsEffect(lassoTuningData, x = "s",
y = "mse.test.mean", plot.type = "line") + theme_minimal()

# set hyper parameters
tunedLasso <- setHyperPars(lasso, par.vals = tunedLassoPars$x)
tunedLassoModel <- train(tunedLasso, iowaTask)
lassoModelData <- getLearnerModel(tunedLassoModel)
lassoCoefs <- coef(lassoModelData, s = tunedLassoPars$x$s)
lassoCoefs
## 7 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 370.39074923
## Water_g -3.67928452
## Protein_g 0.37063175
## Lipid_Tot_g 4.95553240
## Iron_mg -2.02448666
## Phosphorus_mg -0.02071408
## Selenium_g 0.24162660
coefTib <- tibble(Coef = rownames(lassoCoefs)[-1])
coefTib$LASSO <- as.vector(lassoCoefs)[-1]
coefUntidy <- gather(coefTib, key = Model, value = Beta, -Coef)
# Plot the coefficients
ggplot(coefUntidy, aes(reorder(Coef, Beta), Beta, fill = Model)) +
geom_bar(stat = "identity", col = "black") +
facet_wrap(~ Model) +
theme_minimal() +
theme(legend.position = "none")

lassoWrapper <- makeTuneWrapper(lasso,
resampling = cvForTuning,
par.set = lassoParamSpace,
control = randSearch)
learners = list(lassoWrapper)
kFold3 <- makeResampleDesc("CV", iters = 3)
parallelStartSocket(cpus = detectCores())
## Starting parallelization in mode=socket with cpus=8.
bench <- benchmark(learners, iowaTask, kFold3)
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Mapping in parallel: mode = socket; level = mlr.benchmark; cpus = 8; elements = 1.
parallelStop()
## Stopped parallelization. All cleaned up.
bench
## task.id learner.id mse.test.mean
## 1 nutrition.df lasso.tuned 551.6674
## From the LASSO regression model mean squared error rate is 551.6674
kNN model
# Create a task
nutritionTask <- makeRegrTask(data = nutrition.df,
target = "Energ_Kcal")
## Warning in makeTask(type = type, data = data, weights = weights, blocking =
## blocking, : Provided data is not a pure data.frame but from class tbl_df, hence
## it will be converted.
kknn <- makeLearner("regr.kknn")
#tune k-hyperparameter
kknnParamSpace <- makeParamSet(makeDiscreteParam("k", values = 1:12))
gridSearch <- makeTuneControlGrid()
kFold <- makeResampleDesc("CV", iters = 10)
tunedK <- tuneParams(kknn, task = nutritionTask, resampling = kFold, par.set = kknnParamSpace, control = gridSearch)
## [Tune] Started tuning learner regr.kknn for parameter set:
## Type len Def Constr Req Tunable Trafo
## k discrete - - 1,2,3,4,5,6,7,8,9,10,11,12 - TRUE -
## With control class: TuneControlGrid
## Imputation value: Inf
## [Tune-x] 1: k=1
## [Tune-y] 1: mse.test.mean=1629.8080000; time: 0.0 min
## [Tune-x] 2: k=2
## [Tune-y] 2: mse.test.mean=1352.8684822; time: 0.0 min
## [Tune-x] 3: k=3
## [Tune-y] 3: mse.test.mean=1211.3983963; time: 0.0 min
## [Tune-x] 4: k=4
## [Tune-y] 4: mse.test.mean=1133.3707414; time: 0.0 min
## [Tune-x] 5: k=5
## [Tune-y] 5: mse.test.mean=1071.9745988; time: 0.0 min
## [Tune-x] 6: k=6
## [Tune-y] 6: mse.test.mean=1031.1464253; time: 0.0 min
## [Tune-x] 7: k=7
## [Tune-y] 7: mse.test.mean=1006.6449404; time: 0.0 min
## [Tune-x] 8: k=8
## [Tune-y] 8: mse.test.mean=990.2507070; time: 0.0 min
## [Tune-x] 9: k=9
## [Tune-y] 9: mse.test.mean=983.5500055; time: 0.0 min
## [Tune-x] 10: k=10
## [Tune-y] 10: mse.test.mean=984.1405422; time: 0.0 min
## [Tune-x] 11: k=11
## [Tune-y] 11: mse.test.mean=988.9315744; time: 0.0 min
## [Tune-x] 12: k=12
## [Tune-y] 12: mse.test.mean=995.9347340; time: 0.0 min
## [Tune] Result: k=9 : mse.test.mean=983.5500055
knnTuningData <- generateHyperParsEffectData(tunedK)
plotHyperParsEffect(knnTuningData, x = "k",
y = "mse.test.mean",
plot.type = "line") +
theme_minimal()

tunedKnn <- setHyperPars(makeLearner("regr.kknn"),
par.vals = tunedK$x)
tunedKnnModel <- train(tunedKnn, nutritionTask)
tunedKnnModel
## Model for learner.id=regr.kknn; learner.class=regr.kknn
## Trained on: task.id = nutrition.df; obs = 500; features = 6
## Hyperparameters: k=9
kknnWrapper <- makeTuneWrapper(kknn, resampling = kFold,
par.set = kknnParamSpace,
control = gridSearch)
learners = list(kknnWrapper)
holdout <- makeResampleDesc("Holdout")
parallelStartSocket(cpus = detectCores())
## Starting parallelization in mode=socket with cpus=8.
bench <- benchmark(learners, nutritionTask, holdout)
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Mapping in parallel: mode = socket; level = mlr.benchmark; cpus = 8; elements = 1.
parallelStop()
## Stopped parallelization. All cleaned up.
bench
## task.id learner.id mse.test.mean
## 1 nutrition.df regr.kknn.tuned 1467.902
## From the KNN model mean squared error rate is 1467.902