R Markdown

Importing the libraries

install.packages('pROC', repos = 'https://cran.r-project.org/')
## Installing package into 'C:/Users/adrij/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'pROC' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\adrij\AppData\Local\Temp\RtmpELNKrN\downloaded_packages
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-6
library(randomForestSRC)
## 
##  randomForestSRC 3.2.1 
##  
##  Type rfsrc.news() to see new features, changes, and bug fixes. 
## 
library(parallel)
library(mlr)
## Loading required package: ParamHelpers
## Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
## Future development will only happen in 'mlr3'
## (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
## uncaught bugs meanwhile in {mlr} - please consider switching.
## 
## Attaching package: 'mlr'
## The following objects are masked from 'package:randomForestSRC':
## 
##     impute, subsample
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand()  masks Matrix::expand()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ tidyr::pack()    masks Matrix::pack()
## ✖ purrr::partial() masks randomForestSRC::partial()
## ✖ tidyr::unpack()  masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(parallelMap)
library(kknn)
library(dplyr)
library(ggplot2)
library(readxl)

Importing the Data

nutrition.df <- read_excel("Final_Nutrition2.xlsx")
head(nutrition.df)
# Remove all unecessary columns
nutrition.df <- subset(nutrition.df, 
                       select=-c(NDB_No, Shrt_Desc, Long_Desc,
                                 FdGrp_Desc,GmWt_Desc1, GmWt_Desc2)) 

# Remove all null rows
nutrition.df <- na.omit(nutrition.df)
set.seed(45210)
nutrition.df <- nutrition.df[,c(1:4,8,10,16)] %>% 
  sample_n(size = 500)

Regression using LASSO

# Building a task
iowaTask  <-  makeRegrTask(data = nutrition.df, 
                           target = "Energ_Kcal") 
## Warning in makeTask(type = type, data = data, weights = weights, blocking =
## blocking, : Provided data is not a pure data.frame but from class tbl_df, hence
## it will be converted.
# CReate a learner
lasso  <-  makeLearner("regr.glmnet", alpha = 1, id = "lasso") 
lassoParamSpace <- makeParamSet( makeNumericParam("s", lower = 0, upper = 12))

randSearch  <-  makeTuneControlRandom(maxit = 130) 

# Cross validation of tuning process
cvForTuning  <-  makeResampleDesc("RepCV", folds = 3, reps = 10)
parallelStartSocket(cpus = detectCores())
## Starting parallelization in mode=socket with cpus=8.
# Tune the parameters
tunedLassoPars  <-  tuneParams(lasso, task = iowaTask, 
                               resampling = cvForTuning, 
                               par.set = lassoParamSpace, 
                               control = randSearch)
## [Tune] Started tuning learner lasso for parameter set:
##      Type len Def  Constr Req Tunable Trafo
## s numeric   -   - 0 to 12   -    TRUE     -
## With control class: TuneControlRandom
## Imputation value: Inf
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Mapping in parallel: mode = socket; level = mlr.tuneParams; cpus = 8; elements = 130.
## [Tune] Result: s=0.0686 : mse.test.mean=509.9903410
parallelStop()
## Stopped parallelization. All cleaned up.
# Obtain the hyper parameters
lassoTuningData  <-  generateHyperParsEffectData(tunedLassoPars) 
plotHyperParsEffect(lassoTuningData, x = "s", 
                    y = "mse.test.mean", plot.type = "line") + theme_minimal()

# set hyper parameters
tunedLasso  <-  setHyperPars(lasso, par.vals = tunedLassoPars$x) 
tunedLassoModel  <-  train(tunedLasso, iowaTask) 
lassoModelData  <-  getLearnerModel(tunedLassoModel) 
lassoCoefs  <-  coef(lassoModelData, s = tunedLassoPars$x$s) 
lassoCoefs
## 7 x 1 sparse Matrix of class "dgCMatrix"
##                         s1
## (Intercept)   370.39074923
## Water_g        -3.67928452
## Protein_g       0.37063175
## Lipid_Tot_g     4.95553240
## Iron_mg        -2.02448666
## Phosphorus_mg  -0.02071408
## Selenium_g      0.24162660
coefTib  <-  tibble(Coef = rownames(lassoCoefs)[-1])
coefTib$LASSO  <-  as.vector(lassoCoefs)[-1] 
coefUntidy  <-  gather(coefTib, key = Model, value = Beta, -Coef) 
# Plot the coefficients
ggplot(coefUntidy, aes(reorder(Coef, Beta), Beta, fill = Model)) +
  geom_bar(stat = "identity", col = "black") + 
  facet_wrap(~ Model) + 
  theme_minimal() + 
  theme(legend.position = "none")

lassoWrapper  <-  makeTuneWrapper(lasso, 
                                  resampling = cvForTuning,
                                  par.set = lassoParamSpace,
                                  control = randSearch) 
learners = list(lassoWrapper) 
kFold3  <-  makeResampleDesc("CV", iters = 3) 
parallelStartSocket(cpus = detectCores())
## Starting parallelization in mode=socket with cpus=8.
bench  <-  benchmark(learners, iowaTask, kFold3)
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Mapping in parallel: mode = socket; level = mlr.benchmark; cpus = 8; elements = 1.
parallelStop()
## Stopped parallelization. All cleaned up.
bench
##        task.id  learner.id mse.test.mean
## 1 nutrition.df lasso.tuned      551.6674
## From the LASSO regression model mean squared error rate is  551.6674

kNN model

# Create a task
nutritionTask  <-  makeRegrTask(data = nutrition.df, 
                                target = "Energ_Kcal") 
## Warning in makeTask(type = type, data = data, weights = weights, blocking =
## blocking, : Provided data is not a pure data.frame but from class tbl_df, hence
## it will be converted.
kknn  <-  makeLearner("regr.kknn")
#tune k-hyperparameter 
kknnParamSpace  <-  makeParamSet(makeDiscreteParam("k", values = 1:12)) 
gridSearch  <-  makeTuneControlGrid() 
kFold  <-  makeResampleDesc("CV", iters = 10) 
tunedK  <-  tuneParams(kknn, task = nutritionTask, resampling = kFold, par.set = kknnParamSpace, control = gridSearch)
## [Tune] Started tuning learner regr.kknn for parameter set:
##       Type len Def                     Constr Req Tunable Trafo
## k discrete   -   - 1,2,3,4,5,6,7,8,9,10,11,12   -    TRUE     -
## With control class: TuneControlGrid
## Imputation value: Inf
## [Tune-x] 1: k=1
## [Tune-y] 1: mse.test.mean=1629.8080000; time: 0.0 min
## [Tune-x] 2: k=2
## [Tune-y] 2: mse.test.mean=1352.8684822; time: 0.0 min
## [Tune-x] 3: k=3
## [Tune-y] 3: mse.test.mean=1211.3983963; time: 0.0 min
## [Tune-x] 4: k=4
## [Tune-y] 4: mse.test.mean=1133.3707414; time: 0.0 min
## [Tune-x] 5: k=5
## [Tune-y] 5: mse.test.mean=1071.9745988; time: 0.0 min
## [Tune-x] 6: k=6
## [Tune-y] 6: mse.test.mean=1031.1464253; time: 0.0 min
## [Tune-x] 7: k=7
## [Tune-y] 7: mse.test.mean=1006.6449404; time: 0.0 min
## [Tune-x] 8: k=8
## [Tune-y] 8: mse.test.mean=990.2507070; time: 0.0 min
## [Tune-x] 9: k=9
## [Tune-y] 9: mse.test.mean=983.5500055; time: 0.0 min
## [Tune-x] 10: k=10
## [Tune-y] 10: mse.test.mean=984.1405422; time: 0.0 min
## [Tune-x] 11: k=11
## [Tune-y] 11: mse.test.mean=988.9315744; time: 0.0 min
## [Tune-x] 12: k=12
## [Tune-y] 12: mse.test.mean=995.9347340; time: 0.0 min
## [Tune] Result: k=9 : mse.test.mean=983.5500055
knnTuningData  <-  generateHyperParsEffectData(tunedK) 
plotHyperParsEffect(knnTuningData, x = "k", 
                    y = "mse.test.mean", 
                    plot.type = "line") + 
  theme_minimal()

tunedKnn  <-  setHyperPars(makeLearner("regr.kknn"), 
                           par.vals = tunedK$x) 
tunedKnnModel  <-  train(tunedKnn, nutritionTask)
tunedKnnModel
## Model for learner.id=regr.kknn; learner.class=regr.kknn
## Trained on: task.id = nutrition.df; obs = 500; features = 6
## Hyperparameters: k=9
kknnWrapper  <-  makeTuneWrapper(kknn, resampling = kFold, 
                               par.set = kknnParamSpace, 
                               control = gridSearch) 
learners = list(kknnWrapper) 
holdout  <-  makeResampleDesc("Holdout") 
parallelStartSocket(cpus = detectCores())
## Starting parallelization in mode=socket with cpus=8.
bench  <-  benchmark(learners, nutritionTask, holdout)
## Exporting objects to slaves for mode socket: .mlr.slave.options
## Mapping in parallel: mode = socket; level = mlr.benchmark; cpus = 8; elements = 1.
parallelStop()
## Stopped parallelization. All cleaned up.
bench
##        task.id      learner.id mse.test.mean
## 1 nutrition.df regr.kknn.tuned      1467.902
## From the KNN model mean squared error rate is  1467.902