caret Package Exploration

Reem

2024-12-09

##Introduction In this exercise, we will explore the caret package, create a machine learning model using its functions, and extend its functionality with a custom function for evaluating model performance. caret is widely used in R for streamlining the process of building and evaluating machine learning models.

##Step 1: Package Discovery and Installation ###Installing and Loading the Package

# Install caret (if not already installed)
if (!requireNamespace("caret", quietly = TRUE)) {
  install.packages("caret")
}
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice

###Exploring the Package Documentation

# Display package documentation
?caret
## No documentation for 'caret' in specified packages and libraries:
## you could try '??caret'
# List vignettes for the package
browseVignettes("caret")
## starting httpd help server ... done

##Step 2: Analyzing the Package Structure ###Listing Functions and Datasets

# List all exported functions
ls("package:caret")
##   [1] "anovaScores"           "avNNet"                "bag"                  
##   [4] "bagControl"            "bagEarth"              "bagEarthStats"        
##   [7] "bagFDA"                "best"                  "BoxCoxTrans"          
##  [10] "calibration"           "caretFuncs"            "caretGA"              
##  [13] "caretSA"               "caretSBF"              "caretTheme"           
##  [16] "cforestStats"          "checkConditionalX"     "checkInstall"         
##  [19] "checkResamples"        "class2ind"             "classDist"            
##  [22] "cluster"               "compare_models"        "confusionMatrix"      
##  [25] "confusionMatrix.train" "contr.dummy"           "contr.ltfr"           
##  [28] "createDataPartition"   "createFolds"           "createModel"          
##  [31] "createMultiFolds"      "createResample"        "createTimeSlices"     
##  [34] "ctreeBag"              "defaultSummary"        "dotPlot"              
##  [37] "downSample"            "dummyVars"             "expandParameters"     
##  [40] "expoTrans"             "extractPrediction"     "extractProb"          
##  [43] "F_meas"                "featurePlot"           "filterVarImp"         
##  [46] "findCorrelation"       "findLinearCombos"      "flatTable"            
##  [49] "gafs"                  "gafs.default"          "gafs_initial"         
##  [52] "gafs_lrSelection"      "gafs_raMutation"       "gafs_rwSelection"     
##  [55] "gafs_spCrossover"      "gafs_tourSelection"    "gafs_uCrossover"      
##  [58] "gafsControl"           "gamFormula"            "gamFuncs"             
##  [61] "gamScores"             "getModelInfo"          "getSamplingInfo"      
##  [64] "getTrainPerf"          "ggplot.gafs"           "ggplot.safs"          
##  [67] "groupKFold"            "hasTerms"              "icr"                  
##  [70] "index2vec"             "ipredStats"            "knn3"                 
##  [73] "knn3Train"             "knnreg"                "knnregTrain"          
##  [76] "ldaBag"                "ldaFuncs"              "ldaSBF"               
##  [79] "learning_curve_dat"    "lift"                  "lmFuncs"              
##  [82] "lmSBF"                 "LPH07_1"               "LPH07_2"              
##  [85] "lrFuncs"               "MAE"                   "maxDissim"            
##  [88] "MeanSD"                "minDiss"               "mnLogLoss"            
##  [91] "modelCor"              "modelLookup"           "multiClassSummary"    
##  [94] "nbBag"                 "nbFuncs"               "nbSBF"                
##  [97] "nearZeroVar"           "negPredValue"          "nnetBag"              
## [100] "nullModel"             "nzv"                   "oneSE"                
## [103] "outcome_conversion"    "panel.calibration"     "panel.lift"           
## [106] "panel.lift2"           "panel.needle"          "pcaNNet"              
## [109] "pickSizeBest"          "pickSizeTolerance"     "pickVars"             
## [112] "plot.gafs"             "plot.rfe"              "plot.safs"            
## [115] "plot.train"            "plotClassProbs"        "plotObsVsPred"        
## [118] "plsBag"                "plsda"                 "posPredValue"         
## [121] "postResample"          "precision"             "predict.bagEarth"     
## [124] "predict.gafs"          "predict.train"         "predictionFunction"   
## [127] "predictors"            "preProcess"            "print.train"          
## [130] "probFunction"          "progress"              "prSummary"            
## [133] "R2"                    "recall"                "resampleHist"         
## [136] "resamples"             "resampleSummary"       "resampleWrapper"      
## [139] "rfe"                   "rfeControl"            "rfeIter"              
## [142] "rfFuncs"               "rfGA"                  "rfSA"                 
## [145] "rfSBF"                 "rfStats"               "RMSE"                 
## [148] "safs"                  "safs_initial"          "safs_perturb"         
## [151] "safs_prob"             "safsControl"           "sbf"                  
## [154] "sbfControl"            "sbfIter"               "sensitivity"          
## [157] "SLC14_1"               "SLC14_2"               "sortImp"              
## [160] "spatialSign"           "specificity"           "splsda"               
## [163] "sumDiss"               "summary.bagEarth"      "svmBag"               
## [166] "thresholder"           "tolerance"             "train"                
## [169] "trainControl"          "treebagFuncs"          "treebagGA"            
## [172] "treebagSA"             "treebagSBF"            "twoClassSim"          
## [175] "twoClassSummary"       "upSample"              "var_seq"              
## [178] "varImp"                "well_numbered"
# Explore available datasets
data(package = "caret")

##Step 3: Solving a Practical Problem ###Building a Predictive Model We will build a linear regression model using the mtcars dataset to predict mpg based on other variables.

  1. Split Data into Training and Testing Sets
# Split data into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(mtcars$mpg, p = 0.8, list = FALSE)
trainData <- mtcars[trainIndex, ]
testData <- mtcars[-trainIndex, ]
  1. Train a Linear Regression Model
# Train a linear regression model
model <- train(mpg ~ ., data = trainData, method = "lm")
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
summary(model)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2742 -1.3609 -0.2707  1.1921  4.9877 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.81069   22.93545  -0.123    0.904
## cyl          0.75593    1.21576   0.622    0.542
## disp         0.01172    0.01674   0.700    0.494
## hp          -0.01386    0.02197  -0.631    0.536
## drat         2.24007    1.77251   1.264    0.223
## wt          -2.73273    1.87954  -1.454    0.164
## qsec         0.53957    0.71812   0.751    0.463
## vs           1.21640    2.02623   0.600    0.556
## am           1.73662    2.08358   0.833    0.416
## gear         2.95127    1.88459   1.566    0.136
## carb        -1.19910    0.98232  -1.221    0.239
## 
## Residual standard error: 2.431 on 17 degrees of freedom
## Multiple R-squared:  0.8861, Adjusted R-squared:  0.8191 
## F-statistic: 13.23 on 10 and 17 DF,  p-value: 3.719e-06
  1. Make Predictions on the Test Set
# Predict on test data
predictions <- predict(model, newdata = testData)
  1. Evaluate the Model
# Evaluate the model
rmse <- sqrt(mean((predictions - testData$mpg)^2))
r_squared <- cor(predictions, testData$mpg)^2
cat("RMSE:", rmse, "\n")
## RMSE: 4.808981
cat("R-squared:", r_squared, "\n")
## R-squared: 0.6297763

##Step 4: Extending the Package ###Creating a Custom Function for Model Evaluation We will define a utility function to calculate and return performance metrics.

# Custom function for model evaluation
evaluate_model <- function(actual, predicted) {
  rmse <- sqrt(mean((predicted - actual)^2))
  r_squared <- cor(predicted, actual)^2
  list(RMSE = rmse, R_Squared = r_squared)
}

# Apply the function
metrics <- evaluate_model(testData$mpg, predictions)
metrics
## $RMSE
## [1] 4.808981
## 
## $R_Squared
## [1] 0.6297763

##Conclusion In this exercise, we: -Explored the caret package and its documentation. -Built a predictive model using linear regression. -Extended functionality with a custom evaluation function. This demonstrates how caret simplifies machine learning workflows and how custom functions can enhance its capabilities.