Reem
2024-12-09
##Introduction In this exercise, we will explore the caret package, create a machine learning model using its functions, and extend its functionality with a custom function for evaluating model performance. caret is widely used in R for streamlining the process of building and evaluating machine learning models.
##Step 1: Package Discovery and Installation ###Installing and Loading the Package
# Install caret (if not already installed)
if (!requireNamespace("caret", quietly = TRUE)) {
install.packages("caret")
}
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
###Exploring the Package Documentation
## No documentation for 'caret' in specified packages and libraries:
## you could try '??caret'
## starting httpd help server ... done
##Step 2: Analyzing the Package Structure ###Listing Functions and Datasets
## [1] "anovaScores" "avNNet" "bag"
## [4] "bagControl" "bagEarth" "bagEarthStats"
## [7] "bagFDA" "best" "BoxCoxTrans"
## [10] "calibration" "caretFuncs" "caretGA"
## [13] "caretSA" "caretSBF" "caretTheme"
## [16] "cforestStats" "checkConditionalX" "checkInstall"
## [19] "checkResamples" "class2ind" "classDist"
## [22] "cluster" "compare_models" "confusionMatrix"
## [25] "confusionMatrix.train" "contr.dummy" "contr.ltfr"
## [28] "createDataPartition" "createFolds" "createModel"
## [31] "createMultiFolds" "createResample" "createTimeSlices"
## [34] "ctreeBag" "defaultSummary" "dotPlot"
## [37] "downSample" "dummyVars" "expandParameters"
## [40] "expoTrans" "extractPrediction" "extractProb"
## [43] "F_meas" "featurePlot" "filterVarImp"
## [46] "findCorrelation" "findLinearCombos" "flatTable"
## [49] "gafs" "gafs.default" "gafs_initial"
## [52] "gafs_lrSelection" "gafs_raMutation" "gafs_rwSelection"
## [55] "gafs_spCrossover" "gafs_tourSelection" "gafs_uCrossover"
## [58] "gafsControl" "gamFormula" "gamFuncs"
## [61] "gamScores" "getModelInfo" "getSamplingInfo"
## [64] "getTrainPerf" "ggplot.gafs" "ggplot.safs"
## [67] "groupKFold" "hasTerms" "icr"
## [70] "index2vec" "ipredStats" "knn3"
## [73] "knn3Train" "knnreg" "knnregTrain"
## [76] "ldaBag" "ldaFuncs" "ldaSBF"
## [79] "learning_curve_dat" "lift" "lmFuncs"
## [82] "lmSBF" "LPH07_1" "LPH07_2"
## [85] "lrFuncs" "MAE" "maxDissim"
## [88] "MeanSD" "minDiss" "mnLogLoss"
## [91] "modelCor" "modelLookup" "multiClassSummary"
## [94] "nbBag" "nbFuncs" "nbSBF"
## [97] "nearZeroVar" "negPredValue" "nnetBag"
## [100] "nullModel" "nzv" "oneSE"
## [103] "outcome_conversion" "panel.calibration" "panel.lift"
## [106] "panel.lift2" "panel.needle" "pcaNNet"
## [109] "pickSizeBest" "pickSizeTolerance" "pickVars"
## [112] "plot.gafs" "plot.rfe" "plot.safs"
## [115] "plot.train" "plotClassProbs" "plotObsVsPred"
## [118] "plsBag" "plsda" "posPredValue"
## [121] "postResample" "precision" "predict.bagEarth"
## [124] "predict.gafs" "predict.train" "predictionFunction"
## [127] "predictors" "preProcess" "print.train"
## [130] "probFunction" "progress" "prSummary"
## [133] "R2" "recall" "resampleHist"
## [136] "resamples" "resampleSummary" "resampleWrapper"
## [139] "rfe" "rfeControl" "rfeIter"
## [142] "rfFuncs" "rfGA" "rfSA"
## [145] "rfSBF" "rfStats" "RMSE"
## [148] "safs" "safs_initial" "safs_perturb"
## [151] "safs_prob" "safsControl" "sbf"
## [154] "sbfControl" "sbfIter" "sensitivity"
## [157] "SLC14_1" "SLC14_2" "sortImp"
## [160] "spatialSign" "specificity" "splsda"
## [163] "sumDiss" "summary.bagEarth" "svmBag"
## [166] "thresholder" "tolerance" "train"
## [169] "trainControl" "treebagFuncs" "treebagGA"
## [172] "treebagSA" "treebagSBF" "twoClassSim"
## [175] "twoClassSummary" "upSample" "var_seq"
## [178] "varImp" "well_numbered"
##Step 3: Solving a Practical Problem ###Building a Predictive Model We will build a linear regression model using the mtcars dataset to predict mpg based on other variables.
# Split data into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(mtcars$mpg, p = 0.8, list = FALSE)
trainData <- mtcars[trainIndex, ]
testData <- mtcars[-trainIndex, ]
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2742 -1.3609 -0.2707 1.1921 4.9877
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.81069 22.93545 -0.123 0.904
## cyl 0.75593 1.21576 0.622 0.542
## disp 0.01172 0.01674 0.700 0.494
## hp -0.01386 0.02197 -0.631 0.536
## drat 2.24007 1.77251 1.264 0.223
## wt -2.73273 1.87954 -1.454 0.164
## qsec 0.53957 0.71812 0.751 0.463
## vs 1.21640 2.02623 0.600 0.556
## am 1.73662 2.08358 0.833 0.416
## gear 2.95127 1.88459 1.566 0.136
## carb -1.19910 0.98232 -1.221 0.239
##
## Residual standard error: 2.431 on 17 degrees of freedom
## Multiple R-squared: 0.8861, Adjusted R-squared: 0.8191
## F-statistic: 13.23 on 10 and 17 DF, p-value: 3.719e-06
# Evaluate the model
rmse <- sqrt(mean((predictions - testData$mpg)^2))
r_squared <- cor(predictions, testData$mpg)^2
cat("RMSE:", rmse, "\n")
## RMSE: 4.808981
## R-squared: 0.6297763
##Step 4: Extending the Package ###Creating a Custom Function for Model Evaluation We will define a utility function to calculate and return performance metrics.
# Custom function for model evaluation
evaluate_model <- function(actual, predicted) {
rmse <- sqrt(mean((predicted - actual)^2))
r_squared <- cor(predicted, actual)^2
list(RMSE = rmse, R_Squared = r_squared)
}
# Apply the function
metrics <- evaluate_model(testData$mpg, predictions)
metrics
## $RMSE
## [1] 4.808981
##
## $R_Squared
## [1] 0.6297763
##Conclusion In this exercise, we: -Explored the caret package and its documentation. -Built a predictive model using linear regression. -Extended functionality with a custom evaluation function. This demonstrates how caret simplifies machine learning workflows and how custom functions can enhance its capabilities.