#Loading the necessary Libraries for processing the Dataframe

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom        1.0.8     ✔ rsample      1.3.0
## ✔ dials        1.4.0     ✔ tune         1.3.0
## ✔ infer        1.0.8     ✔ workflows    1.2.0
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.3.1     ✔ yardstick    1.3.2
## ✔ recipes      1.3.0
## Warning: package 'broom' was built under R version 4.4.3
## Warning: package 'dials' was built under R version 4.4.3
## Warning: package 'scales' was built under R version 4.4.3
## Warning: package 'infer' was built under R version 4.4.3
## Warning: package 'modeldata' was built under R version 4.4.3
## Warning: package 'parsnip' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'tune' was built under R version 4.4.3
## Warning: package 'workflows' was built under R version 4.4.3
## Warning: package 'workflowsets' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()

#Loading the Dataframe cdc2

load("C:\\Users\\pearl\\Downloads\\cdc2.Rdata")
str(cdc2)
## 'data.frame':    19997 obs. of  15 variables:
##  $ genhlth    : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
##  $ exerany    : num  0 0 1 1 0 1 1 0 0 1 ...
##  $ hlthplan   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ smoke100   : num  0 1 1 0 0 0 0 0 1 0 ...
##  $ height     : num  70 64 60 66 61 64 71 67 65 70 ...
##  $ weight     : int  175 125 105 132 150 114 194 170 150 180 ...
##  $ wtdesire   : int  175 115 105 124 130 114 185 160 130 170 ...
##  $ age        : int  77 33 49 42 55 55 31 45 27 44 ...
##  $ gender     : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
##  $ BMI        : num  25.1 21.5 20.5 21.3 28.3 ...
##  $ BMIDes     : num  25.1 19.7 20.5 20 24.6 ...
##  $ DesActRatio: num  1 0.92 1 0.939 0.867 ...
##  $ BMICat     : Factor w/ 5 levels "Underweight",..: 3 2 2 2 3 2 3 3 3 3 ...
##  $ BMIDesCat  : Factor w/ 5 levels "Underweight",..: 3 2 2 2 2 2 3 3 2 2 ...
##  $ ageCat     : Factor w/ 4 levels "18-31","32-43",..: 4 2 3 2 3 3 1 3 1 3 ...

##Creating Training and Testing Material from the Dataframe

set.seed(123)
cdc2_split <- initial_split(cdc2,prop =0.7, strata = BMI)

#Creating my training and testing datasets
cdc2_training <- training(cdc2_split)

cdc2_testing <- testing(cdc2_split)


#validating that the datasets created properly
str(cdc2_training)
## 'data.frame':    13996 obs. of  15 variables:
##  $ genhlth    : Factor w/ 5 levels "excellent","very good",..: 3 3 3 2 2 2 2 1 3 1 ...
##  $ exerany    : num  0 1 1 1 0 1 1 1 1 1 ...
##  $ hlthplan   : num  1 1 1 1 0 1 1 0 1 1 ...
##  $ smoke100   : num  1 1 0 0 1 0 1 0 0 1 ...
##  $ height     : num  64 60 66 67 64 65 65 64 68 66 ...
##  $ weight     : int  125 105 132 125 105 125 124 118 130 112 ...
##  $ wtdesire   : int  115 105 124 120 120 125 118 118 130 115 ...
##  $ age        : int  33 49 42 33 27 29 52 35 47 47 ...
##  $ gender     : Factor w/ 2 levels "m","f": 2 2 2 2 2 2 2 2 2 2 ...
##  $ BMI        : num  21.5 20.5 21.3 19.6 18 ...
##  $ BMIDes     : num  19.7 20.5 20 18.8 20.6 ...
##  $ DesActRatio: num  0.92 1 0.939 0.96 1.143 ...
##  $ BMICat     : Factor w/ 5 levels "Underweight",..: 2 2 2 2 1 2 2 2 2 1 ...
##  $ BMIDesCat  : Factor w/ 5 levels "Underweight",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ ageCat     : Factor w/ 4 levels "18-31","32-43",..: 2 3 2 2 1 1 3 2 3 3 ...
str(cdc2_testing)
## 'data.frame':    6001 obs. of  15 variables:
##  $ genhlth    : Factor w/ 5 levels "excellent","very good",..: 2 3 1 2 3 2 1 1 2 1 ...
##  $ exerany    : num  1 1 1 0 1 1 1 0 1 1 ...
##  $ hlthplan   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ smoke100   : num  0 0 1 1 1 0 0 0 0 1 ...
##  $ height     : num  64 70 69 73 67 61 63 59 68 69 ...
##  $ weight     : int  114 180 186 160 165 115 117 145 150 172 ...
##  $ wtdesire   : int  114 170 175 160 158 105 110 110 150 172 ...
##  $ age        : int  55 44 46 43 30 32 44 36 43 58 ...
##  $ gender     : Factor w/ 2 levels "m","f": 2 1 1 1 1 2 2 2 1 1 ...
##  $ BMI        : num  19.6 25.8 27.5 21.1 25.8 ...
##  $ BMIDes     : num  19.6 24.4 25.8 21.1 24.7 ...
##  $ DesActRatio: num  1 0.944 0.941 1 0.958 ...
##  $ BMICat     : Factor w/ 5 levels "Underweight",..: 2 3 3 2 3 2 2 3 2 3 ...
##  $ BMIDesCat  : Factor w/ 5 levels "Underweight",..: 2 2 3 2 2 2 2 2 2 3 ...
##  $ ageCat     : Factor w/ 4 levels "18-31","32-43",..: 3 3 3 2 1 2 3 2 2 4 ...
#Verifying the split worked as anticipated
.7 * nrow(cdc2)
## [1] 13997.9
nrow(cdc2)
## [1] 19997
print(" ")
## [1] " "
.3 * nrow(cdc2)
## [1] 5999.1
nrow(cdc2)
## [1] 19997
print(" ")
## [1] " "
summary(cdc2_training$BMI)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.75   22.71   25.60   26.32   28.89   73.09
summary(cdc2_testing$BMI)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.40   22.71   25.60   26.28   28.89   61.57

##Linear Modeling #Initializing the linear model specifications

linear_model <- linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")

##Linear Model One #Model designed to predict weight using height

lmOne <- linear_model %>%
  fit(weight ~ height, data = cdc2_training) # training data only
print(lmOne)
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = weight ~ height, data = data)
## 
## Coefficients:
## (Intercept)       height  
##    -196.056        5.445
tidy(lmOne)
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)  -196.      4.63       -42.3       0
## 2 height          5.45    0.0688      79.2       0

##Linear Model Two #Model Designed to predict weight using age

lmTwo <- linear_model %>%
  fit(weight ~ age, data = cdc2_training) # training data only
print(lmTwo)
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = weight ~ age, data = data)
## 
## Coefficients:
## (Intercept)          age  
##   169.36650      0.01009
tidy(lmTwo)
## # A tibble: 2 × 5
##   term        estimate std.error statistic p.value
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept) 169.        0.956    177.      0    
## 2 age           0.0101    0.0198     0.511   0.610

##Linear Model Three #Model designed to predict weight based on combination of height and age

lmThree <- linear_model %>%
  fit(weight ~ height + age, data = cdc2_training) # training data only
print(lmThree)
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = weight ~ height + age, data = data)
## 
## Coefficients:
## (Intercept)       height          age  
##   -209.6622       5.5324       0.1712
tidy(lmThree)
## # A tibble: 3 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept) -210.       4.80       -43.7 0       
## 2 height         5.53     0.0690      80.1 0       
## 3 age            0.171    0.0165      10.4 3.76e-25

Linear Model Four

Model designed to predict weight based upon 3 variables: height, age and gender

lmFour <- linear_model %>%
  fit(weight ~ height + age + gender, data = cdc2_training) # training data only
print(lmFour)
## parsnip model object
## 
## 
## Call:
## stats::lm(formula = weight ~ height + age + gender, data = data)
## 
## Coefficients:
## (Intercept)       height          age      genderf  
##   -138.0023       4.5622       0.1553     -11.0792
tidy(lmFour)
## # A tibble: 4 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept) -138.       7.04      -19.6  1.59e-84
## 2 height         4.56     0.0981     46.5  0       
## 3 age            0.155    0.0164      9.46 3.60e-21
## 4 genderf      -11.1      0.801     -13.8  2.87e-43

##Predictions and Evaluation of Models

#Model Predictions are made utilizing the testing material from the database

#Predictions for ModelOne

predictionsOne <-predict(lmOne, new_data = cdc2_testing)
str(predictionsOne)
## tibble [6,001 × 1] (S3: tbl_df/tbl/data.frame)
##  $ .pred: num [1:6001] 152 185 180 201 169 ...

#Predictions for ModelTwo

predictionsTwo <-predict(lmTwo, new_data = cdc2_testing)
str(predictionsTwo)
## tibble [6,001 × 1] (S3: tbl_df/tbl/data.frame)
##  $ .pred: num [1:6001] 170 170 170 170 170 ...

#Predictions for ModelThree

predictionsThree <-predict(lmThree, new_data = cdc2_testing)
str(predictionsThree)
## tibble [6,001 × 1] (S3: tbl_df/tbl/data.frame)
##  $ .pred: num [1:6001] 154 185 180 202 166 ...

#Predictions for ModelFour

predictionsFour <-predict(lmFour, new_data = cdc2_testing)
str(predictionsFour)
## tibble [6,001 × 1] (S3: tbl_df/tbl/data.frame)
##  $ .pred: num [1:6001] 151 188 184 202 172 ...

##Evaluating the Model #Comparing the Distributions of the actual and predicted values

#Summarizing the predictions and test data
summary(predictionsOne$.pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   81.64  152.43  168.76  169.59  185.10  250.44
summary(predictionsTwo$.pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   169.5   169.7   169.8   169.8   169.9   170.3
summary(predictionsThree$.pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   79.68  151.94  168.03  169.51  187.42  247.42
summary(predictionsFour$.pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   90.11  150.45  167.45  169.45  189.17  239.20
#Summarizing the actual values from the test data
summary(cdc2_testing$weight) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    68.0   140.0   165.0   169.3   190.0   405.0

Results of Models One through Four

#


resultsOne <- cdc2_testing %>%
  select(weight,height) %>%
  bind_cols(predictionsOne)  


resultsTwo <- cdc2_testing %>%
  select(weight,age) %>%
  bind_cols(predictionsTwo)  

resultsThree <- cdc2_testing %>%
  select(weight,height, age) %>%
  bind_cols(predictionsThree)  

resultsFour <- cdc2_testing %>%
  select(weight,height, age, gender) %>%
  bind_cols(predictionsFour)  


#validate structure of Dataframes
str(resultsOne)
## 'data.frame':    6001 obs. of  3 variables:
##  $ weight: int  114 180 186 160 165 115 117 145 150 172 ...
##  $ height: num  64 70 69 73 67 61 63 59 68 69 ...
##  $ .pred : num  152 185 180 201 169 ...
head(resultsOne)
##    weight height    .pred
## 6     114     64 152.4297
## 10    180     70 185.1002
## 11    186     69 179.6551
## 23    160     73 201.4354
## 24    165     67 168.7649
## 29    115     61 136.0944
str(resultsTwo)
## 'data.frame':    6001 obs. of  3 variables:
##  $ weight: int  114 180 186 160 165 115 117 145 150 172 ...
##  $ age   : int  55 44 46 43 30 32 44 36 43 58 ...
##  $ .pred : num  170 170 170 170 170 ...
head(resultsTwo)
##    weight age    .pred
## 6     114  55 169.9216
## 10    180  44 169.8106
## 11    186  46 169.8308
## 23    160  43 169.8005
## 24    165  30 169.6693
## 29    115  32 169.6895
str(resultsThree)
## 'data.frame':    6001 obs. of  4 variables:
##  $ weight: int  114 180 186 160 165 115 117 145 150 172 ...
##  $ height: num  64 70 69 73 67 61 63 59 68 69 ...
##  $ age   : int  55 44 46 43 30 32 44 36 43 58 ...
##  $ .pred : num  154 185 180 202 166 ...
head(resultsThree)
##    weight height age    .pred
## 6     114     64  55 153.8272
## 10    180     70  44 185.1392
## 11    186     69  46 179.9491
## 23    160     73  43 201.5654
## 24    165     67  30 166.1458
## 29    115     61  32 133.2934
str(resultsFour)
## 'data.frame':    6001 obs. of  5 variables:
##  $ weight: int  114 180 186 160 165 115 117 145 150 172 ...
##  $ height: num  64 70 69 73 67 61 63 59 68 69 ...
##  $ age   : int  55 44 46 43 30 32 44 36 43 58 ...
##  $ gender: Factor w/ 2 levels "m","f": 2 1 1 1 1 2 2 2 1 1 ...
##  $ .pred : num  151 188 184 202 172 ...
head(resultsFour)
##    weight height age gender    .pred
## 6     114     64  55      f 151.4389
## 10    180     70  44      m 188.1830
## 11    186     69  46      m 183.9314
## 23    160     73  43      m 201.7143
## 24    165     67  30      m 172.3225
## 29    115     61  32      f 134.1807

Calculating and Printing RMSE and R squared for the Models

# Calculating the RMSE and R squared for each model
rmseOne <- rmse(resultsOne, truth = weight, estimate = .pred)
rsqOne <- rsq(resultsOne, truth = weight, estimate = .pred)

rmseTwo <- rmse(resultsTwo, truth = weight, estimate = .pred)
rsqTwo <- rsq(resultsTwo, truth = weight, estimate = .pred)

rmseThree <- rmse(resultsThree, truth = weight, estimate = .pred)
rsqThree <- rsq(resultsThree, truth = weight, estimate = .pred)

rmseFour <- rmse(resultsFour, truth = weight, estimate = .pred)
rsqFour <- rsq(resultsFour, truth = weight, estimate = .pred)
print(paste("RMSE for Model One [height alone]:", rmseOne$.estimate))
## [1] "RMSE for Model One [height alone]: 33.0873007460516"
print(paste("RMSE for Model Two [age alone]:", rmseTwo$.estimate))
## [1] "RMSE for Model Two [age alone]: 39.7675369215992"
print(paste("RMSE for Model Three [height and age]:", rmseThree$.estimate))
## [1] "RMSE for Model Three [height and age]: 32.9729131989734"
print(paste("RMSE for Model Four [height, age, and gender]:", rmseFour$.estimate))
## [1] "RMSE for Model Four [height, age, and gender]: 32.6908208152921"

##Plotting the Models Performace via R Squared #Plot for Model One [weight predictions using height alone]

# Plot for Model 1
ggplot(resultsOne, aes(x = weight, y = .pred)) +
  geom_point(alpha = 0.3, color = 'pink') +
  geom_abline(slope = 1, intercept = 0, color = 'blue', linetype = "dashed") +
  labs(
    title = 'Actual vs Predicted Weight [Model 1: Weight predicted by Height]',
    x = 'Actual Weight',
    y = 'Predicted Weight'
  ) +
  theme_minimal()

#Plot for Model Two [weight predictions using age alone]

# Plot for Model Two
ggplot(resultsTwo, aes(x = weight, y = .pred)) +
  geom_point(alpha = 0.3, color = 'green') +
  geom_abline(slope = 1, intercept = 0, color = 'blue', linetype = "dashed") +
  labs(
    title = 'Actual vs Predicted Weight [Model 2: Weight predicted by Age]',
    x = 'Actual Weight',
    y = 'Predicted Weight'
  ) +
  theme_minimal()

#Plot for Model Three [weight predictions using height and age in combination]

# Plot for Model Threee
ggplot(resultsThree, aes(x = weight, y = .pred)) +
  geom_point(alpha = 0.3, color = 'purple') +
  geom_abline(slope = 1, intercept = 0, color = 'blue', linetype = "dashed") +
  labs(
    title = 'Actual vs Predicted Weight [Model 2: Weight predicted by Height and Age]',
    x = 'Actual Weight',
    y = 'Predicted Weight'
  ) +
  theme_minimal()

#Plot for Model Four [weight predictions using height, age and gender together]

# Plot for Model Four
ggplot(resultsFour, aes(x = weight, y = .pred)) +
  geom_point(alpha = 0.3, color = 'orange') +
  geom_abline(slope = 1, intercept = 0, color = 'blue', linetype = "dashed") +
  labs(
    title = 'Actual vs Predicted Weight [Model 2: Weight predicted by Height, Age, and Gender]',
    x = 'Actual Weight',
    y = 'Predicted Weight'
  ) +
  theme_minimal()

##Results — The model with the lowest RMSE is Model Four, showing that model four has the most accurate weight predictions of the four models.

“RMSE for Model One [height alone]: 33.0873007460516” “RMSE for Model Two [age alone]: 39.7675369215992” “RMSE for Model Three [height and age]: 32.9729131989734” “RMSE for Model Four [height, age, and gender]: 32.6908208152921”