#Loading the necessary Libraries for processing the Dataframe
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.4.3
## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
## ✔ broom 1.0.8 ✔ rsample 1.3.0
## ✔ dials 1.4.0 ✔ tune 1.3.0
## ✔ infer 1.0.8 ✔ workflows 1.2.0
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.3.1 ✔ yardstick 1.3.2
## ✔ recipes 1.3.0
## Warning: package 'broom' was built under R version 4.4.3
## Warning: package 'dials' was built under R version 4.4.3
## Warning: package 'scales' was built under R version 4.4.3
## Warning: package 'infer' was built under R version 4.4.3
## Warning: package 'modeldata' was built under R version 4.4.3
## Warning: package 'parsnip' was built under R version 4.4.3
## Warning: package 'recipes' was built under R version 4.4.3
## Warning: package 'rsample' was built under R version 4.4.3
## Warning: package 'tune' was built under R version 4.4.3
## Warning: package 'workflows' was built under R version 4.4.3
## Warning: package 'workflowsets' was built under R version 4.4.3
## Warning: package 'yardstick' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
#Loading the Dataframe cdc2
load("C:\\Users\\pearl\\Downloads\\cdc2.Rdata")
str(cdc2)
## 'data.frame': 19997 obs. of 15 variables:
## $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
## $ exerany : num 0 0 1 1 0 1 1 0 0 1 ...
## $ hlthplan : num 1 1 1 1 1 1 1 1 1 1 ...
## $ smoke100 : num 0 1 1 0 0 0 0 0 1 0 ...
## $ height : num 70 64 60 66 61 64 71 67 65 70 ...
## $ weight : int 175 125 105 132 150 114 194 170 150 180 ...
## $ wtdesire : int 175 115 105 124 130 114 185 160 130 170 ...
## $ age : int 77 33 49 42 55 55 31 45 27 44 ...
## $ gender : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
## $ BMI : num 25.1 21.5 20.5 21.3 28.3 ...
## $ BMIDes : num 25.1 19.7 20.5 20 24.6 ...
## $ DesActRatio: num 1 0.92 1 0.939 0.867 ...
## $ BMICat : Factor w/ 5 levels "Underweight",..: 3 2 2 2 3 2 3 3 3 3 ...
## $ BMIDesCat : Factor w/ 5 levels "Underweight",..: 3 2 2 2 2 2 3 3 2 2 ...
## $ ageCat : Factor w/ 4 levels "18-31","32-43",..: 4 2 3 2 3 3 1 3 1 3 ...
##Creating Training and Testing Material from the Dataframe
set.seed(123)
cdc2_split <- initial_split(cdc2,prop =0.7, strata = BMI)
#Creating my training and testing datasets
cdc2_training <- training(cdc2_split)
cdc2_testing <- testing(cdc2_split)
#validating that the datasets created properly
str(cdc2_training)
## 'data.frame': 13996 obs. of 15 variables:
## $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 2 2 2 2 1 3 1 ...
## $ exerany : num 0 1 1 1 0 1 1 1 1 1 ...
## $ hlthplan : num 1 1 1 1 0 1 1 0 1 1 ...
## $ smoke100 : num 1 1 0 0 1 0 1 0 0 1 ...
## $ height : num 64 60 66 67 64 65 65 64 68 66 ...
## $ weight : int 125 105 132 125 105 125 124 118 130 112 ...
## $ wtdesire : int 115 105 124 120 120 125 118 118 130 115 ...
## $ age : int 33 49 42 33 27 29 52 35 47 47 ...
## $ gender : Factor w/ 2 levels "m","f": 2 2 2 2 2 2 2 2 2 2 ...
## $ BMI : num 21.5 20.5 21.3 19.6 18 ...
## $ BMIDes : num 19.7 20.5 20 18.8 20.6 ...
## $ DesActRatio: num 0.92 1 0.939 0.96 1.143 ...
## $ BMICat : Factor w/ 5 levels "Underweight",..: 2 2 2 2 1 2 2 2 2 1 ...
## $ BMIDesCat : Factor w/ 5 levels "Underweight",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ ageCat : Factor w/ 4 levels "18-31","32-43",..: 2 3 2 2 1 1 3 2 3 3 ...
str(cdc2_testing)
## 'data.frame': 6001 obs. of 15 variables:
## $ genhlth : Factor w/ 5 levels "excellent","very good",..: 2 3 1 2 3 2 1 1 2 1 ...
## $ exerany : num 1 1 1 0 1 1 1 0 1 1 ...
## $ hlthplan : num 1 1 1 1 1 1 1 1 1 1 ...
## $ smoke100 : num 0 0 1 1 1 0 0 0 0 1 ...
## $ height : num 64 70 69 73 67 61 63 59 68 69 ...
## $ weight : int 114 180 186 160 165 115 117 145 150 172 ...
## $ wtdesire : int 114 170 175 160 158 105 110 110 150 172 ...
## $ age : int 55 44 46 43 30 32 44 36 43 58 ...
## $ gender : Factor w/ 2 levels "m","f": 2 1 1 1 1 2 2 2 1 1 ...
## $ BMI : num 19.6 25.8 27.5 21.1 25.8 ...
## $ BMIDes : num 19.6 24.4 25.8 21.1 24.7 ...
## $ DesActRatio: num 1 0.944 0.941 1 0.958 ...
## $ BMICat : Factor w/ 5 levels "Underweight",..: 2 3 3 2 3 2 2 3 2 3 ...
## $ BMIDesCat : Factor w/ 5 levels "Underweight",..: 2 2 3 2 2 2 2 2 2 3 ...
## $ ageCat : Factor w/ 4 levels "18-31","32-43",..: 3 3 3 2 1 2 3 2 2 4 ...
#Verifying the split worked as anticipated
.7 * nrow(cdc2)
## [1] 13997.9
nrow(cdc2)
## [1] 19997
print(" ")
## [1] " "
.3 * nrow(cdc2)
## [1] 5999.1
nrow(cdc2)
## [1] 19997
print(" ")
## [1] " "
summary(cdc2_training$BMI)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.75 22.71 25.60 26.32 28.89 73.09
summary(cdc2_testing$BMI)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.40 22.71 25.60 26.28 28.89 61.57
##Linear Modeling #Initializing the linear model specifications
linear_model <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
##Linear Model One #Model designed to predict weight using height
lmOne <- linear_model %>%
fit(weight ~ height, data = cdc2_training) # training data only
print(lmOne)
## parsnip model object
##
##
## Call:
## stats::lm(formula = weight ~ height, data = data)
##
## Coefficients:
## (Intercept) height
## -196.056 5.445
tidy(lmOne)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -196. 4.63 -42.3 0
## 2 height 5.45 0.0688 79.2 0
##Linear Model Two #Model Designed to predict weight using age
lmTwo <- linear_model %>%
fit(weight ~ age, data = cdc2_training) # training data only
print(lmTwo)
## parsnip model object
##
##
## Call:
## stats::lm(formula = weight ~ age, data = data)
##
## Coefficients:
## (Intercept) age
## 169.36650 0.01009
tidy(lmTwo)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 169. 0.956 177. 0
## 2 age 0.0101 0.0198 0.511 0.610
##Linear Model Three #Model designed to predict weight based on combination of height and age
lmThree <- linear_model %>%
fit(weight ~ height + age, data = cdc2_training) # training data only
print(lmThree)
## parsnip model object
##
##
## Call:
## stats::lm(formula = weight ~ height + age, data = data)
##
## Coefficients:
## (Intercept) height age
## -209.6622 5.5324 0.1712
tidy(lmThree)
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -210. 4.80 -43.7 0
## 2 height 5.53 0.0690 80.1 0
## 3 age 0.171 0.0165 10.4 3.76e-25
lmFour <- linear_model %>%
fit(weight ~ height + age + gender, data = cdc2_training) # training data only
print(lmFour)
## parsnip model object
##
##
## Call:
## stats::lm(formula = weight ~ height + age + gender, data = data)
##
## Coefficients:
## (Intercept) height age genderf
## -138.0023 4.5622 0.1553 -11.0792
tidy(lmFour)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -138. 7.04 -19.6 1.59e-84
## 2 height 4.56 0.0981 46.5 0
## 3 age 0.155 0.0164 9.46 3.60e-21
## 4 genderf -11.1 0.801 -13.8 2.87e-43
##Predictions and Evaluation of Models
#Model Predictions are made utilizing the testing material from the database
#Predictions for ModelOne
predictionsOne <-predict(lmOne, new_data = cdc2_testing)
str(predictionsOne)
## tibble [6,001 × 1] (S3: tbl_df/tbl/data.frame)
## $ .pred: num [1:6001] 152 185 180 201 169 ...
#Predictions for ModelTwo
predictionsTwo <-predict(lmTwo, new_data = cdc2_testing)
str(predictionsTwo)
## tibble [6,001 × 1] (S3: tbl_df/tbl/data.frame)
## $ .pred: num [1:6001] 170 170 170 170 170 ...
#Predictions for ModelThree
predictionsThree <-predict(lmThree, new_data = cdc2_testing)
str(predictionsThree)
## tibble [6,001 × 1] (S3: tbl_df/tbl/data.frame)
## $ .pred: num [1:6001] 154 185 180 202 166 ...
#Predictions for ModelFour
predictionsFour <-predict(lmFour, new_data = cdc2_testing)
str(predictionsFour)
## tibble [6,001 × 1] (S3: tbl_df/tbl/data.frame)
## $ .pred: num [1:6001] 151 188 184 202 172 ...
##Evaluating the Model #Comparing the Distributions of the actual and predicted values
#Summarizing the predictions and test data
summary(predictionsOne$.pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 81.64 152.43 168.76 169.59 185.10 250.44
summary(predictionsTwo$.pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 169.5 169.7 169.8 169.8 169.9 170.3
summary(predictionsThree$.pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 79.68 151.94 168.03 169.51 187.42 247.42
summary(predictionsFour$.pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 90.11 150.45 167.45 169.45 189.17 239.20
#Summarizing the actual values from the test data
summary(cdc2_testing$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 68.0 140.0 165.0 169.3 190.0 405.0
#
resultsOne <- cdc2_testing %>%
select(weight,height) %>%
bind_cols(predictionsOne)
resultsTwo <- cdc2_testing %>%
select(weight,age) %>%
bind_cols(predictionsTwo)
resultsThree <- cdc2_testing %>%
select(weight,height, age) %>%
bind_cols(predictionsThree)
resultsFour <- cdc2_testing %>%
select(weight,height, age, gender) %>%
bind_cols(predictionsFour)
#validate structure of Dataframes
str(resultsOne)
## 'data.frame': 6001 obs. of 3 variables:
## $ weight: int 114 180 186 160 165 115 117 145 150 172 ...
## $ height: num 64 70 69 73 67 61 63 59 68 69 ...
## $ .pred : num 152 185 180 201 169 ...
head(resultsOne)
## weight height .pred
## 6 114 64 152.4297
## 10 180 70 185.1002
## 11 186 69 179.6551
## 23 160 73 201.4354
## 24 165 67 168.7649
## 29 115 61 136.0944
str(resultsTwo)
## 'data.frame': 6001 obs. of 3 variables:
## $ weight: int 114 180 186 160 165 115 117 145 150 172 ...
## $ age : int 55 44 46 43 30 32 44 36 43 58 ...
## $ .pred : num 170 170 170 170 170 ...
head(resultsTwo)
## weight age .pred
## 6 114 55 169.9216
## 10 180 44 169.8106
## 11 186 46 169.8308
## 23 160 43 169.8005
## 24 165 30 169.6693
## 29 115 32 169.6895
str(resultsThree)
## 'data.frame': 6001 obs. of 4 variables:
## $ weight: int 114 180 186 160 165 115 117 145 150 172 ...
## $ height: num 64 70 69 73 67 61 63 59 68 69 ...
## $ age : int 55 44 46 43 30 32 44 36 43 58 ...
## $ .pred : num 154 185 180 202 166 ...
head(resultsThree)
## weight height age .pred
## 6 114 64 55 153.8272
## 10 180 70 44 185.1392
## 11 186 69 46 179.9491
## 23 160 73 43 201.5654
## 24 165 67 30 166.1458
## 29 115 61 32 133.2934
str(resultsFour)
## 'data.frame': 6001 obs. of 5 variables:
## $ weight: int 114 180 186 160 165 115 117 145 150 172 ...
## $ height: num 64 70 69 73 67 61 63 59 68 69 ...
## $ age : int 55 44 46 43 30 32 44 36 43 58 ...
## $ gender: Factor w/ 2 levels "m","f": 2 1 1 1 1 2 2 2 1 1 ...
## $ .pred : num 151 188 184 202 172 ...
head(resultsFour)
## weight height age gender .pred
## 6 114 64 55 f 151.4389
## 10 180 70 44 m 188.1830
## 11 186 69 46 m 183.9314
## 23 160 73 43 m 201.7143
## 24 165 67 30 m 172.3225
## 29 115 61 32 f 134.1807
# Calculating the RMSE and R squared for each model
rmseOne <- rmse(resultsOne, truth = weight, estimate = .pred)
rsqOne <- rsq(resultsOne, truth = weight, estimate = .pred)
rmseTwo <- rmse(resultsTwo, truth = weight, estimate = .pred)
rsqTwo <- rsq(resultsTwo, truth = weight, estimate = .pred)
rmseThree <- rmse(resultsThree, truth = weight, estimate = .pred)
rsqThree <- rsq(resultsThree, truth = weight, estimate = .pred)
rmseFour <- rmse(resultsFour, truth = weight, estimate = .pred)
rsqFour <- rsq(resultsFour, truth = weight, estimate = .pred)
print(paste("RMSE for Model One [height alone]:", rmseOne$.estimate))
## [1] "RMSE for Model One [height alone]: 33.0873007460516"
print(paste("RMSE for Model Two [age alone]:", rmseTwo$.estimate))
## [1] "RMSE for Model Two [age alone]: 39.7675369215992"
print(paste("RMSE for Model Three [height and age]:", rmseThree$.estimate))
## [1] "RMSE for Model Three [height and age]: 32.9729131989734"
print(paste("RMSE for Model Four [height, age, and gender]:", rmseFour$.estimate))
## [1] "RMSE for Model Four [height, age, and gender]: 32.6908208152921"
##Plotting the Models Performace via R Squared #Plot for Model One [weight predictions using height alone]
# Plot for Model 1
ggplot(resultsOne, aes(x = weight, y = .pred)) +
geom_point(alpha = 0.3, color = 'pink') +
geom_abline(slope = 1, intercept = 0, color = 'blue', linetype = "dashed") +
labs(
title = 'Actual vs Predicted Weight [Model 1: Weight predicted by Height]',
x = 'Actual Weight',
y = 'Predicted Weight'
) +
theme_minimal()
#Plot for Model Two [weight predictions using age alone]
# Plot for Model Two
ggplot(resultsTwo, aes(x = weight, y = .pred)) +
geom_point(alpha = 0.3, color = 'green') +
geom_abline(slope = 1, intercept = 0, color = 'blue', linetype = "dashed") +
labs(
title = 'Actual vs Predicted Weight [Model 2: Weight predicted by Age]',
x = 'Actual Weight',
y = 'Predicted Weight'
) +
theme_minimal()
#Plot for Model Three [weight predictions using height and age in combination]
# Plot for Model Threee
ggplot(resultsThree, aes(x = weight, y = .pred)) +
geom_point(alpha = 0.3, color = 'purple') +
geom_abline(slope = 1, intercept = 0, color = 'blue', linetype = "dashed") +
labs(
title = 'Actual vs Predicted Weight [Model 2: Weight predicted by Height and Age]',
x = 'Actual Weight',
y = 'Predicted Weight'
) +
theme_minimal()
#Plot for Model Four [weight predictions using height, age and gender together]
# Plot for Model Four
ggplot(resultsFour, aes(x = weight, y = .pred)) +
geom_point(alpha = 0.3, color = 'orange') +
geom_abline(slope = 1, intercept = 0, color = 'blue', linetype = "dashed") +
labs(
title = 'Actual vs Predicted Weight [Model 2: Weight predicted by Height, Age, and Gender]',
x = 'Actual Weight',
y = 'Predicted Weight'
) +
theme_minimal()
##Results — The model with the lowest RMSE is Model Four, showing that
model four has the most accurate weight predictions of the four
models.
“RMSE for Model One [height alone]: 33.0873007460516” “RMSE for Model Two [age alone]: 39.7675369215992” “RMSE for Model Three [height and age]: 32.9729131989734” “RMSE for Model Four [height, age, and gender]: 32.6908208152921”