Exploration

  1. Print the structure of your dataset
str(df)
## tibble [100,000 × 48] (S3: tbl_df/tbl/data.frame)
##  $ survey_code             : num [1:100000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ age                     : num [1:100000] 56 69 46 32 60 25 78 38 56 75 ...
##  $ gender                  : chr [1:100000] "Male" "Female" "Male" "Female" ...
##  $ height                  : num [1:100000] 173 163 177 172 164 ...
##  $ weight                  : num [1:100000] 56.9 97.8 80.7 63.1 40 ...
##  $ bmi                     : num [1:100000] 18.9 36.7 25.7 21.3 14.9 ...
##  $ bmi_estimated           : num [1:100000] 18.9 36.7 25.7 21.3 14.9 ...
##  $ bmi_scaled              : num [1:100000] 56.7 110.1 77 64 44.8 ...
##  $ bmi_corrected           : num [1:100000] 19 36.5 25.6 21.2 14.8 ...
##  $ waist_size              : num [1:100000] 72.2 85.6 90.3 100.5 69 ...
##  $ blood_pressure          : num [1:100000] 118 118 123 148 151 ...
##  $ heart_rate              : num [1:100000] 60.7 66.5 76 68.8 92.3 ...
##  $ cholesterol             : num [1:100000] 215 116 138 203 200 ...
##  $ glucose                 : num [1:100000] 103 116.9 89.2 128.4 94.8 ...
##  $ insulin                 : num [1:100000] NA 10.1 NA 18.7 16 ...
##  $ sleep_hours             : num [1:100000] 6.48 8.43 5.7 5.19 7.91 ...
##  $ sleep_quality           : chr [1:100000] "Fair" "Good" "Poor" "Good" ...
##  $ work_hours              : num [1:100000] 7.67 9.52 5.83 9.49 7.28 ...
##  $ physical_activity       : num [1:100000] 0.357 0.568 3.764 0.889 2.902 ...
##  $ daily_steps             : num [1:100000] 13321 11911 2974 5322 9791 ...
##  $ calorie_intake          : num [1:100000] 2674 2650 1747 2034 2386 ...
##  $ sugar_intake            : num [1:100000] 44.5 74.7 19.7 82.6 46 ...
##  $ alcohol_consumption     : chr [1:100000] NA "Regularly" "Regularly" "Occasionally" ...
##  $ smoking_level           : chr [1:100000] "Non-smoker" "Light" "Heavy" "Heavy" ...
##  $ water_intake            : num [1:100000] 1.694 0.716 2.488 2.643 1.968 ...
##  $ screen_time             : num [1:100000] 5 5.93 4.37 4.12 3.18 ...
##  $ stress_level            : num [1:100000] 2 3 0 10 9 7 7 7 2 10 ...
##  $ mental_health_score     : num [1:100000] 8 9 1 4 7 6 1 2 9 9 ...
##  $ mental_health_support   : chr [1:100000] "No" "No" "No" "No" ...
##  $ education_level         : chr [1:100000] "PhD" "High School" "Master" "Master" ...
##  $ job_type                : chr [1:100000] "Tech" "Office" "Office" "Labor" ...
##  $ occupation              : chr [1:100000] "Farmer" "Engineer" "Teacher" "Teacher" ...
##  $ income                  : num [1:100000] 6760 6241 3429 2619 3662 ...
##  $ diet_type               : chr [1:100000] "Vegan" "Vegan" "Vegan" "Vegetarian" ...
##  $ exercise_type           : chr [1:100000] "Strength" "Cardio" "Cardio" "Mixed" ...
##  $ device_usage            : chr [1:100000] "High" "Moderate" "High" "Low" ...
##  $ healthcare_access       : chr [1:100000] "Poor" "Moderate" "Good" "Moderate" ...
##  $ insurance               : chr [1:100000] "No" "No" "Yes" "No" ...
##  $ sunlight_exposure       : chr [1:100000] "High" "High" "High" "High" ...
##  $ meals_per_day           : num [1:100000] 5 5 4 1 1 4 2 3 2 1 ...
##  $ caffeine_intake         : chr [1:100000] "Moderate" "High" "Moderate" "None" ...
##  $ family_history          : chr [1:100000] "No" "Yes" "No" "No" ...
##  $ pet_owner               : chr [1:100000] "Yes" "No" "No" "Yes" ...
##  $ electrolyte_level       : num [1:100000] 0 0 0 0 0 0 0 0 0 0 ...
##  $ gene_marker_flag        : num [1:100000] 1 1 1 1 1 1 NA 1 NA 1 ...
##  $ environmental_risk_score: num [1:100000] 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 ...
##  $ daily_supplement_dosage : num [1:100000] -2.276 6.239 5.424 8.389 0.333 ...
##  $ target                  : chr [1:100000] "healthy" "healthy" "healthy" "healthy" ...
  1. List the variables in your dataset
colnames(df)
##  [1] "survey_code"              "age"                     
##  [3] "gender"                   "height"                  
##  [5] "weight"                   "bmi"                     
##  [7] "bmi_estimated"            "bmi_scaled"              
##  [9] "bmi_corrected"            "waist_size"              
## [11] "blood_pressure"           "heart_rate"              
## [13] "cholesterol"              "glucose"                 
## [15] "insulin"                  "sleep_hours"             
## [17] "sleep_quality"            "work_hours"              
## [19] "physical_activity"        "daily_steps"             
## [21] "calorie_intake"           "sugar_intake"            
## [23] "alcohol_consumption"      "smoking_level"           
## [25] "water_intake"             "screen_time"             
## [27] "stress_level"             "mental_health_score"     
## [29] "mental_health_support"    "education_level"         
## [31] "job_type"                 "occupation"              
## [33] "income"                   "diet_type"               
## [35] "exercise_type"            "device_usage"            
## [37] "healthcare_access"        "insurance"               
## [39] "sunlight_exposure"        "meals_per_day"           
## [41] "caffeine_intake"          "family_history"          
## [43] "pet_owner"                "electrolyte_level"       
## [45] "gene_marker_flag"         "environmental_risk_score"
## [47] "daily_supplement_dosage"  "target"
  1. Print the top 15 rows of your dataset
head(df, 15)

Quality Check

  1. Remove missing values in your dataset
df_no_na <- na.omit(df)
  1. Identify and remove duplicated data
df_cleaned <- df_no_na[!duplicated(df_no_na), ]

5.1 Check row count before and after cleaning

cat("Original rows:", nrow(df), "\n")
## Original rows: 100000
cat("After removing NA and duplicates:", nrow(df_cleaned), "\n")
## After removing NA and duplicates: 43247

Data Wrangling

  1. Write a user-defined function
categorize_water_intake <- function(water) {
  if (is.na(water)) {
    return(NA)
  } else if (water < 1) {
    return("Very Low")
  } else if (water < 2) {
    return("Low")
  } else if (water < 3) {
    return("Moderate")
  } else {
    return("High")
  }
}

df_cleaned$Water_Intake_Category <- sapply(df_cleaned$water_intake, categorize_water_intake)
head(df_cleaned[, c("water_intake", "Water_Intake_Category")], 10)
  1. Filter rows based on a condition
low_water_df <- df_cleaned %>%
  filter(Water_Intake_Category %in% c("Very Low", "Low"))

cat("Low water intake count:", nrow(low_water_df), "\n")
## Low water intake count: 21546
head(low_water_df[, c("age", "gender", "water_intake", "Water_Intake_Category")], 10)

Feature Engineering

  1. Reorder rows in descending order
df_cleaned <- df_cleaned %>%
  filter(!is.na(smoking_level), !is.na(gender), !is.na(age), !is.na(bmi))

df_sorted <- df_cleaned %>%
  arrange(desc(age), desc(bmi))

head(df_sorted)
  1. Rename columns
df_sorted <- df_sorted %>%
  rename(
    Body_Mass_Index = bmi,
    Hours_of_Sleep = sleep_hours,
    Daily_Calories = calorie_intake
  )
colnames(df_sorted)
##  [1] "survey_code"              "age"                     
##  [3] "gender"                   "height"                  
##  [5] "weight"                   "Body_Mass_Index"         
##  [7] "bmi_estimated"            "bmi_scaled"              
##  [9] "bmi_corrected"            "waist_size"              
## [11] "blood_pressure"           "heart_rate"              
## [13] "cholesterol"              "glucose"                 
## [15] "insulin"                  "Hours_of_Sleep"          
## [17] "sleep_quality"            "work_hours"              
## [19] "physical_activity"        "daily_steps"             
## [21] "Daily_Calories"           "sugar_intake"            
## [23] "alcohol_consumption"      "smoking_level"           
## [25] "water_intake"             "screen_time"             
## [27] "stress_level"             "mental_health_score"     
## [29] "mental_health_support"    "education_level"         
## [31] "job_type"                 "occupation"              
## [33] "income"                   "diet_type"               
## [35] "exercise_type"            "device_usage"            
## [37] "healthcare_access"        "insurance"               
## [39] "sunlight_exposure"        "meals_per_day"           
## [41] "caffeine_intake"          "family_history"          
## [43] "pet_owner"                "electrolyte_level"       
## [45] "gene_marker_flag"         "environmental_risk_score"
## [47] "daily_supplement_dosage"  "target"                  
## [49] "Water_Intake_Category"
  1. Add new variables
df_sorted <- df_sorted %>%
  mutate(
    weight_to_height_ratio = weight / height,
    adjusted_bmi = Body_Mass_Index * 1.1
  )
head(df_sorted, 15)

Insight Analyst

  1. Identify dependent & independent variables
dv <- df_sorted$target
demographic_iv <- df_sorted %>% select(age, gender, education_level, job_type, occupation, income)
health_metrics_iv <- df_sorted %>% select(height, weight, Body_Mass_Index, blood_pressure, heart_rate, cholesterol, glucose, insulin, mental_health_score)
lifestyle_iv <- df_sorted %>% select(Hours_of_Sleep, sleep_quality, work_hours, physical_activity, daily_steps, exercise_type, screen_time)
dietary_iv <- df_sorted %>% select(Daily_Calories, sugar_intake, water_intake, diet_type, meals_per_day, caffeine_intake, daily_supplement_dosage)
risk_iv <- df_sorted %>% select(alcohol_consumption, smoking_level, stress_level, family_history, gene_marker_flag, environmental_risk_score)
access_support <- df_sorted %>% select(mental_health_support, healthcare_access, insurance, sunlight_exposure, pet_owner)
head(demographic_iv, 3)
head(health_metrics_iv, 3)
head(lifestyle_iv, 3)
head(dietary_iv, 3)
head(risk_iv, 3)
head(access_support, 3)
  1. Reshape/join data
dietary_with_target <- cbind(Health_Status = dv, dietary_iv)
head(dietary_with_target, 5)

Statistical Analysis

  1. Summary statistics
numeric_data <- df_sorted[sapply(df_sorted, is.numeric)]
summary(numeric_data)
##   survey_code         age            height          weight      
##  Min.   :    2   Min.   :18.00   Min.   :140.0   Min.   : 40.00  
##  1st Qu.:25220   1st Qu.:33.00   1st Qu.:163.3   1st Qu.: 59.91  
##  Median :50222   Median :48.00   Median :170.0   Median : 69.97  
##  Mean   :50110   Mean   :48.52   Mean   :170.0   Mean   : 70.06  
##  3rd Qu.:75078   3rd Qu.:64.00   3rd Qu.:176.8   3rd Qu.: 79.95  
##  Max.   :99999   Max.   :79.00   Max.   :210.0   Max.   :133.80  
##  Body_Mass_Index bmi_estimated     bmi_scaled     bmi_corrected  
##  Min.   :10.13   Min.   :10.13   Min.   : 30.40   Min.   :10.13  
##  1st Qu.:20.27   1st Qu.:20.27   1st Qu.: 60.81   1st Qu.:20.27  
##  Median :24.19   Median :24.19   Median : 72.56   Median :24.17  
##  Mean   :24.49   Mean   :24.49   Mean   : 73.47   Mean   :24.49  
##  3rd Qu.:28.22   3rd Qu.:28.22   3rd Qu.: 84.65   3rd Qu.:28.22  
##  Max.   :59.23   Max.   :59.23   Max.   :177.70   Max.   :59.14  
##    waist_size     blood_pressure     heart_rate      cholesterol    
##  Min.   : 34.09   Min.   : 59.13   Min.   : 36.89   Min.   : 71.96  
##  1st Qu.: 76.69   1st Qu.:109.86   1st Qu.: 68.22   1st Qu.:169.58  
##  Median : 84.91   Median :119.96   Median : 75.04   Median :189.81  
##  Mean   : 84.90   Mean   :120.02   Mean   : 74.95   Mean   :189.93  
##  3rd Qu.: 93.03   3rd Qu.:130.11   3rd Qu.: 81.68   3rd Qu.:210.21  
##  Max.   :133.15   Max.   :180.72   Max.   :112.79   Max.   :314.93  
##     glucose          insulin       Hours_of_Sleep     work_hours    
##  Min.   : 21.91   Min.   :-6.794   Min.   : 3.000   Min.   : 0.000  
##  1st Qu.: 86.41   1st Qu.:11.651   1st Qu.: 5.981   1st Qu.: 6.663  
##  Median : 99.88   Median :15.014   Median : 7.009   Median : 8.009  
##  Mean   : 99.97   Mean   :15.009   Mean   : 7.006   Mean   : 8.004  
##  3rd Qu.:113.51   3rd Qu.:18.406   3rd Qu.: 8.028   3rd Qu.: 9.348  
##  Max.   :183.88   Max.   :34.418   Max.   :12.000   Max.   :16.000  
##  physical_activity  daily_steps    Daily_Calories    sugar_intake   
##  Min.   : 0.000    Min.   : 1000   Min.   : 527.2   Min.   :-25.13  
##  1st Qu.: 1.632    1st Qu.: 5321   1st Qu.:1931.3   1st Qu.: 46.44  
##  Median : 2.977    Median : 7014   Median :2202.9   Median : 60.09  
##  Mean   : 3.043    Mean   : 7019   Mean   :2201.1   Mean   : 59.96  
##  3rd Qu.: 4.335    3rd Qu.: 8719   3rd Qu.:2472.8   3rd Qu.: 73.30  
##  Max.   :11.306    Max.   :18065   Max.   :3825.0   Max.   :135.85  
##   water_intake    screen_time      stress_level    mental_health_score
##  Min.   :0.500   Min.   : 0.000   Min.   : 0.000   Min.   : 0         
##  1st Qu.:1.526   1st Qu.: 3.957   1st Qu.: 2.000   1st Qu.: 2         
##  Median :2.003   Median : 5.994   Median : 5.000   Median : 5         
##  Mean   :2.009   Mean   : 6.012   Mean   : 5.006   Mean   : 5         
##  3rd Qu.:2.482   3rd Qu.: 8.005   3rd Qu.: 8.000   3rd Qu.: 8         
##  Max.   :5.000   Max.   :16.000   Max.   :10.000   Max.   :10         
##      income      meals_per_day   electrolyte_level gene_marker_flag
##  Min.   :  500   Min.   :1.000   Min.   :0         Min.   :1       
##  1st Qu.: 2661   1st Qu.:2.000   1st Qu.:0         1st Qu.:1       
##  Median : 3998   Median :3.000   Median :0         Median :1       
##  Mean   : 4039   Mean   :3.004   Mean   :0         Mean   :1       
##  3rd Qu.: 5357   3rd Qu.:4.000   3rd Qu.:0         3rd Qu.:1       
##  Max.   :11865   Max.   :5.000   Max.   :0         Max.   :1       
##  environmental_risk_score daily_supplement_dosage weight_to_height_ratio
##  Min.   :5.5              Min.   :-9.99873        Min.   :0.2013        
##  1st Qu.:5.5              1st Qu.:-5.01646        1st Qu.:0.3503        
##  Median :5.5              Median :-0.02642        Median :0.4119        
##  Mean   :5.5              Mean   :-0.01843        Mean   :0.4135        
##  3rd Qu.:5.5              3rd Qu.: 4.97760        3rd Qu.:0.4729        
##  Max.   :5.5              Max.   : 9.99954        Max.   :0.8364        
##   adjusted_bmi  
##  Min.   :11.15  
##  1st Qu.:22.30  
##  Median :26.61  
##  Mean   :26.94  
##  3rd Qu.:31.04  
##  Max.   :65.16
  1. Mean, Median, Mode, Range
get_mode <- function(v) {
  uniq_vals <- unique(v)
  uniq_vals[which.max(tabulate(match(v, uniq_vals)))]
}

modes <- sapply(numeric_data, get_mode)
ranges <- sapply(numeric_data, function(x) max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
means <- sapply(numeric_data, mean, na.rm = TRUE)
medians <- sapply(numeric_data, median, na.rm = TRUE)

summary_table <- data.frame(
  Mean = means,
  Median = medians,
  Mode = modes,
  Range = ranges
)

round(summary_table, 2)

Data Prep for Modeling

  1. Correlation between variables
cor(df_sorted$age, df_sorted$Body_Mass_Index, method = "pearson", use = "complete.obs")
## [1] 0.008330876
cor(numeric_data, method = "pearson", use = "complete.obs") %>% round(2)
## Warning in cor(numeric_data, method = "pearson", use = "complete.obs"): the
## standard deviation is zero
##                          survey_code   age height weight Body_Mass_Index
## survey_code                     1.00  0.00   0.00   0.00            0.00
## age                             0.00  1.00  -0.01   0.00            0.01
## height                          0.00 -0.01   1.00   0.00           -0.49
## weight                          0.00  0.00   0.00   1.00            0.86
## Body_Mass_Index                 0.00  0.01  -0.49   0.86            1.00
## bmi_estimated                   0.00  0.01  -0.49   0.86            1.00
## bmi_scaled                      0.00  0.01  -0.49   0.86            1.00
## bmi_corrected                   0.00  0.01  -0.49   0.86            1.00
## waist_size                     -0.01  0.00   0.00   0.01            0.00
## blood_pressure                  0.01  0.00   0.01   0.00            0.00
## heart_rate                      0.00 -0.01   0.00   0.00            0.00
## cholesterol                     0.00  0.01   0.00   0.00            0.00
## glucose                         0.00  0.00  -0.01   0.00            0.01
## insulin                         0.00  0.00   0.00   0.00            0.00
## Hours_of_Sleep                  0.00  0.00  -0.01   0.00            0.01
## work_hours                      0.01  0.01   0.00   0.00            0.00
## physical_activity               0.00 -0.01   0.00   0.00            0.00
## daily_steps                     0.00  0.00   0.00   0.00            0.00
## Daily_Calories                 -0.01 -0.01   0.00   0.00            0.00
## sugar_intake                   -0.01  0.00  -0.01   0.01            0.01
## water_intake                    0.00  0.00   0.00   0.01            0.01
## screen_time                     0.00 -0.01   0.00   0.00            0.00
## stress_level                    0.00  0.00  -0.01   0.00            0.00
## mental_health_score            -0.01  0.00   0.00  -0.01           -0.01
## income                         -0.01  0.00  -0.01   0.00            0.00
## meals_per_day                   0.00  0.00   0.00   0.00            0.00
## electrolyte_level                 NA    NA     NA     NA              NA
## gene_marker_flag                  NA    NA     NA     NA              NA
## environmental_risk_score          NA    NA     NA     NA              NA
## daily_supplement_dosage         0.01  0.00   0.00   0.00            0.00
## weight_to_height_ratio          0.00  0.01  -0.27   0.96            0.97
## adjusted_bmi                    0.00  0.01  -0.49   0.86            1.00
##                          bmi_estimated bmi_scaled bmi_corrected waist_size
## survey_code                       0.00       0.00          0.00      -0.01
## age                               0.01       0.01          0.01       0.00
## height                           -0.49      -0.49         -0.49       0.00
## weight                            0.86       0.86          0.86       0.01
## Body_Mass_Index                   1.00       1.00          1.00       0.00
## bmi_estimated                     1.00       1.00          1.00       0.00
## bmi_scaled                        1.00       1.00          1.00       0.00
## bmi_corrected                     1.00       1.00          1.00       0.00
## waist_size                        0.00       0.00          0.00       1.00
## blood_pressure                    0.00       0.00          0.00       0.01
## heart_rate                        0.00       0.00          0.00       0.00
## cholesterol                       0.00       0.00          0.00       0.00
## glucose                           0.01       0.01          0.01       0.00
## insulin                           0.00       0.00          0.00      -0.01
## Hours_of_Sleep                    0.01       0.01          0.01       0.00
## work_hours                        0.00       0.00          0.00       0.01
## physical_activity                 0.00       0.00          0.00       0.00
## daily_steps                       0.00       0.00          0.00       0.00
## Daily_Calories                    0.00       0.00          0.00       0.00
## sugar_intake                      0.01       0.01          0.01       0.00
## water_intake                      0.01       0.01          0.01       0.00
## screen_time                       0.00       0.00          0.00       0.00
## stress_level                      0.00       0.00          0.00       0.00
## mental_health_score              -0.01      -0.01         -0.01      -0.01
## income                            0.00       0.00          0.00       0.01
## meals_per_day                     0.00       0.00          0.00       0.00
## electrolyte_level                   NA         NA            NA         NA
## gene_marker_flag                    NA         NA            NA         NA
## environmental_risk_score            NA         NA            NA         NA
## daily_supplement_dosage           0.00       0.00          0.00      -0.01
## weight_to_height_ratio            0.97       0.97          0.97       0.01
## adjusted_bmi                      1.00       1.00          1.00       0.00
##                          blood_pressure heart_rate cholesterol glucose insulin
## survey_code                        0.01       0.00        0.00    0.00    0.00
## age                                0.00      -0.01        0.01    0.00    0.00
## height                             0.01       0.00        0.00   -0.01    0.00
## weight                             0.00       0.00        0.00    0.00    0.00
## Body_Mass_Index                    0.00       0.00        0.00    0.01    0.00
## bmi_estimated                      0.00       0.00        0.00    0.01    0.00
## bmi_scaled                         0.00       0.00        0.00    0.01    0.00
## bmi_corrected                      0.00       0.00        0.00    0.01    0.00
## waist_size                         0.01       0.00        0.00    0.00   -0.01
## blood_pressure                     1.00       0.00        0.00    0.00    0.01
## heart_rate                         0.00       1.00        0.00    0.00    0.00
## cholesterol                        0.00       0.00        1.00   -0.01    0.00
## glucose                            0.00       0.00       -0.01    1.00    0.00
## insulin                            0.01       0.00        0.00    0.00    1.00
## Hours_of_Sleep                     0.00       0.00       -0.01   -0.01    0.00
## work_hours                         0.01       0.00        0.00    0.00    0.00
## physical_activity                  0.00       0.00       -0.01    0.00    0.00
## daily_steps                       -0.01      -0.01        0.00    0.00    0.00
## Daily_Calories                     0.00       0.01        0.00   -0.01    0.01
## sugar_intake                       0.00       0.00        0.00    0.00    0.00
## water_intake                       0.00       0.00        0.00    0.00    0.00
## screen_time                        0.01       0.00        0.00    0.00    0.00
## stress_level                       0.00       0.00        0.00    0.00    0.00
## mental_health_score                0.00       0.00        0.01    0.00    0.00
## income                             0.00       0.00        0.01    0.00    0.00
## meals_per_day                      0.00       0.00        0.00    0.00    0.00
## electrolyte_level                    NA         NA          NA      NA      NA
## gene_marker_flag                     NA         NA          NA      NA      NA
## environmental_risk_score             NA         NA          NA      NA      NA
## daily_supplement_dosage            0.01       0.00       -0.01    0.00    0.01
## weight_to_height_ratio             0.00       0.00        0.00    0.01    0.00
## adjusted_bmi                       0.00       0.00        0.00    0.01    0.00
##                          Hours_of_Sleep work_hours physical_activity
## survey_code                        0.00       0.01              0.00
## age                                0.00       0.01             -0.01
## height                            -0.01       0.00              0.00
## weight                             0.00       0.00              0.00
## Body_Mass_Index                    0.01       0.00              0.00
## bmi_estimated                      0.01       0.00              0.00
## bmi_scaled                         0.01       0.00              0.00
## bmi_corrected                      0.01       0.00              0.00
## waist_size                         0.00       0.01              0.00
## blood_pressure                     0.00       0.01              0.00
## heart_rate                         0.00       0.00              0.00
## cholesterol                       -0.01       0.00             -0.01
## glucose                           -0.01       0.00              0.00
## insulin                            0.00       0.00              0.00
## Hours_of_Sleep                     1.00       0.00              0.00
## work_hours                         0.00       1.00              0.01
## physical_activity                  0.00       0.01              1.00
## daily_steps                        0.00       0.00              0.00
## Daily_Calories                     0.00       0.00              0.00
## sugar_intake                       0.00       0.01              0.00
## water_intake                       0.00       0.00              0.00
## screen_time                        0.01       0.00              0.01
## stress_level                       0.00       0.00              0.00
## mental_health_score                0.00      -0.01             -0.01
## income                             0.00       0.00              0.00
## meals_per_day                      0.00      -0.01              0.00
## electrolyte_level                    NA         NA                NA
## gene_marker_flag                     NA         NA                NA
## environmental_risk_score             NA         NA                NA
## daily_supplement_dosage            0.00       0.00             -0.01
## weight_to_height_ratio             0.00       0.00              0.00
## adjusted_bmi                       0.01       0.00              0.00
##                          daily_steps Daily_Calories sugar_intake water_intake
## survey_code                     0.00          -0.01        -0.01         0.00
## age                             0.00          -0.01         0.00         0.00
## height                          0.00           0.00        -0.01         0.00
## weight                          0.00           0.00         0.01         0.01
## Body_Mass_Index                 0.00           0.00         0.01         0.01
## bmi_estimated                   0.00           0.00         0.01         0.01
## bmi_scaled                      0.00           0.00         0.01         0.01
## bmi_corrected                   0.00           0.00         0.01         0.01
## waist_size                      0.00           0.00         0.00         0.00
## blood_pressure                 -0.01           0.00         0.00         0.00
## heart_rate                     -0.01           0.01         0.00         0.00
## cholesterol                     0.00           0.00         0.00         0.00
## glucose                         0.00          -0.01         0.00         0.00
## insulin                         0.00           0.01         0.00         0.00
## Hours_of_Sleep                  0.00           0.00         0.00         0.00
## work_hours                      0.00           0.00         0.01         0.00
## physical_activity               0.00           0.00         0.00         0.00
## daily_steps                     1.00           0.00         0.00         0.01
## Daily_Calories                  0.00           1.00         0.00        -0.02
## sugar_intake                    0.00           0.00         1.00        -0.01
## water_intake                    0.01          -0.02        -0.01         1.00
## screen_time                     0.00           0.00         0.01         0.01
## stress_level                    0.01          -0.01         0.00         0.01
## mental_health_score             0.01           0.00         0.00         0.00
## income                          0.00           0.00         0.00         0.00
## meals_per_day                   0.00          -0.01         0.00         0.01
## electrolyte_level                 NA             NA           NA           NA
## gene_marker_flag                  NA             NA           NA           NA
## environmental_risk_score          NA             NA           NA           NA
## daily_supplement_dosage         0.00           0.00         0.01         0.00
## weight_to_height_ratio          0.00           0.00         0.01         0.01
## adjusted_bmi                    0.00           0.00         0.01         0.01
##                          screen_time stress_level mental_health_score income
## survey_code                     0.00         0.00               -0.01  -0.01
## age                            -0.01         0.00                0.00   0.00
## height                          0.00        -0.01                0.00  -0.01
## weight                          0.00         0.00               -0.01   0.00
## Body_Mass_Index                 0.00         0.00               -0.01   0.00
## bmi_estimated                   0.00         0.00               -0.01   0.00
## bmi_scaled                      0.00         0.00               -0.01   0.00
## bmi_corrected                   0.00         0.00               -0.01   0.00
## waist_size                      0.00         0.00               -0.01   0.01
## blood_pressure                  0.01         0.00                0.00   0.00
## heart_rate                      0.00         0.00                0.00   0.00
## cholesterol                     0.00         0.00                0.01   0.01
## glucose                         0.00         0.00                0.00   0.00
## insulin                         0.00         0.00                0.00   0.00
## Hours_of_Sleep                  0.01         0.00                0.00   0.00
## work_hours                      0.00         0.00               -0.01   0.00
## physical_activity               0.01         0.00               -0.01   0.00
## daily_steps                     0.00         0.01                0.01   0.00
## Daily_Calories                  0.00        -0.01                0.00   0.00
## sugar_intake                    0.01         0.00                0.00   0.00
## water_intake                    0.01         0.01                0.00   0.00
## screen_time                     1.00         0.01                0.00   0.00
## stress_level                    0.01         1.00               -0.01   0.00
## mental_health_score             0.00        -0.01                1.00   0.00
## income                          0.00         0.00                0.00   1.00
## meals_per_day                   0.00         0.00                0.00   0.01
## electrolyte_level                 NA           NA                  NA     NA
## gene_marker_flag                  NA           NA                  NA     NA
## environmental_risk_score          NA           NA                  NA     NA
## daily_supplement_dosage         0.00        -0.01                0.00  -0.01
## weight_to_height_ratio          0.00         0.00               -0.01   0.00
## adjusted_bmi                    0.00         0.00               -0.01   0.00
##                          meals_per_day electrolyte_level gene_marker_flag
## survey_code                       0.00                NA               NA
## age                               0.00                NA               NA
## height                            0.00                NA               NA
## weight                            0.00                NA               NA
## Body_Mass_Index                   0.00                NA               NA
## bmi_estimated                     0.00                NA               NA
## bmi_scaled                        0.00                NA               NA
## bmi_corrected                     0.00                NA               NA
## waist_size                        0.00                NA               NA
## blood_pressure                    0.00                NA               NA
## heart_rate                        0.00                NA               NA
## cholesterol                       0.00                NA               NA
## glucose                           0.00                NA               NA
## insulin                           0.00                NA               NA
## Hours_of_Sleep                    0.00                NA               NA
## work_hours                       -0.01                NA               NA
## physical_activity                 0.00                NA               NA
## daily_steps                       0.00                NA               NA
## Daily_Calories                   -0.01                NA               NA
## sugar_intake                      0.00                NA               NA
## water_intake                      0.01                NA               NA
## screen_time                       0.00                NA               NA
## stress_level                      0.00                NA               NA
## mental_health_score               0.00                NA               NA
## income                            0.01                NA               NA
## meals_per_day                     1.00                NA               NA
## electrolyte_level                   NA                 1               NA
## gene_marker_flag                    NA                NA                1
## environmental_risk_score            NA                NA               NA
## daily_supplement_dosage           0.01                NA               NA
## weight_to_height_ratio            0.00                NA               NA
## adjusted_bmi                      0.00                NA               NA
##                          environmental_risk_score daily_supplement_dosage
## survey_code                                    NA                    0.01
## age                                            NA                    0.00
## height                                         NA                    0.00
## weight                                         NA                    0.00
## Body_Mass_Index                                NA                    0.00
## bmi_estimated                                  NA                    0.00
## bmi_scaled                                     NA                    0.00
## bmi_corrected                                  NA                    0.00
## waist_size                                     NA                   -0.01
## blood_pressure                                 NA                    0.01
## heart_rate                                     NA                    0.00
## cholesterol                                    NA                   -0.01
## glucose                                        NA                    0.00
## insulin                                        NA                    0.01
## Hours_of_Sleep                                 NA                    0.00
## work_hours                                     NA                    0.00
## physical_activity                              NA                   -0.01
## daily_steps                                    NA                    0.00
## Daily_Calories                                 NA                    0.00
## sugar_intake                                   NA                    0.01
## water_intake                                   NA                    0.00
## screen_time                                    NA                    0.00
## stress_level                                   NA                   -0.01
## mental_health_score                            NA                    0.00
## income                                         NA                   -0.01
## meals_per_day                                  NA                    0.01
## electrolyte_level                              NA                      NA
## gene_marker_flag                               NA                      NA
## environmental_risk_score                        1                      NA
## daily_supplement_dosage                        NA                    1.00
## weight_to_height_ratio                         NA                    0.00
## adjusted_bmi                                   NA                    0.00
##                          weight_to_height_ratio adjusted_bmi
## survey_code                                0.00         0.00
## age                                        0.01         0.01
## height                                    -0.27        -0.49
## weight                                     0.96         0.86
## Body_Mass_Index                            0.97         1.00
## bmi_estimated                              0.97         1.00
## bmi_scaled                                 0.97         1.00
## bmi_corrected                              0.97         1.00
## waist_size                                 0.01         0.00
## blood_pressure                             0.00         0.00
## heart_rate                                 0.00         0.00
## cholesterol                                0.00         0.00
## glucose                                    0.01         0.01
## insulin                                    0.00         0.00
## Hours_of_Sleep                             0.00         0.01
## work_hours                                 0.00         0.00
## physical_activity                          0.00         0.00
## daily_steps                                0.00         0.00
## Daily_Calories                             0.00         0.00
## sugar_intake                               0.01         0.01
## water_intake                               0.01         0.01
## screen_time                                0.00         0.00
## stress_level                               0.00         0.00
## mental_health_score                       -0.01        -0.01
## income                                     0.00         0.00
## meals_per_day                              0.00         0.00
## electrolyte_level                            NA           NA
## gene_marker_flag                             NA           NA
## environmental_risk_score                     NA           NA
## daily_supplement_dosage                    0.00         0.00
## weight_to_height_ratio                     1.00         0.97
## adjusted_bmi                               0.97         1.00
  1. Create training set
set.seed(123)
training_set <- df_sorted %>% sample_frac(0.50)
head(training_set, 5)

Chart Creation

  1. Scatter plot
ggplot(df, aes(x = weight, y = bmi)) +
  geom_point(shape = 4, color = "orange") +
  labs(title = "Scatter Plot of Weight vs BMI", x = "Weight (Kg)", y = "BMI") +
  theme_minimal()

  1. Bar plot
smoking_gender_count <- df %>%
  group_by(smoking_level, gender) %>%
  summarise(count = n(), .groups = 'drop')

ggplot(smoking_gender_count, aes(x = smoking_level, y = count, fill = gender)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  geom_text(aes(label = count), position = position_dodge(width = 0.9), vjust = -0.25, size = 3.5) +
  labs(title = "Count by Smoking Level and Gender", x = "Smoking Level", y = "Count") +
  scale_fill_manual(values = c("steelblue", "orange")) +
  theme_minimal()