str(df)
## tibble [100,000 × 48] (S3: tbl_df/tbl/data.frame)
## $ survey_code : num [1:100000] 1 2 3 4 5 6 7 8 9 10 ...
## $ age : num [1:100000] 56 69 46 32 60 25 78 38 56 75 ...
## $ gender : chr [1:100000] "Male" "Female" "Male" "Female" ...
## $ height : num [1:100000] 173 163 177 172 164 ...
## $ weight : num [1:100000] 56.9 97.8 80.7 63.1 40 ...
## $ bmi : num [1:100000] 18.9 36.7 25.7 21.3 14.9 ...
## $ bmi_estimated : num [1:100000] 18.9 36.7 25.7 21.3 14.9 ...
## $ bmi_scaled : num [1:100000] 56.7 110.1 77 64 44.8 ...
## $ bmi_corrected : num [1:100000] 19 36.5 25.6 21.2 14.8 ...
## $ waist_size : num [1:100000] 72.2 85.6 90.3 100.5 69 ...
## $ blood_pressure : num [1:100000] 118 118 123 148 151 ...
## $ heart_rate : num [1:100000] 60.7 66.5 76 68.8 92.3 ...
## $ cholesterol : num [1:100000] 215 116 138 203 200 ...
## $ glucose : num [1:100000] 103 116.9 89.2 128.4 94.8 ...
## $ insulin : num [1:100000] NA 10.1 NA 18.7 16 ...
## $ sleep_hours : num [1:100000] 6.48 8.43 5.7 5.19 7.91 ...
## $ sleep_quality : chr [1:100000] "Fair" "Good" "Poor" "Good" ...
## $ work_hours : num [1:100000] 7.67 9.52 5.83 9.49 7.28 ...
## $ physical_activity : num [1:100000] 0.357 0.568 3.764 0.889 2.902 ...
## $ daily_steps : num [1:100000] 13321 11911 2974 5322 9791 ...
## $ calorie_intake : num [1:100000] 2674 2650 1747 2034 2386 ...
## $ sugar_intake : num [1:100000] 44.5 74.7 19.7 82.6 46 ...
## $ alcohol_consumption : chr [1:100000] NA "Regularly" "Regularly" "Occasionally" ...
## $ smoking_level : chr [1:100000] "Non-smoker" "Light" "Heavy" "Heavy" ...
## $ water_intake : num [1:100000] 1.694 0.716 2.488 2.643 1.968 ...
## $ screen_time : num [1:100000] 5 5.93 4.37 4.12 3.18 ...
## $ stress_level : num [1:100000] 2 3 0 10 9 7 7 7 2 10 ...
## $ mental_health_score : num [1:100000] 8 9 1 4 7 6 1 2 9 9 ...
## $ mental_health_support : chr [1:100000] "No" "No" "No" "No" ...
## $ education_level : chr [1:100000] "PhD" "High School" "Master" "Master" ...
## $ job_type : chr [1:100000] "Tech" "Office" "Office" "Labor" ...
## $ occupation : chr [1:100000] "Farmer" "Engineer" "Teacher" "Teacher" ...
## $ income : num [1:100000] 6760 6241 3429 2619 3662 ...
## $ diet_type : chr [1:100000] "Vegan" "Vegan" "Vegan" "Vegetarian" ...
## $ exercise_type : chr [1:100000] "Strength" "Cardio" "Cardio" "Mixed" ...
## $ device_usage : chr [1:100000] "High" "Moderate" "High" "Low" ...
## $ healthcare_access : chr [1:100000] "Poor" "Moderate" "Good" "Moderate" ...
## $ insurance : chr [1:100000] "No" "No" "Yes" "No" ...
## $ sunlight_exposure : chr [1:100000] "High" "High" "High" "High" ...
## $ meals_per_day : num [1:100000] 5 5 4 1 1 4 2 3 2 1 ...
## $ caffeine_intake : chr [1:100000] "Moderate" "High" "Moderate" "None" ...
## $ family_history : chr [1:100000] "No" "Yes" "No" "No" ...
## $ pet_owner : chr [1:100000] "Yes" "No" "No" "Yes" ...
## $ electrolyte_level : num [1:100000] 0 0 0 0 0 0 0 0 0 0 ...
## $ gene_marker_flag : num [1:100000] 1 1 1 1 1 1 NA 1 NA 1 ...
## $ environmental_risk_score: num [1:100000] 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 ...
## $ daily_supplement_dosage : num [1:100000] -2.276 6.239 5.424 8.389 0.333 ...
## $ target : chr [1:100000] "healthy" "healthy" "healthy" "healthy" ...
colnames(df)
## [1] "survey_code" "age"
## [3] "gender" "height"
## [5] "weight" "bmi"
## [7] "bmi_estimated" "bmi_scaled"
## [9] "bmi_corrected" "waist_size"
## [11] "blood_pressure" "heart_rate"
## [13] "cholesterol" "glucose"
## [15] "insulin" "sleep_hours"
## [17] "sleep_quality" "work_hours"
## [19] "physical_activity" "daily_steps"
## [21] "calorie_intake" "sugar_intake"
## [23] "alcohol_consumption" "smoking_level"
## [25] "water_intake" "screen_time"
## [27] "stress_level" "mental_health_score"
## [29] "mental_health_support" "education_level"
## [31] "job_type" "occupation"
## [33] "income" "diet_type"
## [35] "exercise_type" "device_usage"
## [37] "healthcare_access" "insurance"
## [39] "sunlight_exposure" "meals_per_day"
## [41] "caffeine_intake" "family_history"
## [43] "pet_owner" "electrolyte_level"
## [45] "gene_marker_flag" "environmental_risk_score"
## [47] "daily_supplement_dosage" "target"
head(df, 15)
df_no_na <- na.omit(df)
df_cleaned <- df_no_na[!duplicated(df_no_na), ]
5.1 Check row count before and after cleaning
cat("Original rows:", nrow(df), "\n")
## Original rows: 100000
cat("After removing NA and duplicates:", nrow(df_cleaned), "\n")
## After removing NA and duplicates: 43247
categorize_water_intake <- function(water) {
if (is.na(water)) {
return(NA)
} else if (water < 1) {
return("Very Low")
} else if (water < 2) {
return("Low")
} else if (water < 3) {
return("Moderate")
} else {
return("High")
}
}
df_cleaned$Water_Intake_Category <- sapply(df_cleaned$water_intake, categorize_water_intake)
head(df_cleaned[, c("water_intake", "Water_Intake_Category")], 10)
low_water_df <- df_cleaned %>%
filter(Water_Intake_Category %in% c("Very Low", "Low"))
cat("Low water intake count:", nrow(low_water_df), "\n")
## Low water intake count: 21546
head(low_water_df[, c("age", "gender", "water_intake", "Water_Intake_Category")], 10)
df_cleaned <- df_cleaned %>%
filter(!is.na(smoking_level), !is.na(gender), !is.na(age), !is.na(bmi))
df_sorted <- df_cleaned %>%
arrange(desc(age), desc(bmi))
head(df_sorted)
df_sorted <- df_sorted %>%
rename(
Body_Mass_Index = bmi,
Hours_of_Sleep = sleep_hours,
Daily_Calories = calorie_intake
)
colnames(df_sorted)
## [1] "survey_code" "age"
## [3] "gender" "height"
## [5] "weight" "Body_Mass_Index"
## [7] "bmi_estimated" "bmi_scaled"
## [9] "bmi_corrected" "waist_size"
## [11] "blood_pressure" "heart_rate"
## [13] "cholesterol" "glucose"
## [15] "insulin" "Hours_of_Sleep"
## [17] "sleep_quality" "work_hours"
## [19] "physical_activity" "daily_steps"
## [21] "Daily_Calories" "sugar_intake"
## [23] "alcohol_consumption" "smoking_level"
## [25] "water_intake" "screen_time"
## [27] "stress_level" "mental_health_score"
## [29] "mental_health_support" "education_level"
## [31] "job_type" "occupation"
## [33] "income" "diet_type"
## [35] "exercise_type" "device_usage"
## [37] "healthcare_access" "insurance"
## [39] "sunlight_exposure" "meals_per_day"
## [41] "caffeine_intake" "family_history"
## [43] "pet_owner" "electrolyte_level"
## [45] "gene_marker_flag" "environmental_risk_score"
## [47] "daily_supplement_dosage" "target"
## [49] "Water_Intake_Category"
df_sorted <- df_sorted %>%
mutate(
weight_to_height_ratio = weight / height,
adjusted_bmi = Body_Mass_Index * 1.1
)
head(df_sorted, 15)
dv <- df_sorted$target
demographic_iv <- df_sorted %>% select(age, gender, education_level, job_type, occupation, income)
health_metrics_iv <- df_sorted %>% select(height, weight, Body_Mass_Index, blood_pressure, heart_rate, cholesterol, glucose, insulin, mental_health_score)
lifestyle_iv <- df_sorted %>% select(Hours_of_Sleep, sleep_quality, work_hours, physical_activity, daily_steps, exercise_type, screen_time)
dietary_iv <- df_sorted %>% select(Daily_Calories, sugar_intake, water_intake, diet_type, meals_per_day, caffeine_intake, daily_supplement_dosage)
risk_iv <- df_sorted %>% select(alcohol_consumption, smoking_level, stress_level, family_history, gene_marker_flag, environmental_risk_score)
access_support <- df_sorted %>% select(mental_health_support, healthcare_access, insurance, sunlight_exposure, pet_owner)
head(demographic_iv, 3)
head(health_metrics_iv, 3)
head(lifestyle_iv, 3)
head(dietary_iv, 3)
head(risk_iv, 3)
head(access_support, 3)
dietary_with_target <- cbind(Health_Status = dv, dietary_iv)
head(dietary_with_target, 5)
numeric_data <- df_sorted[sapply(df_sorted, is.numeric)]
summary(numeric_data)
## survey_code age height weight
## Min. : 2 Min. :18.00 Min. :140.0 Min. : 40.00
## 1st Qu.:25220 1st Qu.:33.00 1st Qu.:163.3 1st Qu.: 59.91
## Median :50222 Median :48.00 Median :170.0 Median : 69.97
## Mean :50110 Mean :48.52 Mean :170.0 Mean : 70.06
## 3rd Qu.:75078 3rd Qu.:64.00 3rd Qu.:176.8 3rd Qu.: 79.95
## Max. :99999 Max. :79.00 Max. :210.0 Max. :133.80
## Body_Mass_Index bmi_estimated bmi_scaled bmi_corrected
## Min. :10.13 Min. :10.13 Min. : 30.40 Min. :10.13
## 1st Qu.:20.27 1st Qu.:20.27 1st Qu.: 60.81 1st Qu.:20.27
## Median :24.19 Median :24.19 Median : 72.56 Median :24.17
## Mean :24.49 Mean :24.49 Mean : 73.47 Mean :24.49
## 3rd Qu.:28.22 3rd Qu.:28.22 3rd Qu.: 84.65 3rd Qu.:28.22
## Max. :59.23 Max. :59.23 Max. :177.70 Max. :59.14
## waist_size blood_pressure heart_rate cholesterol
## Min. : 34.09 Min. : 59.13 Min. : 36.89 Min. : 71.96
## 1st Qu.: 76.69 1st Qu.:109.86 1st Qu.: 68.22 1st Qu.:169.58
## Median : 84.91 Median :119.96 Median : 75.04 Median :189.81
## Mean : 84.90 Mean :120.02 Mean : 74.95 Mean :189.93
## 3rd Qu.: 93.03 3rd Qu.:130.11 3rd Qu.: 81.68 3rd Qu.:210.21
## Max. :133.15 Max. :180.72 Max. :112.79 Max. :314.93
## glucose insulin Hours_of_Sleep work_hours
## Min. : 21.91 Min. :-6.794 Min. : 3.000 Min. : 0.000
## 1st Qu.: 86.41 1st Qu.:11.651 1st Qu.: 5.981 1st Qu.: 6.663
## Median : 99.88 Median :15.014 Median : 7.009 Median : 8.009
## Mean : 99.97 Mean :15.009 Mean : 7.006 Mean : 8.004
## 3rd Qu.:113.51 3rd Qu.:18.406 3rd Qu.: 8.028 3rd Qu.: 9.348
## Max. :183.88 Max. :34.418 Max. :12.000 Max. :16.000
## physical_activity daily_steps Daily_Calories sugar_intake
## Min. : 0.000 Min. : 1000 Min. : 527.2 Min. :-25.13
## 1st Qu.: 1.632 1st Qu.: 5321 1st Qu.:1931.3 1st Qu.: 46.44
## Median : 2.977 Median : 7014 Median :2202.9 Median : 60.09
## Mean : 3.043 Mean : 7019 Mean :2201.1 Mean : 59.96
## 3rd Qu.: 4.335 3rd Qu.: 8719 3rd Qu.:2472.8 3rd Qu.: 73.30
## Max. :11.306 Max. :18065 Max. :3825.0 Max. :135.85
## water_intake screen_time stress_level mental_health_score
## Min. :0.500 Min. : 0.000 Min. : 0.000 Min. : 0
## 1st Qu.:1.526 1st Qu.: 3.957 1st Qu.: 2.000 1st Qu.: 2
## Median :2.003 Median : 5.994 Median : 5.000 Median : 5
## Mean :2.009 Mean : 6.012 Mean : 5.006 Mean : 5
## 3rd Qu.:2.482 3rd Qu.: 8.005 3rd Qu.: 8.000 3rd Qu.: 8
## Max. :5.000 Max. :16.000 Max. :10.000 Max. :10
## income meals_per_day electrolyte_level gene_marker_flag
## Min. : 500 Min. :1.000 Min. :0 Min. :1
## 1st Qu.: 2661 1st Qu.:2.000 1st Qu.:0 1st Qu.:1
## Median : 3998 Median :3.000 Median :0 Median :1
## Mean : 4039 Mean :3.004 Mean :0 Mean :1
## 3rd Qu.: 5357 3rd Qu.:4.000 3rd Qu.:0 3rd Qu.:1
## Max. :11865 Max. :5.000 Max. :0 Max. :1
## environmental_risk_score daily_supplement_dosage weight_to_height_ratio
## Min. :5.5 Min. :-9.99873 Min. :0.2013
## 1st Qu.:5.5 1st Qu.:-5.01646 1st Qu.:0.3503
## Median :5.5 Median :-0.02642 Median :0.4119
## Mean :5.5 Mean :-0.01843 Mean :0.4135
## 3rd Qu.:5.5 3rd Qu.: 4.97760 3rd Qu.:0.4729
## Max. :5.5 Max. : 9.99954 Max. :0.8364
## adjusted_bmi
## Min. :11.15
## 1st Qu.:22.30
## Median :26.61
## Mean :26.94
## 3rd Qu.:31.04
## Max. :65.16
get_mode <- function(v) {
uniq_vals <- unique(v)
uniq_vals[which.max(tabulate(match(v, uniq_vals)))]
}
modes <- sapply(numeric_data, get_mode)
ranges <- sapply(numeric_data, function(x) max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
means <- sapply(numeric_data, mean, na.rm = TRUE)
medians <- sapply(numeric_data, median, na.rm = TRUE)
summary_table <- data.frame(
Mean = means,
Median = medians,
Mode = modes,
Range = ranges
)
round(summary_table, 2)
cor(df_sorted$age, df_sorted$Body_Mass_Index, method = "pearson", use = "complete.obs")
## [1] 0.008330876
cor(numeric_data, method = "pearson", use = "complete.obs") %>% round(2)
## Warning in cor(numeric_data, method = "pearson", use = "complete.obs"): the
## standard deviation is zero
## survey_code age height weight Body_Mass_Index
## survey_code 1.00 0.00 0.00 0.00 0.00
## age 0.00 1.00 -0.01 0.00 0.01
## height 0.00 -0.01 1.00 0.00 -0.49
## weight 0.00 0.00 0.00 1.00 0.86
## Body_Mass_Index 0.00 0.01 -0.49 0.86 1.00
## bmi_estimated 0.00 0.01 -0.49 0.86 1.00
## bmi_scaled 0.00 0.01 -0.49 0.86 1.00
## bmi_corrected 0.00 0.01 -0.49 0.86 1.00
## waist_size -0.01 0.00 0.00 0.01 0.00
## blood_pressure 0.01 0.00 0.01 0.00 0.00
## heart_rate 0.00 -0.01 0.00 0.00 0.00
## cholesterol 0.00 0.01 0.00 0.00 0.00
## glucose 0.00 0.00 -0.01 0.00 0.01
## insulin 0.00 0.00 0.00 0.00 0.00
## Hours_of_Sleep 0.00 0.00 -0.01 0.00 0.01
## work_hours 0.01 0.01 0.00 0.00 0.00
## physical_activity 0.00 -0.01 0.00 0.00 0.00
## daily_steps 0.00 0.00 0.00 0.00 0.00
## Daily_Calories -0.01 -0.01 0.00 0.00 0.00
## sugar_intake -0.01 0.00 -0.01 0.01 0.01
## water_intake 0.00 0.00 0.00 0.01 0.01
## screen_time 0.00 -0.01 0.00 0.00 0.00
## stress_level 0.00 0.00 -0.01 0.00 0.00
## mental_health_score -0.01 0.00 0.00 -0.01 -0.01
## income -0.01 0.00 -0.01 0.00 0.00
## meals_per_day 0.00 0.00 0.00 0.00 0.00
## electrolyte_level NA NA NA NA NA
## gene_marker_flag NA NA NA NA NA
## environmental_risk_score NA NA NA NA NA
## daily_supplement_dosage 0.01 0.00 0.00 0.00 0.00
## weight_to_height_ratio 0.00 0.01 -0.27 0.96 0.97
## adjusted_bmi 0.00 0.01 -0.49 0.86 1.00
## bmi_estimated bmi_scaled bmi_corrected waist_size
## survey_code 0.00 0.00 0.00 -0.01
## age 0.01 0.01 0.01 0.00
## height -0.49 -0.49 -0.49 0.00
## weight 0.86 0.86 0.86 0.01
## Body_Mass_Index 1.00 1.00 1.00 0.00
## bmi_estimated 1.00 1.00 1.00 0.00
## bmi_scaled 1.00 1.00 1.00 0.00
## bmi_corrected 1.00 1.00 1.00 0.00
## waist_size 0.00 0.00 0.00 1.00
## blood_pressure 0.00 0.00 0.00 0.01
## heart_rate 0.00 0.00 0.00 0.00
## cholesterol 0.00 0.00 0.00 0.00
## glucose 0.01 0.01 0.01 0.00
## insulin 0.00 0.00 0.00 -0.01
## Hours_of_Sleep 0.01 0.01 0.01 0.00
## work_hours 0.00 0.00 0.00 0.01
## physical_activity 0.00 0.00 0.00 0.00
## daily_steps 0.00 0.00 0.00 0.00
## Daily_Calories 0.00 0.00 0.00 0.00
## sugar_intake 0.01 0.01 0.01 0.00
## water_intake 0.01 0.01 0.01 0.00
## screen_time 0.00 0.00 0.00 0.00
## stress_level 0.00 0.00 0.00 0.00
## mental_health_score -0.01 -0.01 -0.01 -0.01
## income 0.00 0.00 0.00 0.01
## meals_per_day 0.00 0.00 0.00 0.00
## electrolyte_level NA NA NA NA
## gene_marker_flag NA NA NA NA
## environmental_risk_score NA NA NA NA
## daily_supplement_dosage 0.00 0.00 0.00 -0.01
## weight_to_height_ratio 0.97 0.97 0.97 0.01
## adjusted_bmi 1.00 1.00 1.00 0.00
## blood_pressure heart_rate cholesterol glucose insulin
## survey_code 0.01 0.00 0.00 0.00 0.00
## age 0.00 -0.01 0.01 0.00 0.00
## height 0.01 0.00 0.00 -0.01 0.00
## weight 0.00 0.00 0.00 0.00 0.00
## Body_Mass_Index 0.00 0.00 0.00 0.01 0.00
## bmi_estimated 0.00 0.00 0.00 0.01 0.00
## bmi_scaled 0.00 0.00 0.00 0.01 0.00
## bmi_corrected 0.00 0.00 0.00 0.01 0.00
## waist_size 0.01 0.00 0.00 0.00 -0.01
## blood_pressure 1.00 0.00 0.00 0.00 0.01
## heart_rate 0.00 1.00 0.00 0.00 0.00
## cholesterol 0.00 0.00 1.00 -0.01 0.00
## glucose 0.00 0.00 -0.01 1.00 0.00
## insulin 0.01 0.00 0.00 0.00 1.00
## Hours_of_Sleep 0.00 0.00 -0.01 -0.01 0.00
## work_hours 0.01 0.00 0.00 0.00 0.00
## physical_activity 0.00 0.00 -0.01 0.00 0.00
## daily_steps -0.01 -0.01 0.00 0.00 0.00
## Daily_Calories 0.00 0.01 0.00 -0.01 0.01
## sugar_intake 0.00 0.00 0.00 0.00 0.00
## water_intake 0.00 0.00 0.00 0.00 0.00
## screen_time 0.01 0.00 0.00 0.00 0.00
## stress_level 0.00 0.00 0.00 0.00 0.00
## mental_health_score 0.00 0.00 0.01 0.00 0.00
## income 0.00 0.00 0.01 0.00 0.00
## meals_per_day 0.00 0.00 0.00 0.00 0.00
## electrolyte_level NA NA NA NA NA
## gene_marker_flag NA NA NA NA NA
## environmental_risk_score NA NA NA NA NA
## daily_supplement_dosage 0.01 0.00 -0.01 0.00 0.01
## weight_to_height_ratio 0.00 0.00 0.00 0.01 0.00
## adjusted_bmi 0.00 0.00 0.00 0.01 0.00
## Hours_of_Sleep work_hours physical_activity
## survey_code 0.00 0.01 0.00
## age 0.00 0.01 -0.01
## height -0.01 0.00 0.00
## weight 0.00 0.00 0.00
## Body_Mass_Index 0.01 0.00 0.00
## bmi_estimated 0.01 0.00 0.00
## bmi_scaled 0.01 0.00 0.00
## bmi_corrected 0.01 0.00 0.00
## waist_size 0.00 0.01 0.00
## blood_pressure 0.00 0.01 0.00
## heart_rate 0.00 0.00 0.00
## cholesterol -0.01 0.00 -0.01
## glucose -0.01 0.00 0.00
## insulin 0.00 0.00 0.00
## Hours_of_Sleep 1.00 0.00 0.00
## work_hours 0.00 1.00 0.01
## physical_activity 0.00 0.01 1.00
## daily_steps 0.00 0.00 0.00
## Daily_Calories 0.00 0.00 0.00
## sugar_intake 0.00 0.01 0.00
## water_intake 0.00 0.00 0.00
## screen_time 0.01 0.00 0.01
## stress_level 0.00 0.00 0.00
## mental_health_score 0.00 -0.01 -0.01
## income 0.00 0.00 0.00
## meals_per_day 0.00 -0.01 0.00
## electrolyte_level NA NA NA
## gene_marker_flag NA NA NA
## environmental_risk_score NA NA NA
## daily_supplement_dosage 0.00 0.00 -0.01
## weight_to_height_ratio 0.00 0.00 0.00
## adjusted_bmi 0.01 0.00 0.00
## daily_steps Daily_Calories sugar_intake water_intake
## survey_code 0.00 -0.01 -0.01 0.00
## age 0.00 -0.01 0.00 0.00
## height 0.00 0.00 -0.01 0.00
## weight 0.00 0.00 0.01 0.01
## Body_Mass_Index 0.00 0.00 0.01 0.01
## bmi_estimated 0.00 0.00 0.01 0.01
## bmi_scaled 0.00 0.00 0.01 0.01
## bmi_corrected 0.00 0.00 0.01 0.01
## waist_size 0.00 0.00 0.00 0.00
## blood_pressure -0.01 0.00 0.00 0.00
## heart_rate -0.01 0.01 0.00 0.00
## cholesterol 0.00 0.00 0.00 0.00
## glucose 0.00 -0.01 0.00 0.00
## insulin 0.00 0.01 0.00 0.00
## Hours_of_Sleep 0.00 0.00 0.00 0.00
## work_hours 0.00 0.00 0.01 0.00
## physical_activity 0.00 0.00 0.00 0.00
## daily_steps 1.00 0.00 0.00 0.01
## Daily_Calories 0.00 1.00 0.00 -0.02
## sugar_intake 0.00 0.00 1.00 -0.01
## water_intake 0.01 -0.02 -0.01 1.00
## screen_time 0.00 0.00 0.01 0.01
## stress_level 0.01 -0.01 0.00 0.01
## mental_health_score 0.01 0.00 0.00 0.00
## income 0.00 0.00 0.00 0.00
## meals_per_day 0.00 -0.01 0.00 0.01
## electrolyte_level NA NA NA NA
## gene_marker_flag NA NA NA NA
## environmental_risk_score NA NA NA NA
## daily_supplement_dosage 0.00 0.00 0.01 0.00
## weight_to_height_ratio 0.00 0.00 0.01 0.01
## adjusted_bmi 0.00 0.00 0.01 0.01
## screen_time stress_level mental_health_score income
## survey_code 0.00 0.00 -0.01 -0.01
## age -0.01 0.00 0.00 0.00
## height 0.00 -0.01 0.00 -0.01
## weight 0.00 0.00 -0.01 0.00
## Body_Mass_Index 0.00 0.00 -0.01 0.00
## bmi_estimated 0.00 0.00 -0.01 0.00
## bmi_scaled 0.00 0.00 -0.01 0.00
## bmi_corrected 0.00 0.00 -0.01 0.00
## waist_size 0.00 0.00 -0.01 0.01
## blood_pressure 0.01 0.00 0.00 0.00
## heart_rate 0.00 0.00 0.00 0.00
## cholesterol 0.00 0.00 0.01 0.01
## glucose 0.00 0.00 0.00 0.00
## insulin 0.00 0.00 0.00 0.00
## Hours_of_Sleep 0.01 0.00 0.00 0.00
## work_hours 0.00 0.00 -0.01 0.00
## physical_activity 0.01 0.00 -0.01 0.00
## daily_steps 0.00 0.01 0.01 0.00
## Daily_Calories 0.00 -0.01 0.00 0.00
## sugar_intake 0.01 0.00 0.00 0.00
## water_intake 0.01 0.01 0.00 0.00
## screen_time 1.00 0.01 0.00 0.00
## stress_level 0.01 1.00 -0.01 0.00
## mental_health_score 0.00 -0.01 1.00 0.00
## income 0.00 0.00 0.00 1.00
## meals_per_day 0.00 0.00 0.00 0.01
## electrolyte_level NA NA NA NA
## gene_marker_flag NA NA NA NA
## environmental_risk_score NA NA NA NA
## daily_supplement_dosage 0.00 -0.01 0.00 -0.01
## weight_to_height_ratio 0.00 0.00 -0.01 0.00
## adjusted_bmi 0.00 0.00 -0.01 0.00
## meals_per_day electrolyte_level gene_marker_flag
## survey_code 0.00 NA NA
## age 0.00 NA NA
## height 0.00 NA NA
## weight 0.00 NA NA
## Body_Mass_Index 0.00 NA NA
## bmi_estimated 0.00 NA NA
## bmi_scaled 0.00 NA NA
## bmi_corrected 0.00 NA NA
## waist_size 0.00 NA NA
## blood_pressure 0.00 NA NA
## heart_rate 0.00 NA NA
## cholesterol 0.00 NA NA
## glucose 0.00 NA NA
## insulin 0.00 NA NA
## Hours_of_Sleep 0.00 NA NA
## work_hours -0.01 NA NA
## physical_activity 0.00 NA NA
## daily_steps 0.00 NA NA
## Daily_Calories -0.01 NA NA
## sugar_intake 0.00 NA NA
## water_intake 0.01 NA NA
## screen_time 0.00 NA NA
## stress_level 0.00 NA NA
## mental_health_score 0.00 NA NA
## income 0.01 NA NA
## meals_per_day 1.00 NA NA
## electrolyte_level NA 1 NA
## gene_marker_flag NA NA 1
## environmental_risk_score NA NA NA
## daily_supplement_dosage 0.01 NA NA
## weight_to_height_ratio 0.00 NA NA
## adjusted_bmi 0.00 NA NA
## environmental_risk_score daily_supplement_dosage
## survey_code NA 0.01
## age NA 0.00
## height NA 0.00
## weight NA 0.00
## Body_Mass_Index NA 0.00
## bmi_estimated NA 0.00
## bmi_scaled NA 0.00
## bmi_corrected NA 0.00
## waist_size NA -0.01
## blood_pressure NA 0.01
## heart_rate NA 0.00
## cholesterol NA -0.01
## glucose NA 0.00
## insulin NA 0.01
## Hours_of_Sleep NA 0.00
## work_hours NA 0.00
## physical_activity NA -0.01
## daily_steps NA 0.00
## Daily_Calories NA 0.00
## sugar_intake NA 0.01
## water_intake NA 0.00
## screen_time NA 0.00
## stress_level NA -0.01
## mental_health_score NA 0.00
## income NA -0.01
## meals_per_day NA 0.01
## electrolyte_level NA NA
## gene_marker_flag NA NA
## environmental_risk_score 1 NA
## daily_supplement_dosage NA 1.00
## weight_to_height_ratio NA 0.00
## adjusted_bmi NA 0.00
## weight_to_height_ratio adjusted_bmi
## survey_code 0.00 0.00
## age 0.01 0.01
## height -0.27 -0.49
## weight 0.96 0.86
## Body_Mass_Index 0.97 1.00
## bmi_estimated 0.97 1.00
## bmi_scaled 0.97 1.00
## bmi_corrected 0.97 1.00
## waist_size 0.01 0.00
## blood_pressure 0.00 0.00
## heart_rate 0.00 0.00
## cholesterol 0.00 0.00
## glucose 0.01 0.01
## insulin 0.00 0.00
## Hours_of_Sleep 0.00 0.01
## work_hours 0.00 0.00
## physical_activity 0.00 0.00
## daily_steps 0.00 0.00
## Daily_Calories 0.00 0.00
## sugar_intake 0.01 0.01
## water_intake 0.01 0.01
## screen_time 0.00 0.00
## stress_level 0.00 0.00
## mental_health_score -0.01 -0.01
## income 0.00 0.00
## meals_per_day 0.00 0.00
## electrolyte_level NA NA
## gene_marker_flag NA NA
## environmental_risk_score NA NA
## daily_supplement_dosage 0.00 0.00
## weight_to_height_ratio 1.00 0.97
## adjusted_bmi 0.97 1.00
set.seed(123)
training_set <- df_sorted %>% sample_frac(0.50)
head(training_set, 5)
ggplot(df, aes(x = weight, y = bmi)) +
geom_point(shape = 4, color = "orange") +
labs(title = "Scatter Plot of Weight vs BMI", x = "Weight (Kg)", y = "BMI") +
theme_minimal()
smoking_gender_count <- df %>%
group_by(smoking_level, gender) %>%
summarise(count = n(), .groups = 'drop')
ggplot(smoking_gender_count, aes(x = smoking_level, y = count, fill = gender)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_text(aes(label = count), position = position_dodge(width = 0.9), vjust = -0.25, size = 3.5) +
labs(title = "Count by Smoking Level and Gender", x = "Smoking Level", y = "Count") +
scale_fill_manual(values = c("steelblue", "orange")) +
theme_minimal()