library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# Set working directory (change the path if needed)
setwd("C:/Users/NIKA/Downloads")
# Load the Excel file
health_lifestyle_classification <- read_excel("health_lifestyle_classification_Few.xlsx")
1.Print the structure of your dataset
str(health_lifestyle_classification)
## tibble [569 × 48] (S3: tbl_df/tbl/data.frame)
## $ survey_code : num [1:569] 1 2 3 4 5 6 7 8 9 10 ...
## $ age : num [1:569] 56 69 46 32 60 25 78 38 56 75 ...
## $ gender : chr [1:569] "Male" "Female" "Male" "Female" ...
## $ height : num [1:569] 173 163 177 172 164 ...
## $ weight : num [1:569] 56.9 97.8 80.7 63.1 40 ...
## $ bmi : num [1:569] 18.9 36.7 25.7 21.3 14.9 ...
## $ bmi_estimated : num [1:569] 18.9 36.7 25.7 21.3 14.9 ...
## $ bmi_scaled : num [1:569] 56.7 110.1 77 64 44.8 ...
## $ bmi_corrected : num [1:569] 19 36.5 25.6 21.2 14.8 ...
## $ waist_size : num [1:569] 72.2 85.6 90.3 100.5 69 ...
## $ blood_pressure : num [1:569] 118 118 123 148 151 ...
## $ heart_rate : num [1:569] 60.7 66.5 76 68.8 92.3 ...
## $ cholesterol : num [1:569] 215 116 138 203 200 ...
## $ glucose : num [1:569] 103 116.9 89.2 128.4 94.8 ...
## $ insulin : num [1:569] NA 10.1 NA 18.7 16 ...
## $ sleep_hours : num [1:569] 6.48 8.43 5.7 5.19 7.91 ...
## $ sleep_quality : chr [1:569] "Fair" "Good" "Poor" "Good" ...
## $ work_hours : num [1:569] 7.67 9.52 5.83 9.49 7.28 ...
## $ physical_activity : num [1:569] 0.357 0.568 3.764 0.889 2.902 ...
## $ daily_steps : num [1:569] 13321 11911 2974 5322 9791 ...
## $ calorie_intake : num [1:569] 2674 2650 1747 2034 2386 ...
## $ sugar_intake : num [1:569] 44.5 74.7 19.7 82.6 46 ...
## $ alcohol_consumption : chr [1:569] NA "Regularly" "Regularly" "Occasionally" ...
## $ smoking_level : chr [1:569] "Non-smoker" "Light" "Heavy" "Heavy" ...
## $ water_intake : num [1:569] 1.694 0.716 2.488 2.643 1.968 ...
## $ screen_time : num [1:569] 5 5.93 4.37 4.12 3.18 ...
## $ stress_level : num [1:569] 2 3 0 10 9 7 7 7 2 10 ...
## $ mental_health_score : num [1:569] 8 9 1 4 7 6 1 2 9 9 ...
## $ mental_health_support : chr [1:569] "No" "No" "No" "No" ...
## $ education_level : chr [1:569] "PhD" "High School" "Master" "Master" ...
## $ job_type : chr [1:569] "Tech" "Office" "Office" "Labor" ...
## $ occupation : chr [1:569] "Farmer" "Engineer" "Teacher" "Teacher" ...
## $ income : num [1:569] 6760 6241 3429 2619 3662 ...
## $ diet_type : chr [1:569] "Vegan" "Vegan" "Vegan" "Vegetarian" ...
## $ exercise_type : chr [1:569] "Strength" "Cardio" "Cardio" "Mixed" ...
## $ device_usage : chr [1:569] "High" "Moderate" "High" "Low" ...
## $ healthcare_access : chr [1:569] "Poor" "Moderate" "Good" "Moderate" ...
## $ insurance : chr [1:569] "No" "No" "Yes" "No" ...
## $ sunlight_exposure : chr [1:569] "High" "High" "High" "High" ...
## $ meals_per_day : num [1:569] 5 5 4 1 1 4 2 3 2 1 ...
## $ caffeine_intake : chr [1:569] "Moderate" "High" "Moderate" "None" ...
## $ family_history : chr [1:569] "No" "Yes" "No" "No" ...
## $ pet_owner : chr [1:569] "Yes" "No" "No" "Yes" ...
## $ electrolyte_level : num [1:569] 0 0 0 0 0 0 0 0 0 0 ...
## $ gene_marker_flag : num [1:569] 1 1 1 1 1 1 NA 1 NA 1 ...
## $ environmental_risk_score: num [1:569] 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 ...
## $ daily_supplement_dosage : num [1:569] -2.276 6.239 5.424 8.389 0.333 ...
## $ target : chr [1:569] "healthy" "healthy" "healthy" "healthy" ...
2.List the variables in your dataset
colnames(health_lifestyle_classification)
## [1] "survey_code" "age"
## [3] "gender" "height"
## [5] "weight" "bmi"
## [7] "bmi_estimated" "bmi_scaled"
## [9] "bmi_corrected" "waist_size"
## [11] "blood_pressure" "heart_rate"
## [13] "cholesterol" "glucose"
## [15] "insulin" "sleep_hours"
## [17] "sleep_quality" "work_hours"
## [19] "physical_activity" "daily_steps"
## [21] "calorie_intake" "sugar_intake"
## [23] "alcohol_consumption" "smoking_level"
## [25] "water_intake" "screen_time"
## [27] "stress_level" "mental_health_score"
## [29] "mental_health_support" "education_level"
## [31] "job_type" "occupation"
## [33] "income" "diet_type"
## [35] "exercise_type" "device_usage"
## [37] "healthcare_access" "insurance"
## [39] "sunlight_exposure" "meals_per_day"
## [41] "caffeine_intake" "family_history"
## [43] "pet_owner" "electrolyte_level"
## [45] "gene_marker_flag" "environmental_risk_score"
## [47] "daily_supplement_dosage" "target"
3.Print the top 15 rows of your dataset
head(health_lifestyle_classification,15)
4.Remove missing values in your dataset
df_no_na <- na.omit(health_lifestyle_classification)
5.Identify and remove duplicated data in your dataset
df_cleaned <- df_no_na[!duplicated(df_no_na), ]
5.1 Check the number of rows before and after cleaning
cat("Original rows:", nrow(health_lifestyle_classification), "\n")
## Original rows: 569
cat("After removing NA and duplicates:", nrow(df_cleaned), "\n")
## After removing NA and duplicates: 248
# Define a custom function to categorize water intake level
categorize_water_intake <- function(water) {
if (is.na(water)) {
return(NA)
} else if (water < 1) {
return("Very Low")
} else if (water < 2) {
return("Low")
} else if (water < 3) {
return("Moderate")
} else {
return("High")
}
}
# Apply the function to create a new column in the cleaned dataset
df_cleaned$Water_Intake_Category <- sapply(df_cleaned$water_intake, categorize_water_intake)
# Check the first few rows
head(df_cleaned[, c("water_intake", "Water_Intake_Category")], 10)
# Filter rows where water intake is considered "Very Low" or "Low"
low_water_df <- df_cleaned %>%
dplyr::filter(Water_Intake_Category %in% c("Very Low", "Low"))
# Show the number of individuals with low water intake
cat("Number of individuals with low or very low water intake:", nrow(low_water_df), "\n")
## Number of individuals with low or very low water intake: 124
# Display first few filtered entries
head(low_water_df[, c("age", "gender", "water_intake", "Water_Intake_Category")], 10)
8.Reorder multiple rows in descending order
df_sorted <- df_cleaned[order(-df_cleaned$age, -df_cleaned$bmi), ]
head(df_sorted,15)
9.Rename some of the column names in your dataset
names(df_sorted)[names(df_sorted) == "bmi"] <- "Body_Mass_Index"
names(df_sorted)[names(df_sorted) == "sleep_hours"] <- "Hours_of_Sleep"
names(df_sorted)[names(df_sorted) == "calorie_intake"] <- "Daily_Calories"
colnames(df_sorted)
## [1] "survey_code" "age"
## [3] "gender" "height"
## [5] "weight" "Body_Mass_Index"
## [7] "bmi_estimated" "bmi_scaled"
## [9] "bmi_corrected" "waist_size"
## [11] "blood_pressure" "heart_rate"
## [13] "cholesterol" "glucose"
## [15] "insulin" "Hours_of_Sleep"
## [17] "sleep_quality" "work_hours"
## [19] "physical_activity" "daily_steps"
## [21] "Daily_Calories" "sugar_intake"
## [23] "alcohol_consumption" "smoking_level"
## [25] "water_intake" "screen_time"
## [27] "stress_level" "mental_health_score"
## [29] "mental_health_support" "education_level"
## [31] "job_type" "occupation"
## [33] "income" "diet_type"
## [35] "exercise_type" "device_usage"
## [37] "healthcare_access" "insurance"
## [39] "sunlight_exposure" "meals_per_day"
## [41] "caffeine_intake" "family_history"
## [43] "pet_owner" "electrolyte_level"
## [45] "gene_marker_flag" "environmental_risk_score"
## [47] "daily_supplement_dosage" "target"
## [49] "Water_Intake_Category"
10.Add new variables to your data frame using a mathematical function
df_sorted$weight_to_height_ratio <- df_sorted$weight / df_sorted$height
df_sorted$adjusted_bmi <- df_sorted$Body_Mass_Index * 1.1
head(df_sorted,15)
library(dplyr)
dv <- df_sorted$target
head(dv)
## [1] "healthy" "healthy" "healthy" "healthy" "healthy" "diseased"
11.2 Identify independent variables according to group metrics
demographic_iv <- df_sorted %>% select(age, gender, education_level, job_type, occupation, income)
health_metrics_iv <- df_sorted %>% select(height, weight, Body_Mass_Index, blood_pressure, heart_rate, cholesterol, glucose, insulin, mental_health_score)
lifestyle_iv <- df_sorted %>% select(Hours_of_Sleep, sleep_quality, work_hours, physical_activity, daily_steps, exercise_type, screen_time)
dietary_iv <- df_sorted %>% select(Daily_Calories, sugar_intake, water_intake, diet_type, meals_per_day, caffeine_intake, daily_supplement_dosage)
risk_iv <- df_sorted %>% select(alcohol_consumption, smoking_level, stress_level, family_history, gene_marker_flag, environmental_risk_score)
access_support <- df_sorted %>% select(mental_health_support, healthcare_access, insurance, sunlight_exposure, pet_owner)
head(demographic_iv, 3)
head(health_metrics_iv, 3)
head(lifestyle_iv, 3)
head(dietary_iv, 3)
head(risk_iv, 3)
head(access_support, 3)
dietary_with_target <- cbind(dv, dietary_iv)
names(dietary_with_target)[1] = "Health Status"
head(dietary_with_target, 5)
13.1 Select only numeric columns
library(ggplot2)
numeric_data <- df_sorted[sapply(df_sorted, is.numeric)]
summary(numeric_data)
## survey_code age height weight
## Min. : 2.0 Min. :18.00 Min. :140.0 Min. : 40.00
## 1st Qu.:133.8 1st Qu.:32.00 1st Qu.:162.1 1st Qu.: 61.14
## Median :289.5 Median :48.00 Median :169.2 Median : 70.01
## Mean :285.7 Mean :47.97 Mean :168.6 Mean : 69.99
## 3rd Qu.:433.2 3rd Qu.:63.00 3rd Qu.:175.2 3rd Qu.: 79.36
## Max. :569.0 Max. :79.00 Max. :202.3 Max. :109.58
## Body_Mass_Index bmi_estimated bmi_scaled bmi_corrected
## Min. :11.66 Min. :11.66 Min. : 34.98 Min. :11.63
## 1st Qu.:20.88 1st Qu.:20.88 1st Qu.: 62.65 1st Qu.:20.76
## Median :24.22 Median :24.22 Median : 72.65 Median :24.33
## Mean :24.91 Mean :24.91 Mean : 74.73 Mean :24.90
## 3rd Qu.:28.46 3rd Qu.:28.46 3rd Qu.: 85.37 3rd Qu.:28.37
## Max. :39.52 Max. :39.52 Max. :118.55 Max. :39.56
## waist_size blood_pressure heart_rate cholesterol
## Min. : 58.35 Min. : 80.56 Min. : 49.69 Min. :115.6
## 1st Qu.: 76.64 1st Qu.:109.44 1st Qu.: 68.42 1st Qu.:168.3
## Median : 85.14 Median :118.66 Median : 74.45 Median :186.2
## Mean : 85.67 Mean :119.59 Mean : 74.71 Mean :188.0
## 3rd Qu.: 94.04 3rd Qu.:129.73 3rd Qu.: 80.49 3rd Qu.:207.4
## Max. :122.32 Max. :160.46 Max. :103.33 Max. :279.9
## glucose insulin Hours_of_Sleep work_hours
## Min. : 42.72 Min. : 1.756 Min. : 3.000 Min. : 2.865
## 1st Qu.: 85.73 1st Qu.:10.818 1st Qu.: 6.230 1st Qu.: 6.734
## Median : 98.52 Median :14.003 Median : 7.215 Median : 7.963
## Mean : 99.77 Mean :14.228 Mean : 7.160 Mean : 7.956
## 3rd Qu.:114.57 3rd Qu.:18.108 3rd Qu.: 8.180 3rd Qu.: 9.198
## Max. :155.76 Max. :28.696 Max. :10.699 Max. :12.872
## physical_activity daily_steps Daily_Calories sugar_intake
## Min. :0.000 Min. : 1000 Min. : 868.3 Min. : -5.60
## 1st Qu.:1.754 1st Qu.: 5252 1st Qu.:1947.2 1st Qu.: 49.51
## Median :2.998 Median : 6958 Median :2264.5 Median : 63.62
## Mean :3.049 Mean : 6840 Mean :2233.1 Mean : 62.22
## 3rd Qu.:4.251 3rd Qu.: 8364 3rd Qu.:2490.0 3rd Qu.: 74.89
## Max. :9.158 Max. :12640 Max. :3283.6 Max. :108.70
## water_intake screen_time stress_level mental_health_score
## Min. :0.500 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:1.509 1st Qu.: 4.101 1st Qu.: 2.000 1st Qu.: 2.000
## Median :1.998 Median : 6.106 Median : 5.000 Median : 5.000
## Mean :2.041 Mean : 6.086 Mean : 4.847 Mean : 5.012
## 3rd Qu.:2.559 3rd Qu.: 8.235 3rd Qu.: 7.000 3rd Qu.: 8.000
## Max. :4.450 Max. :14.367 Max. :10.000 Max. :10.000
## income meals_per_day electrolyte_level gene_marker_flag
## Min. : 500 Min. :1.000 Min. :0 Min. :1
## 1st Qu.: 2543 1st Qu.:2.000 1st Qu.:0 1st Qu.:1
## Median : 4113 Median :3.000 Median :0 Median :1
## Mean : 4079 Mean :2.887 Mean :0 Mean :1
## 3rd Qu.: 5359 3rd Qu.:4.000 3rd Qu.:0 3rd Qu.:1
## Max. :10370 Max. :5.000 Max. :0 Max. :1
## environmental_risk_score daily_supplement_dosage weight_to_height_ratio
## Min. :5.5 Min. :-9.7786 Min. :0.2160
## 1st Qu.:5.5 1st Qu.:-5.0411 1st Qu.:0.3596
## Median :5.5 Median :-0.2995 Median :0.4135
## Mean :5.5 Mean :-0.1421 Mean :0.4168
## 3rd Qu.:5.5 3rd Qu.: 4.7380 3rd Qu.:0.4724
## Max. :5.5 Max. : 9.9986 Max. :0.6188
## adjusted_bmi
## Min. :12.83
## 1st Qu.:22.97
## Median :26.64
## Mean :27.40
## 3rd Qu.:31.30
## Max. :43.47
# Define a custom mode function
get_mode <- function(v) {
uniq_vals <- unique(v)
uniq_vals[which.max(tabulate(match(v, uniq_vals)))]
}
# Apply mode to each numeric column
modes <- sapply(numeric_data, get_mode)
ranges <- sapply(numeric_data, function(x) max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
# Mean and Median
means <- sapply(numeric_data, mean, na.rm = TRUE)
medians <- sapply(numeric_data, median, na.rm = TRUE)
# Combine all into one table
summary_table <- data.frame(
Mean = means,
Median = medians,
Mode = modes,
Range = ranges
)
print(summary_table)
## Mean Median Mode Range
## survey_code 285.7096774 289.5000000 274.0000000 5.670000e+02
## age 47.9717742 48.0000000 79.0000000 6.100000e+01
## height 168.6359171 169.1994565 140.0000000 6.228482e+01
## weight 69.9921295 70.0080202 40.0000000 6.957590e+01
## Body_Mass_Index 24.9109833 24.2170333 37.9811770 2.785584e+01
## bmi_estimated 24.9109833 24.2170333 37.9811770 2.785584e+01
## bmi_scaled 74.7329499 72.6510998 113.9435309 8.356752e+01
## bmi_corrected 24.9017327 24.3307096 37.6263938 2.792979e+01
## waist_size 85.6696251 85.1404412 85.1187065 6.396912e+01
## blood_pressure 119.5904507 118.6617580 134.6035728 7.990143e+01
## heart_rate 74.7144658 74.4484888 70.8469406 5.363990e+01
## cholesterol 187.9645858 186.2278623 248.8885628 1.643311e+02
## glucose 99.7704221 98.5198903 60.4987097 1.130321e+02
## insulin 14.2282506 14.0033370 21.5470081 2.693972e+01
## Hours_of_Sleep 7.1604312 7.2146879 6.6722405 7.698735e+00
## work_hours 7.9558864 7.9634520 5.3054320 1.000748e+01
## physical_activity 3.0494869 2.9979726 0.0000000 9.157612e+00
## daily_steps 6840.4199491 6957.5963378 1000.0000000 1.163989e+04
## Daily_Calories 2233.0872868 2264.4589133 2485.0604576 2.415253e+03
## sugar_intake 62.2178626 63.6153974 55.1667093 1.142973e+02
## water_intake 2.0406867 1.9981441 3.1524530 3.950480e+00
## screen_time 6.0857620 6.1060002 0.0000000 1.436693e+01
## stress_level 4.8467742 5.0000000 3.0000000 1.000000e+01
## mental_health_score 5.0120968 5.0000000 2.0000000 1.000000e+01
## income 4078.5207842 4113.2535230 500.0000000 9.869897e+03
## meals_per_day 2.8870968 3.0000000 2.0000000 4.000000e+00
## electrolyte_level 0.0000000 0.0000000 0.0000000 0.000000e+00
## gene_marker_flag 1.0000000 1.0000000 1.0000000 0.000000e+00
## environmental_risk_score 5.5000000 5.5000000 5.5000000 0.000000e+00
## daily_supplement_dosage -0.1420907 -0.2995412 -1.4286092 1.977718e+01
## weight_to_height_ratio 0.4167587 0.4135018 0.6001306 4.028484e-01
## adjusted_bmi 27.4020816 26.6387366 41.7792947 3.064142e+01
summary_table_rounded <- round(summary_table, 2)
print(summary_table_rounded)
## Mean Median Mode Range
## survey_code 285.71 289.50 274.00 567.00
## age 47.97 48.00 79.00 61.00
## height 168.64 169.20 140.00 62.28
## weight 69.99 70.01 40.00 69.58
## Body_Mass_Index 24.91 24.22 37.98 27.86
## bmi_estimated 24.91 24.22 37.98 27.86
## bmi_scaled 74.73 72.65 113.94 83.57
## bmi_corrected 24.90 24.33 37.63 27.93
## waist_size 85.67 85.14 85.12 63.97
## blood_pressure 119.59 118.66 134.60 79.90
## heart_rate 74.71 74.45 70.85 53.64
## cholesterol 187.96 186.23 248.89 164.33
## glucose 99.77 98.52 60.50 113.03
## insulin 14.23 14.00 21.55 26.94
## Hours_of_Sleep 7.16 7.21 6.67 7.70
## work_hours 7.96 7.96 5.31 10.01
## physical_activity 3.05 3.00 0.00 9.16
## daily_steps 6840.42 6957.60 1000.00 11639.89
## Daily_Calories 2233.09 2264.46 2485.06 2415.25
## sugar_intake 62.22 63.62 55.17 114.30
## water_intake 2.04 2.00 3.15 3.95
## screen_time 6.09 6.11 0.00 14.37
## stress_level 4.85 5.00 3.00 10.00
## mental_health_score 5.01 5.00 2.00 10.00
## income 4078.52 4113.25 500.00 9869.90
## meals_per_day 2.89 3.00 2.00 4.00
## electrolyte_level 0.00 0.00 0.00 0.00
## gene_marker_flag 1.00 1.00 1.00 0.00
## environmental_risk_score 5.50 5.50 5.50 0.00
## daily_supplement_dosage -0.14 -0.30 -1.43 19.78
## weight_to_height_ratio 0.42 0.41 0.60 0.40
## adjusted_bmi 27.40 26.64 41.78 30.64
15.Find the correlation between any 2 variables using Pearson correlation
correlation_age_bmi <- cor(df_sorted$age, df_sorted$Body_Mass_Index, method = "pearson", use = "complete.obs")
round(correlation_age_bmi, 4)
## [1] -0.0381
correlation_matrix <- cor(numeric_data, method = "pearson", use = "complete.obs")
## Warning in cor(numeric_data, method = "pearson", use = "complete.obs"): the
## standard deviation is zero
round(correlation_matrix, 2)
## survey_code age height weight Body_Mass_Index
## survey_code 1.00 0.03 0.00 -0.10 -0.08
## age 0.03 1.00 0.02 -0.04 -0.04
## height 0.00 0.02 1.00 -0.03 -0.55
## weight -0.10 -0.04 -0.03 1.00 0.84
## Body_Mass_Index -0.08 -0.04 -0.55 0.84 1.00
## bmi_estimated -0.08 -0.04 -0.55 0.84 1.00
## bmi_scaled -0.08 -0.04 -0.55 0.84 1.00
## bmi_corrected -0.09 -0.04 -0.56 0.84 1.00
## waist_size 0.00 0.00 0.14 0.09 0.00
## blood_pressure -0.10 0.06 0.03 0.03 0.02
## heart_rate -0.01 -0.13 0.01 -0.08 -0.08
## cholesterol 0.03 -0.06 0.08 -0.05 -0.08
## glucose -0.10 -0.03 -0.06 0.03 0.06
## insulin 0.00 -0.02 0.04 0.09 0.04
## Hours_of_Sleep 0.11 0.06 0.03 -0.04 -0.04
## work_hours 0.01 -0.15 -0.06 -0.08 -0.03
## physical_activity 0.05 -0.02 0.10 -0.09 -0.12
## daily_steps 0.03 0.13 -0.11 -0.01 0.06
## Daily_Calories 0.05 0.12 0.00 -0.06 -0.05
## sugar_intake 0.04 -0.08 -0.03 -0.06 -0.03
## water_intake -0.05 0.07 -0.08 -0.01 0.04
## screen_time 0.04 0.05 0.07 0.02 -0.01
## stress_level -0.08 0.01 -0.02 0.05 0.04
## mental_health_score -0.07 -0.05 -0.03 0.05 0.07
## income -0.07 0.11 0.05 0.07 0.03
## meals_per_day 0.03 -0.04 -0.04 0.00 0.04
## electrolyte_level NA NA NA NA NA
## gene_marker_flag NA NA NA NA NA
## environmental_risk_score NA NA NA NA NA
## daily_supplement_dosage -0.08 -0.10 -0.09 -0.07 0.00
## weight_to_height_ratio -0.10 -0.04 -0.33 0.95 0.97
## adjusted_bmi -0.08 -0.04 -0.55 0.84 1.00
## bmi_estimated bmi_scaled bmi_corrected waist_size
## survey_code -0.08 -0.08 -0.09 0.00
## age -0.04 -0.04 -0.04 0.00
## height -0.55 -0.55 -0.56 0.14
## weight 0.84 0.84 0.84 0.09
## Body_Mass_Index 1.00 1.00 1.00 0.00
## bmi_estimated 1.00 1.00 1.00 0.00
## bmi_scaled 1.00 1.00 1.00 0.00
## bmi_corrected 1.00 1.00 1.00 0.00
## waist_size 0.00 0.00 0.00 1.00
## blood_pressure 0.02 0.02 0.01 0.11
## heart_rate -0.08 -0.08 -0.08 -0.06
## cholesterol -0.08 -0.08 -0.08 -0.01
## glucose 0.06 0.06 0.07 -0.01
## insulin 0.04 0.04 0.04 0.01
## Hours_of_Sleep -0.04 -0.04 -0.03 0.08
## work_hours -0.03 -0.03 -0.03 0.03
## physical_activity -0.12 -0.12 -0.12 0.00
## daily_steps 0.06 0.06 0.06 -0.02
## Daily_Calories -0.05 -0.05 -0.05 0.00
## sugar_intake -0.03 -0.03 -0.03 -0.14
## water_intake 0.04 0.04 0.04 0.07
## screen_time -0.01 -0.01 -0.01 0.04
## stress_level 0.04 0.04 0.04 -0.04
## mental_health_score 0.07 0.07 0.07 0.09
## income 0.03 0.03 0.03 0.04
## meals_per_day 0.04 0.04 0.04 -0.01
## electrolyte_level NA NA NA NA
## gene_marker_flag NA NA NA NA
## environmental_risk_score NA NA NA NA
## daily_supplement_dosage 0.00 0.00 0.01 -0.07
## weight_to_height_ratio 0.97 0.97 0.96 0.04
## adjusted_bmi 1.00 1.00 1.00 0.00
## blood_pressure heart_rate cholesterol glucose insulin
## survey_code -0.10 -0.01 0.03 -0.10 0.00
## age 0.06 -0.13 -0.06 -0.03 -0.02
## height 0.03 0.01 0.08 -0.06 0.04
## weight 0.03 -0.08 -0.05 0.03 0.09
## Body_Mass_Index 0.02 -0.08 -0.08 0.06 0.04
## bmi_estimated 0.02 -0.08 -0.08 0.06 0.04
## bmi_scaled 0.02 -0.08 -0.08 0.06 0.04
## bmi_corrected 0.01 -0.08 -0.08 0.07 0.04
## waist_size 0.11 -0.06 -0.01 -0.01 0.01
## blood_pressure 1.00 0.08 0.01 -0.04 0.05
## heart_rate 0.08 1.00 -0.07 -0.10 -0.10
## cholesterol 0.01 -0.07 1.00 -0.07 0.08
## glucose -0.04 -0.10 -0.07 1.00 -0.01
## insulin 0.05 -0.10 0.08 -0.01 1.00
## Hours_of_Sleep 0.06 0.09 -0.07 -0.05 -0.10
## work_hours -0.08 0.14 -0.02 -0.07 0.05
## physical_activity -0.11 -0.03 0.08 0.02 -0.04
## daily_steps -0.06 -0.04 0.03 0.06 -0.01
## Daily_Calories -0.21 0.07 0.01 -0.07 0.01
## sugar_intake -0.02 0.08 0.02 -0.04 0.06
## water_intake 0.11 0.03 -0.03 0.09 0.10
## screen_time 0.12 0.06 -0.18 -0.09 -0.13
## stress_level 0.04 -0.06 -0.04 0.08 0.13
## mental_health_score -0.12 0.05 0.01 -0.01 0.06
## income 0.06 -0.04 0.09 -0.04 -0.02
## meals_per_day -0.05 0.07 -0.03 -0.09 -0.07
## electrolyte_level NA NA NA NA NA
## gene_marker_flag NA NA NA NA NA
## environmental_risk_score NA NA NA NA NA
## daily_supplement_dosage -0.12 -0.03 0.06 0.02 0.05
## weight_to_height_ratio 0.02 -0.08 -0.07 0.05 0.07
## adjusted_bmi 0.02 -0.08 -0.08 0.06 0.04
## Hours_of_Sleep work_hours physical_activity
## survey_code 0.11 0.01 0.05
## age 0.06 -0.15 -0.02
## height 0.03 -0.06 0.10
## weight -0.04 -0.08 -0.09
## Body_Mass_Index -0.04 -0.03 -0.12
## bmi_estimated -0.04 -0.03 -0.12
## bmi_scaled -0.04 -0.03 -0.12
## bmi_corrected -0.03 -0.03 -0.12
## waist_size 0.08 0.03 0.00
## blood_pressure 0.06 -0.08 -0.11
## heart_rate 0.09 0.14 -0.03
## cholesterol -0.07 -0.02 0.08
## glucose -0.05 -0.07 0.02
## insulin -0.10 0.05 -0.04
## Hours_of_Sleep 1.00 0.00 -0.11
## work_hours 0.00 1.00 0.04
## physical_activity -0.11 0.04 1.00
## daily_steps 0.02 -0.03 0.03
## Daily_Calories -0.04 0.02 0.10
## sugar_intake -0.09 0.08 0.06
## water_intake -0.01 -0.06 0.05
## screen_time 0.07 -0.03 -0.04
## stress_level 0.00 -0.16 -0.02
## mental_health_score 0.06 0.11 -0.13
## income -0.01 -0.13 -0.05
## meals_per_day -0.10 0.11 0.06
## electrolyte_level NA NA NA
## gene_marker_flag NA NA NA
## environmental_risk_score NA NA NA
## daily_supplement_dosage -0.10 0.06 -0.11
## weight_to_height_ratio -0.04 -0.06 -0.11
## adjusted_bmi -0.04 -0.03 -0.12
## daily_steps Daily_Calories sugar_intake water_intake
## survey_code 0.03 0.05 0.04 -0.05
## age 0.13 0.12 -0.08 0.07
## height -0.11 0.00 -0.03 -0.08
## weight -0.01 -0.06 -0.06 -0.01
## Body_Mass_Index 0.06 -0.05 -0.03 0.04
## bmi_estimated 0.06 -0.05 -0.03 0.04
## bmi_scaled 0.06 -0.05 -0.03 0.04
## bmi_corrected 0.06 -0.05 -0.03 0.04
## waist_size -0.02 0.00 -0.14 0.07
## blood_pressure -0.06 -0.21 -0.02 0.11
## heart_rate -0.04 0.07 0.08 0.03
## cholesterol 0.03 0.01 0.02 -0.03
## glucose 0.06 -0.07 -0.04 0.09
## insulin -0.01 0.01 0.06 0.10
## Hours_of_Sleep 0.02 -0.04 -0.09 -0.01
## work_hours -0.03 0.02 0.08 -0.06
## physical_activity 0.03 0.10 0.06 0.05
## daily_steps 1.00 0.11 0.04 0.05
## Daily_Calories 0.11 1.00 -0.02 -0.07
## sugar_intake 0.04 -0.02 1.00 -0.04
## water_intake 0.05 -0.07 -0.04 1.00
## screen_time -0.03 -0.05 -0.05 -0.09
## stress_level -0.04 -0.06 0.11 0.05
## mental_health_score 0.06 -0.02 0.00 0.03
## income 0.00 -0.12 -0.05 -0.09
## meals_per_day 0.05 -0.02 0.12 -0.05
## electrolyte_level NA NA NA NA
## gene_marker_flag NA NA NA NA
## environmental_risk_score NA NA NA NA
## daily_supplement_dosage 0.06 -0.10 0.06 -0.06
## weight_to_height_ratio 0.03 -0.06 -0.05 0.02
## adjusted_bmi 0.06 -0.05 -0.03 0.04
## screen_time stress_level mental_health_score income
## survey_code 0.04 -0.08 -0.07 -0.07
## age 0.05 0.01 -0.05 0.11
## height 0.07 -0.02 -0.03 0.05
## weight 0.02 0.05 0.05 0.07
## Body_Mass_Index -0.01 0.04 0.07 0.03
## bmi_estimated -0.01 0.04 0.07 0.03
## bmi_scaled -0.01 0.04 0.07 0.03
## bmi_corrected -0.01 0.04 0.07 0.03
## waist_size 0.04 -0.04 0.09 0.04
## blood_pressure 0.12 0.04 -0.12 0.06
## heart_rate 0.06 -0.06 0.05 -0.04
## cholesterol -0.18 -0.04 0.01 0.09
## glucose -0.09 0.08 -0.01 -0.04
## insulin -0.13 0.13 0.06 -0.02
## Hours_of_Sleep 0.07 0.00 0.06 -0.01
## work_hours -0.03 -0.16 0.11 -0.13
## physical_activity -0.04 -0.02 -0.13 -0.05
## daily_steps -0.03 -0.04 0.06 0.00
## Daily_Calories -0.05 -0.06 -0.02 -0.12
## sugar_intake -0.05 0.11 0.00 -0.05
## water_intake -0.09 0.05 0.03 -0.09
## screen_time 1.00 -0.09 -0.08 -0.03
## stress_level -0.09 1.00 0.08 -0.09
## mental_health_score -0.08 0.08 1.00 -0.08
## income -0.03 -0.09 -0.08 1.00
## meals_per_day 0.06 -0.03 0.06 -0.07
## electrolyte_level NA NA NA NA
## gene_marker_flag NA NA NA NA
## environmental_risk_score NA NA NA NA
## daily_supplement_dosage 0.00 0.08 0.00 -0.13
## weight_to_height_ratio 0.00 0.05 0.07 0.05
## adjusted_bmi -0.01 0.04 0.07 0.03
## meals_per_day electrolyte_level gene_marker_flag
## survey_code 0.03 NA NA
## age -0.04 NA NA
## height -0.04 NA NA
## weight 0.00 NA NA
## Body_Mass_Index 0.04 NA NA
## bmi_estimated 0.04 NA NA
## bmi_scaled 0.04 NA NA
## bmi_corrected 0.04 NA NA
## waist_size -0.01 NA NA
## blood_pressure -0.05 NA NA
## heart_rate 0.07 NA NA
## cholesterol -0.03 NA NA
## glucose -0.09 NA NA
## insulin -0.07 NA NA
## Hours_of_Sleep -0.10 NA NA
## work_hours 0.11 NA NA
## physical_activity 0.06 NA NA
## daily_steps 0.05 NA NA
## Daily_Calories -0.02 NA NA
## sugar_intake 0.12 NA NA
## water_intake -0.05 NA NA
## screen_time 0.06 NA NA
## stress_level -0.03 NA NA
## mental_health_score 0.06 NA NA
## income -0.07 NA NA
## meals_per_day 1.00 NA NA
## electrolyte_level NA 1 NA
## gene_marker_flag NA NA 1
## environmental_risk_score NA NA NA
## daily_supplement_dosage 0.11 NA NA
## weight_to_height_ratio 0.02 NA NA
## adjusted_bmi 0.04 NA NA
## environmental_risk_score daily_supplement_dosage
## survey_code NA -0.08
## age NA -0.10
## height NA -0.09
## weight NA -0.07
## Body_Mass_Index NA 0.00
## bmi_estimated NA 0.00
## bmi_scaled NA 0.00
## bmi_corrected NA 0.01
## waist_size NA -0.07
## blood_pressure NA -0.12
## heart_rate NA -0.03
## cholesterol NA 0.06
## glucose NA 0.02
## insulin NA 0.05
## Hours_of_Sleep NA -0.10
## work_hours NA 0.06
## physical_activity NA -0.11
## daily_steps NA 0.06
## Daily_Calories NA -0.10
## sugar_intake NA 0.06
## water_intake NA -0.06
## screen_time NA 0.00
## stress_level NA 0.08
## mental_health_score NA 0.00
## income NA -0.13
## meals_per_day NA 0.11
## electrolyte_level NA NA
## gene_marker_flag NA NA
## environmental_risk_score 1 NA
## daily_supplement_dosage NA 1.00
## weight_to_height_ratio NA -0.03
## adjusted_bmi NA 0.00
## weight_to_height_ratio adjusted_bmi
## survey_code -0.10 -0.08
## age -0.04 -0.04
## height -0.33 -0.55
## weight 0.95 0.84
## Body_Mass_Index 0.97 1.00
## bmi_estimated 0.97 1.00
## bmi_scaled 0.97 1.00
## bmi_corrected 0.96 1.00
## waist_size 0.04 0.00
## blood_pressure 0.02 0.02
## heart_rate -0.08 -0.08
## cholesterol -0.07 -0.08
## glucose 0.05 0.06
## insulin 0.07 0.04
## Hours_of_Sleep -0.04 -0.04
## work_hours -0.06 -0.03
## physical_activity -0.11 -0.12
## daily_steps 0.03 0.06
## Daily_Calories -0.06 -0.05
## sugar_intake -0.05 -0.03
## water_intake 0.02 0.04
## screen_time 0.00 -0.01
## stress_level 0.05 0.04
## mental_health_score 0.07 0.07
## income 0.05 0.03
## meals_per_day 0.02 0.04
## electrolyte_level NA NA
## gene_marker_flag NA NA
## environmental_risk_score NA NA
## daily_supplement_dosage -0.03 0.00
## weight_to_height_ratio 1.00 0.97
## adjusted_bmi 0.97 1.00
16.Create a training set using random number generator engine
set.seed(123)
training_set <- df_sorted %>% sample_frac(0.50, replace = FALSE)
head(training_set, 5)
17.Plot a scatter plot for any 2 variables in your dataset
library(ggplot2)
ggplot(health_lifestyle_classification, aes(x = weight, y = bmi )) +
geom_point(shape = 4, color = "orange") +
labs(title = "Scatter Plot of Weight vs BMI",
x = "Weight (Kg)",
y = "BMI ") +
theme_minimal()
18.Plot a bar plot for any 2 variables in your dataset
library(dplyr)
library(ggplot2)
# Create a summarized count dataset
smoking_gender_count <- health_lifestyle_classification %>%
group_by(smoking_level, gender) %>%
summarise(count = n(), .groups = 'drop')
# Plot grouped bar chart
ggplot(smoking_gender_count, aes(x = smoking_level, y = count, fill = gender)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_text(aes(label = count),
position = position_dodge(width = 0.9),
vjust = -0.25, size = 3.5) +
labs(title = "Count by Smoking Level and Gender",
x = "Smoking Level",
y = "Count") +
scale_fill_manual(values = c("steelblue", "orange")) +
coord_cartesian(ylim = c(16000, 16850)) +
theme_minimal()