library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

# Set working directory (change the path if needed)
setwd("D:\\ISBA 2025\\Semi 2\\DataAnalytics")

# Load the Excel file
health_lifestyle_classification <- read_excel("health_lifestyle_classification_Few.xlsx")

Exploration

1.Print the structure of your dataset

str(health_lifestyle_classification)
## tibble [569 × 48] (S3: tbl_df/tbl/data.frame)
##  $ survey_code             : num [1:569] 1 2 3 4 5 6 7 8 9 10 ...
##  $ age                     : num [1:569] 56 69 46 32 60 25 78 38 56 75 ...
##  $ gender                  : chr [1:569] "Male" "Female" "Male" "Female" ...
##  $ height                  : num [1:569] 173 163 177 172 164 ...
##  $ weight                  : num [1:569] 56.9 97.8 80.7 63.1 40 ...
##  $ bmi                     : num [1:569] 18.9 36.7 25.7 21.3 14.9 ...
##  $ bmi_estimated           : num [1:569] 18.9 36.7 25.7 21.3 14.9 ...
##  $ bmi_scaled              : num [1:569] 56.7 110.1 77 64 44.8 ...
##  $ bmi_corrected           : num [1:569] 19 36.5 25.6 21.2 14.8 ...
##  $ waist_size              : num [1:569] 72.2 85.6 90.3 100.5 69 ...
##  $ blood_pressure          : num [1:569] 118 118 123 148 151 ...
##  $ heart_rate              : num [1:569] 60.7 66.5 76 68.8 92.3 ...
##  $ cholesterol             : num [1:569] 215 116 138 203 200 ...
##  $ glucose                 : num [1:569] 103 116.9 89.2 128.4 94.8 ...
##  $ insulin                 : num [1:569] NA 10.1 NA 18.7 16 ...
##  $ sleep_hours             : num [1:569] 6.48 8.43 5.7 5.19 7.91 ...
##  $ sleep_quality           : chr [1:569] "Fair" "Good" "Poor" "Good" ...
##  $ work_hours              : num [1:569] 7.67 9.52 5.83 9.49 7.28 ...
##  $ physical_activity       : num [1:569] 0.357 0.568 3.764 0.889 2.902 ...
##  $ daily_steps             : num [1:569] 13321 11911 2974 5322 9791 ...
##  $ calorie_intake          : num [1:569] 2674 2650 1747 2034 2386 ...
##  $ sugar_intake            : num [1:569] 44.5 74.7 19.7 82.6 46 ...
##  $ alcohol_consumption     : chr [1:569] NA "Regularly" "Regularly" "Occasionally" ...
##  $ smoking_level           : chr [1:569] "Non-smoker" "Light" "Heavy" "Heavy" ...
##  $ water_intake            : num [1:569] 1.694 0.716 2.488 2.643 1.968 ...
##  $ screen_time             : num [1:569] 5 5.93 4.37 4.12 3.18 ...
##  $ stress_level            : num [1:569] 2 3 0 10 9 7 7 7 2 10 ...
##  $ mental_health_score     : num [1:569] 8 9 1 4 7 6 1 2 9 9 ...
##  $ mental_health_support   : chr [1:569] "No" "No" "No" "No" ...
##  $ education_level         : chr [1:569] "PhD" "High School" "Master" "Master" ...
##  $ job_type                : chr [1:569] "Tech" "Office" "Office" "Labor" ...
##  $ occupation              : chr [1:569] "Farmer" "Engineer" "Teacher" "Teacher" ...
##  $ income                  : num [1:569] 6760 6241 3429 2619 3662 ...
##  $ diet_type               : chr [1:569] "Vegan" "Vegan" "Vegan" "Vegetarian" ...
##  $ exercise_type           : chr [1:569] "Strength" "Cardio" "Cardio" "Mixed" ...
##  $ device_usage            : chr [1:569] "High" "Moderate" "High" "Low" ...
##  $ healthcare_access       : chr [1:569] "Poor" "Moderate" "Good" "Moderate" ...
##  $ insurance               : chr [1:569] "No" "No" "Yes" "No" ...
##  $ sunlight_exposure       : chr [1:569] "High" "High" "High" "High" ...
##  $ meals_per_day           : num [1:569] 5 5 4 1 1 4 2 3 2 1 ...
##  $ caffeine_intake         : chr [1:569] "Moderate" "High" "Moderate" "None" ...
##  $ family_history          : chr [1:569] "No" "Yes" "No" "No" ...
##  $ pet_owner               : chr [1:569] "Yes" "No" "No" "Yes" ...
##  $ electrolyte_level       : num [1:569] 0 0 0 0 0 0 0 0 0 0 ...
##  $ gene_marker_flag        : num [1:569] 1 1 1 1 1 1 NA 1 NA 1 ...
##  $ environmental_risk_score: num [1:569] 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 ...
##  $ daily_supplement_dosage : num [1:569] -2.276 6.239 5.424 8.389 0.333 ...
##  $ target                  : chr [1:569] "healthy" "healthy" "healthy" "healthy" ...

2.List the variables in your dataset

colnames(health_lifestyle_classification)
##  [1] "survey_code"              "age"                     
##  [3] "gender"                   "height"                  
##  [5] "weight"                   "bmi"                     
##  [7] "bmi_estimated"            "bmi_scaled"              
##  [9] "bmi_corrected"            "waist_size"              
## [11] "blood_pressure"           "heart_rate"              
## [13] "cholesterol"              "glucose"                 
## [15] "insulin"                  "sleep_hours"             
## [17] "sleep_quality"            "work_hours"              
## [19] "physical_activity"        "daily_steps"             
## [21] "calorie_intake"           "sugar_intake"            
## [23] "alcohol_consumption"      "smoking_level"           
## [25] "water_intake"             "screen_time"             
## [27] "stress_level"             "mental_health_score"     
## [29] "mental_health_support"    "education_level"         
## [31] "job_type"                 "occupation"              
## [33] "income"                   "diet_type"               
## [35] "exercise_type"            "device_usage"            
## [37] "healthcare_access"        "insurance"               
## [39] "sunlight_exposure"        "meals_per_day"           
## [41] "caffeine_intake"          "family_history"          
## [43] "pet_owner"                "electrolyte_level"       
## [45] "gene_marker_flag"         "environmental_risk_score"
## [47] "daily_supplement_dosage"  "target"

3.Print the top 15 rows of your dataset

head(health_lifestyle_classification,15)

Quality Check

4.Remove missing values in your dataset

df_no_na <- na.omit(health_lifestyle_classification)

5.Identify and remove duplicated data in your dataset

df_cleaned <- df_no_na[!duplicated(df_no_na), ]

5.1 Check the number of rows before and after cleaning

cat("Original rows:", nrow(health_lifestyle_classification), "\n")
## Original rows: 569
cat("After removing NA and duplicates:", nrow(df_cleaned), "\n")
## After removing NA and duplicates: 248

Data Wrangling

  1. Write a user-defined function using any of the variables from the dataset
# Define a custom function to categorize water intake level
categorize_water_intake <- function(water) {
  if (is.na(water)) {
    return(NA)
  } else if (water < 1) {
    return("Very Low")
  } else if (water < 2) {
    return("Low")
  } else if (water < 3) {
    return("Moderate")
  } else {
    return("High")
  }
}

# Apply the function to create a new column in the cleaned dataset
df_cleaned$Water_Intake_Category <- sapply(df_cleaned$water_intake, categorize_water_intake)

# Check the first few rows
head(df_cleaned[, c("water_intake", "Water_Intake_Category")], 10)
  1. Use data manipulation techniques and filter rows based on any logical criteria in the dataset
# Filter rows where water intake is considered "Very Low" or "Low"
low_water_df <- df_cleaned %>%
  dplyr::filter(Water_Intake_Category %in% c("Very Low", "Low"))

# Show the number of individuals with low water intake
cat("Number of individuals with low or very low water intake:", nrow(low_water_df), "\n")
## Number of individuals with low or very low water intake: 124
# Display first few filtered entries
head(low_water_df[, c("age", "gender", "water_intake", "Water_Intake_Category")], 10)

Feature Engineering

8.Reorder multiple rows in descending order

df_sorted <- df_cleaned[order(-df_cleaned$age, -df_cleaned$bmi), ]
head(df_sorted,15)

9.Rename some of the column names in your dataset

names(df_sorted)[names(df_sorted) == "bmi"] <- "Body_Mass_Index"
names(df_sorted)[names(df_sorted) == "sleep_hours"] <- "Hours_of_Sleep"
names(df_sorted)[names(df_sorted) == "calorie_intake"] <- "Daily_Calories"
colnames(df_sorted)
##  [1] "survey_code"              "age"                     
##  [3] "gender"                   "height"                  
##  [5] "weight"                   "Body_Mass_Index"         
##  [7] "bmi_estimated"            "bmi_scaled"              
##  [9] "bmi_corrected"            "waist_size"              
## [11] "blood_pressure"           "heart_rate"              
## [13] "cholesterol"              "glucose"                 
## [15] "insulin"                  "Hours_of_Sleep"          
## [17] "sleep_quality"            "work_hours"              
## [19] "physical_activity"        "daily_steps"             
## [21] "Daily_Calories"           "sugar_intake"            
## [23] "alcohol_consumption"      "smoking_level"           
## [25] "water_intake"             "screen_time"             
## [27] "stress_level"             "mental_health_score"     
## [29] "mental_health_support"    "education_level"         
## [31] "job_type"                 "occupation"              
## [33] "income"                   "diet_type"               
## [35] "exercise_type"            "device_usage"            
## [37] "healthcare_access"        "insurance"               
## [39] "sunlight_exposure"        "meals_per_day"           
## [41] "caffeine_intake"          "family_history"          
## [43] "pet_owner"                "electrolyte_level"       
## [45] "gene_marker_flag"         "environmental_risk_score"
## [47] "daily_supplement_dosage"  "target"                  
## [49] "Water_Intake_Category"

10.Add new variables to your data frame using a mathematical function

df_sorted$weight_to_height_ratio <- df_sorted$weight / df_sorted$height
df_sorted$adjusted_bmi <- df_sorted$Body_Mass_Index * 1.1
head(df_sorted,15)

Insight Analyst

  1. Identify the dependent & independent variables 11.1. Identify dependent variable
library(dplyr)
dv <- df_sorted$target
head(dv)
## [1] "healthy"  "healthy"  "healthy"  "healthy"  "healthy"  "diseased"

11.2 Identify independent variables according to group metrics

demographic_iv <- df_sorted %>% select(age, gender, education_level, job_type, occupation, income)
health_metrics_iv <- df_sorted %>% select(height, weight, Body_Mass_Index, blood_pressure, heart_rate, cholesterol, glucose, insulin, mental_health_score)
lifestyle_iv <- df_sorted %>% select(Hours_of_Sleep, sleep_quality, work_hours, physical_activity, daily_steps, exercise_type, screen_time)
dietary_iv <- df_sorted %>% select(Daily_Calories, sugar_intake, water_intake, diet_type, meals_per_day, caffeine_intake, daily_supplement_dosage)
risk_iv <- df_sorted %>% select(alcohol_consumption, smoking_level, stress_level, family_history, gene_marker_flag, environmental_risk_score)
access_support <- df_sorted %>% select(mental_health_support, healthcare_access, insurance, sunlight_exposure, pet_owner)
head(demographic_iv, 3)
head(health_metrics_iv, 3)
head(lifestyle_iv, 3)
head(dietary_iv, 3)
head(risk_iv, 3)
head(access_support, 3)
  1. Use reshaping techniques and create a new data frame by joining those variables
dietary_with_target <- cbind(dv, dietary_iv)
names(dietary_with_target)[1] = "Health Status"
head(dietary_with_target, 5)

Statistical Analysis

  1. Print the summary statistics of your dataset

13.1 Select only numeric columns

library(ggplot2)
numeric_data <- df_sorted[sapply(df_sorted, is.numeric)]
summary(numeric_data)
##   survey_code         age            height          weight      
##  Min.   :  2.0   Min.   :18.00   Min.   :140.0   Min.   : 40.00  
##  1st Qu.:133.8   1st Qu.:32.00   1st Qu.:162.1   1st Qu.: 61.14  
##  Median :289.5   Median :48.00   Median :169.2   Median : 70.01  
##  Mean   :285.7   Mean   :47.97   Mean   :168.6   Mean   : 69.99  
##  3rd Qu.:433.2   3rd Qu.:63.00   3rd Qu.:175.2   3rd Qu.: 79.36  
##  Max.   :569.0   Max.   :79.00   Max.   :202.3   Max.   :109.58  
##  Body_Mass_Index bmi_estimated     bmi_scaled     bmi_corrected  
##  Min.   :11.66   Min.   :11.66   Min.   : 34.98   Min.   :11.63  
##  1st Qu.:20.88   1st Qu.:20.88   1st Qu.: 62.65   1st Qu.:20.76  
##  Median :24.22   Median :24.22   Median : 72.65   Median :24.33  
##  Mean   :24.91   Mean   :24.91   Mean   : 74.73   Mean   :24.90  
##  3rd Qu.:28.46   3rd Qu.:28.46   3rd Qu.: 85.37   3rd Qu.:28.37  
##  Max.   :39.52   Max.   :39.52   Max.   :118.55   Max.   :39.56  
##    waist_size     blood_pressure     heart_rate      cholesterol   
##  Min.   : 58.35   Min.   : 80.56   Min.   : 49.69   Min.   :115.6  
##  1st Qu.: 76.64   1st Qu.:109.44   1st Qu.: 68.42   1st Qu.:168.3  
##  Median : 85.14   Median :118.66   Median : 74.45   Median :186.2  
##  Mean   : 85.67   Mean   :119.59   Mean   : 74.71   Mean   :188.0  
##  3rd Qu.: 94.04   3rd Qu.:129.73   3rd Qu.: 80.49   3rd Qu.:207.4  
##  Max.   :122.32   Max.   :160.46   Max.   :103.33   Max.   :279.9  
##     glucose          insulin       Hours_of_Sleep     work_hours    
##  Min.   : 42.72   Min.   : 1.756   Min.   : 3.000   Min.   : 2.865  
##  1st Qu.: 85.73   1st Qu.:10.818   1st Qu.: 6.230   1st Qu.: 6.734  
##  Median : 98.52   Median :14.003   Median : 7.215   Median : 7.963  
##  Mean   : 99.77   Mean   :14.228   Mean   : 7.160   Mean   : 7.956  
##  3rd Qu.:114.57   3rd Qu.:18.108   3rd Qu.: 8.180   3rd Qu.: 9.198  
##  Max.   :155.76   Max.   :28.696   Max.   :10.699   Max.   :12.872  
##  physical_activity  daily_steps    Daily_Calories    sugar_intake   
##  Min.   :0.000     Min.   : 1000   Min.   : 868.3   Min.   : -5.60  
##  1st Qu.:1.754     1st Qu.: 5252   1st Qu.:1947.2   1st Qu.: 49.51  
##  Median :2.998     Median : 6958   Median :2264.5   Median : 63.62  
##  Mean   :3.049     Mean   : 6840   Mean   :2233.1   Mean   : 62.22  
##  3rd Qu.:4.251     3rd Qu.: 8364   3rd Qu.:2490.0   3rd Qu.: 74.89  
##  Max.   :9.158     Max.   :12640   Max.   :3283.6   Max.   :108.70  
##   water_intake    screen_time      stress_level    mental_health_score
##  Min.   :0.500   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000     
##  1st Qu.:1.509   1st Qu.: 4.101   1st Qu.: 2.000   1st Qu.: 2.000     
##  Median :1.998   Median : 6.106   Median : 5.000   Median : 5.000     
##  Mean   :2.041   Mean   : 6.086   Mean   : 4.847   Mean   : 5.012     
##  3rd Qu.:2.559   3rd Qu.: 8.235   3rd Qu.: 7.000   3rd Qu.: 8.000     
##  Max.   :4.450   Max.   :14.367   Max.   :10.000   Max.   :10.000     
##      income      meals_per_day   electrolyte_level gene_marker_flag
##  Min.   :  500   Min.   :1.000   Min.   :0         Min.   :1       
##  1st Qu.: 2543   1st Qu.:2.000   1st Qu.:0         1st Qu.:1       
##  Median : 4113   Median :3.000   Median :0         Median :1       
##  Mean   : 4079   Mean   :2.887   Mean   :0         Mean   :1       
##  3rd Qu.: 5359   3rd Qu.:4.000   3rd Qu.:0         3rd Qu.:1       
##  Max.   :10370   Max.   :5.000   Max.   :0         Max.   :1       
##  environmental_risk_score daily_supplement_dosage weight_to_height_ratio
##  Min.   :5.5              Min.   :-9.7786         Min.   :0.2160        
##  1st Qu.:5.5              1st Qu.:-5.0411         1st Qu.:0.3596        
##  Median :5.5              Median :-0.2995         Median :0.4135        
##  Mean   :5.5              Mean   :-0.1421         Mean   :0.4168        
##  3rd Qu.:5.5              3rd Qu.: 4.7380         3rd Qu.:0.4724        
##  Max.   :5.5              Max.   : 9.9986         Max.   :0.6188        
##   adjusted_bmi  
##  Min.   :12.83  
##  1st Qu.:22.97  
##  Median :26.64  
##  Mean   :27.40  
##  3rd Qu.:31.30  
##  Max.   :43.47
  1. Use any of the numerical variables and perform: Mean, Median, Mode, Range
# Define a custom mode function
get_mode <- function(v) {
  uniq_vals <- unique(v)
  uniq_vals[which.max(tabulate(match(v, uniq_vals)))]
}

# Apply mode to each numeric column
modes <- sapply(numeric_data, get_mode)

ranges <- sapply(numeric_data, function(x) max(x, na.rm = TRUE) - min(x, na.rm = TRUE))

# Mean and Median
means <- sapply(numeric_data, mean, na.rm = TRUE)
medians <- sapply(numeric_data, median, na.rm = TRUE)

# Combine all into one table
summary_table <- data.frame(
  Mean = means,
  Median = medians,
  Mode = modes,
  Range = ranges
)

print(summary_table)
##                                  Mean       Median         Mode        Range
## survey_code               285.7096774  289.5000000  274.0000000 5.670000e+02
## age                        47.9717742   48.0000000   79.0000000 6.100000e+01
## height                    168.6359171  169.1994565  140.0000000 6.228482e+01
## weight                     69.9921295   70.0080202   40.0000000 6.957590e+01
## Body_Mass_Index            24.9109833   24.2170333   37.9811770 2.785584e+01
## bmi_estimated              24.9109833   24.2170333   37.9811770 2.785584e+01
## bmi_scaled                 74.7329499   72.6510998  113.9435309 8.356752e+01
## bmi_corrected              24.9017327   24.3307096   37.6263938 2.792979e+01
## waist_size                 85.6696251   85.1404412   85.1187065 6.396912e+01
## blood_pressure            119.5904507  118.6617580  134.6035728 7.990143e+01
## heart_rate                 74.7144658   74.4484888   70.8469406 5.363990e+01
## cholesterol               187.9645858  186.2278623  248.8885628 1.643311e+02
## glucose                    99.7704221   98.5198903   60.4987097 1.130321e+02
## insulin                    14.2282506   14.0033370   21.5470081 2.693972e+01
## Hours_of_Sleep              7.1604312    7.2146879    6.6722405 7.698735e+00
## work_hours                  7.9558864    7.9634520    5.3054320 1.000748e+01
## physical_activity           3.0494869    2.9979726    0.0000000 9.157612e+00
## daily_steps              6840.4199491 6957.5963378 1000.0000000 1.163989e+04
## Daily_Calories           2233.0872868 2264.4589133 2485.0604576 2.415253e+03
## sugar_intake               62.2178626   63.6153974   55.1667093 1.142973e+02
## water_intake                2.0406867    1.9981441    3.1524530 3.950480e+00
## screen_time                 6.0857620    6.1060002    0.0000000 1.436693e+01
## stress_level                4.8467742    5.0000000    3.0000000 1.000000e+01
## mental_health_score         5.0120968    5.0000000    2.0000000 1.000000e+01
## income                   4078.5207842 4113.2535230  500.0000000 9.869897e+03
## meals_per_day               2.8870968    3.0000000    2.0000000 4.000000e+00
## electrolyte_level           0.0000000    0.0000000    0.0000000 0.000000e+00
## gene_marker_flag            1.0000000    1.0000000    1.0000000 0.000000e+00
## environmental_risk_score    5.5000000    5.5000000    5.5000000 0.000000e+00
## daily_supplement_dosage    -0.1420907   -0.2995412   -1.4286092 1.977718e+01
## weight_to_height_ratio      0.4167587    0.4135018    0.6001306 4.028484e-01
## adjusted_bmi               27.4020816   26.6387366   41.7792947 3.064142e+01
summary_table_rounded <- round(summary_table, 2)
print(summary_table_rounded)
##                             Mean  Median    Mode    Range
## survey_code               285.71  289.50  274.00   567.00
## age                        47.97   48.00   79.00    61.00
## height                    168.64  169.20  140.00    62.28
## weight                     69.99   70.01   40.00    69.58
## Body_Mass_Index            24.91   24.22   37.98    27.86
## bmi_estimated              24.91   24.22   37.98    27.86
## bmi_scaled                 74.73   72.65  113.94    83.57
## bmi_corrected              24.90   24.33   37.63    27.93
## waist_size                 85.67   85.14   85.12    63.97
## blood_pressure            119.59  118.66  134.60    79.90
## heart_rate                 74.71   74.45   70.85    53.64
## cholesterol               187.96  186.23  248.89   164.33
## glucose                    99.77   98.52   60.50   113.03
## insulin                    14.23   14.00   21.55    26.94
## Hours_of_Sleep              7.16    7.21    6.67     7.70
## work_hours                  7.96    7.96    5.31    10.01
## physical_activity           3.05    3.00    0.00     9.16
## daily_steps              6840.42 6957.60 1000.00 11639.89
## Daily_Calories           2233.09 2264.46 2485.06  2415.25
## sugar_intake               62.22   63.62   55.17   114.30
## water_intake                2.04    2.00    3.15     3.95
## screen_time                 6.09    6.11    0.00    14.37
## stress_level                4.85    5.00    3.00    10.00
## mental_health_score         5.01    5.00    2.00    10.00
## income                   4078.52 4113.25  500.00  9869.90
## meals_per_day               2.89    3.00    2.00     4.00
## electrolyte_level           0.00    0.00    0.00     0.00
## gene_marker_flag            1.00    1.00    1.00     0.00
## environmental_risk_score    5.50    5.50    5.50     0.00
## daily_supplement_dosage    -0.14   -0.30   -1.43    19.78
## weight_to_height_ratio      0.42    0.41    0.60     0.40
## adjusted_bmi               27.40   26.64   41.78    30.64

Data Prep for Modeling

15.Find the correlation between any 2 variables using Pearson correlation

correlation_age_bmi <- cor(df_sorted$age, df_sorted$Body_Mass_Index, method = "pearson", use = "complete.obs")
round(correlation_age_bmi, 4)
## [1] -0.0381
correlation_matrix <- cor(numeric_data, method = "pearson", use = "complete.obs")
## Warning in cor(numeric_data, method = "pearson", use = "complete.obs"): the
## standard deviation is zero
round(correlation_matrix, 2)
##                          survey_code   age height weight Body_Mass_Index
## survey_code                     1.00  0.03   0.00  -0.10           -0.08
## age                             0.03  1.00   0.02  -0.04           -0.04
## height                          0.00  0.02   1.00  -0.03           -0.55
## weight                         -0.10 -0.04  -0.03   1.00            0.84
## Body_Mass_Index                -0.08 -0.04  -0.55   0.84            1.00
## bmi_estimated                  -0.08 -0.04  -0.55   0.84            1.00
## bmi_scaled                     -0.08 -0.04  -0.55   0.84            1.00
## bmi_corrected                  -0.09 -0.04  -0.56   0.84            1.00
## waist_size                      0.00  0.00   0.14   0.09            0.00
## blood_pressure                 -0.10  0.06   0.03   0.03            0.02
## heart_rate                     -0.01 -0.13   0.01  -0.08           -0.08
## cholesterol                     0.03 -0.06   0.08  -0.05           -0.08
## glucose                        -0.10 -0.03  -0.06   0.03            0.06
## insulin                         0.00 -0.02   0.04   0.09            0.04
## Hours_of_Sleep                  0.11  0.06   0.03  -0.04           -0.04
## work_hours                      0.01 -0.15  -0.06  -0.08           -0.03
## physical_activity               0.05 -0.02   0.10  -0.09           -0.12
## daily_steps                     0.03  0.13  -0.11  -0.01            0.06
## Daily_Calories                  0.05  0.12   0.00  -0.06           -0.05
## sugar_intake                    0.04 -0.08  -0.03  -0.06           -0.03
## water_intake                   -0.05  0.07  -0.08  -0.01            0.04
## screen_time                     0.04  0.05   0.07   0.02           -0.01
## stress_level                   -0.08  0.01  -0.02   0.05            0.04
## mental_health_score            -0.07 -0.05  -0.03   0.05            0.07
## income                         -0.07  0.11   0.05   0.07            0.03
## meals_per_day                   0.03 -0.04  -0.04   0.00            0.04
## electrolyte_level                 NA    NA     NA     NA              NA
## gene_marker_flag                  NA    NA     NA     NA              NA
## environmental_risk_score          NA    NA     NA     NA              NA
## daily_supplement_dosage        -0.08 -0.10  -0.09  -0.07            0.00
## weight_to_height_ratio         -0.10 -0.04  -0.33   0.95            0.97
## adjusted_bmi                   -0.08 -0.04  -0.55   0.84            1.00
##                          bmi_estimated bmi_scaled bmi_corrected waist_size
## survey_code                      -0.08      -0.08         -0.09       0.00
## age                              -0.04      -0.04         -0.04       0.00
## height                           -0.55      -0.55         -0.56       0.14
## weight                            0.84       0.84          0.84       0.09
## Body_Mass_Index                   1.00       1.00          1.00       0.00
## bmi_estimated                     1.00       1.00          1.00       0.00
## bmi_scaled                        1.00       1.00          1.00       0.00
## bmi_corrected                     1.00       1.00          1.00       0.00
## waist_size                        0.00       0.00          0.00       1.00
## blood_pressure                    0.02       0.02          0.01       0.11
## heart_rate                       -0.08      -0.08         -0.08      -0.06
## cholesterol                      -0.08      -0.08         -0.08      -0.01
## glucose                           0.06       0.06          0.07      -0.01
## insulin                           0.04       0.04          0.04       0.01
## Hours_of_Sleep                   -0.04      -0.04         -0.03       0.08
## work_hours                       -0.03      -0.03         -0.03       0.03
## physical_activity                -0.12      -0.12         -0.12       0.00
## daily_steps                       0.06       0.06          0.06      -0.02
## Daily_Calories                   -0.05      -0.05         -0.05       0.00
## sugar_intake                     -0.03      -0.03         -0.03      -0.14
## water_intake                      0.04       0.04          0.04       0.07
## screen_time                      -0.01      -0.01         -0.01       0.04
## stress_level                      0.04       0.04          0.04      -0.04
## mental_health_score               0.07       0.07          0.07       0.09
## income                            0.03       0.03          0.03       0.04
## meals_per_day                     0.04       0.04          0.04      -0.01
## electrolyte_level                   NA         NA            NA         NA
## gene_marker_flag                    NA         NA            NA         NA
## environmental_risk_score            NA         NA            NA         NA
## daily_supplement_dosage           0.00       0.00          0.01      -0.07
## weight_to_height_ratio            0.97       0.97          0.96       0.04
## adjusted_bmi                      1.00       1.00          1.00       0.00
##                          blood_pressure heart_rate cholesterol glucose insulin
## survey_code                       -0.10      -0.01        0.03   -0.10    0.00
## age                                0.06      -0.13       -0.06   -0.03   -0.02
## height                             0.03       0.01        0.08   -0.06    0.04
## weight                             0.03      -0.08       -0.05    0.03    0.09
## Body_Mass_Index                    0.02      -0.08       -0.08    0.06    0.04
## bmi_estimated                      0.02      -0.08       -0.08    0.06    0.04
## bmi_scaled                         0.02      -0.08       -0.08    0.06    0.04
## bmi_corrected                      0.01      -0.08       -0.08    0.07    0.04
## waist_size                         0.11      -0.06       -0.01   -0.01    0.01
## blood_pressure                     1.00       0.08        0.01   -0.04    0.05
## heart_rate                         0.08       1.00       -0.07   -0.10   -0.10
## cholesterol                        0.01      -0.07        1.00   -0.07    0.08
## glucose                           -0.04      -0.10       -0.07    1.00   -0.01
## insulin                            0.05      -0.10        0.08   -0.01    1.00
## Hours_of_Sleep                     0.06       0.09       -0.07   -0.05   -0.10
## work_hours                        -0.08       0.14       -0.02   -0.07    0.05
## physical_activity                 -0.11      -0.03        0.08    0.02   -0.04
## daily_steps                       -0.06      -0.04        0.03    0.06   -0.01
## Daily_Calories                    -0.21       0.07        0.01   -0.07    0.01
## sugar_intake                      -0.02       0.08        0.02   -0.04    0.06
## water_intake                       0.11       0.03       -0.03    0.09    0.10
## screen_time                        0.12       0.06       -0.18   -0.09   -0.13
## stress_level                       0.04      -0.06       -0.04    0.08    0.13
## mental_health_score               -0.12       0.05        0.01   -0.01    0.06
## income                             0.06      -0.04        0.09   -0.04   -0.02
## meals_per_day                     -0.05       0.07       -0.03   -0.09   -0.07
## electrolyte_level                    NA         NA          NA      NA      NA
## gene_marker_flag                     NA         NA          NA      NA      NA
## environmental_risk_score             NA         NA          NA      NA      NA
## daily_supplement_dosage           -0.12      -0.03        0.06    0.02    0.05
## weight_to_height_ratio             0.02      -0.08       -0.07    0.05    0.07
## adjusted_bmi                       0.02      -0.08       -0.08    0.06    0.04
##                          Hours_of_Sleep work_hours physical_activity
## survey_code                        0.11       0.01              0.05
## age                                0.06      -0.15             -0.02
## height                             0.03      -0.06              0.10
## weight                            -0.04      -0.08             -0.09
## Body_Mass_Index                   -0.04      -0.03             -0.12
## bmi_estimated                     -0.04      -0.03             -0.12
## bmi_scaled                        -0.04      -0.03             -0.12
## bmi_corrected                     -0.03      -0.03             -0.12
## waist_size                         0.08       0.03              0.00
## blood_pressure                     0.06      -0.08             -0.11
## heart_rate                         0.09       0.14             -0.03
## cholesterol                       -0.07      -0.02              0.08
## glucose                           -0.05      -0.07              0.02
## insulin                           -0.10       0.05             -0.04
## Hours_of_Sleep                     1.00       0.00             -0.11
## work_hours                         0.00       1.00              0.04
## physical_activity                 -0.11       0.04              1.00
## daily_steps                        0.02      -0.03              0.03
## Daily_Calories                    -0.04       0.02              0.10
## sugar_intake                      -0.09       0.08              0.06
## water_intake                      -0.01      -0.06              0.05
## screen_time                        0.07      -0.03             -0.04
## stress_level                       0.00      -0.16             -0.02
## mental_health_score                0.06       0.11             -0.13
## income                            -0.01      -0.13             -0.05
## meals_per_day                     -0.10       0.11              0.06
## electrolyte_level                    NA         NA                NA
## gene_marker_flag                     NA         NA                NA
## environmental_risk_score             NA         NA                NA
## daily_supplement_dosage           -0.10       0.06             -0.11
## weight_to_height_ratio            -0.04      -0.06             -0.11
## adjusted_bmi                      -0.04      -0.03             -0.12
##                          daily_steps Daily_Calories sugar_intake water_intake
## survey_code                     0.03           0.05         0.04        -0.05
## age                             0.13           0.12        -0.08         0.07
## height                         -0.11           0.00        -0.03        -0.08
## weight                         -0.01          -0.06        -0.06        -0.01
## Body_Mass_Index                 0.06          -0.05        -0.03         0.04
## bmi_estimated                   0.06          -0.05        -0.03         0.04
## bmi_scaled                      0.06          -0.05        -0.03         0.04
## bmi_corrected                   0.06          -0.05        -0.03         0.04
## waist_size                     -0.02           0.00        -0.14         0.07
## blood_pressure                 -0.06          -0.21        -0.02         0.11
## heart_rate                     -0.04           0.07         0.08         0.03
## cholesterol                     0.03           0.01         0.02        -0.03
## glucose                         0.06          -0.07        -0.04         0.09
## insulin                        -0.01           0.01         0.06         0.10
## Hours_of_Sleep                  0.02          -0.04        -0.09        -0.01
## work_hours                     -0.03           0.02         0.08        -0.06
## physical_activity               0.03           0.10         0.06         0.05
## daily_steps                     1.00           0.11         0.04         0.05
## Daily_Calories                  0.11           1.00        -0.02        -0.07
## sugar_intake                    0.04          -0.02         1.00        -0.04
## water_intake                    0.05          -0.07        -0.04         1.00
## screen_time                    -0.03          -0.05        -0.05        -0.09
## stress_level                   -0.04          -0.06         0.11         0.05
## mental_health_score             0.06          -0.02         0.00         0.03
## income                          0.00          -0.12        -0.05        -0.09
## meals_per_day                   0.05          -0.02         0.12        -0.05
## electrolyte_level                 NA             NA           NA           NA
## gene_marker_flag                  NA             NA           NA           NA
## environmental_risk_score          NA             NA           NA           NA
## daily_supplement_dosage         0.06          -0.10         0.06        -0.06
## weight_to_height_ratio          0.03          -0.06        -0.05         0.02
## adjusted_bmi                    0.06          -0.05        -0.03         0.04
##                          screen_time stress_level mental_health_score income
## survey_code                     0.04        -0.08               -0.07  -0.07
## age                             0.05         0.01               -0.05   0.11
## height                          0.07        -0.02               -0.03   0.05
## weight                          0.02         0.05                0.05   0.07
## Body_Mass_Index                -0.01         0.04                0.07   0.03
## bmi_estimated                  -0.01         0.04                0.07   0.03
## bmi_scaled                     -0.01         0.04                0.07   0.03
## bmi_corrected                  -0.01         0.04                0.07   0.03
## waist_size                      0.04        -0.04                0.09   0.04
## blood_pressure                  0.12         0.04               -0.12   0.06
## heart_rate                      0.06        -0.06                0.05  -0.04
## cholesterol                    -0.18        -0.04                0.01   0.09
## glucose                        -0.09         0.08               -0.01  -0.04
## insulin                        -0.13         0.13                0.06  -0.02
## Hours_of_Sleep                  0.07         0.00                0.06  -0.01
## work_hours                     -0.03        -0.16                0.11  -0.13
## physical_activity              -0.04        -0.02               -0.13  -0.05
## daily_steps                    -0.03        -0.04                0.06   0.00
## Daily_Calories                 -0.05        -0.06               -0.02  -0.12
## sugar_intake                   -0.05         0.11                0.00  -0.05
## water_intake                   -0.09         0.05                0.03  -0.09
## screen_time                     1.00        -0.09               -0.08  -0.03
## stress_level                   -0.09         1.00                0.08  -0.09
## mental_health_score            -0.08         0.08                1.00  -0.08
## income                         -0.03        -0.09               -0.08   1.00
## meals_per_day                   0.06        -0.03                0.06  -0.07
## electrolyte_level                 NA           NA                  NA     NA
## gene_marker_flag                  NA           NA                  NA     NA
## environmental_risk_score          NA           NA                  NA     NA
## daily_supplement_dosage         0.00         0.08                0.00  -0.13
## weight_to_height_ratio          0.00         0.05                0.07   0.05
## adjusted_bmi                   -0.01         0.04                0.07   0.03
##                          meals_per_day electrolyte_level gene_marker_flag
## survey_code                       0.03                NA               NA
## age                              -0.04                NA               NA
## height                           -0.04                NA               NA
## weight                            0.00                NA               NA
## Body_Mass_Index                   0.04                NA               NA
## bmi_estimated                     0.04                NA               NA
## bmi_scaled                        0.04                NA               NA
## bmi_corrected                     0.04                NA               NA
## waist_size                       -0.01                NA               NA
## blood_pressure                   -0.05                NA               NA
## heart_rate                        0.07                NA               NA
## cholesterol                      -0.03                NA               NA
## glucose                          -0.09                NA               NA
## insulin                          -0.07                NA               NA
## Hours_of_Sleep                   -0.10                NA               NA
## work_hours                        0.11                NA               NA
## physical_activity                 0.06                NA               NA
## daily_steps                       0.05                NA               NA
## Daily_Calories                   -0.02                NA               NA
## sugar_intake                      0.12                NA               NA
## water_intake                     -0.05                NA               NA
## screen_time                       0.06                NA               NA
## stress_level                     -0.03                NA               NA
## mental_health_score               0.06                NA               NA
## income                           -0.07                NA               NA
## meals_per_day                     1.00                NA               NA
## electrolyte_level                   NA                 1               NA
## gene_marker_flag                    NA                NA                1
## environmental_risk_score            NA                NA               NA
## daily_supplement_dosage           0.11                NA               NA
## weight_to_height_ratio            0.02                NA               NA
## adjusted_bmi                      0.04                NA               NA
##                          environmental_risk_score daily_supplement_dosage
## survey_code                                    NA                   -0.08
## age                                            NA                   -0.10
## height                                         NA                   -0.09
## weight                                         NA                   -0.07
## Body_Mass_Index                                NA                    0.00
## bmi_estimated                                  NA                    0.00
## bmi_scaled                                     NA                    0.00
## bmi_corrected                                  NA                    0.01
## waist_size                                     NA                   -0.07
## blood_pressure                                 NA                   -0.12
## heart_rate                                     NA                   -0.03
## cholesterol                                    NA                    0.06
## glucose                                        NA                    0.02
## insulin                                        NA                    0.05
## Hours_of_Sleep                                 NA                   -0.10
## work_hours                                     NA                    0.06
## physical_activity                              NA                   -0.11
## daily_steps                                    NA                    0.06
## Daily_Calories                                 NA                   -0.10
## sugar_intake                                   NA                    0.06
## water_intake                                   NA                   -0.06
## screen_time                                    NA                    0.00
## stress_level                                   NA                    0.08
## mental_health_score                            NA                    0.00
## income                                         NA                   -0.13
## meals_per_day                                  NA                    0.11
## electrolyte_level                              NA                      NA
## gene_marker_flag                               NA                      NA
## environmental_risk_score                        1                      NA
## daily_supplement_dosage                        NA                    1.00
## weight_to_height_ratio                         NA                   -0.03
## adjusted_bmi                                   NA                    0.00
##                          weight_to_height_ratio adjusted_bmi
## survey_code                               -0.10        -0.08
## age                                       -0.04        -0.04
## height                                    -0.33        -0.55
## weight                                     0.95         0.84
## Body_Mass_Index                            0.97         1.00
## bmi_estimated                              0.97         1.00
## bmi_scaled                                 0.97         1.00
## bmi_corrected                              0.96         1.00
## waist_size                                 0.04         0.00
## blood_pressure                             0.02         0.02
## heart_rate                                -0.08        -0.08
## cholesterol                               -0.07        -0.08
## glucose                                    0.05         0.06
## insulin                                    0.07         0.04
## Hours_of_Sleep                            -0.04        -0.04
## work_hours                                -0.06        -0.03
## physical_activity                         -0.11        -0.12
## daily_steps                                0.03         0.06
## Daily_Calories                            -0.06        -0.05
## sugar_intake                              -0.05        -0.03
## water_intake                               0.02         0.04
## screen_time                                0.00        -0.01
## stress_level                               0.05         0.04
## mental_health_score                        0.07         0.07
## income                                     0.05         0.03
## meals_per_day                              0.02         0.04
## electrolyte_level                            NA           NA
## gene_marker_flag                             NA           NA
## environmental_risk_score                     NA           NA
## daily_supplement_dosage                   -0.03         0.00
## weight_to_height_ratio                     1.00         0.97
## adjusted_bmi                               0.97         1.00

16.Create a training set using random number generator engine

set.seed(123)
training_set <- df_sorted %>% sample_frac(0.50, replace = FALSE)
head(training_set, 5)

Chart Creation

17.Plot a scatter plot for any 2 variables in your dataset

library(ggplot2)
ggplot(health_lifestyle_classification, aes(x = weight, y = bmi )) +
  geom_point(shape = 4, color = "orange") +
  labs(title = "Scatter Plot of Weight vs BMI",
       x = "Weight (Kg)",
       y = "BMI ") +
  theme_minimal()

18.Plot a bar plot for any 2 variables in your dataset

library(dplyr)
library(ggplot2)
# Create a summarized count dataset
smoking_gender_count <- health_lifestyle_classification %>%
  group_by(smoking_level, gender) %>%
  summarise(count = n(), .groups = 'drop')

# Plot grouped bar chart
ggplot(smoking_gender_count, aes(x = smoking_level, y = count, fill = gender)) +
    geom_bar(stat = "identity", position = position_dodge()) +
    geom_text(aes(label = count),
              position = position_dodge(width = 0.9),
              vjust = -0.25, size = 3.5) +
    labs(title = "Count by Smoking Level and Gender",
         x = "Smoking Level",
         y = "Count") +
    scale_fill_manual(values = c("steelblue", "orange")) +
    coord_cartesian(ylim = c(16000, 16850)) +
    theme_minimal()