library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
obesity <- read.csv("C:/Users/saisr/Downloads/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition/obesity.csv")
str(obesity)
## 'data.frame': 2111 obs. of 17 variables:
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ family_history_with_overweight: chr "yes" "yes" "yes" "no" ...
## $ FAVC : chr "no" "no" "no" "no" ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ CAEC : chr "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
## $ SMOKE : chr "no" "yes" "no" "no" ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ SCC : chr "no" "yes" "no" "no" ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CALC : chr "no" "Sometimes" "Frequently" "Frequently" ...
## $ MTRANS : chr "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
## $ NObeyesdad : chr "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
head(obesity)
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female 21 1.62 64.0 yes no 2 3
## 2 Female 21 1.52 56.0 yes no 3 3
## 3 Male 23 1.80 77.0 yes no 2 3
## 4 Male 27 1.80 87.0 no no 3 3
## 5 Male 22 1.78 89.8 no no 2 1
## 6 Male 29 1.62 53.0 no yes 2 3
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 Sometimes no 2 no 0 1 no Public_Transportation
## 2 Sometimes yes 3 yes 3 0 Sometimes Public_Transportation
## 3 Sometimes no 2 no 2 1 Frequently Public_Transportation
## 4 Sometimes no 2 no 2 0 Frequently Walking
## 5 Sometimes no 2 no 0 0 Sometimes Public_Transportation
## 6 Sometimes no 2 no 0 0 Sometimes Automobile
## NObeyesdad
## 1 Normal_Weight
## 2 Normal_Weight
## 3 Normal_Weight
## 4 Overweight_Level_I
## 5 Overweight_Level_II
## 6 Normal_Weight
summary(obesity)
## Gender Age Height Weight
## Length:2111 Min. :14.00 Min. :1.450 Min. : 39.00
## Class :character 1st Qu.:19.95 1st Qu.:1.630 1st Qu.: 65.47
## Mode :character Median :22.78 Median :1.700 Median : 83.00
## Mean :24.31 Mean :1.702 Mean : 86.59
## 3rd Qu.:26.00 3rd Qu.:1.768 3rd Qu.:107.43
## Max. :61.00 Max. :1.980 Max. :173.00
## family_history_with_overweight FAVC FCVC
## Length:2111 Length:2111 Min. :1.000
## Class :character Class :character 1st Qu.:2.000
## Mode :character Mode :character Median :2.386
## Mean :2.419
## 3rd Qu.:3.000
## Max. :3.000
## NCP CAEC SMOKE CH2O
## Min. :1.000 Length:2111 Length:2111 Min. :1.000
## 1st Qu.:2.659 Class :character Class :character 1st Qu.:1.585
## Median :3.000 Mode :character Mode :character Median :2.000
## Mean :2.686 Mean :2.008
## 3rd Qu.:3.000 3rd Qu.:2.477
## Max. :4.000 Max. :3.000
## SCC FAF TUE CALC
## Length:2111 Min. :0.0000 Min. :0.0000 Length:2111
## Class :character 1st Qu.:0.1245 1st Qu.:0.0000 Class :character
## Mode :character Median :1.0000 Median :0.6253 Mode :character
## Mean :1.0103 Mean :0.6579
## 3rd Qu.:1.6667 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.0000
## MTRANS NObeyesdad
## Length:2111 Length:2111
## Class :character Class :character
## Mode :character Mode :character
##
##
##
summarizing two key numeric columns: Height
and
Weight
.
# Summary statistics for Height
height_summary <- obesity %>%
summarise(
Min = min(Height, na.rm = TRUE),
Max = max(Height, na.rm = TRUE),
Mean = mean(Height, na.rm = TRUE),
Median = median(Height, na.rm = TRUE),
Q1 = quantile(Height, 0.25, na.rm = TRUE),
Q3 = quantile(Height, 0.75, na.rm = TRUE),
SD = sd(Height, na.rm = TRUE)
)
# Summary statistics for Weight
weight_summary <- obesity %>%
summarise(
Min = min(Weight, na.rm = TRUE),
Max = max(Weight, na.rm = TRUE),
Mean = mean(Weight, na.rm = TRUE),
Median = median(Weight, na.rm = TRUE),
Q1 = quantile(Weight, 0.25, na.rm = TRUE),
Q3 = quantile(Weight, 0.75, na.rm = TRUE),
SD = sd(Weight, na.rm = TRUE)
)
height_summary
## Min Max Mean Median Q1 Q3 SD
## 1 1.45 1.98 1.701677 1.700499 1.63 1.768464 0.09330482
weight_summary
## Min Max Mean Median Q1 Q3 SD
## 1 39 173 86.58606 83 65.47334 107.4307 26.19117
We’ll examine the categorical columns: Gender, NObeyesdad, and MTRANS
# Unique values and counts for Gender
gender_counts <- obesity %>%
count(Gender)
# Unique values and counts for NObeyesdad
weight_category_counts <- obesity %>%
count(NObeyesdad)
# Unique values and counts for MTRANS
transportation_counts <- obesity %>%
count(MTRANS)
gender_counts
## Gender n
## 1 Female 1043
## 2 Male 1068
weight_category_counts
## NObeyesdad n
## 1 Insufficient_Weight 272
## 2 Normal_Weight 287
## 3 Obesity_Type_I 351
## 4 Obesity_Type_II 297
## 5 Obesity_Type_III 324
## 6 Overweight_Level_I 290
## 7 Overweight_Level_II 290
transportation_counts
## MTRANS n
## 1 Automobile 457
## 2 Bike 7
## 3 Motorbike 11
## 4 Public_Transportation 1580
## 5 Walking 56
# Scatter plot of Height vs. Weight by Weight Category
ggplot(obesity, aes(x = Height, y = Weight, color = NObeyesdad)) +
geom_point() +
labs(title = "Height vs. Weight by Weight Category",
x = "Height (m)",
y = "Weight (kg)",
color = "Weight Category") +
theme_minimal()
# Exploring the relationship between age and weight
age_weight_plot <- ggplot(obesity, aes(x = Age, y = Weight)) +
geom_boxplot(aes(color = Gender)) +
labs(title = "Relationship Between Age and Weight",
x = "Age",
y = "Weight") +
theme_minimal()
age_weight_plot
# Exploring relationship between CALC and FAf
caloric_plot <- ggplot(obesity, aes(x = CALC, y = FAF)) +
geom_point(aes(color = Gender)) +
labs(title = "Caloric Intake vs Physical Activity",
x = "Caloric Intake (CALC)",
y = "Physical Activity (FAF)") +
theme_minimal()
caloric_plot
The dataset consists of the following columns:
The goal of this analysis is to:
# Calculate average age by obesity level
age_by_obesity <- obesity %>%
group_by(NObeyesdad) %>%
summarise(Average_Age = mean(Age, na.rm = TRUE))
age_by_obesity
## # A tibble: 7 × 2
## NObeyesdad Average_Age
## <chr> <dbl>
## 1 Insufficient_Weight 19.8
## 2 Normal_Weight 21.7
## 3 Obesity_Type_I 25.9
## 4 Obesity_Type_II 28.2
## 5 Obesity_Type_III 23.5
## 6 Overweight_Level_I 23.4
## 7 Overweight_Level_II 27.0
# Boxplot of Physical Activity Frequency by Obesity Level
ggplot(obesity, aes(x = NObeyesdad, y = FAF, fill = NObeyesdad)) +
geom_boxplot() +
labs(title = "Physical Activity Frequency by Obesity Level", x = "Obesity Level", y = "Physical Activity Frequency") +
theme_minimal()