data preparation
library (readxl)
obesity <- read_excel ("C:/Users/User/Desktop/Dataset_Obesity.xlsx" )
str (obesity)
tibble [2,111 × 16] (S3: tbl_df/tbl/data.frame)
$ Gender : chr [1:2111] "Female" "Female" "Male" "Male" ...
$ Age : num [1:2111] 21 21 23 27 22 29 23 22 24 22 ...
$ Height : num [1:2111] 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
$ Weight : num [1:2111] 64 56 77 87 89.8 53 55 53 64 68 ...
$ family_history_with_overweight: chr [1:2111] "yes" "yes" "yes" "no" ...
$ FAVC : chr [1:2111] "no" "no" "no" "no" ...
$ FCVC : num [1:2111] 2 3 2 3 2 2 3 2 3 2 ...
$ NCP : num [1:2111] 3 3 3 3 1 3 3 3 3 3 ...
$ CAEC : chr [1:2111] "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
$ SMOKE : chr [1:2111] "no" "yes" "no" "no" ...
$ CH2O : num [1:2111] 2 3 2 2 2 2 2 2 2 2 ...
$ SCC : chr [1:2111] "no" "yes" "no" "no" ...
$ FAF : num [1:2111] 0 3 2 2 0 0 1 3 1 1 ...
$ CALC : chr [1:2111] "no" "Sometimes" "Frequently" "Frequently" ...
$ MTRANS : chr [1:2111] "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
$ NObeyesdad : chr [1:2111] "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
As we see, there are some categorical data that should be transformed into factors
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
obesity <- obesity %>%
mutate_if (is.character, as.factor)
str (obesity)
tibble [2,111 × 16] (S3: tbl_df/tbl/data.frame)
$ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
$ Age : num [1:2111] 21 21 23 27 22 29 23 22 24 22 ...
$ Height : num [1:2111] 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
$ Weight : num [1:2111] 64 56 77 87 89.8 53 55 53 64 68 ...
$ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
$ FAVC : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
$ FCVC : num [1:2111] 2 3 2 3 2 2 3 2 3 2 ...
$ NCP : num [1:2111] 3 3 3 3 1 3 3 3 3 3 ...
$ CAEC : Factor w/ 4 levels "Always","Frequently",..: 4 4 4 4 4 4 4 4 4 4 ...
$ SMOKE : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
$ CH2O : num [1:2111] 2 3 2 2 2 2 2 2 2 2 ...
$ SCC : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
$ FAF : num [1:2111] 0 3 2 2 0 0 1 3 1 1 ...
$ CALC : Factor w/ 4 levels "Always","Frequently",..: 3 4 2 2 4 4 4 4 2 3 ...
$ MTRANS : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
$ NObeyesdad : Factor w/ 7 levels "Insufficient_Weight",..: 2 2 2 6 7 2 2 2 2 2 ...
Checking for missing values
Gender Age
0 0
Height Weight
0 0
family_history_with_overweight FAVC
0 0
FCVC NCP
0 0
CAEC SMOKE
0 0
CH2O SCC
0 0
FAF CALC
0 0
MTRANS NObeyesdad
0 0
Converting height to centimeters
obesity$ Height <- obesity$ Height * 100
weight distribution by obesity type
library (ggplot2)
boxplot_weight_obesity <- ggplot (obesity, aes (x = NObeyesdad, y = Weight, fill = NObeyesdad)) +
geom_boxplot () +
labs (title = "Weight Distribution by Obesity Type" ,
x = "Obesity Type" ,
y = "Weight" ) +
theme_minimal () +
theme (plot.title = element_text (face = "bold" , size = 14 ),
axis.text.x = element_text (angle = 45 , hjust = 1 )) +
scale_fill_manual (values = c (
"Insufficient_Weight" = "navyblue" ,
"Normal_Weight" = "green" ,
"Overweight_Level_I" = "purple" ,
"Overweight_Level_II" = "maroon1" ,
"Obesity_Type_I" = "yellow" ,
"Obesity_Type_II" = "orange" ,
"Obesity_Type_III" = "red"
))
print (boxplot_weight_obesity)
Weight&height correlations
library (dplyr)
library (ggplot2)
obesity_cor <- obesity %>% select (c (NObeyesdad, Height, Weight))
custom_colors <- c (
"Insufficient_Weight" = "navyblue" ,
"Normal_Weight" = "green" ,
"Overweight_Level_I" = "purple" ,
"Overweight_Level_II" = "maroon1" ,
"Obesity_Type_I" = "yellow" ,
"Obesity_Type_II" = "orange" ,
"Obesity_Type_III" = "red"
)
ggplot (data = obesity_cor, mapping = aes (x = Height, y = Weight, col = NObeyesdad)) +
geom_point () +
geom_smooth (method = lm, color = "black" , se = FALSE , formula = y ~ x) +
scale_color_manual (values = custom_colors) +
labs (
title = 'Correlation of Height and Weight' ,
x = "Height (cm)" ,
y = "Weight (Kg)" ,
col = "Obesity Level"
)
print (cor (obesity_cor$ Height, obesity_cor$ Weight))
According to the data, the correlation between height and weight is weakly positive (0.46)
library (dplyr)
correlations_by_obesity <- obesity %>%
group_by (NObeyesdad) %>%
summarise (
correlation = cor (Height, Weight, use = "complete.obs" )
)
print (correlations_by_obesity)
# A tibble: 7 × 2
NObeyesdad correlation
<fct> <dbl>
1 Insufficient_Weight 0.918
2 Normal_Weight 0.830
3 Obesity_Type_I 0.958
4 Obesity_Type_II 0.908
5 Obesity_Type_III 0.928
6 Overweight_Level_I 0.977
7 Overweight_Level_II 0.960
Correlation coefficients
library (ggplot2)
ggplot (correlations_by_obesity, aes (x = NObeyesdad, y = correlation, fill = NObeyesdad)) +
geom_col () +
labs (title = "Correlation of Height and Weight by Obesity Level" ,
x = "Obesity Level" ,
y = "Correlation" ) +
scale_fill_manual (values = c (
"Insufficient_Weight" = "navyblue" ,
"Normal_Weight" = "green" ,
"Overweight_Level_I" = "purple" ,
"Overweight_Level_II" = "maroon1" ,
"Obesity_Type_I" = "yellow" ,
"Obesity_Type_II" = "orange" ,
"Obesity_Type_III" = "red"
)) +
theme_minimal () +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
Gender comparison
library (glue)
library (ggplot2)
obs_gender <- obesity %>%
select (c (Gender, NObeyesdad)) %>%
group_by (Gender, NObeyesdad) %>%
summarise (total = n (), .groups = "drop" ) %>%
mutate (label = glue ("Total : {total}" ))
plotog <- ggplot (data = obs_gender, aes (x = NObeyesdad, y = total, fill = NObeyesdad, text = label)) +
geom_col (position = "dodge" ) +
facet_wrap (vars (Gender)) +
scale_fill_manual (values = c (
"Insufficient_Weight" = "navyblue" ,
"Normal_Weight" = "green" ,
"Overweight_Level_I" = "purple" ,
"Overweight_Level_II" = "maroon1" ,
"Obesity_Type_I" = "yellow" ,
"Obesity_Type_II" = "orange" ,
"Obesity_Type_III" = "red"
)) +
labs (title = list (text = paste0 ('Obesity Type based on Gender' )),
x = "Obesity Type" ,
y = "Total"
)+
theme_minimal () +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
print (plotog)
other
library (ggplot2)
library (dplyr)
library (tidyr)
height_weight <- obesity %>%
select (c (Gender, Height, Weight)) %>%
pivot_longer (cols = c ("Height" , "Weight" ), names_to = "variable" )
plothw <- ggplot (data = height_weight, mapping = aes (x = Gender, y = value)) +
geom_boxplot (aes (fill = Gender), position = "dodge" ) +
facet_wrap (vars (variable)) +
labs (
title = 'Height and Weight Distribution Based on Gender' ,
x = "Gender" ,
y = "Height (cm) / Weight (Kg)"
) +
scale_fill_manual (values = c ("Female" = "lightpink" , "Male" = "lightblue" )) +
theme_minimal () +
theme (
panel.background = element_blank (),
plot.background = element_blank (),
axis.line = element_line (color = "black" ), #
axis.text.x = element_text (color = "black" ),
axis.text.y = element_text (color = "black" ),
plot.title = element_text (face = "bold" , color = "black" )
)
print (plothw)
The plots show that the median height of females in the sample is clearly lower than that of males, with some males exceeding 1.98 meters (outliers). As for weight, the difference is not as large
gender_obesity_table <- table (obesity$ Gender, obesity$ NObeyesdad)
chisq_test_result <- chisq.test (gender_obesity_table)
print (chisq_test_result)
Pearson's Chi-squared test
data: gender_obesity_table
X-squared = 657.75, df = 6, p-value < 2.2e-16
text
library (rcompanion)
cramerV (gender_obesity_table)
A value of 0.5582 indicates a moderate association between gender and obesity level, but it’s not an absolute determinant.
library (ggplot2)
library (dplyr)
library (glue)
gene_snack <- obesity %>%
select (c (family_history_with_overweight, FAVC, NObeyesdad)) %>%
group_by (family_history_with_overweight, FAVC, NObeyesdad) %>%
summarise (count = n (), .groups = "drop" ) %>%
mutate (label = glue ("Total: {count} \n Obesity Type: {NObeyesdad} \n Family History with Overweight: {family_history_with_overweight} \n Consumption of food between meals: {FAVC}" ))
plotc <- ggplot (gene_snack, mapping = aes (x = FAVC, y = count, fill = NObeyesdad, text = label)) +
geom_col (position = "dodge" ) +
facet_wrap (vars (family_history_with_overweight)) +
scale_fill_manual (values = c (
"Insufficient_Weight" = "navyblue" ,
"Normal_Weight" = "green" ,
"Overweight_Level_I" = "purple" ,
"Overweight_Level_II" = "maroon1" ,
"Obesity_Type_I" = "yellow" ,
"Obesity_Type_II" = "orange" ,
"Obesity_Type_III" = "red" )) +
scale_x_discrete (expand = expansion (mult = c (0.0001 , 0.0001 ))) +
labs (title = "Family History with Overweight" ,
x = "Frequent consumption of high caloric food" ,
y = NULL ,
fill = "Obesity Type" ) +
theme (legend.title = element_blank (),
axis.text.x = element_text (),
plot.title = element_text (face = "bold" ),
panel.background = element_rect (fill = "#ffffff" ),
axis.line.y = element_line (colour = "grey" ),
axis.line.x = element_blank (),
panel.grid = element_blank ())
print (plotc)
descriptive analysis
obesity %>%
group_by (family_history_with_overweight, NObeyesdad) %>%
summarise (count = n ()) %>%
mutate (percentage = (count / sum (count)) * 100 ) %>%
arrange (desc (percentage))
`summarise()` has grouped output by 'family_history_with_overweight'. You can
override using the `.groups` argument.
# A tibble: 13 × 4
# Groups: family_history_with_overweight [2]
family_history_with_overweight NObeyesdad count percentage
<fct> <fct> <int> <dbl>
1 no Insufficient_Weight 146 37.9
2 no Normal_Weight 132 34.3
3 no Overweight_Level_I 81 21.0
4 yes Obesity_Type_I 344 19.9
5 yes Obesity_Type_III 324 18.8
6 yes Obesity_Type_II 296 17.1
7 yes Overweight_Level_II 272 15.8
8 yes Overweight_Level_I 209 12.1
9 yes Normal_Weight 155 8.98
10 yes Insufficient_Weight 126 7.30
11 no Overweight_Level_II 18 4.68
12 no Obesity_Type_I 7 1.82
13 no Obesity_Type_II 1 0.260
Chi-Square Test of Independence
chisq_test <- table (obesity$ family_history_with_overweight, obesity$ NObeyesdad)
chisq.test (chisq_test)
Pearson's Chi-squared test
data: chisq_test
X-squared = 621.98, df = 6, p-value < 2.2e-16
visual
mosaicplot (chisq_test, main = "Association Between Family History and Obesity Levels" ,
col = c ("navyblue" , "green" , "purple" , "maroon1" , "yellow" , "orange" , "red" ),
las = 1 )
Calculating bmi
obesity <- obesity %>%
mutate (Height_m = Height / 100 ,
BMI = Weight / (Height_m^ 2 ))
obesity <- obesity %>%
mutate (BMI_category = case_when (
BMI < 18.5 ~ "Underweight" ,
BMI >= 18.5 & BMI <= 24.9 ~ "Normal weight" ,
BMI >= 25 & BMI <= 29.9 ~ "Overweight" ,
BMI >= 30 ~ "Obesity"
))
text
library (ggplot2)
bmi_colors <- c (
"Underweight" = "navyblue" ,
"Normal" = "green" ,
"Overweight" = "orange" ,
"Obese" = "red"
)
obesity <- obesity %>%
mutate (BMI_category = case_when (
BMI < 18.5 ~ "Underweight" ,
BMI >= 18.5 & BMI < 24.9 ~ "Normal" ,
BMI >= 25 & BMI < 29.9 ~ "Overweight" ,
BMI >= 30 ~ "Obese"
))
ggplot (obesity, aes (x = NObeyesdad, y = BMI, fill = BMI_category)) +
geom_boxplot () +
scale_fill_manual (values = bmi_colors) +
labs (title = "BMI Distribution by Obesity Type (Colored by BMI Category)" ,
x = "Obesity Type" ,
y = "BMI" ,
fill = "BMI Category" ) +
theme_minimal () +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
weight for obesity