data preparation
library (readxl)
obesity <- read_excel ("C:/Users/User/Desktop/Dataset_Obesity.xlsx" )
str (obesity)
tibble [2,111 × 16] (S3: tbl_df/tbl/data.frame)
$ Gender : chr [1:2111] "Female" "Female" "Male" "Male" ...
$ Age : num [1:2111] 21 21 23 27 22 29 23 22 24 22 ...
$ Height : num [1:2111] 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
$ Weight : num [1:2111] 64 56 77 87 89.8 53 55 53 64 68 ...
$ family_history_with_overweight: chr [1:2111] "yes" "yes" "yes" "no" ...
$ FAVC : chr [1:2111] "no" "no" "no" "no" ...
$ FCVC : num [1:2111] 2 3 2 3 2 2 3 2 3 2 ...
$ NCP : num [1:2111] 3 3 3 3 1 3 3 3 3 3 ...
$ CAEC : chr [1:2111] "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
$ SMOKE : chr [1:2111] "no" "yes" "no" "no" ...
$ CH2O : num [1:2111] 2 3 2 2 2 2 2 2 2 2 ...
$ SCC : chr [1:2111] "no" "yes" "no" "no" ...
$ FAF : num [1:2111] 0 3 2 2 0 0 1 3 1 1 ...
$ CALC : chr [1:2111] "no" "Sometimes" "Frequently" "Frequently" ...
$ MTRANS : chr [1:2111] "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
$ NObeyesdad : chr [1:2111] "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
As we see, there are some categorical data that should be transformed into factors
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
obesity <- obesity %>%
mutate_if (is.character, as.factor)
str (obesity)
tibble [2,111 × 16] (S3: tbl_df/tbl/data.frame)
$ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
$ Age : num [1:2111] 21 21 23 27 22 29 23 22 24 22 ...
$ Height : num [1:2111] 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
$ Weight : num [1:2111] 64 56 77 87 89.8 53 55 53 64 68 ...
$ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
$ FAVC : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
$ FCVC : num [1:2111] 2 3 2 3 2 2 3 2 3 2 ...
$ NCP : num [1:2111] 3 3 3 3 1 3 3 3 3 3 ...
$ CAEC : Factor w/ 4 levels "Always","Frequently",..: 4 4 4 4 4 4 4 4 4 4 ...
$ SMOKE : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
$ CH2O : num [1:2111] 2 3 2 2 2 2 2 2 2 2 ...
$ SCC : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
$ FAF : num [1:2111] 0 3 2 2 0 0 1 3 1 1 ...
$ CALC : Factor w/ 4 levels "Always","Frequently",..: 3 4 2 2 4 4 4 4 2 3 ...
$ MTRANS : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
$ NObeyesdad : Factor w/ 7 levels "Insufficient_Weight",..: 2 2 2 6 7 2 2 2 2 2 ...
Checking for missing values
Gender Age
0 0
Height Weight
0 0
family_history_with_overweight FAVC
0 0
FCVC NCP
0 0
CAEC SMOKE
0 0
CH2O SCC
0 0
FAF CALC
0 0
MTRANS NObeyesdad
0 0
Converting height to centimeters
obesity$ Height <- obesity$ Height * 100
weight distribution by obesity type
library (ggplot2)
boxplot_weight_obesity <- ggplot (obesity, aes (x = NObeyesdad, y = Weight, fill = NObeyesdad)) +
geom_boxplot () +
labs (title = "Weight Distribution by Obesity Type" ,
x = "Obesity Type" ,
y = "Weight" ) +
theme_minimal () +
theme (plot.title = element_text (face = "bold" , size = 14 ),
axis.text.x = element_text (angle = 45 , hjust = 1 )) +
scale_fill_manual (values = c (
"Insufficient_Weight" = "navyblue" ,
"Normal_Weight" = "green" ,
"Overweight_Level_I" = "purple" ,
"Overweight_Level_II" = "maroon1" ,
"Obesity_Type_I" = "yellow" ,
"Obesity_Type_II" = "orange" ,
"Obesity_Type_III" = "red"
))
print (boxplot_weight_obesity)
Weight&height correlations
library (dplyr)
library (ggplot2)
obesity_cor <- obesity %>% select (c (NObeyesdad, Height, Weight))
custom_colors <- c (
"Insufficient_Weight" = "navyblue" ,
"Normal_Weight" = "green" ,
"Overweight_Level_I" = "purple" ,
"Overweight_Level_II" = "maroon1" ,
"Obesity_Type_I" = "yellow" ,
"Obesity_Type_II" = "orange" ,
"Obesity_Type_III" = "red"
)
ggplot (data = obesity_cor, mapping = aes (x = Height, y = Weight, col = NObeyesdad)) +
geom_point () +
geom_smooth (method = lm, color = "black" , se = FALSE , formula = y ~ x) +
scale_color_manual (values = custom_colors) +
labs (
title = 'Correlation of Height and Weight' ,
x = "Height (cm)" ,
y = "Weight (Kg)" ,
col = "Obesity Level"
)
print (cor (obesity_cor$ Height, obesity_cor$ Weight))
According to the data, the correlation between height and weight is weakly positive (0.46)
library (dplyr)
correlations_by_obesity <- obesity %>%
group_by (NObeyesdad) %>%
summarise (
correlation = cor (Height, Weight, use = "complete.obs" )
)
print (correlations_by_obesity)
# A tibble: 7 × 2
NObeyesdad correlation
<fct> <dbl>
1 Insufficient_Weight 0.918
2 Normal_Weight 0.830
3 Obesity_Type_I 0.958
4 Obesity_Type_II 0.908
5 Obesity_Type_III 0.928
6 Overweight_Level_I 0.977
7 Overweight_Level_II 0.960
Correlation coefficients
library (ggplot2)
ggplot (correlations_by_obesity, aes (x = NObeyesdad, y = correlation, fill = NObeyesdad)) +
geom_col () +
labs (title = "Correlation of Height and Weight by Obesity Level" ,
x = "Obesity Level" ,
y = "Correlation" ) +
scale_fill_manual (values = c (
"Insufficient_Weight" = "navyblue" ,
"Normal_Weight" = "green" ,
"Overweight_Level_I" = "purple" ,
"Overweight_Level_II" = "maroon1" ,
"Obesity_Type_I" = "yellow" ,
"Obesity_Type_II" = "orange" ,
"Obesity_Type_III" = "red"
)) +
theme_minimal () +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
Gender comparison
library (glue)
library (ggplot2)
obs_gender <- obesity %>%
select (c (Gender, NObeyesdad)) %>%
group_by (Gender, NObeyesdad) %>%
summarise (total = n (), .groups = "drop" ) %>%
mutate (label = glue ("Total : {total}" ))
plotog <- ggplot (data = obs_gender, aes (x = NObeyesdad, y = total, fill = NObeyesdad, text = label)) +
geom_col (position = "dodge" ) +
facet_wrap (vars (Gender)) +
scale_fill_manual (values = c (
"Insufficient_Weight" = "navyblue" ,
"Normal_Weight" = "green" ,
"Overweight_Level_I" = "purple" ,
"Overweight_Level_II" = "maroon1" ,
"Obesity_Type_I" = "yellow" ,
"Obesity_Type_II" = "orange" ,
"Obesity_Type_III" = "red"
)) +
labs (title = list (text = paste0 ('Obesity Type based on Gender' )),
x = "Obesity Type" ,
y = "Total"
)+
theme_minimal () +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
print (plotog)
other
library (ggplot2)
library (dplyr)
library (tidyr)
height_weight <- obesity %>%
select (c (Gender, Height, Weight)) %>%
pivot_longer (cols = c ("Height" , "Weight" ), names_to = "variable" )
plothw <- ggplot (data = height_weight, mapping = aes (x = Gender, y = value)) +
geom_boxplot (aes (fill = Gender), position = "dodge" ) +
facet_wrap (vars (variable)) +
labs (
title = 'Height and Weight Distribution Based on Gender' ,
x = "Gender" ,
y = "Height (cm) / Weight (Kg)"
) +
scale_fill_manual (values = c ("Female" = "maroon1" , "Male" = "navyblue" )) +
theme_minimal () +
theme (
panel.background = element_blank (),
plot.background = element_blank (),
axis.line = element_line (color = "black" ), #
axis.text.x = element_text (color = "black" ),
axis.text.y = element_text (color = "black" ),
plot.title = element_text (face = "bold" , color = "black" )
)
print (plothw)
The plots show that the median height of females in the sample is clearly lower than that of males, as for weight, the difference is not as large
gender_obesity_table <- table (obesity$ Gender, obesity$ NObeyesdad)
chisq_test_result <- chisq.test (gender_obesity_table)
print (chisq_test_result)
Pearson's Chi-squared test
data: gender_obesity_table
X-squared = 657.75, df = 6, p-value < 2.2e-16
text
library (rcompanion)
cramerV (gender_obesity_table)
A value of 0.5582 indicates a moderate association between gender and obesity level, but it’s not an absolute determinant.
library (ggplot2)
library (dplyr)
library (glue)
gene_snack <- obesity %>%
select (c (family_history_with_overweight, FAVC, NObeyesdad)) %>%
group_by (family_history_with_overweight, FAVC, NObeyesdad) %>%
summarise (count = n (), .groups = "drop" ) %>%
mutate (label = glue ("Total: {count} \n Obesity Type: {NObeyesdad} \n Family History with Overweight: {family_history_with_overweight} \n Consumption of food between meals: {FAVC}" ))
plotc <- ggplot (gene_snack, mapping = aes (x = FAVC, y = count, fill = NObeyesdad, text = label)) +
geom_col (position = "dodge" ) +
facet_wrap (vars (family_history_with_overweight)) +
scale_fill_manual (values = c (
"Insufficient_Weight" = "navyblue" ,
"Normal_Weight" = "green" ,
"Overweight_Level_I" = "purple" ,
"Overweight_Level_II" = "maroon1" ,
"Obesity_Type_I" = "yellow" ,
"Obesity_Type_II" = "orange" ,
"Obesity_Type_III" = "red" )) +
scale_x_discrete (expand = expansion (mult = c (0.0001 , 0.0001 ))) +
labs (title = "Family History with Overweight" ,
x = "Frequent consumption of high caloric food" ,
y = NULL ,
fill = "Obesity Type" ) +
theme (legend.title = element_blank (),
axis.text.x = element_text (),
plot.title = element_text (face = "bold" ),
panel.background = element_rect (fill = "#ffffff" ),
axis.line.y = element_line (colour = "grey" ),
axis.line.x = element_blank (),
panel.grid = element_blank ())
print (plotc)
descriptive analysis
obesity %>%
group_by (family_history_with_overweight, NObeyesdad) %>%
summarise (count = n ()) %>%
mutate (percentage = (count / sum (count)) * 100 ) %>%
arrange (desc (percentage))
`summarise()` has grouped output by 'family_history_with_overweight'. You can
override using the `.groups` argument.
# A tibble: 13 × 4
# Groups: family_history_with_overweight [2]
family_history_with_overweight NObeyesdad count percentage
<fct> <fct> <int> <dbl>
1 no Insufficient_Weight 146 37.9
2 no Normal_Weight 132 34.3
3 no Overweight_Level_I 81 21.0
4 yes Obesity_Type_I 344 19.9
5 yes Obesity_Type_III 324 18.8
6 yes Obesity_Type_II 296 17.1
7 yes Overweight_Level_II 272 15.8
8 yes Overweight_Level_I 209 12.1
9 yes Normal_Weight 155 8.98
10 yes Insufficient_Weight 126 7.30
11 no Overweight_Level_II 18 4.68
12 no Obesity_Type_I 7 1.82
13 no Obesity_Type_II 1 0.260
Chi-Square Test of Independence
chisq_test <- table (obesity$ family_history_with_overweight, obesity$ NObeyesdad)
chisq.test (chisq_test)
Pearson's Chi-squared test
data: chisq_test
X-squared = 621.98, df = 6, p-value < 2.2e-16
visual
mosaicplot (chisq_test, main = "Association Between Family History and Obesity Levels" ,
col = c ("navyblue" , "green" , "purple" , "maroon1" , "yellow" , "orange" , "red" ),
las = 1 )
library (ggplot2)
ggplot (obesity, aes (x = FAVC, fill = NObeyesdad)) +
geom_bar (position = "fill" ) +
labs (title = "Obesity Type by High-Calorie Food Consumption" ,
x = "High-Calorie Food Consumption (FAVC)" ,
y = "Proportion" ,
fill = "Obesity Type" ) +
scale_fill_manual (values = c (
"Insufficient_Weight" = "navyblue" ,
"Normal_Weight" = "green" ,
"Overweight_Level_I" = "purple" ,
"Overweight_Level_II" = "maroon1" ,
"Obesity_Type_I" = "yellow" ,
"Obesity_Type_II" = "orange" ,
"Obesity_Type_III" = "red"
)) +
theme_minimal ()
chisq_test <- table (obesity$ FAVC, obesity$ NObeyesdad)
chisq_result <- chisq.test (chisq_test)
chisq_result
Pearson's Chi-squared test
data: chisq_test
X-squared = 233.34, df = 6, p-value < 2.2e-16
chisq_test <- table (obesity$ FCVC, obesity$ NObeyesdad)
chisq_result <- chisq.test (chisq_test)
Warning in chisq.test(chisq_test): Chi-squared approximation may be incorrect
Pearson's Chi-squared test
data: chisq_test
X-squared = 5945, df = 4854, p-value < 2.2e-16
library (ggplot2)
library (dplyr)
library (glue)
gene_snack <- obesity %>%
select (c (family_history_with_overweight, CAEC, NObeyesdad)) %>%
group_by (family_history_with_overweight, CAEC, NObeyesdad) %>%
summarise (count = n (), .groups = "drop" ) %>%
mutate (label = glue ("Total: {count} \n Obesity Type: {NObeyesdad} \n Family History with Overweight: {family_history_with_overweight} \n Consumption of food between meals: {CAEC}" ))
plotc <- ggplot (gene_snack, mapping = aes (x = CAEC, y = count, fill = NObeyesdad, text = label)) +
geom_col (position = "dodge" ) +
facet_wrap (vars (family_history_with_overweight)) +
scale_fill_manual (values = c (
"Insufficient_Weight" = "navyblue" ,
"Normal_Weight" = "green" ,
"Overweight_Level_I" = "purple" ,
"Overweight_Level_II" = "maroon1" ,
"Obesity_Type_I" = "yellow" ,
"Obesity_Type_II" = "orange" ,
"Obesity_Type_III" = "red" )) +
scale_x_discrete (expand = expansion (mult = c (0.0001 , 0.0001 ))) +
labs (title = "Family History with Overweight" ,
x = "Consumption of food between meals" ,
y = NULL ,
fill = "Obesity Type" ) +
theme (legend.title = element_blank (),
axis.text.x = element_text (),
plot.title = element_text (face = "bold" ),
panel.background = element_rect (fill = "#ffffff" ),
axis.line.y = element_line (colour = "grey" ),
axis.line.x = element_blank (),
panel.grid = element_blank ())
# Print the plot
print (plotc)
library (ggplot2)
library (dplyr)
# Create the plot
plot_gender_obesity_age <- ggplot (obesity, aes (x = Age, y = NObeyesdad, color = Gender)) +
geom_jitter (width = 0.3 , alpha = 0.6 ) + # Adds jitter for better visualization and reduces overlap
scale_color_manual (values = c ("Male" = "navyblue" , "Female" = "maroon1" )) + # Custom colors for gender
labs (
title = "Obesity Type by Age and Gender" ,
x = "Age" ,
y = "Obesity Type" ,
color = "Gender"
) +
theme_minimal () + # Clean background theme
theme (
plot.title = element_text (face = "bold" , size = 14 ),
axis.text.x = element_text (angle = 45 , hjust = 1 ),
panel.grid.minor = element_blank ()
)
# Print the plot
print (plot_gender_obesity_age)