Econometrics 1.2 group assignment: obesity determinants analysis

data preparation

library(readxl)
obesity <- read_excel("C:/Users/User/Desktop/Dataset_Obesity.xlsx")

str(obesity)
tibble [2,111 × 16] (S3: tbl_df/tbl/data.frame)
 $ Gender                        : chr [1:2111] "Female" "Female" "Male" "Male" ...
 $ Age                           : num [1:2111] 21 21 23 27 22 29 23 22 24 22 ...
 $ Height                        : num [1:2111] 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
 $ Weight                        : num [1:2111] 64 56 77 87 89.8 53 55 53 64 68 ...
 $ family_history_with_overweight: chr [1:2111] "yes" "yes" "yes" "no" ...
 $ FAVC                          : chr [1:2111] "no" "no" "no" "no" ...
 $ FCVC                          : num [1:2111] 2 3 2 3 2 2 3 2 3 2 ...
 $ NCP                           : num [1:2111] 3 3 3 3 1 3 3 3 3 3 ...
 $ CAEC                          : chr [1:2111] "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
 $ SMOKE                         : chr [1:2111] "no" "yes" "no" "no" ...
 $ CH2O                          : num [1:2111] 2 3 2 2 2 2 2 2 2 2 ...
 $ SCC                           : chr [1:2111] "no" "yes" "no" "no" ...
 $ FAF                           : num [1:2111] 0 3 2 2 0 0 1 3 1 1 ...
 $ CALC                          : chr [1:2111] "no" "Sometimes" "Frequently" "Frequently" ...
 $ MTRANS                        : chr [1:2111] "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
 $ NObeyesdad                    : chr [1:2111] "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...

As we see, there are some categorical data that should be transformed into factors

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
obesity <- obesity %>%
  mutate_if(is.character, as.factor)

str(obesity)
tibble [2,111 × 16] (S3: tbl_df/tbl/data.frame)
 $ Gender                        : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
 $ Age                           : num [1:2111] 21 21 23 27 22 29 23 22 24 22 ...
 $ Height                        : num [1:2111] 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
 $ Weight                        : num [1:2111] 64 56 77 87 89.8 53 55 53 64 68 ...
 $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
 $ FAVC                          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
 $ FCVC                          : num [1:2111] 2 3 2 3 2 2 3 2 3 2 ...
 $ NCP                           : num [1:2111] 3 3 3 3 1 3 3 3 3 3 ...
 $ CAEC                          : Factor w/ 4 levels "Always","Frequently",..: 4 4 4 4 4 4 4 4 4 4 ...
 $ SMOKE                         : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
 $ CH2O                          : num [1:2111] 2 3 2 2 2 2 2 2 2 2 ...
 $ SCC                           : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
 $ FAF                           : num [1:2111] 0 3 2 2 0 0 1 3 1 1 ...
 $ CALC                          : Factor w/ 4 levels "Always","Frequently",..: 3 4 2 2 4 4 4 4 2 3 ...
 $ MTRANS                        : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
 $ NObeyesdad                    : Factor w/ 7 levels "Insufficient_Weight",..: 2 2 2 6 7 2 2 2 2 2 ...

Checking for missing values

 colSums(is.na(obesity))
                        Gender                            Age 
                             0                              0 
                        Height                         Weight 
                             0                              0 
family_history_with_overweight                           FAVC 
                             0                              0 
                          FCVC                            NCP 
                             0                              0 
                          CAEC                          SMOKE 
                             0                              0 
                          CH2O                            SCC 
                             0                              0 
                           FAF                           CALC 
                             0                              0 
                        MTRANS                     NObeyesdad 
                             0                              0 

Converting height to centimeters

obesity$Height <- obesity$Height * 100

weight distribution by obesity type

library(ggplot2)

boxplot_weight_obesity <- ggplot(obesity, aes(x = NObeyesdad, y = Weight, fill = NObeyesdad)) +
  geom_boxplot() +
  labs(title = "Weight Distribution by Obesity Type",
       x = "Obesity Type",
       y = "Weight") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14),
        axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_manual(values = c(
    "Insufficient_Weight" = "navyblue",
    "Normal_Weight" = "green",
    "Overweight_Level_I" = "purple",
    "Overweight_Level_II" = "maroon1",
    "Obesity_Type_I" = "yellow",
    "Obesity_Type_II" = "orange",
    "Obesity_Type_III" = "red"
  ))

print(boxplot_weight_obesity)

Weight&height correlations

library(dplyr)
library(ggplot2)

obesity_cor <- obesity %>% select(c(NObeyesdad, Height, Weight))


custom_colors <- c(
  "Insufficient_Weight" = "navyblue", 
  "Normal_Weight" = "green", 
  "Overweight_Level_I" = "purple", 
  "Overweight_Level_II" = "maroon1", 
  "Obesity_Type_I" = "yellow", 
  "Obesity_Type_II" = "orange", 
  "Obesity_Type_III" = "red"
)

ggplot(data = obesity_cor, mapping = aes(x = Height, y = Weight, col = NObeyesdad)) +
  geom_point() +
  geom_smooth(method = lm, color = "black", se = FALSE, formula = y ~ x) +
  scale_color_manual(values = custom_colors) +
  labs(
    title = 'Correlation of Height and Weight',
    x = "Height (cm)",
    y = "Weight (Kg)",
    col = "Obesity Level"
  )

print(cor(obesity_cor$Height, obesity_cor$Weight))
[1] 0.4631361

According to the data, the correlation between height and weight is weakly positive (0.46)

library(dplyr)

correlations_by_obesity <- obesity %>%
  group_by(NObeyesdad) %>%
  summarise(
    correlation = cor(Height, Weight, use = "complete.obs")
  )

print(correlations_by_obesity)
# A tibble: 7 × 2
  NObeyesdad          correlation
  <fct>                     <dbl>
1 Insufficient_Weight       0.918
2 Normal_Weight             0.830
3 Obesity_Type_I            0.958
4 Obesity_Type_II           0.908
5 Obesity_Type_III          0.928
6 Overweight_Level_I        0.977
7 Overweight_Level_II       0.960

Correlation coefficients

library(ggplot2)

ggplot(correlations_by_obesity, aes(x = NObeyesdad, y = correlation, fill = NObeyesdad)) +
  geom_col() +
  labs(title = "Correlation of Height and Weight by Obesity Level",
       x = "Obesity Level",
       y = "Correlation") +
  scale_fill_manual(values = c(
    "Insufficient_Weight" = "navyblue",
    "Normal_Weight" = "green",
    "Overweight_Level_I" = "purple",
    "Overweight_Level_II" = "maroon1",
    "Obesity_Type_I" = "yellow",
    "Obesity_Type_II" = "orange",
    "Obesity_Type_III" = "red"
  )) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Gender comparison

library(glue)
library(ggplot2)

obs_gender <- obesity %>%
  select(c(Gender, NObeyesdad)) %>%
  group_by(Gender, NObeyesdad) %>%
  summarise(total = n(), .groups = "drop") %>%
  mutate(label = glue("Total : {total}"))

plotog <- ggplot(data = obs_gender, aes(x = NObeyesdad, y = total, fill = NObeyesdad, text = label)) +
  geom_col(position = "dodge") +
  facet_wrap(vars(Gender)) +
  scale_fill_manual(values = c(
    "Insufficient_Weight" = "navyblue",
    "Normal_Weight" = "green",
    "Overweight_Level_I" = "purple",
    "Overweight_Level_II" = "maroon1",
    "Obesity_Type_I" = "yellow",
    "Obesity_Type_II" = "orange",
    "Obesity_Type_III" = "red"
  )) +
  labs(title = list(text = paste0('Obesity Type based on Gender')),
       x = "Obesity Type",
       y = "Total"
  )+
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(plotog)

other

library(ggplot2)
library(dplyr)
library(tidyr)

height_weight <- obesity %>%
  select(c(Gender, Height, Weight)) %>%
  pivot_longer(cols = c("Height", "Weight"), names_to = "variable")

plothw <- ggplot(data = height_weight, mapping = aes(x = Gender, y = value)) +
  geom_boxplot(aes(fill = Gender), position = "dodge") +
  facet_wrap(vars(variable)) +
  labs(
    title = 'Height and Weight Distribution Based on Gender',
    x = "Gender",
    y = "Height (cm) / Weight (Kg)"
  ) +
  scale_fill_manual(values = c("Female" = "lightpink", "Male" = "lightblue")) + 
  theme_minimal() + 
  theme(
    panel.background = element_blank(),  
    plot.background = element_blank(),   
    axis.line = element_line(color = "black"), #
    axis.text.x = element_text(color = "black"),
    axis.text.y = element_text(color = "black"),
    plot.title = element_text(face = "bold", color = "black")
  )

print(plothw)

The plots show that the median height of females in the sample is clearly lower than that of males, with some males exceeding 1.98 meters (outliers). As for weight, the difference is not as large

gender_obesity_table <- table(obesity$Gender, obesity$NObeyesdad)

chisq_test_result <- chisq.test(gender_obesity_table)

print(chisq_test_result)

    Pearson's Chi-squared test

data:  gender_obesity_table
X-squared = 657.75, df = 6, p-value < 2.2e-16

text

library(rcompanion)
cramerV(gender_obesity_table)
Cramer V 
  0.5582 

A value of 0.5582 indicates a moderate association between gender and obesity level, but it’s not an absolute determinant.

library(ggplot2)
library(dplyr)
library(glue)


gene_snack <- obesity %>%
  select(c(family_history_with_overweight, FAVC, NObeyesdad)) %>%
  group_by(family_history_with_overweight, FAVC, NObeyesdad) %>%
  summarise(count = n(), .groups = "drop") %>%
  mutate(label = glue("Total: {count}\nObesity Type: {NObeyesdad}\nFamily History with Overweight: {family_history_with_overweight}\nConsumption of food between meals: {FAVC}"))


plotc <- ggplot(gene_snack, mapping = aes(x = FAVC, y = count, fill = NObeyesdad, text = label)) +
  geom_col(position = "dodge") +
  facet_wrap(vars(family_history_with_overweight)) +
  scale_fill_manual(values = c(
    "Insufficient_Weight" = "navyblue", 
    "Normal_Weight" = "green", 
    "Overweight_Level_I" = "purple", 
    "Overweight_Level_II" = "maroon1", 
    "Obesity_Type_I" = "yellow", 
    "Obesity_Type_II" = "orange", 
    "Obesity_Type_III" = "red")) +
  scale_x_discrete(expand = expansion(mult = c(0.0001, 0.0001))) +
  labs(title = "Family History with Overweight",
       x = "Frequent consumption of high caloric food",
       y = NULL,
       fill = "Obesity Type") +
  theme(legend.title = element_blank(),
        axis.text.x = element_text(),
        plot.title = element_text(face = "bold"),
        panel.background = element_rect(fill = "#ffffff"),
        axis.line.y = element_line(colour = "grey"),
        axis.line.x = element_blank(),
        panel.grid = element_blank())


print(plotc)

descriptive analysis

obesity %>% 
  group_by(family_history_with_overweight, NObeyesdad) %>% 
  summarise(count = n()) %>% 
  mutate(percentage = (count / sum(count)) * 100) %>% 
  arrange(desc(percentage))
`summarise()` has grouped output by 'family_history_with_overweight'. You can
override using the `.groups` argument.
# A tibble: 13 × 4
# Groups:   family_history_with_overweight [2]
   family_history_with_overweight NObeyesdad          count percentage
   <fct>                          <fct>               <int>      <dbl>
 1 no                             Insufficient_Weight   146     37.9  
 2 no                             Normal_Weight         132     34.3  
 3 no                             Overweight_Level_I     81     21.0  
 4 yes                            Obesity_Type_I        344     19.9  
 5 yes                            Obesity_Type_III      324     18.8  
 6 yes                            Obesity_Type_II       296     17.1  
 7 yes                            Overweight_Level_II   272     15.8  
 8 yes                            Overweight_Level_I    209     12.1  
 9 yes                            Normal_Weight         155      8.98 
10 yes                            Insufficient_Weight   126      7.30 
11 no                             Overweight_Level_II    18      4.68 
12 no                             Obesity_Type_I          7      1.82 
13 no                             Obesity_Type_II         1      0.260

Chi-Square Test of Independence

chisq_test <- table(obesity$family_history_with_overweight, obesity$NObeyesdad)
chisq.test(chisq_test)

    Pearson's Chi-squared test

data:  chisq_test
X-squared = 621.98, df = 6, p-value < 2.2e-16

visual

mosaicplot(chisq_test, main = "Association Between Family History and Obesity Levels",
           col = c("navyblue", "green", "purple", "maroon1", "yellow", "orange", "red"),
           las = 1)

Calculating bmi

obesity <- obesity %>%
  mutate(Height_m = Height / 100,
         BMI = Weight / (Height_m^2))  



obesity <- obesity %>%
  mutate(BMI_category = case_when(
    BMI < 18.5 ~ "Underweight",
    BMI >= 18.5 & BMI <= 24.9 ~ "Normal weight",
    BMI >= 25 & BMI <= 29.9 ~ "Overweight",
    BMI >= 30 ~ "Obesity"
  ))

text

library(ggplot2)


bmi_colors <- c(
  "Underweight" = "navyblue",
  "Normal" = "green",
  "Overweight" = "orange",
  "Obese" = "red"
)


obesity <- obesity %>%
  mutate(BMI_category = case_when(
    BMI < 18.5 ~ "Underweight",
    BMI >= 18.5 & BMI < 24.9 ~ "Normal",
    BMI >= 25 & BMI < 29.9 ~ "Overweight",
    BMI >= 30 ~ "Obese"
  ))

ggplot(obesity, aes(x = NObeyesdad, y = BMI, fill = BMI_category)) +
  geom_boxplot() +
  scale_fill_manual(values = bmi_colors) +
  labs(title = "BMI Distribution by Obesity Type (Colored by BMI Category)",
       x = "Obesity Type",
       y = "BMI",
       fill = "BMI Category") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

weight for obesity