Econometrics 1.2 group assignment: obesity determinants analysis

data preparation

library(readxl)
obesity <- read_excel("C:/Users/User/Desktop/Dataset_Obesity.xlsx")

str(obesity)

tibble [2,111 × 16] (S3: tbl_df/tbl/data.frame)
 $ Gender                        : chr [1:2111] "Female" "Female" "Male" "Male" ...
 $ Age                           : num [1:2111] 21 21 23 27 22 29 23 22 24 22 ...
 $ Height                        : num [1:2111] 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
 $ Weight                        : num [1:2111] 64 56 77 87 89.8 53 55 53 64 68 ...
 $ family_history_with_overweight: chr [1:2111] "yes" "yes" "yes" "no" ...
 $ FAVC                          : chr [1:2111] "no" "no" "no" "no" ...
 $ FCVC                          : num [1:2111] 2 3 2 3 2 2 3 2 3 2 ...
 $ NCP                           : num [1:2111] 3 3 3 3 1 3 3 3 3 3 ...
 $ CAEC                          : chr [1:2111] "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
 $ SMOKE                         : chr [1:2111] "no" "yes" "no" "no" ...
 $ CH2O                          : num [1:2111] 2 3 2 2 2 2 2 2 2 2 ...
 $ SCC                           : chr [1:2111] "no" "yes" "no" "no" ...
 $ FAF                           : num [1:2111] 0 3 2 2 0 0 1 3 1 1 ...
 $ CALC                          : chr [1:2111] "no" "Sometimes" "Frequently" "Frequently" ...
 $ MTRANS                        : chr [1:2111] "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
 $ NObeyesdad                    : chr [1:2111] "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...

As we see, there are some categorical data that should be transformed into factors

library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

obesity <- obesity %>%
  mutate_if(is.character, as.factor)

str(obesity)

tibble [2,111 × 16] (S3: tbl_df/tbl/data.frame)
 $ Gender                        : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
 $ Age                           : num [1:2111] 21 21 23 27 22 29 23 22 24 22 ...
 $ Height                        : num [1:2111] 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
 $ Weight                        : num [1:2111] 64 56 77 87 89.8 53 55 53 64 68 ...
 $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
 $ FAVC                          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
 $ FCVC                          : num [1:2111] 2 3 2 3 2 2 3 2 3 2 ...
 $ NCP                           : num [1:2111] 3 3 3 3 1 3 3 3 3 3 ...
 $ CAEC                          : Factor w/ 4 levels "Always","Frequently",..: 4 4 4 4 4 4 4 4 4 4 ...
 $ SMOKE                         : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
 $ CH2O                          : num [1:2111] 2 3 2 2 2 2 2 2 2 2 ...
 $ SCC                           : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
 $ FAF                           : num [1:2111] 0 3 2 2 0 0 1 3 1 1 ...
 $ CALC                          : Factor w/ 4 levels "Always","Frequently",..: 3 4 2 2 4 4 4 4 2 3 ...
 $ MTRANS                        : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
 $ NObeyesdad                    : Factor w/ 7 levels "Insufficient_Weight",..: 2 2 2 6 7 2 2 2 2 2 ...

Checking for missing values

 colSums(is.na(obesity))

                        Gender                            Age 
                             0                              0 
                        Height                         Weight 
                             0                              0 
family_history_with_overweight                           FAVC 
                             0                              0 
                          FCVC                            NCP 
                             0                              0 
                          CAEC                          SMOKE 
                             0                              0 
                          CH2O                            SCC 
                             0                              0 
                           FAF                           CALC 
                             0                              0 
                        MTRANS                     NObeyesdad 
                             0                              0

Converting height to centimeters

obesity$Height <- obesity$Height * 100

weight distribution by obesity type

library(ggplot2)

boxplot_weight_obesity <- ggplot(obesity, aes(x = NObeyesdad, y = Weight, fill = NObeyesdad)) +
  geom_boxplot() +
  labs(title = "Weight Distribution by Obesity Type",
       x = "Obesity Type",
       y = "Weight") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14),
        axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_manual(values = c(
    "Insufficient_Weight" = "navyblue",
    "Normal_Weight" = "green",
    "Overweight_Level_I" = "purple",
    "Overweight_Level_II" = "maroon1",
    "Obesity_Type_I" = "yellow",
    "Obesity_Type_II" = "orange",
    "Obesity_Type_III" = "red"
  ))

print(boxplot_weight_obesity)

Weight&height correlations

library(dplyr)
library(ggplot2)

obesity_cor <- obesity %>% select(c(NObeyesdad, Height, Weight))


custom_colors <- c(
  "Insufficient_Weight" = "navyblue", 
  "Normal_Weight" = "green", 
  "Overweight_Level_I" = "purple", 
  "Overweight_Level_II" = "maroon1", 
  "Obesity_Type_I" = "yellow", 
  "Obesity_Type_II" = "orange", 
  "Obesity_Type_III" = "red"
)

ggplot(data = obesity_cor, mapping = aes(x = Height, y = Weight, col = NObeyesdad)) +
  geom_point() +
  geom_smooth(method = lm, color = "black", se = FALSE, formula = y ~ x) +
  scale_color_manual(values = custom_colors) +
  labs(
    title = 'Correlation of Height and Weight',
    x = "Height (cm)",
    y = "Weight (Kg)",
    col = "Obesity Level"
  )

print(cor(obesity_cor$Height, obesity_cor$Weight))

[1] 0.4631361

According to the data, the correlation between height and weight is weakly positive (0.46)

library(dplyr)

correlations_by_obesity <- obesity %>%
  group_by(NObeyesdad) %>%
  summarise(
    correlation = cor(Height, Weight, use = "complete.obs")
  )

print(correlations_by_obesity)

# A tibble: 7 × 2
  NObeyesdad          correlation
  <fct>                     <dbl>
1 Insufficient_Weight       0.918
2 Normal_Weight             0.830
3 Obesity_Type_I            0.958
4 Obesity_Type_II           0.908
5 Obesity_Type_III          0.928
6 Overweight_Level_I        0.977
7 Overweight_Level_II       0.960

Correlation coefficients

library(ggplot2)

ggplot(correlations_by_obesity, aes(x = NObeyesdad, y = correlation, fill = NObeyesdad)) +
  geom_col() +
  labs(title = "Correlation of Height and Weight by Obesity Level",
       x = "Obesity Level",
       y = "Correlation") +
  scale_fill_manual(values = c(
    "Insufficient_Weight" = "navyblue",
    "Normal_Weight" = "green",
    "Overweight_Level_I" = "purple",
    "Overweight_Level_II" = "maroon1",
    "Obesity_Type_I" = "yellow",
    "Obesity_Type_II" = "orange",
    "Obesity_Type_III" = "red"
  )) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Gender comparison

library(glue)
library(ggplot2)

obs_gender <- obesity %>%
  select(c(Gender, NObeyesdad)) %>%
  group_by(Gender, NObeyesdad) %>%
  summarise(total = n(), .groups = "drop") %>%
  mutate(label = glue("Total : {total}"))

plotog <- ggplot(data = obs_gender, aes(x = NObeyesdad, y = total, fill = NObeyesdad, text = label)) +
  geom_col(position = "dodge") +
  facet_wrap(vars(Gender)) +
  scale_fill_manual(values = c(
    "Insufficient_Weight" = "navyblue",
    "Normal_Weight" = "green",
    "Overweight_Level_I" = "purple",
    "Overweight_Level_II" = "maroon1",
    "Obesity_Type_I" = "yellow",
    "Obesity_Type_II" = "orange",
    "Obesity_Type_III" = "red"
  )) +
  labs(title = list(text = paste0('Obesity Type based on Gender')),
       x = "Obesity Type",
       y = "Total"
  )+
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(plotog)

other

library(ggplot2)
library(dplyr)
library(tidyr)

height_weight <- obesity %>%
  select(c(Gender, Height, Weight)) %>%
  pivot_longer(cols = c("Height", "Weight"), names_to = "variable")

plothw <- ggplot(data = height_weight, mapping = aes(x = Gender, y = value)) +
  geom_boxplot(aes(fill = Gender), position = "dodge") +
  facet_wrap(vars(variable)) +
  labs(
    title = 'Height and Weight Distribution Based on Gender',
    x = "Gender",
    y = "Height (cm) / Weight (Kg)"
  ) +
  scale_fill_manual(values = c("Female" = "maroon1", "Male" = "navyblue")) + 
  theme_minimal() + 
  theme(
    panel.background = element_blank(),  
    plot.background = element_blank(),   
    axis.line = element_line(color = "black"), #
    axis.text.x = element_text(color = "black"),
    axis.text.y = element_text(color = "black"),
    plot.title = element_text(face = "bold", color = "black")
  )

print(plothw)

The plots show that the median height of females in the sample is clearly lower than that of males, as for weight, the difference is not as large

gender_obesity_table <- table(obesity$Gender, obesity$NObeyesdad)

chisq_test_result <- chisq.test(gender_obesity_table)

print(chisq_test_result)


    Pearson's Chi-squared test

data:  gender_obesity_table
X-squared = 657.75, df = 6, p-value < 2.2e-16

text

library(rcompanion)
cramerV(gender_obesity_table)

Cramer V 
  0.5582

A value of 0.5582 indicates a moderate association between gender and obesity level, but it’s not an absolute determinant.

library(ggplot2)
library(dplyr)
library(glue)


gene_snack <- obesity %>%
  select(c(family_history_with_overweight, FAVC, NObeyesdad)) %>%
  group_by(family_history_with_overweight, FAVC, NObeyesdad) %>%
  summarise(count = n(), .groups = "drop") %>%
  mutate(label = glue("Total: {count}\nObesity Type: {NObeyesdad}\nFamily History with Overweight: {family_history_with_overweight}\nConsumption of food between meals: {FAVC}"))


plotc <- ggplot(gene_snack, mapping = aes(x = FAVC, y = count, fill = NObeyesdad, text = label)) +
  geom_col(position = "dodge") +
  facet_wrap(vars(family_history_with_overweight)) +
  scale_fill_manual(values = c(
    "Insufficient_Weight" = "navyblue", 
    "Normal_Weight" = "green", 
    "Overweight_Level_I" = "purple", 
    "Overweight_Level_II" = "maroon1", 
    "Obesity_Type_I" = "yellow", 
    "Obesity_Type_II" = "orange", 
    "Obesity_Type_III" = "red")) +
  scale_x_discrete(expand = expansion(mult = c(0.0001, 0.0001))) +
  labs(title = "Family History with Overweight",
       x = "Frequent consumption of high caloric food",
       y = NULL,
       fill = "Obesity Type") +
  theme(legend.title = element_blank(),
        axis.text.x = element_text(),
        plot.title = element_text(face = "bold"),
        panel.background = element_rect(fill = "#ffffff"),
        axis.line.y = element_line(colour = "grey"),
        axis.line.x = element_blank(),
        panel.grid = element_blank())


print(plotc)

descriptive analysis

obesity %>% 
  group_by(family_history_with_overweight, NObeyesdad) %>% 
  summarise(count = n()) %>% 
  mutate(percentage = (count / sum(count)) * 100) %>% 
  arrange(desc(percentage))

`summarise()` has grouped output by 'family_history_with_overweight'. You can
override using the `.groups` argument.

# A tibble: 13 × 4
# Groups:   family_history_with_overweight [2]
   family_history_with_overweight NObeyesdad          count percentage
   <fct>                          <fct>               <int>      <dbl>
 1 no                             Insufficient_Weight   146     37.9  
 2 no                             Normal_Weight         132     34.3  
 3 no                             Overweight_Level_I     81     21.0  
 4 yes                            Obesity_Type_I        344     19.9  
 5 yes                            Obesity_Type_III      324     18.8  
 6 yes                            Obesity_Type_II       296     17.1  
 7 yes                            Overweight_Level_II   272     15.8  
 8 yes                            Overweight_Level_I    209     12.1  
 9 yes                            Normal_Weight         155      8.98 
10 yes                            Insufficient_Weight   126      7.30 
11 no                             Overweight_Level_II    18      4.68 
12 no                             Obesity_Type_I          7      1.82 
13 no                             Obesity_Type_II         1      0.260

Chi-Square Test of Independence

chisq_test <- table(obesity$family_history_with_overweight, obesity$NObeyesdad)
chisq.test(chisq_test)


    Pearson's Chi-squared test

data:  chisq_test
X-squared = 621.98, df = 6, p-value < 2.2e-16

visual

mosaicplot(chisq_test, main = "Association Between Family History and Obesity Levels",
           col = c("navyblue", "green", "purple", "maroon1", "yellow", "orange", "red"),
           las = 1)

library(ggplot2)
ggplot(obesity, aes(x = FAVC, fill = NObeyesdad)) +
  geom_bar(position = "fill") +
  labs(title = "Obesity Type by High-Calorie Food Consumption",
       x = "High-Calorie Food Consumption (FAVC)",
       y = "Proportion",
       fill = "Obesity Type") +
  scale_fill_manual(values = c(
    "Insufficient_Weight" = "navyblue", 
    "Normal_Weight" = "green", 
    "Overweight_Level_I" = "purple", 
    "Overweight_Level_II" = "maroon1", 
    "Obesity_Type_I" = "yellow", 
    "Obesity_Type_II" = "orange", 
    "Obesity_Type_III" = "red"
  )) +
  theme_minimal()

chisq_test <- table(obesity$FAVC, obesity$NObeyesdad)


chisq_result <- chisq.test(chisq_test)
chisq_result


    Pearson's Chi-squared test

data:  chisq_test
X-squared = 233.34, df = 6, p-value < 2.2e-16

chisq_test <- table(obesity$FCVC, obesity$NObeyesdad)


chisq_result <- chisq.test(chisq_test)

Warning in chisq.test(chisq_test): Chi-squared approximation may be incorrect

chisq_result


    Pearson's Chi-squared test

data:  chisq_test
X-squared = 5945, df = 4854, p-value < 2.2e-16

library(ggplot2)
library(dplyr)
library(glue)


gene_snack <- obesity %>%
  select(c(family_history_with_overweight, CAEC, NObeyesdad)) %>%
  group_by(family_history_with_overweight, CAEC, NObeyesdad) %>%
  summarise(count = n(), .groups = "drop") %>%
  mutate(label = glue("Total: {count}\nObesity Type: {NObeyesdad}\nFamily History with Overweight: {family_history_with_overweight}\nConsumption of food between meals: {CAEC}"))


plotc <- ggplot(gene_snack, mapping = aes(x = CAEC, y = count, fill = NObeyesdad, text = label)) +
  geom_col(position = "dodge") +
  facet_wrap(vars(family_history_with_overweight)) +
  scale_fill_manual(values = c(
    "Insufficient_Weight" = "navyblue", 
    "Normal_Weight" = "green", 
    "Overweight_Level_I" = "purple", 
    "Overweight_Level_II" = "maroon1", 
    "Obesity_Type_I" = "yellow", 
    "Obesity_Type_II" = "orange", 
    "Obesity_Type_III" = "red")) +
  scale_x_discrete(expand = expansion(mult = c(0.0001, 0.0001))) +
  labs(title = "Family History with Overweight",
       x = "Consumption of food between meals",
       y = NULL,
       fill = "Obesity Type") +
  theme(legend.title = element_blank(),
        axis.text.x = element_text(),
        plot.title = element_text(face = "bold"),
        panel.background = element_rect(fill = "#ffffff"),
        axis.line.y = element_line(colour = "grey"),
        axis.line.x = element_blank(),
        panel.grid = element_blank())

# Print the plot
print(plotc)

library(ggplot2)
library(dplyr)

# Create the plot
plot_gender_obesity_age <- ggplot(obesity, aes(x = Age, y = NObeyesdad, color = Gender)) +
  geom_jitter(width = 0.3, alpha = 0.6) +  # Adds jitter for better visualization and reduces overlap
  scale_color_manual(values = c("Male" = "navyblue", "Female" = "maroon1")) +  # Custom colors for gender
  labs(
    title = "Obesity Type by Age and Gender",
    x = "Age",
    y = "Obesity Type",
    color = "Gender"
  ) +
  theme_minimal() +  # Clean background theme
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.minor = element_blank()
  )

# Print the plot
print(plot_gender_obesity_age)