Exploring BMI Influences: Gender, Genetics, Food Habits and transportation choices

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(flextable)
library(boot)

df <- read.csv("~/Downloads/ObesityDataSet_raw_and_data_sinthetic.csv", header=TRUE)
df['BMI'] <- df['Weight']/df['Height']**2

EDA

df |> ggplot(mapping = aes(x = factor(NObeyesdad, levels = c('Insufficient_Weight','Normal_Weight', 'Overweight_Level_I','Overweight_Level_II','Obesity_Type_I','Obesity_Type_II','Obesity_Type_III' )), fill = NObeyesdad)) + geom_bar() + theme_classic() + 
  labs(title = 'Each Category counts', x = 'Obesity Levels', y = 'Count') + theme(axis.text.x = element_text(angle = 45, hjust = 1))

df |>  ggplot(mapping = aes(x = family_history_with_overweight, fill = (BMI >30))) + geom_bar( width = 0.3) + theme_classic() + labs(title = 'Individuals with Family history of overweight')

df |> 
  select(family_history_with_overweight, BMI, Gender) |> 
  mutate(is_obese = (BMI>30)) |> 
  group_by(Gender,family_history_with_overweight,is_obese) |>
  summarise(count = n()) -> gen_genetic_obese_count

## `summarise()` has grouped output by 'Gender', 'family_history_with_overweight'.
## You can override using the `.groups` argument.

gen_genetic_obese_count

## # A tibble: 8 × 4
## # Groups:   Gender, family_history_with_overweight [4]
##   Gender family_history_with_overweight is_obese count
##   <chr>  <chr>                          <lgl>    <int>
## 1 Female no                             FALSE      230
## 2 Female no                             TRUE         2
## 3 Female yes                            FALSE      331
## 4 Female yes                            TRUE       480
## 5 Male   no                             FALSE      147
## 6 Male   no                             TRUE         6
## 7 Male   yes                            FALSE      429
## 8 Male   yes                            TRUE       486

Individuals with family_history_with_overwieght are more compared to without family overweight history 1726 to 385. (81.7% have famliy-history)
Gender-wise , obesity count : Evenly balanced
gen_genetic_obese_count :

Individuals both male and female with no family-overweight_genetics tend be normal weight (or insufficent weight) or simply not Obese.
A little more analysis is required to understand if it is a contributing factor for high BMIs.

df |> 
  select(family_history_with_overweight, BMI, Gender) |> 
  mutate(is_obese = (BMI>30)) |> 
  group_by(is_obese, Gender) |>
  summarise(count = n(), meanBMI = mean(BMI)) -> gender_wise_obesity_counts

## `summarise()` has grouped output by 'is_obese'. You can override using the
## `.groups` argument.

gender_wise_obesity_counts  <- gender_wise_obesity_counts |> 
  mutate(Gend_obes  = paste( is_obese,Gender,  sep = ' obese : '))


ggplot(data = gender_wise_obesity_counts, 
       mapping = aes(x = Gend_obes,fill = is_obese)) + 
  geom_bar(width = 0.5) + 
  theme_classic() + 
  labs(
    title = 'Gender + Obesity Distributions', 
    x = 'Gender + Obesity', 
    y = 'Proportions Ratio'
  )

ggplot(data = gender_wise_obesity_counts, 
       mapping = aes(x = Gend_obes, y = meanBMI, fill = (is_obese))) + 
  geom_bar(stat = "identity", width = 0.3, colour = "black") + 
  theme_classic() + 
  labs(
    title = "Mean BMI Grouped by Gender + Obesity", 
    x = "Gender + Obesity", 
    y = "Mean BMI"
  ) + 
  scale_fill_manual(
    name = "is_obese", 
    values = c("FALSE" = "green", 
               "TRUE" = "yellow")
  )

gender_wise_obesity_counts

## # A tibble: 4 × 5
## # Groups:   is_obese [2]
##   is_obese Gender count meanBMI Gend_obes           
##   <lgl>    <chr>  <int>   <dbl> <chr>               
## 1 FALSE    Female   561    22.6 FALSE obese : Female
## 2 FALSE    Male     576    24.3 FALSE obese : Male  
## 3 TRUE     Female   482    38.8 TRUE obese : Female 
## 4 TRUE     Male     492    35.1 TRUE obese : Male

Conclusions

Gender are no real contribution in esitmating a person’s BMI from EDA
there appears to be some relation between family-history with Obesity, need further analysis

BMI ~ Gender

df |> t.test(BMI ~ Gender,data = _)

## 
##  Welch Two Sample t-test
## 
## data:  BMI by Gender
## t = 2.4282, df = 1823.5, p-value = 0.01527
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  0.1633912 1.5358581
## sample estimates:
## mean in group Female   mean in group Male 
##             30.13000             29.28038

Upon performing t-test, we get a p-value 0.015, which is less than 0.05 but the when power is defined we usually use delta as 1.
But the difference between the mean BMIs is 30.1-29.3 is around 0.8 which is less than 1,
So we cannot reject the null hypothesis, since the observed difference is less than Delta.

df |> 
  select(family_history_with_overweight, BMI, Gender) |> 
  mutate(is_obese = (BMI>30))  |> mutate(Gend_obes  = paste(is_obese, Gender,  sep = ' obese ')) -> new_df


new_df$Gend_obes <- factor(new_df$Gend_obes)

new_df |> 
  ggplot(mapping = aes(x = Gend_obes, fill = is_obese)) + 
  geom_histogram(stat = "count", width = 0.5) + 

  labs(
    title = "Number of indiviuals Grouped by Gender + Obesity", 
    x = "Gender + Obesity",
    y = "Count"
  ) + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  theme_classic()

## Warning in geom_histogram(stat = "count", width = 0.5): Ignoring unknown
## parameters: `binwidth`, `bins`, and `pad`

gen_genetic_obese_count

df |> 
  select(family_history_with_overweight, BMI) |> 
  mutate(is_obese = (BMI>30)) |> 
  group_by(family_history_with_overweight,is_obese) |>
  summarise(count = n()) -> genetic_obese_count

## `summarise()` has grouped output by 'family_history_with_overweight'. You can
## override using the `.groups` argument.

genetic_obese_count

## # A tibble: 4 × 3
## # Groups:   family_history_with_overweight [2]
##   family_history_with_overweight is_obese count
##   <chr>                          <lgl>    <int>
## 1 no                             FALSE      377
## 2 no                             TRUE         8
## 3 yes                            FALSE      760
## 4 yes                            TRUE       966

Chi-Squared test for Contingency table -> is_obese ~ family_history_with_overweight

Assumptions:

With the given count of people with obseity are 974 and not obese are 1137, similarly 385 individual have to no family_history_with_overweight and 1726 will have history.
Null Hypothesis: There is no relationship between Obesity and family_history_with_overweight
Alternative Hypothesis: There is a relation between Obesity and family_history_with_overweight

df |> 
  select(BMI, family_history_with_overweight) |> 
  mutate(is_obese = (BMI > 30)) |> 
  xtabs(~ is_obese + family_history_with_overweight, data = _) -> contingency_table

contingency_table |> 
  chisq.test() -> x2res 

contingency_table

##         family_history_with_overweight
## is_obese  no yes
##    FALSE 377 760
##    TRUE    8 966

x2res$expected

##         family_history_with_overweight
## is_obese       no      yes
##    FALSE 207.3638 929.6362
##    TRUE  177.6362 796.3638

x2res

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  contingency_table
## X-squared = 365.69, df = 1, p-value < 2.2e-16

Since the p-value is very small, Null Hypothesis can be rejected with confidence
This means the observed contingency table cells’ values is NOT occured by chance

The important Assumptions :

The total number of individuals with Obesity and without Obesity are constant. Similarly, total number of individual without family overweight history and with history are constant

Conclusion: Alternative Hypothesis is True and there is a relationship between is_obese ~ family_history_with_overweight

t-test for comparing no history and with history considering BMI

Null Hypothesis: There is no relationship between family_history_with_overweight and BMI

pairwise.t.test(df$BMI, df$family_history_with_overweight, p.adjust.method = 'bonferroni', alternative = 'less')

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  df$BMI and df$family_history_with_overweight 
## 
##     no
## yes 1 
## 
## P value adjustment method: bonferroni

df |> select(BMI, family_history_with_overweight) |>  group_by(family_history_with_overweight) |> summarise(meanBMI = mean(BMI))

## # A tibble: 2 × 2
##   family_history_with_overweight meanBMI
##   <chr>                            <dbl>
## 1 no                                21.5
## 2 yes                               31.5

Conclusion * Null Hyothesis can be rejected

ANOVA between CAEC (Eating Food in between meals)

df |> 
  ggplot(mapping = aes(x = factor(CAEC, levels = c('no', 'Sometimes', 'Frequently', 'Always')), y = BMI)) + 
  geom_boxplot(aes(color = ifelse(CAEC == "Sometimes", "darkblue", "black")), size = 0.6) + 
  scale_color_identity() +  # Use the colors as defined
  labs(
    title = 'BMI vs Frequency of Eating Food in Between Meals - BoxPlot', 
    x = 'Frequency of Eating Food in Between Meals', 
    y = 'BMI'
  ) +
  geom_hline(yintercept = 30, colour = 'red') +
  annotate("text", x = 0.6, y = 31, label = "Obese", color = "red", size = 3, fontface = "bold") + 
  annotate("text", x = 0.6, y = 29, label = "Non-Obese", color = "red", size = 3, fontface = "bold") +
  theme_classic()

It is clear that individuals with no food in between meals are generally tend to be under 30 for BMI and people with ‘Sometimes’ category tends to high BMIs(clearly median is above BMI 30)

df |> select(BMI, CAEC) |>  group_by(CAEC) |> summarise(count = n(), meanBMI = mean(BMI))

## # A tibble: 4 × 3
##   CAEC       count meanBMI
##   <chr>      <int>   <dbl>
## 1 Always        53    24.3
## 2 Frequently   242    20.9
## 3 Sometimes   1765    31.2
## 4 no            51    25.4

Hypothesis test

Assumptions: * Normality of each distributions in CAEC variable(except ‘Sometimes’)

Null Hypothesis: All the distribuions/categories are independent

df_sub <- df |> filter(CAEC %in% c('no','Frequently','Always'))
anova_res <- aov(BMI~CAEC, data = df_sub)
summary(anova_res)

##              Df Sum Sq Mean Sq F value   Pr(>F)    
## CAEC          2   1174   587.2   30.39 7.09e-13 ***
## Residuals   343   6629    19.3                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value and f-value suggests that the our null hypothesis is false
there exist alteast on different group among CAEC categories

pairwise.t.test(df_sub$BMI, df_sub$CAEC, p.adjust.method = 'bonferroni')

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  df_sub$BMI and df_sub$CAEC 
## 
##            Always  Frequently
## Frequently 1.4e-06 -         
## no         0.61    2.9e-10   
## 
## P value adjustment method: bonferroni

‘Sometimes’ category is very different from all other categories
‘no’ and ‘Always’ pair has a p-value of ‘1’ .This implies they are truly independent.

Conclusion * Individuals in ‘Sometimes’ category are very different from other categories and from BoxPlot it is clear that more than 54% of this category are obese *

lets also check if the Sometimes category and Frequently category different from remaining groups

df_no_al <- df |> filter(CAEC %in% c('no','Always'))
df_some_freq <- df |>  filter(CAEC %in% c('Sometimes','Frequently'))

pairwise.t.test(df_some_freq$BMI, df_some_freq$CAEC)

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  df_some_freq$BMI and df_some_freq$CAEC 
## 
##           Frequently
## Sometimes <2e-16    
## 
## P value adjustment method: holm

df_some_freq |>  select(NObeyesdad, CAEC) |>  ftable() |> chisq.test('simulate.p.value' = TRUE, B = 9999) -> x2res

x2res

## 
##  Pearson's Chi-squared test with simulated p-value (based on 9999
##  replicates)
## 
## data:  ftable(select(df_some_freq, NObeyesdad, CAEC))
## X-squared = 527.28, df = NA, p-value = 1e-04

Performing Chi-Squared Test on BMI ~ Means of Transportation

A common rule-of-thumb is that the approximation is adequate as long as all e_i,_j>1 and at most 20% of the cells have e_i,_j < 5 (Cochran, 1954).
If this doesnt satisify, there are two options

Combine some of the categorical variables, to perform X^2 test
Simulate sampling from several populations

Null Hypothesis: There is no relationship between BMI and MTRANS ,ie. an individual’s means of transportation has no influence on obesity of that individual Alternative Hypothesis: There is some relationship between BMI and MTRANS.

df |> mutate(is_obese = (BMI>30)) |>  select(is_obese, MTRANS) |>  ftable()

##          MTRANS Automobile Bike Motorbike Public_Transportation Walking
## is_obese                                                               
## FALSE                  250    6         8                   820      53
## TRUE                   207    1         3                   760       3

df |> mutate(is_obese = (BMI>30)) |>  select(is_obese, MTRANS) |>  ftable() |> chisq.test('simulate.p.value' = TRUE, B = 9999) -> x2res
x2res$expected

##          [,1]     [,2]    [,3]     [,4]     [,5]
## [1,] 246.1435 3.770251 5.92468 850.9995 30.16201
## [2,] 210.8565 3.229749 5.07532 729.0005 25.83799

x2res

## 
##  Pearson's Chi-squared test with simulated p-value (based on 9999
##  replicates)
## 
## data:  ftable(select(mutate(df, is_obese = (BMI > 30)), is_obese, MTRANS))
## X-squared = 44.491, df = NA, p-value = 1e-04

Since more the 20 percent of the counts are less than 5. So performing simulated chi-squared test _ Considering alpha as 0.01, obtained p-values is less than 0.01. Thus Null hypothesis can be rejected.

Conclusion: - The observed contingency table is very rare to occure by chance, thus there is some relationship between MTRANS and BMI

Final Conclusions:

Gender has no real impact on BMI of person.
However, Genetics has relationship with BMI. From count bar plot, it is clear that individual with no family history of overweight are less likely to be obese.
Irregular food habits, ‘Sometimes’ and ‘Frequently’ category is significantly different from ‘no’ and ‘Always’ category.
But, ‘Sometimes’ is also different from ‘Frequently’ category where ‘Sometimes’ category individuals tends to have higher BMI values and ‘Always’ category individuals tends to have lower BMIs comparatively.
And finally, Means of transportation and BMI has some relation and more specific analysis needs to be done for more useful usable insights.