library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(flextable)
library(boot)

df <- read.csv("~/Downloads/ObesityDataSet_raw_and_data_sinthetic.csv", header=TRUE)
df['BMI'] <- df['Weight']/df['Height']**2

EDA

df |> ggplot(mapping = aes(x = factor(NObeyesdad, levels = c('Insufficient_Weight','Normal_Weight', 'Overweight_Level_I','Overweight_Level_II','Obesity_Type_I','Obesity_Type_II','Obesity_Type_III' )), fill = NObeyesdad)) + geom_bar() + theme_classic() + 
  labs(title = 'Each Category counts', x = 'Obesity Levels', y = 'Count') + theme(axis.text.x = element_text(angle = 45, hjust = 1))

df |>  ggplot(mapping = aes(x = family_history_with_overweight, fill = (BMI >30))) + geom_bar( width = 0.3) + theme_classic() + labs(title = 'Individuals with Family history of overweight')

df |> 
  select(family_history_with_overweight, BMI, Gender) |> 
  mutate(is_obese = (BMI>30)) |> 
  group_by(Gender,family_history_with_overweight,is_obese) |>
  summarise(count = n()) -> gen_genetic_obese_count
## `summarise()` has grouped output by 'Gender', 'family_history_with_overweight'.
## You can override using the `.groups` argument.
gen_genetic_obese_count
## # A tibble: 8 × 4
## # Groups:   Gender, family_history_with_overweight [4]
##   Gender family_history_with_overweight is_obese count
##   <chr>  <chr>                          <lgl>    <int>
## 1 Female no                             FALSE      230
## 2 Female no                             TRUE         2
## 3 Female yes                            FALSE      331
## 4 Female yes                            TRUE       480
## 5 Male   no                             FALSE      147
## 6 Male   no                             TRUE         6
## 7 Male   yes                            FALSE      429
## 8 Male   yes                            TRUE       486
  1. Individuals both male and female with no family-overweight_genetics tend be normal weight (or insufficent weight) or simply not Obese.
  2. A little more analysis is required to understand if it is a contributing factor for high BMIs.
df |> 
  select(family_history_with_overweight, BMI, Gender) |> 
  mutate(is_obese = (BMI>30)) |> 
  group_by(is_obese, Gender) |>
  summarise(count = n(), meanBMI = mean(BMI)) -> gender_wise_obesity_counts
## `summarise()` has grouped output by 'is_obese'. You can override using the
## `.groups` argument.
gender_wise_obesity_counts  <- gender_wise_obesity_counts |> 
  mutate(Gend_obes  = paste( is_obese,Gender,  sep = ' obese : '))


ggplot(data = gender_wise_obesity_counts, 
       mapping = aes(x = Gend_obes,fill = is_obese)) + 
  geom_bar(width = 0.5) + 
  theme_classic() + 
  labs(
    title = 'Gender + Obesity Distributions', 
    x = 'Gender + Obesity', 
    y = 'Proportions Ratio'
  )

ggplot(data = gender_wise_obesity_counts, 
       mapping = aes(x = Gend_obes, y = meanBMI, fill = (is_obese))) + 
  geom_bar(stat = "identity", width = 0.3, colour = "black") + 
  theme_classic() + 
  labs(
    title = "Mean BMI Grouped by Gender + Obesity", 
    x = "Gender + Obesity", 
    y = "Mean BMI"
  ) + 
  scale_fill_manual(
    name = "is_obese", 
    values = c("FALSE" = "green", 
               "TRUE" = "yellow")
  )

gender_wise_obesity_counts
## # A tibble: 4 × 5
## # Groups:   is_obese [2]
##   is_obese Gender count meanBMI Gend_obes           
##   <lgl>    <chr>  <int>   <dbl> <chr>               
## 1 FALSE    Female   561    22.6 FALSE obese : Female
## 2 FALSE    Male     576    24.3 FALSE obese : Male  
## 3 TRUE     Female   482    38.8 TRUE obese : Female 
## 4 TRUE     Male     492    35.1 TRUE obese : Male

Conclusions

BMI ~ Gender

df |> t.test(BMI ~ Gender,data = _)
## 
##  Welch Two Sample t-test
## 
## data:  BMI by Gender
## t = 2.4282, df = 1823.5, p-value = 0.01527
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  0.1633912 1.5358581
## sample estimates:
## mean in group Female   mean in group Male 
##             30.13000             29.28038
df |> 
  select(family_history_with_overweight, BMI, Gender) |> 
  mutate(is_obese = (BMI>30))  |> mutate(Gend_obes  = paste(is_obese, Gender,  sep = ' obese ')) -> new_df


new_df$Gend_obes <- factor(new_df$Gend_obes)

new_df |> 
  ggplot(mapping = aes(x = Gend_obes, fill = is_obese)) + 
  geom_histogram(stat = "count", width = 0.5) + 

  labs(
    title = "Number of indiviuals Grouped by Gender + Obesity", 
    x = "Gender + Obesity",
    y = "Count"
  ) + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  theme_classic()
## Warning in geom_histogram(stat = "count", width = 0.5): Ignoring unknown
## parameters: `binwidth`, `bins`, and `pad`

gen_genetic_obese_count

df |> 
  select(family_history_with_overweight, BMI) |> 
  mutate(is_obese = (BMI>30)) |> 
  group_by(family_history_with_overweight,is_obese) |>
  summarise(count = n()) -> genetic_obese_count
## `summarise()` has grouped output by 'family_history_with_overweight'. You can
## override using the `.groups` argument.
genetic_obese_count
## # A tibble: 4 × 3
## # Groups:   family_history_with_overweight [2]
##   family_history_with_overweight is_obese count
##   <chr>                          <lgl>    <int>
## 1 no                             FALSE      377
## 2 no                             TRUE         8
## 3 yes                            FALSE      760
## 4 yes                            TRUE       966

Chi-Squared test for Contingency table -> is_obese ~ family_history_with_overweight

Assumptions:

df |> 
  select(BMI, family_history_with_overweight) |> 
  mutate(is_obese = (BMI > 30)) |> 
  xtabs(~ is_obese + family_history_with_overweight, data = _) -> contingency_table

contingency_table |> 
  chisq.test() -> x2res 

contingency_table
##         family_history_with_overweight
## is_obese  no yes
##    FALSE 377 760
##    TRUE    8 966
x2res$expected
##         family_history_with_overweight
## is_obese       no      yes
##    FALSE 207.3638 929.6362
##    TRUE  177.6362 796.3638
x2res
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  contingency_table
## X-squared = 365.69, df = 1, p-value < 2.2e-16

The important Assumptions :

  1. The total number of individuals with Obesity and without Obesity are constant. Similarly, total number of individual without family overweight history and with history are constant

Conclusion: Alternative Hypothesis is True and there is a relationship between is_obese ~ family_history_with_overweight

t-test for comparing no history and with history considering BMI

Null Hypothesis: There is no relationship between family_history_with_overweight and BMI

pairwise.t.test(df$BMI, df$family_history_with_overweight, p.adjust.method = 'bonferroni', alternative = 'less')
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  df$BMI and df$family_history_with_overweight 
## 
##     no
## yes 1 
## 
## P value adjustment method: bonferroni
df |> select(BMI, family_history_with_overweight) |>  group_by(family_history_with_overweight) |> summarise(meanBMI = mean(BMI))
## # A tibble: 2 × 2
##   family_history_with_overweight meanBMI
##   <chr>                            <dbl>
## 1 no                                21.5
## 2 yes                               31.5

Conclusion * Null Hyothesis can be rejected

ANOVA between CAEC (Eating Food in between meals)

df |> 
  ggplot(mapping = aes(x = factor(CAEC, levels = c('no', 'Sometimes', 'Frequently', 'Always')), y = BMI)) + 
  geom_boxplot(aes(color = ifelse(CAEC == "Sometimes", "darkblue", "black")), size = 0.6) + 
  scale_color_identity() +  # Use the colors as defined
  labs(
    title = 'BMI vs Frequency of Eating Food in Between Meals - BoxPlot', 
    x = 'Frequency of Eating Food in Between Meals', 
    y = 'BMI'
  ) +
  geom_hline(yintercept = 30, colour = 'red') +
  annotate("text", x = 0.6, y = 31, label = "Obese", color = "red", size = 3, fontface = "bold") + 
  annotate("text", x = 0.6, y = 29, label = "Non-Obese", color = "red", size = 3, fontface = "bold") +
  theme_classic()

df |> select(BMI, CAEC) |>  group_by(CAEC) |> summarise(count = n(), meanBMI = mean(BMI))
## # A tibble: 4 × 3
##   CAEC       count meanBMI
##   <chr>      <int>   <dbl>
## 1 Always        53    24.3
## 2 Frequently   242    20.9
## 3 Sometimes   1765    31.2
## 4 no            51    25.4

Hypothesis test

Assumptions: * Normality of each distributions in CAEC variable(except ‘Sometimes’)

  • Null Hypothesis: All the distribuions/categories are independent
df_sub <- df |> filter(CAEC %in% c('no','Frequently','Always'))
anova_res <- aov(BMI~CAEC, data = df_sub)
summary(anova_res)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## CAEC          2   1174   587.2   30.39 7.09e-13 ***
## Residuals   343   6629    19.3                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  • The p-value and f-value suggests that the our null hypothesis is false
  • there exist alteast on different group among CAEC categories
pairwise.t.test(df_sub$BMI, df_sub$CAEC, p.adjust.method = 'bonferroni')
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  df_sub$BMI and df_sub$CAEC 
## 
##            Always  Frequently
## Frequently 1.4e-06 -         
## no         0.61    2.9e-10   
## 
## P value adjustment method: bonferroni
  • ‘Sometimes’ category is very different from all other categories
  • ‘no’ and ‘Always’ pair has a p-value of ‘1’ .This implies they are truly independent.

Conclusion * Individuals in ‘Sometimes’ category are very different from other categories and from BoxPlot it is clear that more than 54% of this category are obese *

lets also check if the Sometimes category and Frequently category different from remaining groups

df_no_al <- df |> filter(CAEC %in% c('no','Always'))
df_some_freq <- df |>  filter(CAEC %in% c('Sometimes','Frequently'))

pairwise.t.test(df_some_freq$BMI, df_some_freq$CAEC)
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  df_some_freq$BMI and df_some_freq$CAEC 
## 
##           Frequently
## Sometimes <2e-16    
## 
## P value adjustment method: holm
df_some_freq |>  select(NObeyesdad, CAEC) |>  ftable() |> chisq.test('simulate.p.value' = TRUE, B = 9999) -> x2res

x2res
## 
##  Pearson's Chi-squared test with simulated p-value (based on 9999
##  replicates)
## 
## data:  ftable(select(df_some_freq, NObeyesdad, CAEC))
## X-squared = 527.28, df = NA, p-value = 1e-04

Performing Chi-Squared Test on BMI ~ Means of Transportation

  1. Combine some of the categorical variables, to perform X^2 test
  2. Simulate sampling from several populations

Null Hypothesis: There is no relationship between BMI and MTRANS ,ie. an individual’s means of transportation has no influence on obesity of that individual Alternative Hypothesis: There is some relationship between BMI and MTRANS.

df |> mutate(is_obese = (BMI>30)) |>  select(is_obese, MTRANS) |>  ftable()
##          MTRANS Automobile Bike Motorbike Public_Transportation Walking
## is_obese                                                               
## FALSE                  250    6         8                   820      53
## TRUE                   207    1         3                   760       3
df |> mutate(is_obese = (BMI>30)) |>  select(is_obese, MTRANS) |>  ftable() |> chisq.test('simulate.p.value' = TRUE, B = 9999) -> x2res
x2res$expected
##          [,1]     [,2]    [,3]     [,4]     [,5]
## [1,] 246.1435 3.770251 5.92468 850.9995 30.16201
## [2,] 210.8565 3.229749 5.07532 729.0005 25.83799
x2res
## 
##  Pearson's Chi-squared test with simulated p-value (based on 9999
##  replicates)
## 
## data:  ftable(select(mutate(df, is_obese = (BMI > 30)), is_obese, MTRANS))
## X-squared = 44.491, df = NA, p-value = 1e-04

Conclusion: - The observed contingency table is very rare to occure by chance, thus there is some relationship between MTRANS and BMI

Final Conclusions:

  • Gender has no real impact on BMI of person.

  • However, Genetics has relationship with BMI. From count bar plot, it is clear that individual with no family history of overweight are less likely to be obese.

  • Irregular food habits, ‘Sometimes’ and ‘Frequently’ category is significantly different from ‘no’ and ‘Always’ category.

  • But, ‘Sometimes’ is also different from ‘Frequently’ category where ‘Sometimes’ category individuals tends to have higher BMI values and ‘Always’ category individuals tends to have lower BMIs comparatively.

  • And finally, Means of transportation and BMI has some relation and more specific analysis needs to be done for more useful usable insights.