library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(flextable)
library(boot)
df <- read.csv("~/Downloads/ObesityDataSet_raw_and_data_sinthetic.csv", header=TRUE)
df['BMI'] <- df['Weight']/df['Height']**2
df |> ggplot(mapping = aes(x = factor(NObeyesdad, levels = c('Insufficient_Weight','Normal_Weight', 'Overweight_Level_I','Overweight_Level_II','Obesity_Type_I','Obesity_Type_II','Obesity_Type_III' )), fill = NObeyesdad)) + geom_bar() + theme_classic() +
labs(title = 'Each Category counts', x = 'Obesity Levels', y = 'Count') + theme(axis.text.x = element_text(angle = 45, hjust = 1))
df |> ggplot(mapping = aes(x = family_history_with_overweight, fill = (BMI >30))) + geom_bar( width = 0.3) + theme_classic() + labs(title = 'Individuals with Family history of overweight')
df |>
select(family_history_with_overweight, BMI, Gender) |>
mutate(is_obese = (BMI>30)) |>
group_by(Gender,family_history_with_overweight,is_obese) |>
summarise(count = n()) -> gen_genetic_obese_count
## `summarise()` has grouped output by 'Gender', 'family_history_with_overweight'.
## You can override using the `.groups` argument.
gen_genetic_obese_count
## # A tibble: 8 × 4
## # Groups: Gender, family_history_with_overweight [4]
## Gender family_history_with_overweight is_obese count
## <chr> <chr> <lgl> <int>
## 1 Female no FALSE 230
## 2 Female no TRUE 2
## 3 Female yes FALSE 331
## 4 Female yes TRUE 480
## 5 Male no FALSE 147
## 6 Male no TRUE 6
## 7 Male yes FALSE 429
## 8 Male yes TRUE 486
Individuals with family_history_with_overwieght are more compared to without family overweight history 1726 to 385. (81.7% have famliy-history)
Gender-wise , obesity count : Evenly balanced
gen_genetic_obese_count :
df |>
select(family_history_with_overweight, BMI, Gender) |>
mutate(is_obese = (BMI>30)) |>
group_by(is_obese, Gender) |>
summarise(count = n(), meanBMI = mean(BMI)) -> gender_wise_obesity_counts
## `summarise()` has grouped output by 'is_obese'. You can override using the
## `.groups` argument.
gender_wise_obesity_counts <- gender_wise_obesity_counts |>
mutate(Gend_obes = paste( is_obese,Gender, sep = ' obese : '))
ggplot(data = gender_wise_obesity_counts,
mapping = aes(x = Gend_obes,fill = is_obese)) +
geom_bar(width = 0.5) +
theme_classic() +
labs(
title = 'Gender + Obesity Distributions',
x = 'Gender + Obesity',
y = 'Proportions Ratio'
)
ggplot(data = gender_wise_obesity_counts,
mapping = aes(x = Gend_obes, y = meanBMI, fill = (is_obese))) +
geom_bar(stat = "identity", width = 0.3, colour = "black") +
theme_classic() +
labs(
title = "Mean BMI Grouped by Gender + Obesity",
x = "Gender + Obesity",
y = "Mean BMI"
) +
scale_fill_manual(
name = "is_obese",
values = c("FALSE" = "green",
"TRUE" = "yellow")
)
gender_wise_obesity_counts
## # A tibble: 4 × 5
## # Groups: is_obese [2]
## is_obese Gender count meanBMI Gend_obes
## <lgl> <chr> <int> <dbl> <chr>
## 1 FALSE Female 561 22.6 FALSE obese : Female
## 2 FALSE Male 576 24.3 FALSE obese : Male
## 3 TRUE Female 482 38.8 TRUE obese : Female
## 4 TRUE Male 492 35.1 TRUE obese : Male
Conclusions
df |> t.test(BMI ~ Gender,data = _)
##
## Welch Two Sample t-test
##
## data: BMI by Gender
## t = 2.4282, df = 1823.5, p-value = 0.01527
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## 0.1633912 1.5358581
## sample estimates:
## mean in group Female mean in group Male
## 30.13000 29.28038
df |>
select(family_history_with_overweight, BMI, Gender) |>
mutate(is_obese = (BMI>30)) |> mutate(Gend_obes = paste(is_obese, Gender, sep = ' obese ')) -> new_df
new_df$Gend_obes <- factor(new_df$Gend_obes)
new_df |>
ggplot(mapping = aes(x = Gend_obes, fill = is_obese)) +
geom_histogram(stat = "count", width = 0.5) +
labs(
title = "Number of indiviuals Grouped by Gender + Obesity",
x = "Gender + Obesity",
y = "Count"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_classic()
## Warning in geom_histogram(stat = "count", width = 0.5): Ignoring unknown
## parameters: `binwidth`, `bins`, and `pad`
df |>
select(family_history_with_overweight, BMI) |>
mutate(is_obese = (BMI>30)) |>
group_by(family_history_with_overweight,is_obese) |>
summarise(count = n()) -> genetic_obese_count
## `summarise()` has grouped output by 'family_history_with_overweight'. You can
## override using the `.groups` argument.
genetic_obese_count
## # A tibble: 4 × 3
## # Groups: family_history_with_overweight [2]
## family_history_with_overweight is_obese count
## <chr> <lgl> <int>
## 1 no FALSE 377
## 2 no TRUE 8
## 3 yes FALSE 760
## 4 yes TRUE 966
Assumptions:
With the given count of people with obseity are 974 and not obese are 1137, similarly 385 individual have to no family_history_with_overweight and 1726 will have history.
Null Hypothesis: There is no relationship between Obesity and family_history_with_overweight
Alternative Hypothesis: There is a relation between Obesity and family_history_with_overweight
df |>
select(BMI, family_history_with_overweight) |>
mutate(is_obese = (BMI > 30)) |>
xtabs(~ is_obese + family_history_with_overweight, data = _) -> contingency_table
contingency_table |>
chisq.test() -> x2res
contingency_table
## family_history_with_overweight
## is_obese no yes
## FALSE 377 760
## TRUE 8 966
x2res$expected
## family_history_with_overweight
## is_obese no yes
## FALSE 207.3638 929.6362
## TRUE 177.6362 796.3638
x2res
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: contingency_table
## X-squared = 365.69, df = 1, p-value < 2.2e-16
The important Assumptions :
Conclusion: Alternative Hypothesis is True and there is a relationship between is_obese ~ family_history_with_overweight
Null Hypothesis: There is no relationship between family_history_with_overweight and BMI
pairwise.t.test(df$BMI, df$family_history_with_overweight, p.adjust.method = 'bonferroni', alternative = 'less')
##
## Pairwise comparisons using t tests with pooled SD
##
## data: df$BMI and df$family_history_with_overweight
##
## no
## yes 1
##
## P value adjustment method: bonferroni
df |> select(BMI, family_history_with_overweight) |> group_by(family_history_with_overweight) |> summarise(meanBMI = mean(BMI))
## # A tibble: 2 × 2
## family_history_with_overweight meanBMI
## <chr> <dbl>
## 1 no 21.5
## 2 yes 31.5
Conclusion * Null Hyothesis can be rejected
df |>
ggplot(mapping = aes(x = factor(CAEC, levels = c('no', 'Sometimes', 'Frequently', 'Always')), y = BMI)) +
geom_boxplot(aes(color = ifelse(CAEC == "Sometimes", "darkblue", "black")), size = 0.6) +
scale_color_identity() + # Use the colors as defined
labs(
title = 'BMI vs Frequency of Eating Food in Between Meals - BoxPlot',
x = 'Frequency of Eating Food in Between Meals',
y = 'BMI'
) +
geom_hline(yintercept = 30, colour = 'red') +
annotate("text", x = 0.6, y = 31, label = "Obese", color = "red", size = 3, fontface = "bold") +
annotate("text", x = 0.6, y = 29, label = "Non-Obese", color = "red", size = 3, fontface = "bold") +
theme_classic()
df |> select(BMI, CAEC) |> group_by(CAEC) |> summarise(count = n(), meanBMI = mean(BMI))
## # A tibble: 4 × 3
## CAEC count meanBMI
## <chr> <int> <dbl>
## 1 Always 53 24.3
## 2 Frequently 242 20.9
## 3 Sometimes 1765 31.2
## 4 no 51 25.4
Assumptions: * Normality of each distributions in CAEC variable(except ‘Sometimes’)
df_sub <- df |> filter(CAEC %in% c('no','Frequently','Always'))
anova_res <- aov(BMI~CAEC, data = df_sub)
summary(anova_res)
## Df Sum Sq Mean Sq F value Pr(>F)
## CAEC 2 1174 587.2 30.39 7.09e-13 ***
## Residuals 343 6629 19.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pairwise.t.test(df_sub$BMI, df_sub$CAEC, p.adjust.method = 'bonferroni')
##
## Pairwise comparisons using t tests with pooled SD
##
## data: df_sub$BMI and df_sub$CAEC
##
## Always Frequently
## Frequently 1.4e-06 -
## no 0.61 2.9e-10
##
## P value adjustment method: bonferroni
Conclusion * Individuals in ‘Sometimes’ category are very different from other categories and from BoxPlot it is clear that more than 54% of this category are obese *
df_no_al <- df |> filter(CAEC %in% c('no','Always'))
df_some_freq <- df |> filter(CAEC %in% c('Sometimes','Frequently'))
pairwise.t.test(df_some_freq$BMI, df_some_freq$CAEC)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: df_some_freq$BMI and df_some_freq$CAEC
##
## Frequently
## Sometimes <2e-16
##
## P value adjustment method: holm
df_some_freq |> select(NObeyesdad, CAEC) |> ftable() |> chisq.test('simulate.p.value' = TRUE, B = 9999) -> x2res
x2res
##
## Pearson's Chi-squared test with simulated p-value (based on 9999
## replicates)
##
## data: ftable(select(df_some_freq, NObeyesdad, CAEC))
## X-squared = 527.28, df = NA, p-value = 1e-04
A common rule-of-thumb is that the approximation is adequate as long as all e_i,_j>1 and at most 20% of the cells have e_i,_j < 5 (Cochran, 1954).
If this doesnt satisify, there are two options
Null Hypothesis: There is no relationship between BMI and MTRANS ,ie. an individual’s means of transportation has no influence on obesity of that individual Alternative Hypothesis: There is some relationship between BMI and MTRANS.
df |> mutate(is_obese = (BMI>30)) |> select(is_obese, MTRANS) |> ftable()
## MTRANS Automobile Bike Motorbike Public_Transportation Walking
## is_obese
## FALSE 250 6 8 820 53
## TRUE 207 1 3 760 3
df |> mutate(is_obese = (BMI>30)) |> select(is_obese, MTRANS) |> ftable() |> chisq.test('simulate.p.value' = TRUE, B = 9999) -> x2res
x2res$expected
## [,1] [,2] [,3] [,4] [,5]
## [1,] 246.1435 3.770251 5.92468 850.9995 30.16201
## [2,] 210.8565 3.229749 5.07532 729.0005 25.83799
x2res
##
## Pearson's Chi-squared test with simulated p-value (based on 9999
## replicates)
##
## data: ftable(select(mutate(df, is_obese = (BMI > 30)), is_obese, MTRANS))
## X-squared = 44.491, df = NA, p-value = 1e-04
Conclusion: - The observed contingency table is very rare to occure by chance, thus there is some relationship between MTRANS and BMI
Gender has no real impact on BMI of person.
However, Genetics has relationship with BMI. From count bar plot, it is clear that individual with no family history of overweight are less likely to be obese.
Irregular food habits, ‘Sometimes’ and ‘Frequently’ category is significantly different from ‘no’ and ‘Always’ category.
But, ‘Sometimes’ is also different from ‘Frequently’ category where ‘Sometimes’ category individuals tends to have higher BMI values and ‘Always’ category individuals tends to have lower BMIs comparatively.
And finally, Means of transportation and BMI has some relation and more specific analysis needs to be done for more useful usable insights.