library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df = read.csv('ObesityDataSet_raw_and_data_sinthetic.csv')

Data Preprocessing

changing data types

df |>
  head()
##   Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female  21   1.62   64.0                            yes   no    2   3
## 2 Female  21   1.52   56.0                            yes   no    3   3
## 3   Male  23   1.80   77.0                            yes   no    2   3
## 4   Male  27   1.80   87.0                             no   no    3   3
## 5   Male  22   1.78   89.8                             no   no    2   1
## 6   Male  29   1.62   53.0                             no  yes    2   3
##        CAEC SMOKE CH2O SCC FAF TUE       CALC                MTRANS
## 1 Sometimes    no    2  no   0   1         no Public_Transportation
## 2 Sometimes   yes    3 yes   3   0  Sometimes Public_Transportation
## 3 Sometimes    no    2  no   2   1 Frequently Public_Transportation
## 4 Sometimes    no    2  no   2   0 Frequently               Walking
## 5 Sometimes    no    2  no   0   0  Sometimes Public_Transportation
## 6 Sometimes    no    2  no   0   0  Sometimes            Automobile
##            NObeyesdad
## 1       Normal_Weight
## 2       Normal_Weight
## 3       Normal_Weight
## 4  Overweight_Level_I
## 5 Overweight_Level_II
## 6       Normal_Weight
df |>
  str()
## 'data.frame':    2111 obs. of  17 variables:
##  $ Gender                        : chr  "Female" "Female" "Male" "Male" ...
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ Weight                        : num  64 56 77 87 89.8 53 55 53 64 68 ...
##  $ family_history_with_overweight: chr  "yes" "yes" "yes" "no" ...
##  $ FAVC                          : chr  "no" "no" "no" "no" ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ CAEC                          : chr  "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
##  $ SMOKE                         : chr  "no" "yes" "no" "no" ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ SCC                           : chr  "no" "yes" "no" "no" ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CALC                          : chr  "no" "Sometimes" "Frequently" "Frequently" ...
##  $ MTRANS                        : chr  "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
##  $ NObeyesdad                    : chr  "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...

These variables will possibly be changed from continuous numerics to ordered factors after rounding.

par(mfrow = c(2,3))
hist(df$FCVC)
hist(df$NCP)
hist(df$CH2O)
hist(df$FAF)
hist(df$TUE)

is_integer_like = df$FCVC == floor(df$FCVC)

sum(is_integer_like)/nrow(df)*100
## [1] 60.87162
sum(!is_integer_like)/nrow(df)*100
## [1] 39.12838
is_integer_like = df$NCP == floor(df$NCP)

sum(is_integer_like)/nrow(df)*100
## [1] 69.68261
sum(!is_integer_like)/nrow(df)*100
## [1] 30.31739
is_integer_like = df$CH2O == floor(df$CH2O)

sum(is_integer_like)/nrow(df)*100
## [1] 38.89152
sum(!is_integer_like)/nrow(df)*100
## [1] 61.10848
is_integer_like = df$FAF == floor(df$FAF)

sum(is_integer_like)/nrow(df)*100
## [1] 42.77594
sum(!is_integer_like)/nrow(df)*100
## [1] 57.22406
is_integer_like = df$TUE == floor(df$TUE)

sum(is_integer_like)/nrow(df)*100
## [1] 45.38134
sum(!is_integer_like)/nrow(df)*100
## [1] 54.61866

FCVC and NCP will be changed. The other 3 will stay as numerics.

par(mfrow = c(2,3))
hist(as.integer(round(df$FCVC)))
hist(as.integer(round(df$NCP)))
hist(df$CH2O)
hist(df$FAF)
hist(df$TUE)

# categorical variables
df$Gender = as.factor(df$Gender)
df$family_history_with_overweight = as.factor(df$family_history_with_overweight)
df$FAVC = as.factor(df$FAVC) # yes/no to eat high calorie food frequently
df$CAEC = as.factor(df$CAEC) # eat between meals?
df$SMOKE = as.factor(df$SMOKE) # do you smoke?
df$SCC = as.factor(df$SCC) # do you monitor your calories?
df$CALC = as.factor(df$CALC) # how often do you drink alc?
df$MTRANS = as.factor(df$MTRANS) # main transportation
df$NObeyesdad = as.factor(df$NObeyesdad) # obesity level
levels(df$Gender)
## [1] "Female" "Male"
levels(df$family_history_with_overweight)
## [1] "no"  "yes"
levels(df$FAVC)
## [1] "no"  "yes"
levels(df$CAEC)
## [1] "Always"     "Frequently" "no"         "Sometimes"
levels(df$SMOKE)
## [1] "no"  "yes"
levels(df$SCC)
## [1] "no"  "yes"
levels(df$CALC)
## [1] "Always"     "Frequently" "no"         "Sometimes"
levels(df$MTRANS)
## [1] "Automobile"            "Bike"                  "Motorbike"            
## [4] "Public_Transportation" "Walking"
levels(df$NObeyesdad)
## [1] "Insufficient_Weight" "Normal_Weight"       "Obesity_Type_I"     
## [4] "Obesity_Type_II"     "Obesity_Type_III"    "Overweight_Level_I" 
## [7] "Overweight_Level_II"
ordered_levels = c("no", "Sometimes", "Frequently", "Always")

df[c("CAEC", "CALC")] = lapply(df[c("CAEC", "CALC")], function(col) {
  factor(col, levels = ordered_levels, ordered = TRUE)
})

ordered variables

# numerical variables
df$FCVC = as.factor(round(df$FCVC)) # do you usually eat veggies in meals? will treat as ordered factor with levels 1, 2, and 3
df$NCP = as.factor(round(df$NCP)) # how many main meals do you have daily?

df$FCVC = factor(df$FCVC, ordered = T)
df$NCP = factor(df$NCP, ordered = T)

levels(df$FCVC)
## [1] "1" "2" "3"
levels(df$NCP)
## [1] "1" "2" "3" "4"

making NObeyesdad an ordered factor

df$NObeyesdad = factor(
  df$NObeyesdad,
  levels = c(
    "Insufficient_Weight",
    "Normal_Weight",
    "Overweight_Level_I",
    "Overweight_Level_II",
    "Obesity_Type_I",
    "Obesity_Type_II",
    "Obesity_Type_III"
  ),
  ordered = TRUE
)
levels(df$NObeyesdad)
## [1] "Insufficient_Weight" "Normal_Weight"       "Overweight_Level_I" 
## [4] "Overweight_Level_II" "Obesity_Type_I"      "Obesity_Type_II"    
## [7] "Obesity_Type_III"
par(mfrow = c(2,2))
barplot(table(df$CAEC))
barplot(table(df$CALC))
barplot(table(df$FCVC))
barplot(table(df$NCP))

Adding BMI to each observation

df = df |>
  mutate(BMI = Weight/(Height**2))
str(df)
## 'data.frame':    2111 obs. of  18 variables:
##  $ Gender                        : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ Weight                        : num  64 56 77 87 89.8 53 55 53 64 68 ...
##  $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
##  $ FAVC                          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
##  $ FCVC                          : Ord.factor w/ 3 levels "1"<"2"<"3": 2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : Ord.factor w/ 4 levels "1"<"2"<"3"<"4": 3 3 3 3 1 3 3 3 3 3 ...
##  $ CAEC                          : Ord.factor w/ 4 levels "no"<"Sometimes"<..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ SMOKE                         : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ SCC                           : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CALC                          : Ord.factor w/ 4 levels "no"<"Sometimes"<..: 1 2 3 3 2 2 2 2 3 1 ...
##  $ MTRANS                        : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
##  $ NObeyesdad                    : Ord.factor w/ 7 levels "Insufficient_Weight"<..: 2 2 2 3 4 2 2 2 2 2 ...
##  $ BMI                           : num  24.4 24.2 23.8 26.9 28.3 ...

splitting into training and testing

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(25)
train_index = createDataPartition(df$NObeyesdad, p = 0.7, list = FALSE)
train_data = df[train_index, ]
test_data = df[-train_index, ]

x_train = train_data[, setdiff(names(train_data), "NObeyesdad")]
y_train = train_data$NObeyesdad
x_test = test_data[, setdiff(names(test_data), "NObeyesdad")]
y_test = test_data$NObeyesdad

Answering proposed questions

How does obesity vary across different genders and age groups?

ggplot(train_data, aes(x = NObeyesdad, y = Age)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  labs(title = "Distribution of Age by Obesity Level",
       x = "Obesity Level",
       y = "Age") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

kruskal.test(Age ~ NObeyesdad, data = train_data)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Age by NObeyesdad
## Kruskal-Wallis chi-squared = 379.92, df = 6, p-value < 2.2e-16

NObeyesdad has at least one group in which Age has a significant distribution.

ggplot(train_data, aes(x = NObeyesdad, fill = Gender)) +
  geom_bar(position = "dodge") +
  labs(title = "Obesity Levels by Gender",
       x = "Obesity Level",
       y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

table_gender_obesity = table(train_data$Gender, train_data$NObeyesdad)

chisq.test(table_gender_obesity)
## 
##  Pearson's Chi-squared test
## 
## data:  table_gender_obesity
## X-squared = 469.7, df = 6, p-value < 2.2e-16

There is a significant association between Gender and NObeyesdad.

Gender and BMI exploration

ggplot(train_data, aes(x = Gender, y = BMI, fill = Gender)) +
  geom_boxplot() +
  labs(title = "BMI Distribution by Gender", x = "Gender", y = "BMI") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

aggregate(BMI ~ Gender, data = train_data, mean)
##   Gender      BMI
## 1 Female 30.01209
## 2   Male 29.30834
# normality check
ggplot(train_data, aes(x = BMI)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  facet_wrap(~ Gender) +
  labs(title = "Histogram of BMI by Gender")

wilcox.test(BMI ~ Gender, data = train_data)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  BMI by Gender
## W = 275837, p-value = 0.7653
## alternative hypothesis: true location shift is not equal to 0

Is there correlation between age and BMI?

See boxplot for distribution of age across obesity levels above.

ggplot(train_data, aes(x = Age, y = BMI)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

cor.test(train_data$Age, train_data$BMI)
## 
##  Pearson's product-moment correlation
## 
## data:  train_data$Age and train_data$BMI
## t = 9.9576, df = 1477, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2024318 0.2979776
## sample estimates:
##       cor 
## 0.2508155

There is a slight positive correlation between Age and BMI. As you get older, your BMI will increase.

ggplot(train_data, aes(x = NObeyesdad, y = Age)) +
  geom_jitter(width = 0.2, alpha = 0.5, color = "steelblue") +
  labs(title = "Jittered Plot: Age Across Obesity Levels")+
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Does the frequency of vegetable consumption (FCVC) differ across obesity levels?

train_data |>
  ggplot(aes(x = NObeyesdad, fill = FCVC))+
  geom_bar(position = "dodge")+
  labs(title = "Bar Plot of Obesity Levels and Vegetable Intake") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

kruskal.test(train_data$FCVC ~ train_data$NObeyesdad)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  train_data$FCVC by train_data$NObeyesdad
## Kruskal-Wallis chi-squared = 350.37, df = 6, p-value < 2.2e-16

So, do people with different obesity levels tend to report different frequencies of vegetable consumption? Yes they do. The Kruskal-Wallis rank sum test significant p-value indicates that at least one obesity group statistically differs from another obesity group in their vegetable consumption frequencies.

FCVC would be a possible predictor for NObeyesdad

Do people who frequently eat between meals (CAEC) show higher obesity levels?

train_data |>
  ggplot(aes(x = NObeyesdad, fill = CAEC))+
  geom_bar(position = "dodge")+
  labs(title = "Bar Plot of Obesity Levels and Frequency of Eating Between Meals") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

kruskal.test(train_data$CAEC ~ train_data$NObeyesdad)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  train_data$CAEC by train_data$NObeyesdad
## Kruskal-Wallis chi-squared = 321.34, df = 6, p-value < 2.2e-16
ggplot(train_data, aes(x = CAEC, y = BMI)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  labs(title = "Distribution of BMI by Snacking Frequency",
       x = "How Frequently Do You Eat Between Meals",
       y = "BMI") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

kruskal.test(train_data$BMI ~ train_data$CAEC)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  train_data$BMI by train_data$CAEC
## Kruskal-Wallis chi-squared = 292.3, df = 3, p-value < 2.2e-16

There is at least one group of CAEC that differs from another. To find out which one, Dunn’s test will be run.

train_data$CAEC_unordered = factor(train_data$CAEC, ordered = FALSE)
FSA::dunnTest(BMI ~ CAEC_unordered, data = train_data, method = "bonferroni")
## Dunn (1964) Kruskal-Wallis multiple comparison
##   p-values adjusted with the Bonferroni method.
##               Comparison          Z      P.unadj        P.adj
## 1    Always - Frequently   2.891295 3.836578e-03 2.301947e-02
## 2            Always - no  -0.259862 7.949702e-01 1.000000e+00
## 3        Frequently - no  -3.010631 2.607056e-03 1.564233e-02
## 4     Always - Sometimes  -5.519712 3.395551e-08 2.037331e-07
## 5 Frequently - Sometimes -16.064861 4.498735e-58 2.699241e-57
## 6         no - Sometimes  -4.717087 2.392459e-06 1.435475e-05

There is a difference between the following groups at a 0.01 level:

Always - Sometimes

Frequently - Sometimes

no - Sometimes

To address the question if people who eat frequently between meals show higher levels of obesity, the answer is no. Frequent snackers have similar BMIs to the “no” and “always” groups of CAEC. The group with the highest median BMI is actually the people that reported “sometimes”.

train_data = train_data |>
  select(-CAEC_unordered)

Is there a relationship between the number of daily meals (NCP) and obesity level?

train_data |>
  ggplot(aes(x = NObeyesdad, fill = NCP))+
  geom_bar(position = "fill")+
  labs(title = "Bar Plot of Obesity Levels and Number of Daily Meals",
       y = "Proportion") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

chisq.test(table(train_data$NCP, train_data$NObeyesdad))
## 
##  Pearson's Chi-squared test
## 
## data:  table(train_data$NCP, train_data$NObeyesdad)
## X-squared = 312.12, df = 18, p-value < 2.2e-16

There is a statistically significant association between the number of daily meals ,NCP, and obesity level, NObeyesdad. They are not independent of one another.

tbl = as.data.frame(table(train_data$NCP, train_data$NObeyesdad))
colnames(tbl) = c("NCP", "NObeyesdad", "Count")

ggplot(tbl, aes(x = NCP, y = NObeyesdad, fill = Count)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "white", high = "red") +
  labs(title = "Heatmap of Meal Frequency vs Obesity Level",
       x = "Number of Meals (NCP)", y = "Obesity Level") +
  theme_minimal()

Is there a significant difference in BMI between smokers and non-smokers?

ggplot(train_data, aes(x = SMOKE, y = BMI)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  labs(title = "Distribution of BMI by Smoking",
       x = "Smoker",
       y = "BMI") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

t.test(train_data$BMI ~ train_data$SMOKE)
## 
##  Welch Two Sample t-test
## 
## data:  train_data$BMI by train_data$SMOKE
## t = -0.0048165, df = 36.681, p-value = 0.9962
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
##  -2.228240  2.217675
## sample estimates:
##  mean in group no mean in group yes 
##          29.65557          29.66085

BMI on average is not different between smokers and non-smokers.

How do water and alcohol intake individually and combined affect obesity level?

ggplot(train_data, aes(x = CALC, y = BMI)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  labs(title = "Distribution of BMI by Alcohol Consumption Frequency",
       x = "CALC",
       y = "BMI") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(train_data, aes(x = CH2O, y = BMI)) +
  geom_point(alpha = 0.4, color = "blue") +
  labs(title = "Scatter Plot of Water Consumption vs BMI",
       x = "Water Consumption (CH2O)",
       y = "BMI") +
  theme_minimal()

cor.test(train_data$CH2O, train_data$BMI, method = "spearman")
## Warning in cor.test.default(train_data$CH2O, train_data$BMI, method =
## "spearman"): Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  train_data$CH2O and train_data$BMI
## S = 445797058, p-value = 1.984e-11
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1732311

Higher water consumption is slightly associated with a higher BMI.

ggplot(train_data, aes(x = CH2O, y = BMI, color = CALC)) +
  geom_smooth(method = "loess", se = FALSE) +
  labs(title = "Smoothed Interaction Plot: BMI vs CH2O by CALC",
       x = "Water Consumption (CH2O)",
       y = "BMI",
       color = "Alcohol Consumption") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

check = aov(train_data$BMI ~ train_data$CH2O * train_data$CALC)
summary(check)
##                                   Df Sum Sq Mean Sq F value   Pr(>F)    
## train_data$CH2O                    1   2272  2272.5   39.38 4.57e-10 ***
## train_data$CALC                    3   5184  1728.1   29.95  < 2e-16 ***
## train_data$CH2O:train_data$CALC    2   2377  1188.7   20.60 1.50e-09 ***
## Residuals                       1472  84935    57.7                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(check, which = 1)       # Homogeneity

qqnorm(residuals(check))     # Normality
qqline(residuals(check), col = "red")

Those who consume alcohol tend to have higher BMIs if they also consume more water daily, whereas those who do not consume alcohol have lower BMIs if they also consume the same higher amounts of water.

Are individuals with a family history of being overweight more likely to be obese?

train_data |>
  ggplot(aes(x = NObeyesdad, fill = family_history_with_overweight))+
  geom_bar(position = "fill")+
  labs(title = "Distribution of Family History of Overweight by Obesity Level") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

chisq.test(table(train_data$family_history_with_overweight, train_data$NObeyesdad))
## 
##  Pearson's Chi-squared test
## 
## data:  table(train_data$family_history_with_overweight, train_data$NObeyesdad)
## X-squared = 442.06, df = 6, p-value < 2.2e-16
chisq.test(table(train_data$family_history_with_overweight, train_data$NObeyesdad))$stdres
##      
##       Insufficient_Weight Normal_Weight Overweight_Level_I Overweight_Level_II
##   no            14.014445     10.596668           3.998519           -5.038596
##   yes          -14.014445    -10.596668          -3.998519            5.038596
##      
##       Obesity_Type_I Obesity_Type_II Obesity_Type_III
##   no       -7.336387       -7.107627        -7.685979
##   yes       7.336387        7.107627         7.685979

Family history of being overweight is not independent of obesity level.

More people than expected are in Overweight Level II, Obesity Type I, II, and III, when they have a family history of being overweight.

Fewer people than expected are under the normal weight category when they do not have a family history of being overweight.

Does physical activity affect BMI or obesity level?

ggplot(train_data, aes(x = FAF, y = BMI)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred") +
  labs(
    title = "Relationship Between Physical Activity and BMI",
    x = "Frequency of Physical Activity (FAF)",
    y = "BMI"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

cor.test(train_data$BMI, train_data$FAF, method = "spearman")  
## Warning in cor.test.default(train_data$BMI, train_data$FAF, method =
## "spearman"): Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  train_data$BMI and train_data$FAF
## S = 631293128, p-value = 3.813e-11
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.1707873

As physical activity increases, BMI decreases, but the correlation is weak.

ggplot(train_data, aes(x = NObeyesdad, y = FAF)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  stat_summary(fun = mean, geom = "point", shape = 20, size = 2, color = "red") +
  labs(title = "Distribution of Physical Activity by Obesity Level",
       x = "Obesity Level",
       y = "Frequency of Physical Activity") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

kruskal.test(FAF ~ NObeyesdad, data = train_data)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  FAF by NObeyesdad
## Kruskal-Wallis chi-squared = 65.947, df = 6, p-value = 2.764e-12

Physical activity differs across obesity levels.

Predictive Models for NObeyesdad

Finding the best model

table(train_data$NObeyesdad)
## 
## Insufficient_Weight       Normal_Weight  Overweight_Level_I Overweight_Level_II 
##                 191                 201                 203                 203 
##      Obesity_Type_I     Obesity_Type_II    Obesity_Type_III 
##                 246                 208                 227
prop.table(table(train_data$NObeyesdad))
## 
## Insufficient_Weight       Normal_Weight  Overweight_Level_I Overweight_Level_II 
##           0.1291413           0.1359026           0.1372549           0.1372549 
##      Obesity_Type_I     Obesity_Type_II    Obesity_Type_III 
##           0.1663286           0.1406356           0.1534821

The classes for NObeyesdad are fairly balanced.

Without BMI, Height, or Weight

Multinomial Logistic Regression

Colinnearity check

cor(train_data$BMI, train_data$Height)
## [1] 0.1314545
cor(train_data$BMI, train_data$Weight)
## [1] 0.933768
cor(train_data$Weight, train_data$Height)
## [1] 0.4654195
library(nnet)
set.seed(25)

train_control = trainControl(method = "cv", number = 5)

multi_fit = train(NObeyesdad ~ . -BMI -Height -Weight, data = train_data, trControl = train_control, method = "multinom", preProcess = c("center", "scale"),trace = F)
multi_preds = predict(multi_fit, test_data)
confusionMatrix(multi_preds, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  61            26                  6
##   Normal_Weight                         7            36                 12
##   Overweight_Level_I                    5            10                 38
##   Overweight_Level_II                   1             3                  5
##   Obesity_Type_I                        3             8                 14
##   Obesity_Type_II                       4             2                 11
##   Obesity_Type_III                      0             1                  1
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   6              4               0
##   Normal_Weight                         3              2               1
##   Overweight_Level_I                    3             11               1
##   Overweight_Level_II                  20              3               5
##   Obesity_Type_I                       31             70              12
##   Obesity_Type_II                      22             14              70
##   Obesity_Type_III                      2              1               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                1
##   Normal_Weight                      0
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     1
##   Obesity_Type_II                    0
##   Obesity_Type_III                  95
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6171          
##                  95% CI : (0.5779, 0.6552)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.552           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                             0.75309              0.41860
## Specificity                             0.92196              0.95421
## Pos Pred Value                          0.58654              0.59016
## Neg Pred Value                          0.96212              0.91243
## Prevalence                              0.12816              0.13608
## Detection Rate                          0.09652              0.05696
## Detection Prevalence                    0.16456              0.09652
## Balanced Accuracy                       0.83752              0.68641
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                            0.43678                    0.22989
## Specificity                            0.94495                    0.96881
## Pos Pred Value                         0.55882                    0.54054
## Neg Pred Value                         0.91312                    0.88739
## Prevalence                             0.13766                    0.13766
## Detection Rate                         0.06013                    0.03165
## Detection Prevalence                   0.10759                    0.05854
## Balanced Accuracy                      0.69087                    0.59935
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.6667                 0.7865
## Specificity                         0.8691                 0.9024
## Pos Pred Value                      0.5036                 0.5691
## Neg Pred Value                      0.9290                 0.9627
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1108                 0.1108
## Detection Prevalence                0.2199                 0.1946
## Balanced Accuracy                   0.7679                 0.8445
##                      Class: Obesity_Type_III
## Sensitivity                           0.9794
## Specificity                           0.9907
## Pos Pred Value                        0.9500
## Neg Pred Value                        0.9962
## Prevalence                            0.1535
## Detection Rate                        0.1503
## Detection Prevalence                  0.1582
## Balanced Accuracy                     0.9850

Random Forest

set.seed(25)

train_control = trainControl(method = "cv", number =  5)
rf_fit = train(NObeyesdad ~ . -BMI -Height -Weight, data = train_data, trControl = train_control, method = "rf")

rf_preds = predict(rf_fit, test_data)
confusionMatrix(rf_preds, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  72            11                  3
##   Normal_Weight                         4            59                  4
##   Overweight_Level_I                    4             6                 67
##   Overweight_Level_II                   0             6                  4
##   Obesity_Type_I                        1             3                  6
##   Obesity_Type_II                       0             1                  3
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   1              2               0
##   Normal_Weight                         7              5               1
##   Overweight_Level_I                    3              4               0
##   Overweight_Level_II                  61              7               1
##   Obesity_Type_I                        8             84               3
##   Obesity_Type_II                       7              3              84
##   Obesity_Type_III                      0              0               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      1
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    0
##   Obesity_Type_III                  96
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8275          
##                  95% CI : (0.7958, 0.8562)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7986          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.8889              0.68605
## Specificity                              0.9691              0.95971
## Pos Pred Value                           0.8090              0.72840
## Neg Pred Value                           0.9834              0.95100
## Prevalence                               0.1282              0.13608
## Detection Rate                           0.1139              0.09335
## Detection Prevalence                     0.1408              0.12816
## Balanced Accuracy                        0.9290              0.82288
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                             0.7701                    0.70115
## Specificity                             0.9688                    0.96697
## Pos Pred Value                          0.7976                    0.77215
## Neg Pred Value                          0.9635                    0.95298
## Prevalence                              0.1377                    0.13766
## Detection Rate                          0.1060                    0.09652
## Detection Prevalence                    0.1329                    0.12500
## Balanced Accuracy                       0.8695                    0.83406
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.8000                 0.9438
## Specificity                         0.9602                 0.9742
## Pos Pred Value                      0.8000                 0.8571
## Neg Pred Value                      0.9602                 0.9906
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1329                 0.1329
## Detection Prevalence                0.1661                 0.1551
## Balanced Accuracy                   0.8801                 0.9590
##                      Class: Obesity_Type_III
## Sensitivity                           0.9897
## Specificity                           1.0000
## Pos Pred Value                        1.0000
## Neg Pred Value                        0.9981
## Prevalence                            0.1535
## Detection Rate                        0.1519
## Detection Prevalence                  0.1519
## Balanced Accuracy                     0.9948
varImp(rf_fit)
## rf variable importance
## 
##   only 20 most important variables shown (out of 24)
## 
##                                   Overall
## Age                               100.000
## FAF                                50.797
## TUE                                45.140
## CH2O                               44.619
## GenderMale                         35.645
## family_history_with_overweightyes  30.809
## FCVC.L                             25.583
## CALC.C                             19.994
## CAEC.C                             18.743
## MTRANSPublic_Transportation        18.486
## NCP.L                              16.994
## CAEC.L                             14.629
## FAVCyes                            14.544
## NCP.C                              13.591
## FCVC.Q                             10.685
## CALC.Q                             10.648
## CALC.L                             10.245
## NCP.Q                               4.754
## SCCyes                              3.792
## CAEC.Q                              2.249
plot(varImp(rf_fit), top = 20)

SVM

set.seed(25)

train_control = trainControl(method = "cv", number =  5)
svm_fit = train(NObeyesdad ~ . -BMI -Height -Weight, data = train_data, trControl = train_control, method = "svmRadial")

svm_preds = predict(svm_fit, test_data)
confusionMatrix(svm_preds, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  71            13                  1
##   Normal_Weight                         4            54                  7
##   Overweight_Level_I                    6             9                 57
##   Overweight_Level_II                   0             4                  3
##   Obesity_Type_I                        0             5                 14
##   Obesity_Type_II                       0             1                  5
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   3              3               0
##   Normal_Weight                         9              6               3
##   Overweight_Level_I                    7              9               2
##   Overweight_Level_II                  45              6               3
##   Obesity_Type_I                       16             65               0
##   Obesity_Type_II                       7             15              81
##   Obesity_Type_III                      0              1               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      1
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    1
##   Obesity_Type_III                  95
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7405          
##                  95% CI : (0.7045, 0.7743)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.697           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.8765              0.62791
## Specificity                              0.9637              0.94505
## Pos Pred Value                           0.7802              0.64286
## Neg Pred Value                           0.9815              0.94161
## Prevalence                               0.1282              0.13608
## Detection Rate                           0.1123              0.08544
## Detection Prevalence                     0.1440              0.13291
## Balanced Accuracy                        0.9201              0.78648
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                            0.65517                    0.51724
## Specificity                            0.93945                    0.97064
## Pos Pred Value                         0.63333                    0.73770
## Neg Pred Value                         0.94465                    0.92644
## Prevalence                             0.13766                    0.13766
## Detection Rate                         0.09019                    0.07120
## Detection Prevalence                   0.14241                    0.09652
## Balanced Accuracy                      0.79731                    0.74394
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.6190                 0.9101
## Specificity                         0.9336                 0.9466
## Pos Pred Value                      0.6500                 0.7364
## Neg Pred Value                      0.9248                 0.9847
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1028                 0.1282
## Detection Prevalence                0.1582                 0.1741
## Balanced Accuracy                   0.7763                 0.9284
##                      Class: Obesity_Type_III
## Sensitivity                           0.9794
## Specificity                           0.9981
## Pos Pred Value                        0.9896
## Neg Pred Value                        0.9963
## Prevalence                            0.1535
## Detection Rate                        0.1503
## Detection Prevalence                  0.1519
## Balanced Accuracy                     0.9888

XGboost

library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
set.seed(25)

train_control = trainControl(method = "cv", number =  5)
xgb_fit = train(NObeyesdad ~ . -BMI -Height -Weight, data = train_data, trControl = train_control, method = "xgbTree")
xgb_preds = predict(xgb_fit, test_data)
confusionMatrix(xgb_preds, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  72             8                  1
##   Normal_Weight                         3            59                  9
##   Overweight_Level_I                    3            10                 60
##   Overweight_Level_II                   2             4                  3
##   Obesity_Type_I                        1             4                 10
##   Obesity_Type_II                       0             1                  3
##   Obesity_Type_III                      0             0                  1
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   1              3               0
##   Normal_Weight                         9              5               2
##   Overweight_Level_I                    4              3               1
##   Overweight_Level_II                  56              7               2
##   Obesity_Type_I                        9             83               0
##   Obesity_Type_II                       7              3              84
##   Obesity_Type_III                      1              1               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      1
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    0
##   Obesity_Type_III                  96
## 
## Overall Statistics
##                                         
##                Accuracy : 0.807         
##                  95% CI : (0.774, 0.837)
##     No Information Rate : 0.1661        
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.7745        
##                                         
##  Mcnemar's Test P-Value : NA            
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.8889              0.68605
## Specificity                              0.9764              0.94689
## Pos Pred Value                           0.8471              0.67045
## Neg Pred Value                           0.9835              0.95037
## Prevalence                               0.1282              0.13608
## Detection Rate                           0.1139              0.09335
## Detection Prevalence                     0.1345              0.13924
## Balanced Accuracy                        0.9326              0.81647
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                            0.68966                    0.64368
## Specificity                            0.96147                    0.96697
## Pos Pred Value                         0.74074                    0.75676
## Neg Pred Value                         0.95100                    0.94444
## Prevalence                             0.13766                    0.13766
## Detection Rate                         0.09494                    0.08861
## Detection Prevalence                   0.12816                    0.11709
## Balanced Accuracy                      0.82556                    0.80533
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.7905                 0.9438
## Specificity                         0.9545                 0.9742
## Pos Pred Value                      0.7757                 0.8571
## Neg Pred Value                      0.9581                 0.9906
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1313                 0.1329
## Detection Prevalence                0.1693                 0.1551
## Balanced Accuracy                   0.8725                 0.9590
##                      Class: Obesity_Type_III
## Sensitivity                           0.9897
## Specificity                           0.9944
## Pos Pred Value                        0.9697
## Neg Pred Value                        0.9981
## Prevalence                            0.1535
## Detection Rate                        0.1519
## Detection Prevalence                  0.1566
## Balanced Accuracy                     0.9920

Without Weight (With BMI and Height)

Multinomial Logistic Regression

Colinnearity check

cor(train_data$BMI, train_data$Height)
## [1] 0.1314545
cor(train_data$BMI, train_data$Weight)
## [1] 0.933768
cor(train_data$Weight, train_data$Height)
## [1] 0.4654195
set.seed(25)

train_control = trainControl(method = "cv", number = 5)

multi_fit2 = train(NObeyesdad ~ . -Weight, data = train_data, trControl = train_control, method = "multinom", preProcess = c("center", "scale"),trace = F)
multi_preds2 = predict(multi_fit2, test_data)
confusionMatrix(multi_preds2, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  80             2                  0
##   Normal_Weight                         1            78                  0
##   Overweight_Level_I                    0             6                 82
##   Overweight_Level_II                   0             0                  5
##   Obesity_Type_I                        0             0                  0
##   Obesity_Type_II                       0             0                  0
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   0              0               0
##   Normal_Weight                         0              0               0
##   Overweight_Level_I                    4              0               0
##   Overweight_Level_II                  81              3               0
##   Obesity_Type_I                        2            100               0
##   Obesity_Type_II                       0              1              87
##   Obesity_Type_III                      0              1               2
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      0
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    1
##   Obesity_Type_III                  96
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9557          
##                  95% CI : (0.9366, 0.9704)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9483          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.9877               0.9070
## Specificity                              0.9964               0.9982
## Pos Pred Value                           0.9756               0.9873
## Neg Pred Value                           0.9982               0.9855
## Prevalence                               0.1282               0.1361
## Detection Rate                           0.1266               0.1234
## Detection Prevalence                     0.1297               0.1250
## Balanced Accuracy                        0.9920               0.9526
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                             0.9425                     0.9310
## Specificity                             0.9817                     0.9853
## Pos Pred Value                          0.8913                     0.9101
## Neg Pred Value                          0.9907                     0.9890
## Prevalence                              0.1377                     0.1377
## Detection Rate                          0.1297                     0.1282
## Detection Prevalence                    0.1456                     0.1408
## Balanced Accuracy                       0.9621                     0.9582
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.9524                 0.9775
## Specificity                         0.9962                 0.9963
## Pos Pred Value                      0.9804                 0.9775
## Neg Pred Value                      0.9906                 0.9963
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1582                 0.1377
## Detection Prevalence                0.1614                 0.1408
## Balanced Accuracy                   0.9743                 0.9869
##                      Class: Obesity_Type_III
## Sensitivity                           0.9897
## Specificity                           0.9944
## Pos Pred Value                        0.9697
## Neg Pred Value                        0.9981
## Prevalence                            0.1535
## Detection Rate                        0.1519
## Detection Prevalence                  0.1566
## Balanced Accuracy                     0.9920

Random Forest

set.seed(25)

train_control = trainControl(method = "cv", number =  5)
rf_fit2 = train(NObeyesdad ~ . -Weight, data = train_data, trControl = train_control, method = "rf")

rf_preds2 = predict(rf_fit2, test_data)
confusionMatrix(rf_preds2, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  80             0                  0
##   Normal_Weight                         1            86                  1
##   Overweight_Level_I                    0             0                 85
##   Overweight_Level_II                   0             0                  1
##   Obesity_Type_I                        0             0                  0
##   Obesity_Type_II                       0             0                  0
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   0              0               0
##   Normal_Weight                         0              0               0
##   Overweight_Level_I                    1              0               0
##   Overweight_Level_II                  86              2               0
##   Obesity_Type_I                        0            102               0
##   Obesity_Type_II                       0              1              89
##   Obesity_Type_III                      0              0               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      0
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    1
##   Obesity_Type_III                  96
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9873          
##                  95% CI : (0.9752, 0.9945)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9852          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.9877               1.0000
## Specificity                              1.0000               0.9963
## Pos Pred Value                           1.0000               0.9773
## Neg Pred Value                           0.9982               1.0000
## Prevalence                               0.1282               0.1361
## Detection Rate                           0.1266               0.1361
## Detection Prevalence                     0.1266               0.1392
## Balanced Accuracy                        0.9938               0.9982
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                             0.9770                     0.9885
## Specificity                             0.9982                     0.9945
## Pos Pred Value                          0.9884                     0.9663
## Neg Pred Value                          0.9963                     0.9982
## Prevalence                              0.1377                     0.1377
## Detection Rate                          0.1345                     0.1361
## Detection Prevalence                    0.1361                     0.1408
## Balanced Accuracy                       0.9876                     0.9915
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.9714                 1.0000
## Specificity                         1.0000                 0.9963
## Pos Pred Value                      1.0000                 0.9780
## Neg Pred Value                      0.9943                 1.0000
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1614                 0.1408
## Detection Prevalence                0.1614                 0.1440
## Balanced Accuracy                   0.9857                 0.9982
##                      Class: Obesity_Type_III
## Sensitivity                           0.9897
## Specificity                           1.0000
## Pos Pred Value                        1.0000
## Neg Pred Value                        0.9981
## Prevalence                            0.1535
## Detection Rate                        0.1519
## Detection Prevalence                  0.1519
## Balanced Accuracy                     0.9948
varImp(rf_fit2)
## rf variable importance
## 
##   only 20 most important variables shown (out of 26)
## 
##                                    Overall
## BMI                               100.0000
## GenderMale                         10.1440
## Age                                 5.5687
## Height                              2.9095
## family_history_with_overweightyes   2.2686
## FCVC.L                              2.1593
## CAEC.L                              1.6554
## CH2O                                1.3479
## CAEC.C                              1.2971
## TUE                                 1.2360
## FAF                                 1.2280
## CALC.C                              0.9033
## FAVCyes                             0.8686
## FCVC.Q                              0.7155
## NCP.C                               0.6948
## NCP.L                               0.6260
## CALC.Q                              0.5470
## MTRANSPublic_Transportation         0.4761
## CALC.L                              0.4218
## SCCyes                              0.2051
plot(varImp(rf_fit2), top = 20)

SVM

set.seed(25)

train_control = trainControl(method = "cv", number =  5)
svm_fit2 = train(NObeyesdad ~ . -Weight, data = train_data, trControl = train_control, method = "svmRadial")

svm_preds2 = predict(svm_fit2, test_data)
confusionMatrix(svm_preds2, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  78             8                  0
##   Normal_Weight                         3            65                  8
##   Overweight_Level_I                    0             9                 70
##   Overweight_Level_II                   0             4                  5
##   Obesity_Type_I                        0             0                  4
##   Obesity_Type_II                       0             0                  0
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   0              0               0
##   Normal_Weight                         9              5               1
##   Overweight_Level_I                    7              1               1
##   Overweight_Level_II                  68              2               0
##   Obesity_Type_I                        3             91               1
##   Obesity_Type_II                       0              6              86
##   Obesity_Type_III                      0              0               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      0
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    2
##   Obesity_Type_III                  95
## 
## Overall Statistics
##                                           
##                Accuracy : 0.875           
##                  95% CI : (0.8467, 0.8998)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8541          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.9630               0.7558
## Specificity                              0.9855               0.9524
## Pos Pred Value                           0.9070               0.7143
## Neg Pred Value                           0.9945               0.9612
## Prevalence                               0.1282               0.1361
## Detection Rate                           0.1234               0.1028
## Detection Prevalence                     0.1361               0.1440
## Balanced Accuracy                        0.9742               0.8541
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                             0.8046                     0.7816
## Specificity                             0.9670                     0.9798
## Pos Pred Value                          0.7955                     0.8608
## Neg Pred Value                          0.9688                     0.9656
## Prevalence                              0.1377                     0.1377
## Detection Rate                          0.1108                     0.1076
## Detection Prevalence                    0.1392                     0.1250
## Balanced Accuracy                       0.8858                     0.8807
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.8667                 0.9663
## Specificity                         0.9848                 0.9853
## Pos Pred Value                      0.9192                 0.9149
## Neg Pred Value                      0.9737                 0.9944
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1440                 0.1361
## Detection Prevalence                0.1566                 0.1487
## Balanced Accuracy                   0.9257                 0.9758
##                      Class: Obesity_Type_III
## Sensitivity                           0.9794
## Specificity                           1.0000
## Pos Pred Value                        1.0000
## Neg Pred Value                        0.9963
## Prevalence                            0.1535
## Detection Rate                        0.1503
## Detection Prevalence                  0.1503
## Balanced Accuracy                     0.9897

XGBoost

set.seed(25)

train_control = trainControl(method = "cv", number =  5)
xgb_fit2 = train(NObeyesdad ~ . -Weight, data = train_data, trControl = train_control, method = "xgbTree")
xgb_preds2 = predict(xgb_fit2, test_data)
confusionMatrix(xgb_preds2, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  81             0                  0
##   Normal_Weight                         0            86                  1
##   Overweight_Level_I                    0             0                 86
##   Overweight_Level_II                   0             0                  0
##   Obesity_Type_I                        0             0                  0
##   Obesity_Type_II                       0             0                  0
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   0              0               0
##   Normal_Weight                         0              0               0
##   Overweight_Level_I                    0              0               0
##   Overweight_Level_II                  87              2               0
##   Obesity_Type_I                        0            102               0
##   Obesity_Type_II                       0              1              89
##   Obesity_Type_III                      0              0               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      0
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    1
##   Obesity_Type_III                  96
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9921          
##                  95% CI : (0.9816, 0.9974)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9908          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              1.0000               1.0000
## Specificity                              1.0000               0.9982
## Pos Pred Value                           1.0000               0.9885
## Neg Pred Value                           1.0000               1.0000
## Prevalence                               0.1282               0.1361
## Detection Rate                           0.1282               0.1361
## Detection Prevalence                     0.1282               0.1377
## Balanced Accuracy                        1.0000               0.9991
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                             0.9885                     1.0000
## Specificity                             1.0000                     0.9963
## Pos Pred Value                          1.0000                     0.9775
## Neg Pred Value                          0.9982                     1.0000
## Prevalence                              0.1377                     0.1377
## Detection Rate                          0.1361                     0.1377
## Detection Prevalence                    0.1361                     0.1408
## Balanced Accuracy                       0.9943                     0.9982
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.9714                 1.0000
## Specificity                         1.0000                 0.9963
## Pos Pred Value                      1.0000                 0.9780
## Neg Pred Value                      0.9943                 1.0000
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1614                 0.1408
## Detection Prevalence                0.1614                 0.1440
## Balanced Accuracy                   0.9857                 0.9982
##                      Class: Obesity_Type_III
## Sensitivity                           0.9897
## Specificity                           1.0000
## Pos Pred Value                        1.0000
## Neg Pred Value                        0.9981
## Prevalence                            0.1535
## Detection Rate                        0.1519
## Detection Prevalence                  0.1519
## Balanced Accuracy                     0.9948

Without BMI (With Height and Weight)

Multinomial Logistic Regression

Colinnearity check

cor(train_data$BMI, train_data$Height)
## [1] 0.1314545
cor(train_data$BMI, train_data$Weight)
## [1] 0.933768
cor(train_data$Weight, train_data$Height)
## [1] 0.4654195
set.seed(25)

train_control = trainControl(method = "cv", number = 5)

multi_fit3 = train(NObeyesdad ~ . -BMI, data = train_data, trControl = train_control, method = "multinom", preProcess = c("center", "scale"),trace = F)
multi_preds3 = predict(multi_fit3, test_data)
confusionMatrix(multi_preds3, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  80             2                  0
##   Normal_Weight                         1            77                  0
##   Overweight_Level_I                    0             7                 81
##   Overweight_Level_II                   0             0                  6
##   Obesity_Type_I                        0             0                  0
##   Obesity_Type_II                       0             0                  0
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   0              0               0
##   Normal_Weight                         0              0               0
##   Overweight_Level_I                    2              0               0
##   Overweight_Level_II                  84              2               0
##   Obesity_Type_I                        1             99               2
##   Obesity_Type_II                       0              1              87
##   Obesity_Type_III                      0              3               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      0
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    1
##   Obesity_Type_III                  96
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9557          
##                  95% CI : (0.9366, 0.9704)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9483          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.9877               0.8953
## Specificity                              0.9964               0.9982
## Pos Pred Value                           0.9756               0.9872
## Neg Pred Value                           0.9982               0.9838
## Prevalence                               0.1282               0.1361
## Detection Rate                           0.1266               0.1218
## Detection Prevalence                     0.1297               0.1234
## Balanced Accuracy                        0.9920               0.9468
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                             0.9310                     0.9655
## Specificity                             0.9835                     0.9853
## Pos Pred Value                          0.9000                     0.9130
## Neg Pred Value                          0.9889                     0.9944
## Prevalence                              0.1377                     0.1377
## Detection Rate                          0.1282                     0.1329
## Detection Prevalence                    0.1424                     0.1456
## Balanced Accuracy                       0.9573                     0.9754
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.9429                 0.9775
## Specificity                         0.9943                 0.9963
## Pos Pred Value                      0.9706                 0.9775
## Neg Pred Value                      0.9887                 0.9963
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1566                 0.1377
## Detection Prevalence                0.1614                 0.1408
## Balanced Accuracy                   0.9686                 0.9869
##                      Class: Obesity_Type_III
## Sensitivity                           0.9897
## Specificity                           0.9944
## Pos Pred Value                        0.9697
## Neg Pred Value                        0.9981
## Prevalence                            0.1535
## Detection Rate                        0.1519
## Detection Prevalence                  0.1566
## Balanced Accuracy                     0.9920

Random Forest

set.seed(25)

train_control = trainControl(method = "cv", number =  5)
rf_fit3 = train(NObeyesdad ~ . -BMI, data = train_data, trControl = train_control, method = "rf")

rf_preds3 = predict(rf_fit3, test_data)
confusionMatrix(rf_preds3, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  81             3                  0
##   Normal_Weight                         0            79                  4
##   Overweight_Level_I                    0             4                 81
##   Overweight_Level_II                   0             0                  2
##   Obesity_Type_I                        0             0                  0
##   Obesity_Type_II                       0             0                  0
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   0              0               0
##   Normal_Weight                         0              0               0
##   Overweight_Level_I                    0              1               0
##   Overweight_Level_II                  83              4               0
##   Obesity_Type_I                        4             99               4
##   Obesity_Type_II                       0              1              85
##   Obesity_Type_III                      0              0               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      0
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     1
##   Obesity_Type_II                    0
##   Obesity_Type_III                  96
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9557          
##                  95% CI : (0.9366, 0.9704)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9482          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              1.0000               0.9186
## Specificity                              0.9946               0.9927
## Pos Pred Value                           0.9643               0.9518
## Neg Pred Value                           1.0000               0.9872
## Prevalence                               0.1282               0.1361
## Detection Rate                           0.1282               0.1250
## Detection Prevalence                     0.1329               0.1313
## Balanced Accuracy                        0.9973               0.9556
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                             0.9310                     0.9540
## Specificity                             0.9908                     0.9890
## Pos Pred Value                          0.9419                     0.9326
## Neg Pred Value                          0.9890                     0.9926
## Prevalence                              0.1377                     0.1377
## Detection Rate                          0.1282                     0.1313
## Detection Prevalence                    0.1361                     0.1408
## Balanced Accuracy                       0.9609                     0.9715
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.9429                 0.9551
## Specificity                         0.9829                 0.9982
## Pos Pred Value                      0.9167                 0.9884
## Neg Pred Value                      0.9885                 0.9927
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1566                 0.1345
## Detection Prevalence                0.1709                 0.1361
## Balanced Accuracy                   0.9629                 0.9766
##                      Class: Obesity_Type_III
## Sensitivity                           0.9897
## Specificity                           1.0000
## Pos Pred Value                        1.0000
## Neg Pred Value                        0.9981
## Prevalence                            0.1535
## Detection Rate                        0.1519
## Detection Prevalence                  0.1519
## Balanced Accuracy                     0.9948
varImp(rf_fit3)
## rf variable importance
## 
##   only 20 most important variables shown (out of 26)
## 
##                                    Overall
## Weight                            100.0000
## Height                             49.3916
## GenderMale                         34.7188
## Age                                10.3770
## FAVCyes                             5.0514
## FAF                                 3.0674
## CH2O                                2.4131
## CAEC.L                              2.2343
## TUE                                 1.6528
## CAEC.C                              1.4031
## NCP.L                               0.8449
## FCVC.Q                              0.7469
## CALC.Q                              0.7064
## CALC.L                              0.6974
## NCP.C                               0.6606
## family_history_with_overweightyes   0.6596
## SMOKEyes                            0.5660
## FCVC.L                              0.5651
## MTRANSPublic_Transportation         0.4474
## CALC.C                              0.4210
plot(varImp(rf_fit3), top = 20)

SVM

set.seed(25)

train_control = trainControl(method = "cv", number =  5)
svm_fit3 = train(NObeyesdad ~ . -BMI, data = train_data, trControl = train_control, method = "svmRadial")

svm_preds3 = predict(svm_fit3, test_data)
confusionMatrix(svm_preds3, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  77            10                  0
##   Normal_Weight                         4            60                  8
##   Overweight_Level_I                    0             9                 67
##   Overweight_Level_II                   0             5                  5
##   Obesity_Type_I                        0             2                  7
##   Obesity_Type_II                       0             0                  0
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   1              1               0
##   Normal_Weight                         7              5               1
##   Overweight_Level_I                    9              0               1
##   Overweight_Level_II                  65              2               0
##   Obesity_Type_I                        5             91               1
##   Obesity_Type_II                       0              6              86
##   Obesity_Type_III                      0              0               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      1
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    1
##   Obesity_Type_III                  95
## 
## Overall Statistics
##                                           
##                Accuracy : 0.856           
##                  95% CI : (0.8262, 0.8825)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8318          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.9506              0.69767
## Specificity                              0.9782              0.95238
## Pos Pred Value                           0.8652              0.69767
## Neg Pred Value                           0.9926              0.95238
## Prevalence                               0.1282              0.13608
## Detection Rate                           0.1218              0.09494
## Detection Prevalence                     0.1408              0.13608
## Balanced Accuracy                        0.9644              0.82503
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                             0.7701                     0.7471
## Specificity                             0.9651                     0.9780
## Pos Pred Value                          0.7791                     0.8442
## Neg Pred Value                          0.9634                     0.9604
## Prevalence                              0.1377                     0.1377
## Detection Rate                          0.1060                     0.1028
## Detection Prevalence                    0.1361                     0.1218
## Balanced Accuracy                       0.8676                     0.8626
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.8667                 0.9663
## Specificity                         0.9715                 0.9871
## Pos Pred Value                      0.8585                 0.9247
## Neg Pred Value                      0.9734                 0.9944
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1440                 0.1361
## Detection Prevalence                0.1677                 0.1472
## Balanced Accuracy                   0.9191                 0.9767
##                      Class: Obesity_Type_III
## Sensitivity                           0.9794
## Specificity                           1.0000
## Pos Pred Value                        1.0000
## Neg Pred Value                        0.9963
## Prevalence                            0.1535
## Detection Rate                        0.1503
## Detection Prevalence                  0.1503
## Balanced Accuracy                     0.9897

XGBoost

set.seed(25)

train_control = trainControl(method = "cv", number =  5)
xgb_fit3 = train(NObeyesdad ~ . -BMI, data = train_data, trControl = train_control, method = "xgbTree")
xgb_preds3 = predict(xgb_fit3, test_data)
confusionMatrix(xgb_preds3, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  81             0                  0
##   Normal_Weight                         0            82                  4
##   Overweight_Level_I                    0             4                 82
##   Overweight_Level_II                   0             0                  1
##   Obesity_Type_I                        0             0                  0
##   Obesity_Type_II                       0             0                  0
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   0              0               0
##   Normal_Weight                         0              0               0
##   Overweight_Level_I                    0              1               0
##   Overweight_Level_II                  86              0               0
##   Obesity_Type_I                        0            103               1
##   Obesity_Type_II                       1              1              88
##   Obesity_Type_III                      0              0               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      0
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    1
##   Obesity_Type_III                  96
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9778          
##                  95% CI : (0.9631, 0.9878)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9741          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              1.0000               0.9535
## Specificity                              1.0000               0.9927
## Pos Pred Value                           1.0000               0.9535
## Neg Pred Value                           1.0000               0.9927
## Prevalence                               0.1282               0.1361
## Detection Rate                           0.1282               0.1297
## Detection Prevalence                     0.1282               0.1361
## Balanced Accuracy                        1.0000               0.9731
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                             0.9425                     0.9885
## Specificity                             0.9908                     0.9982
## Pos Pred Value                          0.9425                     0.9885
## Neg Pred Value                          0.9908                     0.9982
## Prevalence                              0.1377                     0.1377
## Detection Rate                          0.1297                     0.1361
## Detection Prevalence                    0.1377                     0.1377
## Balanced Accuracy                       0.9667                     0.9933
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.9810                 0.9888
## Specificity                         0.9981                 0.9945
## Pos Pred Value                      0.9904                 0.9670
## Neg Pred Value                      0.9962                 0.9982
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1630                 0.1392
## Detection Prevalence                0.1646                 0.1440
## Balanced Accuracy                   0.9895                 0.9916
##                      Class: Obesity_Type_III
## Sensitivity                           0.9897
## Specificity                           1.0000
## Pos Pred Value                        1.0000
## Neg Pred Value                        0.9981
## Prevalence                            0.1535
## Detection Rate                        0.1519
## Detection Prevalence                  0.1519
## Balanced Accuracy                     0.9948

Without Height or Weight (With BMI)

Multinomial Logistic Regression

Colinnearity check

cor(train_data$BMI, train_data$Height)
## [1] 0.1314545
cor(train_data$BMI, train_data$Weight)
## [1] 0.933768
cor(train_data$Weight, train_data$Height)
## [1] 0.4654195
set.seed(25)

train_control = trainControl(method = "cv", number = 5)

multi_fit4 = train(NObeyesdad ~ . -Weight -Height, data = train_data, trControl = train_control, method = "multinom", preProcess = c("center", "scale"),trace = F)
multi_preds4 = predict(multi_fit4, test_data)
confusionMatrix(multi_preds4, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  80             3                  0
##   Normal_Weight                         1            76                  1
##   Overweight_Level_I                    0             7                 80
##   Overweight_Level_II                   0             0                  6
##   Obesity_Type_I                        0             0                  0
##   Obesity_Type_II                       0             0                  0
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   0              0               0
##   Normal_Weight                         0              0               0
##   Overweight_Level_I                    3              0               0
##   Overweight_Level_II                  83              4               0
##   Obesity_Type_I                        1             99               0
##   Obesity_Type_II                       0              1              88
##   Obesity_Type_III                      0              1               1
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      0
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    1
##   Obesity_Type_III                  96
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9525          
##                  95% CI : (0.9329, 0.9677)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9446          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.9877               0.8837
## Specificity                              0.9946               0.9963
## Pos Pred Value                           0.9639               0.9744
## Neg Pred Value                           0.9982               0.9819
## Prevalence                               0.1282               0.1361
## Detection Rate                           0.1266               0.1203
## Detection Prevalence                     0.1313               0.1234
## Balanced Accuracy                        0.9911               0.9400
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                             0.9195                     0.9540
## Specificity                             0.9817                     0.9817
## Pos Pred Value                          0.8889                     0.8925
## Neg Pred Value                          0.9871                     0.9926
## Prevalence                              0.1377                     0.1377
## Detection Rate                          0.1266                     0.1313
## Detection Prevalence                    0.1424                     0.1472
## Balanced Accuracy                       0.9506                     0.9678
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.9429                 0.9888
## Specificity                         0.9981                 0.9963
## Pos Pred Value                      0.9900                 0.9778
## Neg Pred Value                      0.9887                 0.9982
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1566                 0.1392
## Detection Prevalence                0.1582                 0.1424
## Balanced Accuracy                   0.9705                 0.9925
##                      Class: Obesity_Type_III
## Sensitivity                           0.9897
## Specificity                           0.9963
## Pos Pred Value                        0.9796
## Neg Pred Value                        0.9981
## Prevalence                            0.1535
## Detection Rate                        0.1519
## Detection Prevalence                  0.1551
## Balanced Accuracy                     0.9930

Random Forest

set.seed(25)

train_control = trainControl(method = "cv", number =  5)
rf_fit4 = train(NObeyesdad ~ . -Weight -Height, data = train_data, trControl = train_control, method = "rf")

rf_preds4 = predict(rf_fit4, test_data)
confusionMatrix(rf_preds4, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  80             0                  0
##   Normal_Weight                         1            86                  2
##   Overweight_Level_I                    0             0                 84
##   Overweight_Level_II                   0             0                  1
##   Obesity_Type_I                        0             0                  0
##   Obesity_Type_II                       0             0                  0
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   0              0               0
##   Normal_Weight                         0              0               0
##   Overweight_Level_I                    1              0               0
##   Overweight_Level_II                  86              2               0
##   Obesity_Type_I                        0            101               0
##   Obesity_Type_II                       0              2              89
##   Obesity_Type_III                      0              0               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      0
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    1
##   Obesity_Type_III                  96
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9842          
##                  95% CI : (0.9711, 0.9924)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9815          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.9877               1.0000
## Specificity                              1.0000               0.9945
## Pos Pred Value                           1.0000               0.9663
## Neg Pred Value                           0.9982               1.0000
## Prevalence                               0.1282               0.1361
## Detection Rate                           0.1266               0.1361
## Detection Prevalence                     0.1266               0.1408
## Balanced Accuracy                        0.9938               0.9973
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                             0.9655                     0.9885
## Specificity                             0.9982                     0.9945
## Pos Pred Value                          0.9882                     0.9663
## Neg Pred Value                          0.9945                     0.9982
## Prevalence                              0.1377                     0.1377
## Detection Rate                          0.1329                     0.1361
## Detection Prevalence                    0.1345                     0.1408
## Balanced Accuracy                       0.9818                     0.9915
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.9619                 1.0000
## Specificity                         1.0000                 0.9945
## Pos Pred Value                      1.0000                 0.9674
## Neg Pred Value                      0.9925                 1.0000
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1598                 0.1408
## Detection Prevalence                0.1598                 0.1456
## Balanced Accuracy                   0.9810                 0.9972
##                      Class: Obesity_Type_III
## Sensitivity                           0.9897
## Specificity                           1.0000
## Pos Pred Value                        1.0000
## Neg Pred Value                        0.9981
## Prevalence                            0.1535
## Detection Rate                        0.1519
## Detection Prevalence                  0.1519
## Balanced Accuracy                     0.9948
varImp(rf_fit4)
## rf variable importance
## 
##   only 20 most important variables shown (out of 25)
## 
##                                    Overall
## BMI                               100.0000
## GenderMale                         10.0820
## Age                                 6.7214
## family_history_with_overweightyes   2.6714
## FCVC.L                              2.4657
## FAF                                 1.9038
## CH2O                                1.8220
## CAEC.L                              1.8212
## TUE                                 1.6335
## CAEC.C                              1.4986
## CALC.C                              1.0058
## FAVCyes                             0.9099
## FCVC.Q                              0.8375
## NCP.C                               0.8042
## NCP.L                               0.7564
## MTRANSPublic_Transportation         0.6220
## CALC.Q                              0.6153
## CALC.L                              0.5338
## SCCyes                              0.3673
## NCP.Q                               0.2030
plot(varImp(rf_fit4), top = 20)

SVM

set.seed(25)

train_control = trainControl(method = "cv", number =  5)
svm_fit4 = train(NObeyesdad ~ . -Weight -Height, data = train_data, trControl = train_control, method = "svmRadial")

svm_preds4 = predict(svm_fit4, test_data)
confusionMatrix(svm_preds4, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  79            10                  0
##   Normal_Weight                         2            63                  8
##   Overweight_Level_I                    0             9                 73
##   Overweight_Level_II                   0             4                  4
##   Obesity_Type_I                        0             0                  2
##   Obesity_Type_II                       0             0                  0
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   0              0               0
##   Normal_Weight                         9              5               2
##   Overweight_Level_I                    6              1               0
##   Overweight_Level_II                  68              1               0
##   Obesity_Type_I                        4             89               1
##   Obesity_Type_II                       0              9              86
##   Obesity_Type_III                      0              0               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      0
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    2
##   Obesity_Type_III                  95
## 
## Overall Statistics
##                                           
##                Accuracy : 0.875           
##                  95% CI : (0.8467, 0.8998)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8541          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.9753              0.73256
## Specificity                              0.9819              0.95238
## Pos Pred Value                           0.8876              0.70787
## Neg Pred Value                           0.9963              0.95764
## Prevalence                               0.1282              0.13608
## Detection Rate                           0.1250              0.09968
## Detection Prevalence                     0.1408              0.14082
## Balanced Accuracy                        0.9786              0.84247
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                             0.8391                     0.7816
## Specificity                             0.9706                     0.9835
## Pos Pred Value                          0.8202                     0.8831
## Neg Pred Value                          0.9742                     0.9658
## Prevalence                              0.1377                     0.1377
## Detection Rate                          0.1155                     0.1076
## Detection Prevalence                    0.1408                     0.1218
## Balanced Accuracy                       0.9049                     0.8825
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.8476                 0.9663
## Specificity                         0.9867                 0.9797
## Pos Pred Value                      0.9271                 0.8866
## Neg Pred Value                      0.9701                 0.9944
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1408                 0.1361
## Detection Prevalence                0.1519                 0.1535
## Balanced Accuracy                   0.9172                 0.9730
##                      Class: Obesity_Type_III
## Sensitivity                           0.9794
## Specificity                           1.0000
## Pos Pred Value                        1.0000
## Neg Pred Value                        0.9963
## Prevalence                            0.1535
## Detection Rate                        0.1503
## Detection Prevalence                  0.1503
## Balanced Accuracy                     0.9897

XGBoost

set.seed(25)

train_control = trainControl(method = "cv", number =  5)
xgb_fit4 = train(NObeyesdad ~ . -Weight -Height, data = train_data, trControl = train_control, method = "xgbTree")
xgb_preds4 = predict(xgb_fit4, test_data)
confusionMatrix(xgb_preds4, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  81             0                  0
##   Normal_Weight                         0            86                  1
##   Overweight_Level_I                    0             0                 86
##   Overweight_Level_II                   0             0                  0
##   Obesity_Type_I                        0             0                  0
##   Obesity_Type_II                       0             0                  0
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   0              0               0
##   Normal_Weight                         0              0               0
##   Overweight_Level_I                    0              0               0
##   Overweight_Level_II                  87              2               0
##   Obesity_Type_I                        0            102               2
##   Obesity_Type_II                       0              1              87
##   Obesity_Type_III                      0              0               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      0
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    0
##   Obesity_Type_III                  97
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9905          
##                  95% CI : (0.9795, 0.9965)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9889          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              1.0000               1.0000
## Specificity                              1.0000               0.9982
## Pos Pred Value                           1.0000               0.9885
## Neg Pred Value                           1.0000               1.0000
## Prevalence                               0.1282               0.1361
## Detection Rate                           0.1282               0.1361
## Detection Prevalence                     0.1282               0.1377
## Balanced Accuracy                        1.0000               0.9991
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                             0.9885                     1.0000
## Specificity                             1.0000                     0.9963
## Pos Pred Value                          1.0000                     0.9775
## Neg Pred Value                          0.9982                     1.0000
## Prevalence                              0.1377                     0.1377
## Detection Rate                          0.1361                     0.1377
## Detection Prevalence                    0.1361                     0.1408
## Balanced Accuracy                       0.9943                     0.9982
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.9714                 0.9775
## Specificity                         0.9962                 0.9982
## Pos Pred Value                      0.9808                 0.9886
## Neg Pred Value                      0.9943                 0.9963
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1614                 0.1377
## Detection Prevalence                0.1646                 0.1392
## Balanced Accuracy                   0.9838                 0.9878
##                      Class: Obesity_Type_III
## Sensitivity                           1.0000
## Specificity                           1.0000
## Pos Pred Value                        1.0000
## Neg Pred Value                        1.0000
## Prevalence                            0.1535
## Detection Rate                        0.1535
## Detection Prevalence                  0.1535
## Balanced Accuracy                     1.0000

Comparing Metrics of Models

library(knitr)
cor(train_data$BMI, train_data$Height)
## [1] 0.1314545
cor(train_data$BMI, train_data$Weight)
## [1] 0.933768
cor(train_data$Weight, train_data$Height)
## [1] 0.4654195

A reminder that BMI and Weight are highly correlated, and that Weight and Height are moderately correlated. BMI was appended to the data and computed from Height and Weight. Weight should be dropped to reduce multicolinnearity and redundancy.

kable(results_table, caption = "Performance Metrics Without `BMI`, `Height`, or `Weight`")
Performance Metrics Without BMI, Height, or Weight
Model Accuracy Sensitivity Specificity Pos_Pred_Value
Accuracy…1 Multinomial 0.6170886 0.6101318 0.9360075 0.6141099
Accuracy…2 RandomForest 0.8275316 0.8256730 0.9712861 0.8234711
Accuracy…3 SVM 0.7405063 0.7407722 0.9567370 0.7385803
Accuracy…4 XGBoost 0.8069620 0.8046080 0.9678289 0.8025074
kable(results_table2, caption = "Performance Metrics With `BMI` and `Height` - Without `Weight`")
Performance Metrics With BMI and Height - Without Weight
Model Accuracy Sensitivity Specificity Pos_Pred_Value
Accuracy…1 Multinomial 0.9556962 0.9553991 0.9926322 0.9559979
Accuracy…2 RandomForest 0.9873418 0.9877558 0.9979020 0.9871370
Accuracy…3 SVM 0.8750000 0.8739034 0.9792482 0.8730803
Accuracy…4 XGBoost 0.9920886 0.9928036 0.9986879 0.9920080
kable(results_table3, caption = "Performance Metrics With `Height` and `Weight` - Without `BMI`")
Performance Metrics With Height and Weight - Without BMI
Model Accuracy Sensitivity Specificity Pos_Pred_Value
Accuracy…1 Multinomial 0.9556962 0.9556615 0.9926233 0.9562351
Accuracy…2 RandomForest 0.9556962 0.9558952 0.9925895 0.9565109
Accuracy…3 SVM 0.8560127 0.8539819 0.9760525 0.8527558
Accuracy…4 XGBoost 0.9778481 0.9777043 0.9963203 0.9774201
kable(results_table4, caption = "Performance Metrics With BMI - Without `Weight` or `Height`")
Performance Metrics With BMI - Without Weight or Height
Model Accuracy Sensitivity Specificity Pos_Pred_Value
Accuracy…1 Multinomial 0.9525316 0.9523215 0.9921251 0.9524209
Accuracy…2 RandomForest 0.9841772 0.9847533 0.9973773 0.9840301
Accuracy…3 SVM 0.8750000 0.8745499 0.9792600 0.8732184
Accuracy…4 XGBoost 0.9905063 0.9910661 0.9984089 0.9907771

The XGBoost model has the best results.

Choosing not to tune the SVM because the Random Forest and XGBoost have great predictive power for the data.

Furthur Model Simplification

varImp(xgb_fit2)
## xgbTree variable importance
## 
##   only 20 most important variables shown (out of 26)
## 
##                                    Overall
## BMI                               100.0000
## GenderMale                         11.3174
## Age                                 4.4123
## CAEC.C                              2.9276
## CH2O                                2.7932
## Height                              2.5677
## TUE                                 2.1718
## family_history_with_overweightyes   1.8793
## FAF                                 1.8149
## FCVC.L                              1.7573
## CAEC.L                              1.6371
## NCP.L                               0.9623
## NCP.C                               0.7320
## FAVCyes                             0.7027
## MTRANSPublic_Transportation         0.5986
## CALC.Q                              0.4823
## CALC.L                              0.4071
## SCCyes                              0.2357
## CALC.C                              0.2179
## FCVC.Q                              0.2077

Building a model based on top 5 important predictors

set.seed(25)

train_control = trainControl(method = "cv", number =  5)
xgb_fit0 = train(NObeyesdad ~ BMI + Gender + Age + CAEC + CH2O, data = train_data, trControl = train_control, method = "xgbTree")
xgb_preds0 = predict(xgb_fit0, test_data)
confusionMatrix(xgb_preds0, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  80             1                  0
##   Normal_Weight                         1            85                  2
##   Overweight_Level_I                    0             0                 83
##   Overweight_Level_II                   0             0                  2
##   Obesity_Type_I                        0             0                  0
##   Obesity_Type_II                       0             0                  0
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   0              0               0
##   Normal_Weight                         0              0               0
##   Overweight_Level_I                    2              0               0
##   Overweight_Level_II                  84              3               0
##   Obesity_Type_I                        1            100               1
##   Obesity_Type_II                       0              2              88
##   Obesity_Type_III                      0              0               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      0
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    1
##   Obesity_Type_III                  96
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9747          
##                  95% CI : (0.9592, 0.9855)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9704          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.9877               0.9884
## Specificity                              0.9982               0.9945
## Pos Pred Value                           0.9877               0.9659
## Neg Pred Value                           0.9982               0.9982
## Prevalence                               0.1282               0.1361
## Detection Rate                           0.1266               0.1345
## Detection Prevalence                     0.1282               0.1392
## Balanced Accuracy                        0.9929               0.9914
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                             0.9540                     0.9655
## Specificity                             0.9963                     0.9908
## Pos Pred Value                          0.9765                     0.9438
## Neg Pred Value                          0.9927                     0.9945
## Prevalence                              0.1377                     0.1377
## Detection Rate                          0.1313                     0.1329
## Detection Prevalence                    0.1345                     0.1408
## Balanced Accuracy                       0.9752                     0.9782
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.9524                 0.9888
## Specificity                         0.9962                 0.9945
## Pos Pred Value                      0.9804                 0.9670
## Neg Pred Value                      0.9906                 0.9982
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1582                 0.1392
## Detection Prevalence                0.1614                 0.1440
## Balanced Accuracy                   0.9743                 0.9916
##                      Class: Obesity_Type_III
## Sensitivity                           0.9897
## Specificity                           1.0000
## Pos Pred Value                        1.0000
## Neg Pred Value                        0.9981
## Prevalence                            0.1535
## Detection Rate                        0.1519
## Detection Prevalence                  0.1519
## Balanced Accuracy                     0.9948
# Confusion matrices
cm_xgb2 = confusionMatrix(xgb_preds2, y_test)
cm_xgb0 = confusionMatrix(xgb_preds0, y_test)

# Extract accuracy
acc_xgb2 = cm_xgb2$overall["Accuracy"]
acc_xgb0 = cm_xgb0$overall["Accuracy"]

# Extract class-wise metrics
cm_xgb2_metrics = cm_xgb2$byClass[, c("Sensitivity", "Specificity", "Pos Pred Value")]
cm_xgb0_metrics = cm_xgb0$byClass[, c("Sensitivity", "Specificity", "Pos Pred Value")]

# Average class-wise metrics
metrics_comparison = data.frame(
  Model = c("xgb_fit2 (All Predictors with BMI and Height)", "xgb_fit0 (Top 5 Predictors from xgb_fit2)"),
  Accuracy = c(acc_xgb2, acc_xgb0),
  Sensitivity = c(mean(cm_xgb2_metrics[, "Sensitivity"], na.rm = TRUE),
                  mean(cm_xgb0_metrics[, "Sensitivity"], na.rm = TRUE)),
  Specificity = c(mean(cm_xgb2_metrics[, "Specificity"], na.rm = TRUE),
                  mean(cm_xgb0_metrics[, "Specificity"], na.rm = TRUE)),
  PPV = c(mean(cm_xgb2_metrics[, "Pos Pred Value"], na.rm = TRUE),
          mean(cm_xgb0_metrics[, "Pos Pred Value"], na.rm = TRUE))
)

# Display table
knitr::kable(metrics_comparison, digits = 4, caption = "Comparison of Full vs Reduced XGBoost Models")
Comparison of Full vs Reduced XGBoost Models
Model Accuracy Sensitivity Specificity PPV
xgb_fit2 (All Predictors with BMI and Height) 0.9921 0.9928 0.9987 0.9920
xgb_fit0 (Top 5 Predictors from xgb_fit2) 0.9747 0.9752 0.9958 0.9745

The drop off in metrics is relatively small and the amount of predictors dropped from 17 to 5.

# Confusion matrices
cm_xgb2 = confusionMatrix(xgb_preds2, y_test)
cm_xgb0 = confusionMatrix(xgb_preds0, y_test)
cm_xgb = confusionMatrix(xgb_preds, y_test)  # For model excluding Height, Weight, BMI

# Extract accuracy
acc_xgb2 = cm_xgb2$overall["Accuracy"]
acc_xgb0 = cm_xgb0$overall["Accuracy"]
acc_xgb  = cm_xgb$overall["Accuracy"]

# Extract class-wise metrics
cm_xgb2_metrics = cm_xgb2$byClass[, c("Sensitivity", "Specificity", "Pos Pred Value")]
cm_xgb0_metrics = cm_xgb0$byClass[, c("Sensitivity", "Specificity", "Pos Pred Value")]
cm_xgb_metrics  = cm_xgb$byClass[, c("Sensitivity", "Specificity", "Pos Pred Value")]

# Average class-wise metrics
metrics_comparison = data.frame(
  Model = c(
    "xgb_fit2 (All Predictors with BMI and Height)",
    "xgb_fit0 (Only Top 5 Predictors from xgb_fit2)",
    "xgb_fit (All Predictors Without Height, Weight, and BMI)"
  ),
  Accuracy = c(acc_xgb2, acc_xgb0, acc_xgb),
  Sensitivity = c(
    mean(cm_xgb2_metrics[, "Sensitivity"], na.rm = TRUE),
    mean(cm_xgb0_metrics[, "Sensitivity"], na.rm = TRUE),
    mean(cm_xgb_metrics[, "Sensitivity"], na.rm = TRUE)
  ),
  Specificity = c(
    mean(cm_xgb2_metrics[, "Specificity"], na.rm = TRUE),
    mean(cm_xgb0_metrics[, "Specificity"], na.rm = TRUE),
    mean(cm_xgb_metrics[, "Specificity"], na.rm = TRUE)
  ),
  PPV = c(
    mean(cm_xgb2_metrics[, "Pos Pred Value"], na.rm = TRUE),
    mean(cm_xgb0_metrics[, "Pos Pred Value"], na.rm = TRUE),
    mean(cm_xgb_metrics[, "Pos Pred Value"], na.rm = TRUE)
  )
)

# Display table
knitr::kable(metrics_comparison, digits = 4, caption = "Comparison of XGBoost Models with Varying Predictors")
Comparison of XGBoost Models with Varying Predictors
Model Accuracy Sensitivity Specificity PPV
xgb_fit2 (All Predictors with BMI and Height) 0.9921 0.9928 0.9987 0.9920
xgb_fit0 (Only Top 5 Predictors from xgb_fit2) 0.9747 0.9752 0.9958 0.9745
xgb_fit (All Predictors Without Height, Weight, and BMI) 0.8070 0.8046 0.9678 0.8025

When dropping the most obvious predictors, the predictive model still has an 80% accuracy.

Comparison of Full vs Reduced XGBoost Models (with F1 Score)
Model Accuracy Sensitivity Specificity PPV F1_Score
xgb_fit2 (All Predictors with BMI and Height) 0.9921 0.9928 0.9987 0.9920 0.9923
xgb_fit0 (Only Top 5 Predictors from xgb_fit2) 0.9747 0.9752 0.9958 0.9745 0.9747
xgb_fit (All Predictors Without Height, Weight, and BMI) 0.8070 0.8046 0.9678 0.8025 0.8024

F1 score was found and is in line with what the best version of the predictive model is. Using all predictors (except Weight due to redundancy with BMI) has the best predictive power, but the trade off for a much simpler model may be worth it because only including the top 5 predictors from the full model still trains a great model with high predictive power.

Lastly, even though BMI was calculated with both Height and Weight, the model that includes BMI excluding Height and Weight does not perform as well as when Height is added in, leading to the conclusion that Height does help capture additional variability in the data.

Exploring Importance of Less Obvious Predictors

While BMI, Height, and Weight emerged as the most obvious predictive features for obesity classification due to the definition of obesity, I further explored model performance under constrained scenarios — specifically, omitting dominant predictors — to assess the value of behavioral and lifestyle variables independently. This allowed evaluation of secondary features’ standalone contribution and model robustness in data-limited settings.

The random forest using the same predictors was best in terms of accuracy so it will be used.

rf_fit was trained earlier without Height, Weight, and BMI to assess the predictive power of the less obvious predictors.

confusionMatrix(rf_preds, y_test)
## Confusion Matrix and Statistics
## 
##                      Reference
## Prediction            Insufficient_Weight Normal_Weight Overweight_Level_I
##   Insufficient_Weight                  72            11                  3
##   Normal_Weight                         4            59                  4
##   Overweight_Level_I                    4             6                 67
##   Overweight_Level_II                   0             6                  4
##   Obesity_Type_I                        1             3                  6
##   Obesity_Type_II                       0             1                  3
##   Obesity_Type_III                      0             0                  0
##                      Reference
## Prediction            Overweight_Level_II Obesity_Type_I Obesity_Type_II
##   Insufficient_Weight                   1              2               0
##   Normal_Weight                         7              5               1
##   Overweight_Level_I                    3              4               0
##   Overweight_Level_II                  61              7               1
##   Obesity_Type_I                        8             84               3
##   Obesity_Type_II                       7              3              84
##   Obesity_Type_III                      0              0               0
##                      Reference
## Prediction            Obesity_Type_III
##   Insufficient_Weight                0
##   Normal_Weight                      1
##   Overweight_Level_I                 0
##   Overweight_Level_II                0
##   Obesity_Type_I                     0
##   Obesity_Type_II                    0
##   Obesity_Type_III                  96
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8275          
##                  95% CI : (0.7958, 0.8562)
##     No Information Rate : 0.1661          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7986          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity                              0.8889              0.68605
## Specificity                              0.9691              0.95971
## Pos Pred Value                           0.8090              0.72840
## Neg Pred Value                           0.9834              0.95100
## Prevalence                               0.1282              0.13608
## Detection Rate                           0.1139              0.09335
## Detection Prevalence                     0.1408              0.12816
## Balanced Accuracy                        0.9290              0.82288
##                      Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity                             0.7701                    0.70115
## Specificity                             0.9688                    0.96697
## Pos Pred Value                          0.7976                    0.77215
## Neg Pred Value                          0.9635                    0.95298
## Prevalence                              0.1377                    0.13766
## Detection Rate                          0.1060                    0.09652
## Detection Prevalence                    0.1329                    0.12500
## Balanced Accuracy                       0.8695                    0.83406
##                      Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity                         0.8000                 0.9438
## Specificity                         0.9602                 0.9742
## Pos Pred Value                      0.8000                 0.8571
## Neg Pred Value                      0.9602                 0.9906
## Prevalence                          0.1661                 0.1408
## Detection Rate                      0.1329                 0.1329
## Detection Prevalence                0.1661                 0.1551
## Balanced Accuracy                   0.8801                 0.9590
##                      Class: Obesity_Type_III
## Sensitivity                           0.9897
## Specificity                           1.0000
## Pos Pred Value                        1.0000
## Neg Pred Value                        0.9981
## Prevalence                            0.1535
## Detection Rate                        0.1519
## Detection Prevalence                  0.1519
## Balanced Accuracy                     0.9948
varImp(rf_fit)
## rf variable importance
## 
##   only 20 most important variables shown (out of 24)
## 
##                                   Overall
## Age                               100.000
## FAF                                50.797
## TUE                                45.140
## CH2O                               44.619
## GenderMale                         35.645
## family_history_with_overweightyes  30.809
## FCVC.L                             25.583
## CALC.C                             19.994
## CAEC.C                             18.743
## MTRANSPublic_Transportation        18.486
## NCP.L                              16.994
## CAEC.L                             14.629
## FAVCyes                            14.544
## NCP.C                              13.591
## FCVC.Q                             10.685
## CALC.Q                             10.648
## CALC.L                             10.245
## NCP.Q                               4.754
## SCCyes                              3.792
## CAEC.Q                              2.249
plot(varImp(rf_fit))

Age, physical activity frequency, time spent using technological devices, daily water intake, and gender look to be the top 5 most important according to the model that excludes obvious predictors.

Age, FAF,CH2O, and Gender have all been explored earlier in the analysis, so TUE, time spent using technological devices, will be looked into.

Does time spent on tech devices have an effect on BMI or obesity level?

ggplot(train_data, aes(x = TUE, y = BMI)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "darkred") +
  labs(
    title = "Relationship Between Time Using Technology and BMI",
    x = "Frequency of Tech Use (TUE)",
    y = "BMI"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

cor.test(train_data$BMI, train_data$TUE, method = "spearman")  
## Warning in cor.test.default(train_data$BMI, train_data$TUE, method =
## "spearman"): Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  train_data$BMI and train_data$TUE
## S = 582958461, p-value = 0.001789
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## -0.08114648

As technology use increases, BMI decreases, but the correlation is weak.

ggplot(train_data, aes(x = NObeyesdad, y = TUE)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  stat_summary(fun = mean, geom = "point", shape = 20, size = 2, color = "red") +
  labs(title = "Distribution of Tech Use by Obesity Level",
       x = "Obesity Level",
       y = "Frequency of Tech Use") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

kruskal.test(TUE ~ NObeyesdad, data = train_data)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  TUE by NObeyesdad
## Kruskal-Wallis chi-squared = 39.055, df = 6, p-value = 6.982e-07

Technology use differs across obesity levels.

Less Obvious Predictors Takeaways

Age

As age increases in adulthood, average BMI increases as well.

FAF

More frequent physical activity leads to a lower BMI on average.

TUE

BMI has a slight negative correlation with technology use.

CH2O

Those who consume alcohol tend to have higher BMIs if they also consume more water daily, whereas those who do not consume alcohol have lower BMIs if they also consume the same higher amounts of water.

Gender

Chi-square was significant between gender and obesity level, therefore there is a difference in gender between at least one obesity level compared to another.

There is no significant difference between BMI and Gender.

While BMI does not significantly differ by gender, a chi-square test reveals that gender and obesity level are associated, suggesting that gender plays a role in how individuals are categorized into obesity levels. Specific categories (e.g., Level 2 and 3) show heavy gender skew. This could reflect differences in obesity labeling, lifestyle patterns, or other behavioral variables that interact with gender.

Conclusion

In the initial models, predictors included Height, Weight, and/or BMI, all of which are directly involved in how obesity is medically defined. This likely inflated predictive performance, as the model essentially had access to a derived version of the target label. The resulting test accuracy of >95% suggests potential data leakage.

To better evaluate generalization, a model was trained excluding Height, Weight, and BMI, and achieved a test accuracy of 82.75%. While lower, this result is more reflective of the model’s ability to generalize from behavioral and lifestyle patterns, without relying on circular logic.

The model without Height, Weight, and BMI would be better in uses where BMI is not available or if used to identify risk before BMI reaches clinical obesity. It is a good balance between performance and model validity.