library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df = read.csv('ObesityDataSet_raw_and_data_sinthetic.csv')
changing data types
df |>
head()
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female 21 1.62 64.0 yes no 2 3
## 2 Female 21 1.52 56.0 yes no 3 3
## 3 Male 23 1.80 77.0 yes no 2 3
## 4 Male 27 1.80 87.0 no no 3 3
## 5 Male 22 1.78 89.8 no no 2 1
## 6 Male 29 1.62 53.0 no yes 2 3
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 Sometimes no 2 no 0 1 no Public_Transportation
## 2 Sometimes yes 3 yes 3 0 Sometimes Public_Transportation
## 3 Sometimes no 2 no 2 1 Frequently Public_Transportation
## 4 Sometimes no 2 no 2 0 Frequently Walking
## 5 Sometimes no 2 no 0 0 Sometimes Public_Transportation
## 6 Sometimes no 2 no 0 0 Sometimes Automobile
## NObeyesdad
## 1 Normal_Weight
## 2 Normal_Weight
## 3 Normal_Weight
## 4 Overweight_Level_I
## 5 Overweight_Level_II
## 6 Normal_Weight
df |>
str()
## 'data.frame': 2111 obs. of 17 variables:
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ family_history_with_overweight: chr "yes" "yes" "yes" "no" ...
## $ FAVC : chr "no" "no" "no" "no" ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ CAEC : chr "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
## $ SMOKE : chr "no" "yes" "no" "no" ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ SCC : chr "no" "yes" "no" "no" ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CALC : chr "no" "Sometimes" "Frequently" "Frequently" ...
## $ MTRANS : chr "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
## $ NObeyesdad : chr "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
These variables will possibly be changed from continuous numerics to ordered factors after rounding.
par(mfrow = c(2,3))
hist(df$FCVC)
hist(df$NCP)
hist(df$CH2O)
hist(df$FAF)
hist(df$TUE)
is_integer_like = df$FCVC == floor(df$FCVC)
sum(is_integer_like)/nrow(df)*100
## [1] 60.87162
sum(!is_integer_like)/nrow(df)*100
## [1] 39.12838
is_integer_like = df$NCP == floor(df$NCP)
sum(is_integer_like)/nrow(df)*100
## [1] 69.68261
sum(!is_integer_like)/nrow(df)*100
## [1] 30.31739
is_integer_like = df$CH2O == floor(df$CH2O)
sum(is_integer_like)/nrow(df)*100
## [1] 38.89152
sum(!is_integer_like)/nrow(df)*100
## [1] 61.10848
is_integer_like = df$FAF == floor(df$FAF)
sum(is_integer_like)/nrow(df)*100
## [1] 42.77594
sum(!is_integer_like)/nrow(df)*100
## [1] 57.22406
is_integer_like = df$TUE == floor(df$TUE)
sum(is_integer_like)/nrow(df)*100
## [1] 45.38134
sum(!is_integer_like)/nrow(df)*100
## [1] 54.61866
FCVC and NCP will be changed. The other 3
will stay as numerics.
par(mfrow = c(2,3))
hist(as.integer(round(df$FCVC)))
hist(as.integer(round(df$NCP)))
hist(df$CH2O)
hist(df$FAF)
hist(df$TUE)
# categorical variables
df$Gender = as.factor(df$Gender)
df$family_history_with_overweight = as.factor(df$family_history_with_overweight)
df$FAVC = as.factor(df$FAVC) # yes/no to eat high calorie food frequently
df$CAEC = as.factor(df$CAEC) # eat between meals?
df$SMOKE = as.factor(df$SMOKE) # do you smoke?
df$SCC = as.factor(df$SCC) # do you monitor your calories?
df$CALC = as.factor(df$CALC) # how often do you drink alc?
df$MTRANS = as.factor(df$MTRANS) # main transportation
df$NObeyesdad = as.factor(df$NObeyesdad) # obesity level
levels(df$Gender)
## [1] "Female" "Male"
levels(df$family_history_with_overweight)
## [1] "no" "yes"
levels(df$FAVC)
## [1] "no" "yes"
levels(df$CAEC)
## [1] "Always" "Frequently" "no" "Sometimes"
levels(df$SMOKE)
## [1] "no" "yes"
levels(df$SCC)
## [1] "no" "yes"
levels(df$CALC)
## [1] "Always" "Frequently" "no" "Sometimes"
levels(df$MTRANS)
## [1] "Automobile" "Bike" "Motorbike"
## [4] "Public_Transportation" "Walking"
levels(df$NObeyesdad)
## [1] "Insufficient_Weight" "Normal_Weight" "Obesity_Type_I"
## [4] "Obesity_Type_II" "Obesity_Type_III" "Overweight_Level_I"
## [7] "Overweight_Level_II"
ordered_levels = c("no", "Sometimes", "Frequently", "Always")
df[c("CAEC", "CALC")] = lapply(df[c("CAEC", "CALC")], function(col) {
factor(col, levels = ordered_levels, ordered = TRUE)
})
ordered variables
# numerical variables
df$FCVC = as.factor(round(df$FCVC)) # do you usually eat veggies in meals? will treat as ordered factor with levels 1, 2, and 3
df$NCP = as.factor(round(df$NCP)) # how many main meals do you have daily?
df$FCVC = factor(df$FCVC, ordered = T)
df$NCP = factor(df$NCP, ordered = T)
levels(df$FCVC)
## [1] "1" "2" "3"
levels(df$NCP)
## [1] "1" "2" "3" "4"
making NObeyesdad an ordered factor
df$NObeyesdad = factor(
df$NObeyesdad,
levels = c(
"Insufficient_Weight",
"Normal_Weight",
"Overweight_Level_I",
"Overweight_Level_II",
"Obesity_Type_I",
"Obesity_Type_II",
"Obesity_Type_III"
),
ordered = TRUE
)
levels(df$NObeyesdad)
## [1] "Insufficient_Weight" "Normal_Weight" "Overweight_Level_I"
## [4] "Overweight_Level_II" "Obesity_Type_I" "Obesity_Type_II"
## [7] "Obesity_Type_III"
par(mfrow = c(2,2))
barplot(table(df$CAEC))
barplot(table(df$CALC))
barplot(table(df$FCVC))
barplot(table(df$NCP))
Adding BMI to each observation
df = df |>
mutate(BMI = Weight/(Height**2))
str(df)
## 'data.frame': 2111 obs. of 18 variables:
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
## $ FAVC : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
## $ FCVC : Ord.factor w/ 3 levels "1"<"2"<"3": 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : Ord.factor w/ 4 levels "1"<"2"<"3"<"4": 3 3 3 3 1 3 3 3 3 3 ...
## $ CAEC : Ord.factor w/ 4 levels "no"<"Sometimes"<..: 2 2 2 2 2 2 2 2 2 2 ...
## $ SMOKE : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ SCC : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CALC : Ord.factor w/ 4 levels "no"<"Sometimes"<..: 1 2 3 3 2 2 2 2 3 1 ...
## $ MTRANS : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
## $ NObeyesdad : Ord.factor w/ 7 levels "Insufficient_Weight"<..: 2 2 2 3 4 2 2 2 2 2 ...
## $ BMI : num 24.4 24.2 23.8 26.9 28.3 ...
splitting into training and testing
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(25)
train_index = createDataPartition(df$NObeyesdad, p = 0.7, list = FALSE)
train_data = df[train_index, ]
test_data = df[-train_index, ]
x_train = train_data[, setdiff(names(train_data), "NObeyesdad")]
y_train = train_data$NObeyesdad
x_test = test_data[, setdiff(names(test_data), "NObeyesdad")]
y_test = test_data$NObeyesdad
How does obesity vary across different genders and age groups?
ggplot(train_data, aes(x = NObeyesdad, y = Age)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(title = "Distribution of Age by Obesity Level",
x = "Obesity Level",
y = "Age") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
kruskal.test(Age ~ NObeyesdad, data = train_data)
##
## Kruskal-Wallis rank sum test
##
## data: Age by NObeyesdad
## Kruskal-Wallis chi-squared = 379.92, df = 6, p-value < 2.2e-16
NObeyesdad has at least one group in which
Age has a significant distribution.
ggplot(train_data, aes(x = NObeyesdad, fill = Gender)) +
geom_bar(position = "dodge") +
labs(title = "Obesity Levels by Gender",
x = "Obesity Level",
y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
table_gender_obesity = table(train_data$Gender, train_data$NObeyesdad)
chisq.test(table_gender_obesity)
##
## Pearson's Chi-squared test
##
## data: table_gender_obesity
## X-squared = 469.7, df = 6, p-value < 2.2e-16
There is a significant association between Gender and
NObeyesdad.
Gender and BMI exploration
ggplot(train_data, aes(x = Gender, y = BMI, fill = Gender)) +
geom_boxplot() +
labs(title = "BMI Distribution by Gender", x = "Gender", y = "BMI") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
aggregate(BMI ~ Gender, data = train_data, mean)
## Gender BMI
## 1 Female 30.01209
## 2 Male 29.30834
# normality check
ggplot(train_data, aes(x = BMI)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
facet_wrap(~ Gender) +
labs(title = "Histogram of BMI by Gender")
wilcox.test(BMI ~ Gender, data = train_data)
##
## Wilcoxon rank sum test with continuity correction
##
## data: BMI by Gender
## W = 275837, p-value = 0.7653
## alternative hypothesis: true location shift is not equal to 0
Is there correlation between age and BMI?
See boxplot for distribution of age across obesity levels above.
ggplot(train_data, aes(x = Age, y = BMI)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
cor.test(train_data$Age, train_data$BMI)
##
## Pearson's product-moment correlation
##
## data: train_data$Age and train_data$BMI
## t = 9.9576, df = 1477, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2024318 0.2979776
## sample estimates:
## cor
## 0.2508155
There is a slight positive correlation between Age and
BMI. As you get older, your BMI will increase.
ggplot(train_data, aes(x = NObeyesdad, y = Age)) +
geom_jitter(width = 0.2, alpha = 0.5, color = "steelblue") +
labs(title = "Jittered Plot: Age Across Obesity Levels")+
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Does the frequency of vegetable consumption (FCVC) differ across obesity levels?
train_data |>
ggplot(aes(x = NObeyesdad, fill = FCVC))+
geom_bar(position = "dodge")+
labs(title = "Bar Plot of Obesity Levels and Vegetable Intake") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
kruskal.test(train_data$FCVC ~ train_data$NObeyesdad)
##
## Kruskal-Wallis rank sum test
##
## data: train_data$FCVC by train_data$NObeyesdad
## Kruskal-Wallis chi-squared = 350.37, df = 6, p-value < 2.2e-16
So, do people with different obesity levels tend to report different frequencies of vegetable consumption? Yes they do. The Kruskal-Wallis rank sum test significant p-value indicates that at least one obesity group statistically differs from another obesity group in their vegetable consumption frequencies.
FCVC would be a possible predictor for
NObeyesdad
Do people who frequently eat between meals (CAEC) show higher obesity levels?
train_data |>
ggplot(aes(x = NObeyesdad, fill = CAEC))+
geom_bar(position = "dodge")+
labs(title = "Bar Plot of Obesity Levels and Frequency of Eating Between Meals") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
kruskal.test(train_data$CAEC ~ train_data$NObeyesdad)
##
## Kruskal-Wallis rank sum test
##
## data: train_data$CAEC by train_data$NObeyesdad
## Kruskal-Wallis chi-squared = 321.34, df = 6, p-value < 2.2e-16
ggplot(train_data, aes(x = CAEC, y = BMI)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(title = "Distribution of BMI by Snacking Frequency",
x = "How Frequently Do You Eat Between Meals",
y = "BMI") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
kruskal.test(train_data$BMI ~ train_data$CAEC)
##
## Kruskal-Wallis rank sum test
##
## data: train_data$BMI by train_data$CAEC
## Kruskal-Wallis chi-squared = 292.3, df = 3, p-value < 2.2e-16
There is at least one group of CAEC that differs from
another. To find out which one, Dunn’s test will be run.
train_data$CAEC_unordered = factor(train_data$CAEC, ordered = FALSE)
FSA::dunnTest(BMI ~ CAEC_unordered, data = train_data, method = "bonferroni")
## Dunn (1964) Kruskal-Wallis multiple comparison
## p-values adjusted with the Bonferroni method.
## Comparison Z P.unadj P.adj
## 1 Always - Frequently 2.891295 3.836578e-03 2.301947e-02
## 2 Always - no -0.259862 7.949702e-01 1.000000e+00
## 3 Frequently - no -3.010631 2.607056e-03 1.564233e-02
## 4 Always - Sometimes -5.519712 3.395551e-08 2.037331e-07
## 5 Frequently - Sometimes -16.064861 4.498735e-58 2.699241e-57
## 6 no - Sometimes -4.717087 2.392459e-06 1.435475e-05
There is a difference between the following groups at a 0.01 level:
Always - Sometimes
Frequently - Sometimes
no - Sometimes
To address the question if people who eat frequently between meals
show higher levels of obesity, the answer is no. Frequent snackers have
similar BMIs to the “no” and “always” groups of CAEC. The
group with the highest median BMI is actually the people that reported
“sometimes”.
train_data = train_data |>
select(-CAEC_unordered)
Is there a relationship between the number of daily meals (NCP) and obesity level?
train_data |>
ggplot(aes(x = NObeyesdad, fill = NCP))+
geom_bar(position = "fill")+
labs(title = "Bar Plot of Obesity Levels and Number of Daily Meals",
y = "Proportion") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
chisq.test(table(train_data$NCP, train_data$NObeyesdad))
##
## Pearson's Chi-squared test
##
## data: table(train_data$NCP, train_data$NObeyesdad)
## X-squared = 312.12, df = 18, p-value < 2.2e-16
There is a statistically significant association between the number
of daily meals ,NCP, and obesity level,
NObeyesdad. They are not independent of one another.
tbl = as.data.frame(table(train_data$NCP, train_data$NObeyesdad))
colnames(tbl) = c("NCP", "NObeyesdad", "Count")
ggplot(tbl, aes(x = NCP, y = NObeyesdad, fill = Count)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "white", high = "red") +
labs(title = "Heatmap of Meal Frequency vs Obesity Level",
x = "Number of Meals (NCP)", y = "Obesity Level") +
theme_minimal()
Is there a significant difference in BMI between smokers and non-smokers?
ggplot(train_data, aes(x = SMOKE, y = BMI)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(title = "Distribution of BMI by Smoking",
x = "Smoker",
y = "BMI") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
t.test(train_data$BMI ~ train_data$SMOKE)
##
## Welch Two Sample t-test
##
## data: train_data$BMI by train_data$SMOKE
## t = -0.0048165, df = 36.681, p-value = 0.9962
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## -2.228240 2.217675
## sample estimates:
## mean in group no mean in group yes
## 29.65557 29.66085
BMI on average is not different between smokers and non-smokers.
How do water and alcohol intake individually and combined affect obesity level?
ggplot(train_data, aes(x = CALC, y = BMI)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(title = "Distribution of BMI by Alcohol Consumption Frequency",
x = "CALC",
y = "BMI") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(train_data, aes(x = CH2O, y = BMI)) +
geom_point(alpha = 0.4, color = "blue") +
labs(title = "Scatter Plot of Water Consumption vs BMI",
x = "Water Consumption (CH2O)",
y = "BMI") +
theme_minimal()
cor.test(train_data$CH2O, train_data$BMI, method = "spearman")
## Warning in cor.test.default(train_data$CH2O, train_data$BMI, method =
## "spearman"): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: train_data$CH2O and train_data$BMI
## S = 445797058, p-value = 1.984e-11
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1732311
Higher water consumption is slightly associated with a higher BMI.
ggplot(train_data, aes(x = CH2O, y = BMI, color = CALC)) +
geom_smooth(method = "loess", se = FALSE) +
labs(title = "Smoothed Interaction Plot: BMI vs CH2O by CALC",
x = "Water Consumption (CH2O)",
y = "BMI",
color = "Alcohol Consumption") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
check = aov(train_data$BMI ~ train_data$CH2O * train_data$CALC)
summary(check)
## Df Sum Sq Mean Sq F value Pr(>F)
## train_data$CH2O 1 2272 2272.5 39.38 4.57e-10 ***
## train_data$CALC 3 5184 1728.1 29.95 < 2e-16 ***
## train_data$CH2O:train_data$CALC 2 2377 1188.7 20.60 1.50e-09 ***
## Residuals 1472 84935 57.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(check, which = 1) # Homogeneity
qqnorm(residuals(check)) # Normality
qqline(residuals(check), col = "red")
Those who consume alcohol tend to have higher BMIs if they also consume more water daily, whereas those who do not consume alcohol have lower BMIs if they also consume the same higher amounts of water.
Are individuals with a family history of being overweight more likely to be obese?
train_data |>
ggplot(aes(x = NObeyesdad, fill = family_history_with_overweight))+
geom_bar(position = "fill")+
labs(title = "Distribution of Family History of Overweight by Obesity Level") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
chisq.test(table(train_data$family_history_with_overweight, train_data$NObeyesdad))
##
## Pearson's Chi-squared test
##
## data: table(train_data$family_history_with_overweight, train_data$NObeyesdad)
## X-squared = 442.06, df = 6, p-value < 2.2e-16
chisq.test(table(train_data$family_history_with_overweight, train_data$NObeyesdad))$stdres
##
## Insufficient_Weight Normal_Weight Overweight_Level_I Overweight_Level_II
## no 14.014445 10.596668 3.998519 -5.038596
## yes -14.014445 -10.596668 -3.998519 5.038596
##
## Obesity_Type_I Obesity_Type_II Obesity_Type_III
## no -7.336387 -7.107627 -7.685979
## yes 7.336387 7.107627 7.685979
Family history of being overweight is not independent of obesity level.
More people than expected are in Overweight Level II, Obesity Type I, II, and III, when they have a family history of being overweight.
Fewer people than expected are under the normal weight category when they do not have a family history of being overweight.
Does physical activity affect BMI or obesity level?
ggplot(train_data, aes(x = FAF, y = BMI)) +
geom_point(alpha = 0.5, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred") +
labs(
title = "Relationship Between Physical Activity and BMI",
x = "Frequency of Physical Activity (FAF)",
y = "BMI"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
cor.test(train_data$BMI, train_data$FAF, method = "spearman")
## Warning in cor.test.default(train_data$BMI, train_data$FAF, method =
## "spearman"): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: train_data$BMI and train_data$FAF
## S = 631293128, p-value = 3.813e-11
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.1707873
As physical activity increases, BMI decreases, but the correlation is weak.
ggplot(train_data, aes(x = NObeyesdad, y = FAF)) +
geom_boxplot(fill = "lightblue", color = "black") +
stat_summary(fun = mean, geom = "point", shape = 20, size = 2, color = "red") +
labs(title = "Distribution of Physical Activity by Obesity Level",
x = "Obesity Level",
y = "Frequency of Physical Activity") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
kruskal.test(FAF ~ NObeyesdad, data = train_data)
##
## Kruskal-Wallis rank sum test
##
## data: FAF by NObeyesdad
## Kruskal-Wallis chi-squared = 65.947, df = 6, p-value = 2.764e-12
Physical activity differs across obesity levels.
NObeyesdadtable(train_data$NObeyesdad)
##
## Insufficient_Weight Normal_Weight Overweight_Level_I Overweight_Level_II
## 191 201 203 203
## Obesity_Type_I Obesity_Type_II Obesity_Type_III
## 246 208 227
prop.table(table(train_data$NObeyesdad))
##
## Insufficient_Weight Normal_Weight Overweight_Level_I Overweight_Level_II
## 0.1291413 0.1359026 0.1372549 0.1372549
## Obesity_Type_I Obesity_Type_II Obesity_Type_III
## 0.1663286 0.1406356 0.1534821
The classes for NObeyesdad are fairly balanced.
Multinomial Logistic Regression
Colinnearity check
cor(train_data$BMI, train_data$Height)
## [1] 0.1314545
cor(train_data$BMI, train_data$Weight)
## [1] 0.933768
cor(train_data$Weight, train_data$Height)
## [1] 0.4654195
library(nnet)
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
multi_fit = train(NObeyesdad ~ . -BMI -Height -Weight, data = train_data, trControl = train_control, method = "multinom", preProcess = c("center", "scale"),trace = F)
multi_preds = predict(multi_fit, test_data)
confusionMatrix(multi_preds, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 61 26 6
## Normal_Weight 7 36 12
## Overweight_Level_I 5 10 38
## Overweight_Level_II 1 3 5
## Obesity_Type_I 3 8 14
## Obesity_Type_II 4 2 11
## Obesity_Type_III 0 1 1
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 6 4 0
## Normal_Weight 3 2 1
## Overweight_Level_I 3 11 1
## Overweight_Level_II 20 3 5
## Obesity_Type_I 31 70 12
## Obesity_Type_II 22 14 70
## Obesity_Type_III 2 1 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 1
## Normal_Weight 0
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 1
## Obesity_Type_II 0
## Obesity_Type_III 95
##
## Overall Statistics
##
## Accuracy : 0.6171
## 95% CI : (0.5779, 0.6552)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.552
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.75309 0.41860
## Specificity 0.92196 0.95421
## Pos Pred Value 0.58654 0.59016
## Neg Pred Value 0.96212 0.91243
## Prevalence 0.12816 0.13608
## Detection Rate 0.09652 0.05696
## Detection Prevalence 0.16456 0.09652
## Balanced Accuracy 0.83752 0.68641
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.43678 0.22989
## Specificity 0.94495 0.96881
## Pos Pred Value 0.55882 0.54054
## Neg Pred Value 0.91312 0.88739
## Prevalence 0.13766 0.13766
## Detection Rate 0.06013 0.03165
## Detection Prevalence 0.10759 0.05854
## Balanced Accuracy 0.69087 0.59935
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.6667 0.7865
## Specificity 0.8691 0.9024
## Pos Pred Value 0.5036 0.5691
## Neg Pred Value 0.9290 0.9627
## Prevalence 0.1661 0.1408
## Detection Rate 0.1108 0.1108
## Detection Prevalence 0.2199 0.1946
## Balanced Accuracy 0.7679 0.8445
## Class: Obesity_Type_III
## Sensitivity 0.9794
## Specificity 0.9907
## Pos Pred Value 0.9500
## Neg Pred Value 0.9962
## Prevalence 0.1535
## Detection Rate 0.1503
## Detection Prevalence 0.1582
## Balanced Accuracy 0.9850
Random Forest
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
rf_fit = train(NObeyesdad ~ . -BMI -Height -Weight, data = train_data, trControl = train_control, method = "rf")
rf_preds = predict(rf_fit, test_data)
confusionMatrix(rf_preds, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 72 11 3
## Normal_Weight 4 59 4
## Overweight_Level_I 4 6 67
## Overweight_Level_II 0 6 4
## Obesity_Type_I 1 3 6
## Obesity_Type_II 0 1 3
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 1 2 0
## Normal_Weight 7 5 1
## Overweight_Level_I 3 4 0
## Overweight_Level_II 61 7 1
## Obesity_Type_I 8 84 3
## Obesity_Type_II 7 3 84
## Obesity_Type_III 0 0 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 1
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 0
## Obesity_Type_III 96
##
## Overall Statistics
##
## Accuracy : 0.8275
## 95% CI : (0.7958, 0.8562)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7986
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.8889 0.68605
## Specificity 0.9691 0.95971
## Pos Pred Value 0.8090 0.72840
## Neg Pred Value 0.9834 0.95100
## Prevalence 0.1282 0.13608
## Detection Rate 0.1139 0.09335
## Detection Prevalence 0.1408 0.12816
## Balanced Accuracy 0.9290 0.82288
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.7701 0.70115
## Specificity 0.9688 0.96697
## Pos Pred Value 0.7976 0.77215
## Neg Pred Value 0.9635 0.95298
## Prevalence 0.1377 0.13766
## Detection Rate 0.1060 0.09652
## Detection Prevalence 0.1329 0.12500
## Balanced Accuracy 0.8695 0.83406
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.8000 0.9438
## Specificity 0.9602 0.9742
## Pos Pred Value 0.8000 0.8571
## Neg Pred Value 0.9602 0.9906
## Prevalence 0.1661 0.1408
## Detection Rate 0.1329 0.1329
## Detection Prevalence 0.1661 0.1551
## Balanced Accuracy 0.8801 0.9590
## Class: Obesity_Type_III
## Sensitivity 0.9897
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 0.9981
## Prevalence 0.1535
## Detection Rate 0.1519
## Detection Prevalence 0.1519
## Balanced Accuracy 0.9948
varImp(rf_fit)
## rf variable importance
##
## only 20 most important variables shown (out of 24)
##
## Overall
## Age 100.000
## FAF 50.797
## TUE 45.140
## CH2O 44.619
## GenderMale 35.645
## family_history_with_overweightyes 30.809
## FCVC.L 25.583
## CALC.C 19.994
## CAEC.C 18.743
## MTRANSPublic_Transportation 18.486
## NCP.L 16.994
## CAEC.L 14.629
## FAVCyes 14.544
## NCP.C 13.591
## FCVC.Q 10.685
## CALC.Q 10.648
## CALC.L 10.245
## NCP.Q 4.754
## SCCyes 3.792
## CAEC.Q 2.249
plot(varImp(rf_fit), top = 20)
SVM
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
svm_fit = train(NObeyesdad ~ . -BMI -Height -Weight, data = train_data, trControl = train_control, method = "svmRadial")
svm_preds = predict(svm_fit, test_data)
confusionMatrix(svm_preds, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 71 13 1
## Normal_Weight 4 54 7
## Overweight_Level_I 6 9 57
## Overweight_Level_II 0 4 3
## Obesity_Type_I 0 5 14
## Obesity_Type_II 0 1 5
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 3 3 0
## Normal_Weight 9 6 3
## Overweight_Level_I 7 9 2
## Overweight_Level_II 45 6 3
## Obesity_Type_I 16 65 0
## Obesity_Type_II 7 15 81
## Obesity_Type_III 0 1 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 1
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 1
## Obesity_Type_III 95
##
## Overall Statistics
##
## Accuracy : 0.7405
## 95% CI : (0.7045, 0.7743)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.697
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.8765 0.62791
## Specificity 0.9637 0.94505
## Pos Pred Value 0.7802 0.64286
## Neg Pred Value 0.9815 0.94161
## Prevalence 0.1282 0.13608
## Detection Rate 0.1123 0.08544
## Detection Prevalence 0.1440 0.13291
## Balanced Accuracy 0.9201 0.78648
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.65517 0.51724
## Specificity 0.93945 0.97064
## Pos Pred Value 0.63333 0.73770
## Neg Pred Value 0.94465 0.92644
## Prevalence 0.13766 0.13766
## Detection Rate 0.09019 0.07120
## Detection Prevalence 0.14241 0.09652
## Balanced Accuracy 0.79731 0.74394
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.6190 0.9101
## Specificity 0.9336 0.9466
## Pos Pred Value 0.6500 0.7364
## Neg Pred Value 0.9248 0.9847
## Prevalence 0.1661 0.1408
## Detection Rate 0.1028 0.1282
## Detection Prevalence 0.1582 0.1741
## Balanced Accuracy 0.7763 0.9284
## Class: Obesity_Type_III
## Sensitivity 0.9794
## Specificity 0.9981
## Pos Pred Value 0.9896
## Neg Pred Value 0.9963
## Prevalence 0.1535
## Detection Rate 0.1503
## Detection Prevalence 0.1519
## Balanced Accuracy 0.9888
XGboost
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
xgb_fit = train(NObeyesdad ~ . -BMI -Height -Weight, data = train_data, trControl = train_control, method = "xgbTree")
xgb_preds = predict(xgb_fit, test_data)
confusionMatrix(xgb_preds, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 72 8 1
## Normal_Weight 3 59 9
## Overweight_Level_I 3 10 60
## Overweight_Level_II 2 4 3
## Obesity_Type_I 1 4 10
## Obesity_Type_II 0 1 3
## Obesity_Type_III 0 0 1
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 1 3 0
## Normal_Weight 9 5 2
## Overweight_Level_I 4 3 1
## Overweight_Level_II 56 7 2
## Obesity_Type_I 9 83 0
## Obesity_Type_II 7 3 84
## Obesity_Type_III 1 1 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 1
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 0
## Obesity_Type_III 96
##
## Overall Statistics
##
## Accuracy : 0.807
## 95% CI : (0.774, 0.837)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7745
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.8889 0.68605
## Specificity 0.9764 0.94689
## Pos Pred Value 0.8471 0.67045
## Neg Pred Value 0.9835 0.95037
## Prevalence 0.1282 0.13608
## Detection Rate 0.1139 0.09335
## Detection Prevalence 0.1345 0.13924
## Balanced Accuracy 0.9326 0.81647
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.68966 0.64368
## Specificity 0.96147 0.96697
## Pos Pred Value 0.74074 0.75676
## Neg Pred Value 0.95100 0.94444
## Prevalence 0.13766 0.13766
## Detection Rate 0.09494 0.08861
## Detection Prevalence 0.12816 0.11709
## Balanced Accuracy 0.82556 0.80533
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.7905 0.9438
## Specificity 0.9545 0.9742
## Pos Pred Value 0.7757 0.8571
## Neg Pred Value 0.9581 0.9906
## Prevalence 0.1661 0.1408
## Detection Rate 0.1313 0.1329
## Detection Prevalence 0.1693 0.1551
## Balanced Accuracy 0.8725 0.9590
## Class: Obesity_Type_III
## Sensitivity 0.9897
## Specificity 0.9944
## Pos Pred Value 0.9697
## Neg Pred Value 0.9981
## Prevalence 0.1535
## Detection Rate 0.1519
## Detection Prevalence 0.1566
## Balanced Accuracy 0.9920
Multinomial Logistic Regression
Colinnearity check
cor(train_data$BMI, train_data$Height)
## [1] 0.1314545
cor(train_data$BMI, train_data$Weight)
## [1] 0.933768
cor(train_data$Weight, train_data$Height)
## [1] 0.4654195
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
multi_fit2 = train(NObeyesdad ~ . -Weight, data = train_data, trControl = train_control, method = "multinom", preProcess = c("center", "scale"),trace = F)
multi_preds2 = predict(multi_fit2, test_data)
confusionMatrix(multi_preds2, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 80 2 0
## Normal_Weight 1 78 0
## Overweight_Level_I 0 6 82
## Overweight_Level_II 0 0 5
## Obesity_Type_I 0 0 0
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 0 0 0
## Normal_Weight 0 0 0
## Overweight_Level_I 4 0 0
## Overweight_Level_II 81 3 0
## Obesity_Type_I 2 100 0
## Obesity_Type_II 0 1 87
## Obesity_Type_III 0 1 2
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 0
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 1
## Obesity_Type_III 96
##
## Overall Statistics
##
## Accuracy : 0.9557
## 95% CI : (0.9366, 0.9704)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9483
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.9877 0.9070
## Specificity 0.9964 0.9982
## Pos Pred Value 0.9756 0.9873
## Neg Pred Value 0.9982 0.9855
## Prevalence 0.1282 0.1361
## Detection Rate 0.1266 0.1234
## Detection Prevalence 0.1297 0.1250
## Balanced Accuracy 0.9920 0.9526
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.9425 0.9310
## Specificity 0.9817 0.9853
## Pos Pred Value 0.8913 0.9101
## Neg Pred Value 0.9907 0.9890
## Prevalence 0.1377 0.1377
## Detection Rate 0.1297 0.1282
## Detection Prevalence 0.1456 0.1408
## Balanced Accuracy 0.9621 0.9582
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.9524 0.9775
## Specificity 0.9962 0.9963
## Pos Pred Value 0.9804 0.9775
## Neg Pred Value 0.9906 0.9963
## Prevalence 0.1661 0.1408
## Detection Rate 0.1582 0.1377
## Detection Prevalence 0.1614 0.1408
## Balanced Accuracy 0.9743 0.9869
## Class: Obesity_Type_III
## Sensitivity 0.9897
## Specificity 0.9944
## Pos Pred Value 0.9697
## Neg Pred Value 0.9981
## Prevalence 0.1535
## Detection Rate 0.1519
## Detection Prevalence 0.1566
## Balanced Accuracy 0.9920
Random Forest
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
rf_fit2 = train(NObeyesdad ~ . -Weight, data = train_data, trControl = train_control, method = "rf")
rf_preds2 = predict(rf_fit2, test_data)
confusionMatrix(rf_preds2, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 80 0 0
## Normal_Weight 1 86 1
## Overweight_Level_I 0 0 85
## Overweight_Level_II 0 0 1
## Obesity_Type_I 0 0 0
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 0 0 0
## Normal_Weight 0 0 0
## Overweight_Level_I 1 0 0
## Overweight_Level_II 86 2 0
## Obesity_Type_I 0 102 0
## Obesity_Type_II 0 1 89
## Obesity_Type_III 0 0 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 0
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 1
## Obesity_Type_III 96
##
## Overall Statistics
##
## Accuracy : 0.9873
## 95% CI : (0.9752, 0.9945)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9852
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.9877 1.0000
## Specificity 1.0000 0.9963
## Pos Pred Value 1.0000 0.9773
## Neg Pred Value 0.9982 1.0000
## Prevalence 0.1282 0.1361
## Detection Rate 0.1266 0.1361
## Detection Prevalence 0.1266 0.1392
## Balanced Accuracy 0.9938 0.9982
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.9770 0.9885
## Specificity 0.9982 0.9945
## Pos Pred Value 0.9884 0.9663
## Neg Pred Value 0.9963 0.9982
## Prevalence 0.1377 0.1377
## Detection Rate 0.1345 0.1361
## Detection Prevalence 0.1361 0.1408
## Balanced Accuracy 0.9876 0.9915
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.9714 1.0000
## Specificity 1.0000 0.9963
## Pos Pred Value 1.0000 0.9780
## Neg Pred Value 0.9943 1.0000
## Prevalence 0.1661 0.1408
## Detection Rate 0.1614 0.1408
## Detection Prevalence 0.1614 0.1440
## Balanced Accuracy 0.9857 0.9982
## Class: Obesity_Type_III
## Sensitivity 0.9897
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 0.9981
## Prevalence 0.1535
## Detection Rate 0.1519
## Detection Prevalence 0.1519
## Balanced Accuracy 0.9948
varImp(rf_fit2)
## rf variable importance
##
## only 20 most important variables shown (out of 26)
##
## Overall
## BMI 100.0000
## GenderMale 10.1440
## Age 5.5687
## Height 2.9095
## family_history_with_overweightyes 2.2686
## FCVC.L 2.1593
## CAEC.L 1.6554
## CH2O 1.3479
## CAEC.C 1.2971
## TUE 1.2360
## FAF 1.2280
## CALC.C 0.9033
## FAVCyes 0.8686
## FCVC.Q 0.7155
## NCP.C 0.6948
## NCP.L 0.6260
## CALC.Q 0.5470
## MTRANSPublic_Transportation 0.4761
## CALC.L 0.4218
## SCCyes 0.2051
plot(varImp(rf_fit2), top = 20)
SVM
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
svm_fit2 = train(NObeyesdad ~ . -Weight, data = train_data, trControl = train_control, method = "svmRadial")
svm_preds2 = predict(svm_fit2, test_data)
confusionMatrix(svm_preds2, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 78 8 0
## Normal_Weight 3 65 8
## Overweight_Level_I 0 9 70
## Overweight_Level_II 0 4 5
## Obesity_Type_I 0 0 4
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 0 0 0
## Normal_Weight 9 5 1
## Overweight_Level_I 7 1 1
## Overweight_Level_II 68 2 0
## Obesity_Type_I 3 91 1
## Obesity_Type_II 0 6 86
## Obesity_Type_III 0 0 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 0
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 2
## Obesity_Type_III 95
##
## Overall Statistics
##
## Accuracy : 0.875
## 95% CI : (0.8467, 0.8998)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8541
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.9630 0.7558
## Specificity 0.9855 0.9524
## Pos Pred Value 0.9070 0.7143
## Neg Pred Value 0.9945 0.9612
## Prevalence 0.1282 0.1361
## Detection Rate 0.1234 0.1028
## Detection Prevalence 0.1361 0.1440
## Balanced Accuracy 0.9742 0.8541
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.8046 0.7816
## Specificity 0.9670 0.9798
## Pos Pred Value 0.7955 0.8608
## Neg Pred Value 0.9688 0.9656
## Prevalence 0.1377 0.1377
## Detection Rate 0.1108 0.1076
## Detection Prevalence 0.1392 0.1250
## Balanced Accuracy 0.8858 0.8807
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.8667 0.9663
## Specificity 0.9848 0.9853
## Pos Pred Value 0.9192 0.9149
## Neg Pred Value 0.9737 0.9944
## Prevalence 0.1661 0.1408
## Detection Rate 0.1440 0.1361
## Detection Prevalence 0.1566 0.1487
## Balanced Accuracy 0.9257 0.9758
## Class: Obesity_Type_III
## Sensitivity 0.9794
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 0.9963
## Prevalence 0.1535
## Detection Rate 0.1503
## Detection Prevalence 0.1503
## Balanced Accuracy 0.9897
XGBoost
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
xgb_fit2 = train(NObeyesdad ~ . -Weight, data = train_data, trControl = train_control, method = "xgbTree")
xgb_preds2 = predict(xgb_fit2, test_data)
confusionMatrix(xgb_preds2, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 81 0 0
## Normal_Weight 0 86 1
## Overweight_Level_I 0 0 86
## Overweight_Level_II 0 0 0
## Obesity_Type_I 0 0 0
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 0 0 0
## Normal_Weight 0 0 0
## Overweight_Level_I 0 0 0
## Overweight_Level_II 87 2 0
## Obesity_Type_I 0 102 0
## Obesity_Type_II 0 1 89
## Obesity_Type_III 0 0 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 0
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 1
## Obesity_Type_III 96
##
## Overall Statistics
##
## Accuracy : 0.9921
## 95% CI : (0.9816, 0.9974)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9908
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 1.0000 1.0000
## Specificity 1.0000 0.9982
## Pos Pred Value 1.0000 0.9885
## Neg Pred Value 1.0000 1.0000
## Prevalence 0.1282 0.1361
## Detection Rate 0.1282 0.1361
## Detection Prevalence 0.1282 0.1377
## Balanced Accuracy 1.0000 0.9991
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.9885 1.0000
## Specificity 1.0000 0.9963
## Pos Pred Value 1.0000 0.9775
## Neg Pred Value 0.9982 1.0000
## Prevalence 0.1377 0.1377
## Detection Rate 0.1361 0.1377
## Detection Prevalence 0.1361 0.1408
## Balanced Accuracy 0.9943 0.9982
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.9714 1.0000
## Specificity 1.0000 0.9963
## Pos Pred Value 1.0000 0.9780
## Neg Pred Value 0.9943 1.0000
## Prevalence 0.1661 0.1408
## Detection Rate 0.1614 0.1408
## Detection Prevalence 0.1614 0.1440
## Balanced Accuracy 0.9857 0.9982
## Class: Obesity_Type_III
## Sensitivity 0.9897
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 0.9981
## Prevalence 0.1535
## Detection Rate 0.1519
## Detection Prevalence 0.1519
## Balanced Accuracy 0.9948
Multinomial Logistic Regression
Colinnearity check
cor(train_data$BMI, train_data$Height)
## [1] 0.1314545
cor(train_data$BMI, train_data$Weight)
## [1] 0.933768
cor(train_data$Weight, train_data$Height)
## [1] 0.4654195
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
multi_fit3 = train(NObeyesdad ~ . -BMI, data = train_data, trControl = train_control, method = "multinom", preProcess = c("center", "scale"),trace = F)
multi_preds3 = predict(multi_fit3, test_data)
confusionMatrix(multi_preds3, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 80 2 0
## Normal_Weight 1 77 0
## Overweight_Level_I 0 7 81
## Overweight_Level_II 0 0 6
## Obesity_Type_I 0 0 0
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 0 0 0
## Normal_Weight 0 0 0
## Overweight_Level_I 2 0 0
## Overweight_Level_II 84 2 0
## Obesity_Type_I 1 99 2
## Obesity_Type_II 0 1 87
## Obesity_Type_III 0 3 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 0
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 1
## Obesity_Type_III 96
##
## Overall Statistics
##
## Accuracy : 0.9557
## 95% CI : (0.9366, 0.9704)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9483
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.9877 0.8953
## Specificity 0.9964 0.9982
## Pos Pred Value 0.9756 0.9872
## Neg Pred Value 0.9982 0.9838
## Prevalence 0.1282 0.1361
## Detection Rate 0.1266 0.1218
## Detection Prevalence 0.1297 0.1234
## Balanced Accuracy 0.9920 0.9468
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.9310 0.9655
## Specificity 0.9835 0.9853
## Pos Pred Value 0.9000 0.9130
## Neg Pred Value 0.9889 0.9944
## Prevalence 0.1377 0.1377
## Detection Rate 0.1282 0.1329
## Detection Prevalence 0.1424 0.1456
## Balanced Accuracy 0.9573 0.9754
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.9429 0.9775
## Specificity 0.9943 0.9963
## Pos Pred Value 0.9706 0.9775
## Neg Pred Value 0.9887 0.9963
## Prevalence 0.1661 0.1408
## Detection Rate 0.1566 0.1377
## Detection Prevalence 0.1614 0.1408
## Balanced Accuracy 0.9686 0.9869
## Class: Obesity_Type_III
## Sensitivity 0.9897
## Specificity 0.9944
## Pos Pred Value 0.9697
## Neg Pred Value 0.9981
## Prevalence 0.1535
## Detection Rate 0.1519
## Detection Prevalence 0.1566
## Balanced Accuracy 0.9920
Random Forest
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
rf_fit3 = train(NObeyesdad ~ . -BMI, data = train_data, trControl = train_control, method = "rf")
rf_preds3 = predict(rf_fit3, test_data)
confusionMatrix(rf_preds3, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 81 3 0
## Normal_Weight 0 79 4
## Overweight_Level_I 0 4 81
## Overweight_Level_II 0 0 2
## Obesity_Type_I 0 0 0
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 0 0 0
## Normal_Weight 0 0 0
## Overweight_Level_I 0 1 0
## Overweight_Level_II 83 4 0
## Obesity_Type_I 4 99 4
## Obesity_Type_II 0 1 85
## Obesity_Type_III 0 0 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 0
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 1
## Obesity_Type_II 0
## Obesity_Type_III 96
##
## Overall Statistics
##
## Accuracy : 0.9557
## 95% CI : (0.9366, 0.9704)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9482
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 1.0000 0.9186
## Specificity 0.9946 0.9927
## Pos Pred Value 0.9643 0.9518
## Neg Pred Value 1.0000 0.9872
## Prevalence 0.1282 0.1361
## Detection Rate 0.1282 0.1250
## Detection Prevalence 0.1329 0.1313
## Balanced Accuracy 0.9973 0.9556
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.9310 0.9540
## Specificity 0.9908 0.9890
## Pos Pred Value 0.9419 0.9326
## Neg Pred Value 0.9890 0.9926
## Prevalence 0.1377 0.1377
## Detection Rate 0.1282 0.1313
## Detection Prevalence 0.1361 0.1408
## Balanced Accuracy 0.9609 0.9715
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.9429 0.9551
## Specificity 0.9829 0.9982
## Pos Pred Value 0.9167 0.9884
## Neg Pred Value 0.9885 0.9927
## Prevalence 0.1661 0.1408
## Detection Rate 0.1566 0.1345
## Detection Prevalence 0.1709 0.1361
## Balanced Accuracy 0.9629 0.9766
## Class: Obesity_Type_III
## Sensitivity 0.9897
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 0.9981
## Prevalence 0.1535
## Detection Rate 0.1519
## Detection Prevalence 0.1519
## Balanced Accuracy 0.9948
varImp(rf_fit3)
## rf variable importance
##
## only 20 most important variables shown (out of 26)
##
## Overall
## Weight 100.0000
## Height 49.3916
## GenderMale 34.7188
## Age 10.3770
## FAVCyes 5.0514
## FAF 3.0674
## CH2O 2.4131
## CAEC.L 2.2343
## TUE 1.6528
## CAEC.C 1.4031
## NCP.L 0.8449
## FCVC.Q 0.7469
## CALC.Q 0.7064
## CALC.L 0.6974
## NCP.C 0.6606
## family_history_with_overweightyes 0.6596
## SMOKEyes 0.5660
## FCVC.L 0.5651
## MTRANSPublic_Transportation 0.4474
## CALC.C 0.4210
plot(varImp(rf_fit3), top = 20)
SVM
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
svm_fit3 = train(NObeyesdad ~ . -BMI, data = train_data, trControl = train_control, method = "svmRadial")
svm_preds3 = predict(svm_fit3, test_data)
confusionMatrix(svm_preds3, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 77 10 0
## Normal_Weight 4 60 8
## Overweight_Level_I 0 9 67
## Overweight_Level_II 0 5 5
## Obesity_Type_I 0 2 7
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 1 1 0
## Normal_Weight 7 5 1
## Overweight_Level_I 9 0 1
## Overweight_Level_II 65 2 0
## Obesity_Type_I 5 91 1
## Obesity_Type_II 0 6 86
## Obesity_Type_III 0 0 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 1
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 1
## Obesity_Type_III 95
##
## Overall Statistics
##
## Accuracy : 0.856
## 95% CI : (0.8262, 0.8825)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8318
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.9506 0.69767
## Specificity 0.9782 0.95238
## Pos Pred Value 0.8652 0.69767
## Neg Pred Value 0.9926 0.95238
## Prevalence 0.1282 0.13608
## Detection Rate 0.1218 0.09494
## Detection Prevalence 0.1408 0.13608
## Balanced Accuracy 0.9644 0.82503
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.7701 0.7471
## Specificity 0.9651 0.9780
## Pos Pred Value 0.7791 0.8442
## Neg Pred Value 0.9634 0.9604
## Prevalence 0.1377 0.1377
## Detection Rate 0.1060 0.1028
## Detection Prevalence 0.1361 0.1218
## Balanced Accuracy 0.8676 0.8626
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.8667 0.9663
## Specificity 0.9715 0.9871
## Pos Pred Value 0.8585 0.9247
## Neg Pred Value 0.9734 0.9944
## Prevalence 0.1661 0.1408
## Detection Rate 0.1440 0.1361
## Detection Prevalence 0.1677 0.1472
## Balanced Accuracy 0.9191 0.9767
## Class: Obesity_Type_III
## Sensitivity 0.9794
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 0.9963
## Prevalence 0.1535
## Detection Rate 0.1503
## Detection Prevalence 0.1503
## Balanced Accuracy 0.9897
XGBoost
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
xgb_fit3 = train(NObeyesdad ~ . -BMI, data = train_data, trControl = train_control, method = "xgbTree")
xgb_preds3 = predict(xgb_fit3, test_data)
confusionMatrix(xgb_preds3, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 81 0 0
## Normal_Weight 0 82 4
## Overweight_Level_I 0 4 82
## Overweight_Level_II 0 0 1
## Obesity_Type_I 0 0 0
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 0 0 0
## Normal_Weight 0 0 0
## Overweight_Level_I 0 1 0
## Overweight_Level_II 86 0 0
## Obesity_Type_I 0 103 1
## Obesity_Type_II 1 1 88
## Obesity_Type_III 0 0 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 0
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 1
## Obesity_Type_III 96
##
## Overall Statistics
##
## Accuracy : 0.9778
## 95% CI : (0.9631, 0.9878)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9741
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 1.0000 0.9535
## Specificity 1.0000 0.9927
## Pos Pred Value 1.0000 0.9535
## Neg Pred Value 1.0000 0.9927
## Prevalence 0.1282 0.1361
## Detection Rate 0.1282 0.1297
## Detection Prevalence 0.1282 0.1361
## Balanced Accuracy 1.0000 0.9731
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.9425 0.9885
## Specificity 0.9908 0.9982
## Pos Pred Value 0.9425 0.9885
## Neg Pred Value 0.9908 0.9982
## Prevalence 0.1377 0.1377
## Detection Rate 0.1297 0.1361
## Detection Prevalence 0.1377 0.1377
## Balanced Accuracy 0.9667 0.9933
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.9810 0.9888
## Specificity 0.9981 0.9945
## Pos Pred Value 0.9904 0.9670
## Neg Pred Value 0.9962 0.9982
## Prevalence 0.1661 0.1408
## Detection Rate 0.1630 0.1392
## Detection Prevalence 0.1646 0.1440
## Balanced Accuracy 0.9895 0.9916
## Class: Obesity_Type_III
## Sensitivity 0.9897
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 0.9981
## Prevalence 0.1535
## Detection Rate 0.1519
## Detection Prevalence 0.1519
## Balanced Accuracy 0.9948
Multinomial Logistic Regression
Colinnearity check
cor(train_data$BMI, train_data$Height)
## [1] 0.1314545
cor(train_data$BMI, train_data$Weight)
## [1] 0.933768
cor(train_data$Weight, train_data$Height)
## [1] 0.4654195
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
multi_fit4 = train(NObeyesdad ~ . -Weight -Height, data = train_data, trControl = train_control, method = "multinom", preProcess = c("center", "scale"),trace = F)
multi_preds4 = predict(multi_fit4, test_data)
confusionMatrix(multi_preds4, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 80 3 0
## Normal_Weight 1 76 1
## Overweight_Level_I 0 7 80
## Overweight_Level_II 0 0 6
## Obesity_Type_I 0 0 0
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 0 0 0
## Normal_Weight 0 0 0
## Overweight_Level_I 3 0 0
## Overweight_Level_II 83 4 0
## Obesity_Type_I 1 99 0
## Obesity_Type_II 0 1 88
## Obesity_Type_III 0 1 1
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 0
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 1
## Obesity_Type_III 96
##
## Overall Statistics
##
## Accuracy : 0.9525
## 95% CI : (0.9329, 0.9677)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9446
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.9877 0.8837
## Specificity 0.9946 0.9963
## Pos Pred Value 0.9639 0.9744
## Neg Pred Value 0.9982 0.9819
## Prevalence 0.1282 0.1361
## Detection Rate 0.1266 0.1203
## Detection Prevalence 0.1313 0.1234
## Balanced Accuracy 0.9911 0.9400
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.9195 0.9540
## Specificity 0.9817 0.9817
## Pos Pred Value 0.8889 0.8925
## Neg Pred Value 0.9871 0.9926
## Prevalence 0.1377 0.1377
## Detection Rate 0.1266 0.1313
## Detection Prevalence 0.1424 0.1472
## Balanced Accuracy 0.9506 0.9678
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.9429 0.9888
## Specificity 0.9981 0.9963
## Pos Pred Value 0.9900 0.9778
## Neg Pred Value 0.9887 0.9982
## Prevalence 0.1661 0.1408
## Detection Rate 0.1566 0.1392
## Detection Prevalence 0.1582 0.1424
## Balanced Accuracy 0.9705 0.9925
## Class: Obesity_Type_III
## Sensitivity 0.9897
## Specificity 0.9963
## Pos Pred Value 0.9796
## Neg Pred Value 0.9981
## Prevalence 0.1535
## Detection Rate 0.1519
## Detection Prevalence 0.1551
## Balanced Accuracy 0.9930
Random Forest
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
rf_fit4 = train(NObeyesdad ~ . -Weight -Height, data = train_data, trControl = train_control, method = "rf")
rf_preds4 = predict(rf_fit4, test_data)
confusionMatrix(rf_preds4, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 80 0 0
## Normal_Weight 1 86 2
## Overweight_Level_I 0 0 84
## Overweight_Level_II 0 0 1
## Obesity_Type_I 0 0 0
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 0 0 0
## Normal_Weight 0 0 0
## Overweight_Level_I 1 0 0
## Overweight_Level_II 86 2 0
## Obesity_Type_I 0 101 0
## Obesity_Type_II 0 2 89
## Obesity_Type_III 0 0 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 0
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 1
## Obesity_Type_III 96
##
## Overall Statistics
##
## Accuracy : 0.9842
## 95% CI : (0.9711, 0.9924)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9815
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.9877 1.0000
## Specificity 1.0000 0.9945
## Pos Pred Value 1.0000 0.9663
## Neg Pred Value 0.9982 1.0000
## Prevalence 0.1282 0.1361
## Detection Rate 0.1266 0.1361
## Detection Prevalence 0.1266 0.1408
## Balanced Accuracy 0.9938 0.9973
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.9655 0.9885
## Specificity 0.9982 0.9945
## Pos Pred Value 0.9882 0.9663
## Neg Pred Value 0.9945 0.9982
## Prevalence 0.1377 0.1377
## Detection Rate 0.1329 0.1361
## Detection Prevalence 0.1345 0.1408
## Balanced Accuracy 0.9818 0.9915
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.9619 1.0000
## Specificity 1.0000 0.9945
## Pos Pred Value 1.0000 0.9674
## Neg Pred Value 0.9925 1.0000
## Prevalence 0.1661 0.1408
## Detection Rate 0.1598 0.1408
## Detection Prevalence 0.1598 0.1456
## Balanced Accuracy 0.9810 0.9972
## Class: Obesity_Type_III
## Sensitivity 0.9897
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 0.9981
## Prevalence 0.1535
## Detection Rate 0.1519
## Detection Prevalence 0.1519
## Balanced Accuracy 0.9948
varImp(rf_fit4)
## rf variable importance
##
## only 20 most important variables shown (out of 25)
##
## Overall
## BMI 100.0000
## GenderMale 10.0820
## Age 6.7214
## family_history_with_overweightyes 2.6714
## FCVC.L 2.4657
## FAF 1.9038
## CH2O 1.8220
## CAEC.L 1.8212
## TUE 1.6335
## CAEC.C 1.4986
## CALC.C 1.0058
## FAVCyes 0.9099
## FCVC.Q 0.8375
## NCP.C 0.8042
## NCP.L 0.7564
## MTRANSPublic_Transportation 0.6220
## CALC.Q 0.6153
## CALC.L 0.5338
## SCCyes 0.3673
## NCP.Q 0.2030
plot(varImp(rf_fit4), top = 20)
SVM
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
svm_fit4 = train(NObeyesdad ~ . -Weight -Height, data = train_data, trControl = train_control, method = "svmRadial")
svm_preds4 = predict(svm_fit4, test_data)
confusionMatrix(svm_preds4, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 79 10 0
## Normal_Weight 2 63 8
## Overweight_Level_I 0 9 73
## Overweight_Level_II 0 4 4
## Obesity_Type_I 0 0 2
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 0 0 0
## Normal_Weight 9 5 2
## Overweight_Level_I 6 1 0
## Overweight_Level_II 68 1 0
## Obesity_Type_I 4 89 1
## Obesity_Type_II 0 9 86
## Obesity_Type_III 0 0 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 0
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 2
## Obesity_Type_III 95
##
## Overall Statistics
##
## Accuracy : 0.875
## 95% CI : (0.8467, 0.8998)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8541
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.9753 0.73256
## Specificity 0.9819 0.95238
## Pos Pred Value 0.8876 0.70787
## Neg Pred Value 0.9963 0.95764
## Prevalence 0.1282 0.13608
## Detection Rate 0.1250 0.09968
## Detection Prevalence 0.1408 0.14082
## Balanced Accuracy 0.9786 0.84247
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.8391 0.7816
## Specificity 0.9706 0.9835
## Pos Pred Value 0.8202 0.8831
## Neg Pred Value 0.9742 0.9658
## Prevalence 0.1377 0.1377
## Detection Rate 0.1155 0.1076
## Detection Prevalence 0.1408 0.1218
## Balanced Accuracy 0.9049 0.8825
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.8476 0.9663
## Specificity 0.9867 0.9797
## Pos Pred Value 0.9271 0.8866
## Neg Pred Value 0.9701 0.9944
## Prevalence 0.1661 0.1408
## Detection Rate 0.1408 0.1361
## Detection Prevalence 0.1519 0.1535
## Balanced Accuracy 0.9172 0.9730
## Class: Obesity_Type_III
## Sensitivity 0.9794
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 0.9963
## Prevalence 0.1535
## Detection Rate 0.1503
## Detection Prevalence 0.1503
## Balanced Accuracy 0.9897
XGBoost
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
xgb_fit4 = train(NObeyesdad ~ . -Weight -Height, data = train_data, trControl = train_control, method = "xgbTree")
xgb_preds4 = predict(xgb_fit4, test_data)
confusionMatrix(xgb_preds4, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 81 0 0
## Normal_Weight 0 86 1
## Overweight_Level_I 0 0 86
## Overweight_Level_II 0 0 0
## Obesity_Type_I 0 0 0
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 0 0 0
## Normal_Weight 0 0 0
## Overweight_Level_I 0 0 0
## Overweight_Level_II 87 2 0
## Obesity_Type_I 0 102 2
## Obesity_Type_II 0 1 87
## Obesity_Type_III 0 0 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 0
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 0
## Obesity_Type_III 97
##
## Overall Statistics
##
## Accuracy : 0.9905
## 95% CI : (0.9795, 0.9965)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9889
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 1.0000 1.0000
## Specificity 1.0000 0.9982
## Pos Pred Value 1.0000 0.9885
## Neg Pred Value 1.0000 1.0000
## Prevalence 0.1282 0.1361
## Detection Rate 0.1282 0.1361
## Detection Prevalence 0.1282 0.1377
## Balanced Accuracy 1.0000 0.9991
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.9885 1.0000
## Specificity 1.0000 0.9963
## Pos Pred Value 1.0000 0.9775
## Neg Pred Value 0.9982 1.0000
## Prevalence 0.1377 0.1377
## Detection Rate 0.1361 0.1377
## Detection Prevalence 0.1361 0.1408
## Balanced Accuracy 0.9943 0.9982
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.9714 0.9775
## Specificity 0.9962 0.9982
## Pos Pred Value 0.9808 0.9886
## Neg Pred Value 0.9943 0.9963
## Prevalence 0.1661 0.1408
## Detection Rate 0.1614 0.1377
## Detection Prevalence 0.1646 0.1392
## Balanced Accuracy 0.9838 0.9878
## Class: Obesity_Type_III
## Sensitivity 1.0000
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 1.0000
## Prevalence 0.1535
## Detection Rate 0.1535
## Detection Prevalence 0.1535
## Balanced Accuracy 1.0000
library(knitr)
cor(train_data$BMI, train_data$Height)
## [1] 0.1314545
cor(train_data$BMI, train_data$Weight)
## [1] 0.933768
cor(train_data$Weight, train_data$Height)
## [1] 0.4654195
A reminder that BMI and Weight are highly
correlated, and that Weight and Height are
moderately correlated. BMI was appended to the data and
computed from Height and Weight.
Weight should be dropped to reduce multicolinnearity and
redundancy.
kable(results_table, caption = "Performance Metrics Without `BMI`, `Height`, or `Weight`")
| Model | Accuracy | Sensitivity | Specificity | Pos_Pred_Value | |
|---|---|---|---|---|---|
| Accuracy…1 | Multinomial | 0.6170886 | 0.6101318 | 0.9360075 | 0.6141099 |
| Accuracy…2 | RandomForest | 0.8275316 | 0.8256730 | 0.9712861 | 0.8234711 |
| Accuracy…3 | SVM | 0.7405063 | 0.7407722 | 0.9567370 | 0.7385803 |
| Accuracy…4 | XGBoost | 0.8069620 | 0.8046080 | 0.9678289 | 0.8025074 |
kable(results_table2, caption = "Performance Metrics With `BMI` and `Height` - Without `Weight`")
| Model | Accuracy | Sensitivity | Specificity | Pos_Pred_Value | |
|---|---|---|---|---|---|
| Accuracy…1 | Multinomial | 0.9556962 | 0.9553991 | 0.9926322 | 0.9559979 |
| Accuracy…2 | RandomForest | 0.9873418 | 0.9877558 | 0.9979020 | 0.9871370 |
| Accuracy…3 | SVM | 0.8750000 | 0.8739034 | 0.9792482 | 0.8730803 |
| Accuracy…4 | XGBoost | 0.9920886 | 0.9928036 | 0.9986879 | 0.9920080 |
kable(results_table3, caption = "Performance Metrics With `Height` and `Weight` - Without `BMI`")
| Model | Accuracy | Sensitivity | Specificity | Pos_Pred_Value | |
|---|---|---|---|---|---|
| Accuracy…1 | Multinomial | 0.9556962 | 0.9556615 | 0.9926233 | 0.9562351 |
| Accuracy…2 | RandomForest | 0.9556962 | 0.9558952 | 0.9925895 | 0.9565109 |
| Accuracy…3 | SVM | 0.8560127 | 0.8539819 | 0.9760525 | 0.8527558 |
| Accuracy…4 | XGBoost | 0.9778481 | 0.9777043 | 0.9963203 | 0.9774201 |
kable(results_table4, caption = "Performance Metrics With BMI - Without `Weight` or `Height`")
| Model | Accuracy | Sensitivity | Specificity | Pos_Pred_Value | |
|---|---|---|---|---|---|
| Accuracy…1 | Multinomial | 0.9525316 | 0.9523215 | 0.9921251 | 0.9524209 |
| Accuracy…2 | RandomForest | 0.9841772 | 0.9847533 | 0.9973773 | 0.9840301 |
| Accuracy…3 | SVM | 0.8750000 | 0.8745499 | 0.9792600 | 0.8732184 |
| Accuracy…4 | XGBoost | 0.9905063 | 0.9910661 | 0.9984089 | 0.9907771 |
The XGBoost model has the best results.
Choosing not to tune the SVM because the Random Forest and XGBoost have great predictive power for the data.
varImp(xgb_fit2)
## xgbTree variable importance
##
## only 20 most important variables shown (out of 26)
##
## Overall
## BMI 100.0000
## GenderMale 11.3174
## Age 4.4123
## CAEC.C 2.9276
## CH2O 2.7932
## Height 2.5677
## TUE 2.1718
## family_history_with_overweightyes 1.8793
## FAF 1.8149
## FCVC.L 1.7573
## CAEC.L 1.6371
## NCP.L 0.9623
## NCP.C 0.7320
## FAVCyes 0.7027
## MTRANSPublic_Transportation 0.5986
## CALC.Q 0.4823
## CALC.L 0.4071
## SCCyes 0.2357
## CALC.C 0.2179
## FCVC.Q 0.2077
set.seed(25)
train_control = trainControl(method = "cv", number = 5)
xgb_fit0 = train(NObeyesdad ~ BMI + Gender + Age + CAEC + CH2O, data = train_data, trControl = train_control, method = "xgbTree")
xgb_preds0 = predict(xgb_fit0, test_data)
confusionMatrix(xgb_preds0, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 80 1 0
## Normal_Weight 1 85 2
## Overweight_Level_I 0 0 83
## Overweight_Level_II 0 0 2
## Obesity_Type_I 0 0 0
## Obesity_Type_II 0 0 0
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 0 0 0
## Normal_Weight 0 0 0
## Overweight_Level_I 2 0 0
## Overweight_Level_II 84 3 0
## Obesity_Type_I 1 100 1
## Obesity_Type_II 0 2 88
## Obesity_Type_III 0 0 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 0
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 1
## Obesity_Type_III 96
##
## Overall Statistics
##
## Accuracy : 0.9747
## 95% CI : (0.9592, 0.9855)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9704
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.9877 0.9884
## Specificity 0.9982 0.9945
## Pos Pred Value 0.9877 0.9659
## Neg Pred Value 0.9982 0.9982
## Prevalence 0.1282 0.1361
## Detection Rate 0.1266 0.1345
## Detection Prevalence 0.1282 0.1392
## Balanced Accuracy 0.9929 0.9914
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.9540 0.9655
## Specificity 0.9963 0.9908
## Pos Pred Value 0.9765 0.9438
## Neg Pred Value 0.9927 0.9945
## Prevalence 0.1377 0.1377
## Detection Rate 0.1313 0.1329
## Detection Prevalence 0.1345 0.1408
## Balanced Accuracy 0.9752 0.9782
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.9524 0.9888
## Specificity 0.9962 0.9945
## Pos Pred Value 0.9804 0.9670
## Neg Pred Value 0.9906 0.9982
## Prevalence 0.1661 0.1408
## Detection Rate 0.1582 0.1392
## Detection Prevalence 0.1614 0.1440
## Balanced Accuracy 0.9743 0.9916
## Class: Obesity_Type_III
## Sensitivity 0.9897
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 0.9981
## Prevalence 0.1535
## Detection Rate 0.1519
## Detection Prevalence 0.1519
## Balanced Accuracy 0.9948
# Confusion matrices
cm_xgb2 = confusionMatrix(xgb_preds2, y_test)
cm_xgb0 = confusionMatrix(xgb_preds0, y_test)
# Extract accuracy
acc_xgb2 = cm_xgb2$overall["Accuracy"]
acc_xgb0 = cm_xgb0$overall["Accuracy"]
# Extract class-wise metrics
cm_xgb2_metrics = cm_xgb2$byClass[, c("Sensitivity", "Specificity", "Pos Pred Value")]
cm_xgb0_metrics = cm_xgb0$byClass[, c("Sensitivity", "Specificity", "Pos Pred Value")]
# Average class-wise metrics
metrics_comparison = data.frame(
Model = c("xgb_fit2 (All Predictors with BMI and Height)", "xgb_fit0 (Top 5 Predictors from xgb_fit2)"),
Accuracy = c(acc_xgb2, acc_xgb0),
Sensitivity = c(mean(cm_xgb2_metrics[, "Sensitivity"], na.rm = TRUE),
mean(cm_xgb0_metrics[, "Sensitivity"], na.rm = TRUE)),
Specificity = c(mean(cm_xgb2_metrics[, "Specificity"], na.rm = TRUE),
mean(cm_xgb0_metrics[, "Specificity"], na.rm = TRUE)),
PPV = c(mean(cm_xgb2_metrics[, "Pos Pred Value"], na.rm = TRUE),
mean(cm_xgb0_metrics[, "Pos Pred Value"], na.rm = TRUE))
)
# Display table
knitr::kable(metrics_comparison, digits = 4, caption = "Comparison of Full vs Reduced XGBoost Models")
| Model | Accuracy | Sensitivity | Specificity | PPV |
|---|---|---|---|---|
| xgb_fit2 (All Predictors with BMI and Height) | 0.9921 | 0.9928 | 0.9987 | 0.9920 |
| xgb_fit0 (Top 5 Predictors from xgb_fit2) | 0.9747 | 0.9752 | 0.9958 | 0.9745 |
The drop off in metrics is relatively small and the amount of predictors dropped from 17 to 5.
# Confusion matrices
cm_xgb2 = confusionMatrix(xgb_preds2, y_test)
cm_xgb0 = confusionMatrix(xgb_preds0, y_test)
cm_xgb = confusionMatrix(xgb_preds, y_test) # For model excluding Height, Weight, BMI
# Extract accuracy
acc_xgb2 = cm_xgb2$overall["Accuracy"]
acc_xgb0 = cm_xgb0$overall["Accuracy"]
acc_xgb = cm_xgb$overall["Accuracy"]
# Extract class-wise metrics
cm_xgb2_metrics = cm_xgb2$byClass[, c("Sensitivity", "Specificity", "Pos Pred Value")]
cm_xgb0_metrics = cm_xgb0$byClass[, c("Sensitivity", "Specificity", "Pos Pred Value")]
cm_xgb_metrics = cm_xgb$byClass[, c("Sensitivity", "Specificity", "Pos Pred Value")]
# Average class-wise metrics
metrics_comparison = data.frame(
Model = c(
"xgb_fit2 (All Predictors with BMI and Height)",
"xgb_fit0 (Only Top 5 Predictors from xgb_fit2)",
"xgb_fit (All Predictors Without Height, Weight, and BMI)"
),
Accuracy = c(acc_xgb2, acc_xgb0, acc_xgb),
Sensitivity = c(
mean(cm_xgb2_metrics[, "Sensitivity"], na.rm = TRUE),
mean(cm_xgb0_metrics[, "Sensitivity"], na.rm = TRUE),
mean(cm_xgb_metrics[, "Sensitivity"], na.rm = TRUE)
),
Specificity = c(
mean(cm_xgb2_metrics[, "Specificity"], na.rm = TRUE),
mean(cm_xgb0_metrics[, "Specificity"], na.rm = TRUE),
mean(cm_xgb_metrics[, "Specificity"], na.rm = TRUE)
),
PPV = c(
mean(cm_xgb2_metrics[, "Pos Pred Value"], na.rm = TRUE),
mean(cm_xgb0_metrics[, "Pos Pred Value"], na.rm = TRUE),
mean(cm_xgb_metrics[, "Pos Pred Value"], na.rm = TRUE)
)
)
# Display table
knitr::kable(metrics_comparison, digits = 4, caption = "Comparison of XGBoost Models with Varying Predictors")
| Model | Accuracy | Sensitivity | Specificity | PPV |
|---|---|---|---|---|
| xgb_fit2 (All Predictors with BMI and Height) | 0.9921 | 0.9928 | 0.9987 | 0.9920 |
| xgb_fit0 (Only Top 5 Predictors from xgb_fit2) | 0.9747 | 0.9752 | 0.9958 | 0.9745 |
| xgb_fit (All Predictors Without Height, Weight, and BMI) | 0.8070 | 0.8046 | 0.9678 | 0.8025 |
When dropping the most obvious predictors, the predictive model still has an 80% accuracy.
| Model | Accuracy | Sensitivity | Specificity | PPV | F1_Score |
|---|---|---|---|---|---|
| xgb_fit2 (All Predictors with BMI and Height) | 0.9921 | 0.9928 | 0.9987 | 0.9920 | 0.9923 |
| xgb_fit0 (Only Top 5 Predictors from xgb_fit2) | 0.9747 | 0.9752 | 0.9958 | 0.9745 | 0.9747 |
| xgb_fit (All Predictors Without Height, Weight, and BMI) | 0.8070 | 0.8046 | 0.9678 | 0.8025 | 0.8024 |
F1 score was found and is in line with what the best version of the
predictive model is. Using all predictors (except Weight
due to redundancy with BMI) has the best predictive power,
but the trade off for a much simpler model may be worth it because only
including the top 5 predictors from the full model still trains a great
model with high predictive power.
Lastly, even though BMI was calculated with both
Height and Weight, the model that includes
BMI excluding Height and Weight
does not perform as well as when Height is added in,
leading to the conclusion that Height does help capture
additional variability in the data.
While BMI, Height, and Weight
emerged as the most obvious predictive features for obesity
classification due to the definition of obesity, I further explored
model performance under constrained scenarios — specifically, omitting
dominant predictors — to assess the value of behavioral and lifestyle
variables independently. This allowed evaluation of secondary features’
standalone contribution and model robustness in data-limited
settings.
The random forest using the same predictors was best in terms of accuracy so it will be used.
rf_fit was trained earlier without Height,
Weight, and BMI to assess the predictive power
of the less obvious predictors.
confusionMatrix(rf_preds, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insufficient_Weight Normal_Weight Overweight_Level_I
## Insufficient_Weight 72 11 3
## Normal_Weight 4 59 4
## Overweight_Level_I 4 6 67
## Overweight_Level_II 0 6 4
## Obesity_Type_I 1 3 6
## Obesity_Type_II 0 1 3
## Obesity_Type_III 0 0 0
## Reference
## Prediction Overweight_Level_II Obesity_Type_I Obesity_Type_II
## Insufficient_Weight 1 2 0
## Normal_Weight 7 5 1
## Overweight_Level_I 3 4 0
## Overweight_Level_II 61 7 1
## Obesity_Type_I 8 84 3
## Obesity_Type_II 7 3 84
## Obesity_Type_III 0 0 0
## Reference
## Prediction Obesity_Type_III
## Insufficient_Weight 0
## Normal_Weight 1
## Overweight_Level_I 0
## Overweight_Level_II 0
## Obesity_Type_I 0
## Obesity_Type_II 0
## Obesity_Type_III 96
##
## Overall Statistics
##
## Accuracy : 0.8275
## 95% CI : (0.7958, 0.8562)
## No Information Rate : 0.1661
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7986
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Insufficient_Weight Class: Normal_Weight
## Sensitivity 0.8889 0.68605
## Specificity 0.9691 0.95971
## Pos Pred Value 0.8090 0.72840
## Neg Pred Value 0.9834 0.95100
## Prevalence 0.1282 0.13608
## Detection Rate 0.1139 0.09335
## Detection Prevalence 0.1408 0.12816
## Balanced Accuracy 0.9290 0.82288
## Class: Overweight_Level_I Class: Overweight_Level_II
## Sensitivity 0.7701 0.70115
## Specificity 0.9688 0.96697
## Pos Pred Value 0.7976 0.77215
## Neg Pred Value 0.9635 0.95298
## Prevalence 0.1377 0.13766
## Detection Rate 0.1060 0.09652
## Detection Prevalence 0.1329 0.12500
## Balanced Accuracy 0.8695 0.83406
## Class: Obesity_Type_I Class: Obesity_Type_II
## Sensitivity 0.8000 0.9438
## Specificity 0.9602 0.9742
## Pos Pred Value 0.8000 0.8571
## Neg Pred Value 0.9602 0.9906
## Prevalence 0.1661 0.1408
## Detection Rate 0.1329 0.1329
## Detection Prevalence 0.1661 0.1551
## Balanced Accuracy 0.8801 0.9590
## Class: Obesity_Type_III
## Sensitivity 0.9897
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 0.9981
## Prevalence 0.1535
## Detection Rate 0.1519
## Detection Prevalence 0.1519
## Balanced Accuracy 0.9948
varImp(rf_fit)
## rf variable importance
##
## only 20 most important variables shown (out of 24)
##
## Overall
## Age 100.000
## FAF 50.797
## TUE 45.140
## CH2O 44.619
## GenderMale 35.645
## family_history_with_overweightyes 30.809
## FCVC.L 25.583
## CALC.C 19.994
## CAEC.C 18.743
## MTRANSPublic_Transportation 18.486
## NCP.L 16.994
## CAEC.L 14.629
## FAVCyes 14.544
## NCP.C 13.591
## FCVC.Q 10.685
## CALC.Q 10.648
## CALC.L 10.245
## NCP.Q 4.754
## SCCyes 3.792
## CAEC.Q 2.249
plot(varImp(rf_fit))
Age, physical activity frequency, time spent using technological devices, daily water intake, and gender look to be the top 5 most important according to the model that excludes obvious predictors.
Age, FAF,CH2O, and
Gender have all been explored earlier in the analysis, so
TUE, time spent using technological devices, will be looked
into.
Does time spent on tech devices have an effect on BMI or obesity level?
ggplot(train_data, aes(x = TUE, y = BMI)) +
geom_point(alpha = 0.5, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred") +
labs(
title = "Relationship Between Time Using Technology and BMI",
x = "Frequency of Tech Use (TUE)",
y = "BMI"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
cor.test(train_data$BMI, train_data$TUE, method = "spearman")
## Warning in cor.test.default(train_data$BMI, train_data$TUE, method =
## "spearman"): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: train_data$BMI and train_data$TUE
## S = 582958461, p-value = 0.001789
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.08114648
As technology use increases, BMI decreases, but the correlation is weak.
ggplot(train_data, aes(x = NObeyesdad, y = TUE)) +
geom_boxplot(fill = "lightblue", color = "black") +
stat_summary(fun = mean, geom = "point", shape = 20, size = 2, color = "red") +
labs(title = "Distribution of Tech Use by Obesity Level",
x = "Obesity Level",
y = "Frequency of Tech Use") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
kruskal.test(TUE ~ NObeyesdad, data = train_data)
##
## Kruskal-Wallis rank sum test
##
## data: TUE by NObeyesdad
## Kruskal-Wallis chi-squared = 39.055, df = 6, p-value = 6.982e-07
Technology use differs across obesity levels.
Age
As age increases in adulthood, average BMI increases as well.
FAF
More frequent physical activity leads to a lower BMI on average.
TUE
BMI has a slight negative correlation with technology use.
CH2O
Those who consume alcohol tend to have higher BMIs if they also consume more water daily, whereas those who do not consume alcohol have lower BMIs if they also consume the same higher amounts of water.
Gender
Chi-square was significant between gender and obesity level, therefore there is a difference in gender between at least one obesity level compared to another.
There is no significant difference between BMI and Gender.
While BMI does not significantly differ by gender, a chi-square test reveals that gender and obesity level are associated, suggesting that gender plays a role in how individuals are categorized into obesity levels. Specific categories (e.g., Level 2 and 3) show heavy gender skew. This could reflect differences in obesity labeling, lifestyle patterns, or other behavioral variables that interact with gender.
In the initial models, predictors included Height,
Weight, and/or BMI, all of which are directly
involved in how obesity is medically defined. This likely inflated
predictive performance, as the model essentially had access to a derived
version of the target label. The resulting test accuracy of >95%
suggests potential data leakage.
To better evaluate generalization, a model was trained excluding
Height, Weight, and BMI, and
achieved a test accuracy of 82.75%. While lower, this result is more
reflective of the model’s ability to generalize from behavioral and
lifestyle patterns, without relying on circular logic.
The model without Height, Weight, and
BMI would be better in uses where BMI is not available or
if used to identify risk before BMI reaches clinical obesity. It is a
good balance between performance and model validity.