# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the data
obesity <- read.csv("C://Users//saisr//Downloads//statistics using R//obesity.csv")
glimpse(obesity)
## Rows: 2,111
## Columns: 17
## $ Gender <chr> "Female", "Female", "Male", "Male", "Ma…
## $ Age <dbl> 21, 21, 23, 27, 22, 29, 23, 22, 24, 22,…
## $ Height <dbl> 1.62, 1.52, 1.80, 1.80, 1.78, 1.62, 1.5…
## $ Weight <dbl> 64.0, 56.0, 77.0, 87.0, 89.8, 53.0, 55.…
## $ family_history_with_overweight <chr> "yes", "yes", "yes", "no", "no", "no", …
## $ FAVC <chr> "no", "no", "no", "no", "no", "yes", "y…
## $ FCVC <dbl> 2, 3, 2, 3, 2, 2, 3, 2, 3, 2, 3, 2, 3, …
## $ NCP <dbl> 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ CAEC <chr> "Sometimes", "Sometimes", "Sometimes", …
## $ SMOKE <chr> "no", "yes", "no", "no", "no", "no", "n…
## $ CH2O <dbl> 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, …
## $ SCC <chr> "no", "yes", "no", "no", "no", "no", "n…
## $ FAF <dbl> 0, 3, 2, 2, 0, 0, 1, 3, 1, 1, 2, 2, 2, …
## $ TUE <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 2, 1, 0, …
## $ CALC <chr> "no", "Sometimes", "Frequently", "Frequ…
## $ MTRANS <chr> "Public_Transportation", "Public_Transp…
## $ NObeyesdad <chr> "Normal_Weight", "Normal_Weight", "Norm…
converting NObeyesdad into a binary outcome
# Convert 'NObeyesdad' column to binary 'Obese'
obesity <- obesity %>%
mutate(Obese = ifelse(NObeyesdad %in% c("Overweight_Level_I",
"Overweight_Level_II",
"Obesity_Type_I",
"Obesity_Type_II",
"Obesity_Type_III"), 1, 0))
table(obesity$Obese) / nrow(obesity)
##
## 0 1
## 0.2648034 0.7351966
Four explanatory variables are:
family_history_with_overweight FAVC (high-calorie food consumption frequency) CAEC (eating between meals frequency) FAF (physical activity frequency)
# Select and preprocess explanatory variables
obesity <- obesity%>%
mutate(family_history_with_overweight = ifelse(family_history_with_overweight == "yes", 1, 0),
FAVC = ifelse(FAVC == "yes", 1, 0),
CAEC = case_when(CAEC == "Always" ~ 3,
CAEC == "Frequently" ~ 2,
CAEC == "Sometimes" ~ 1,
TRUE ~ 0))
obesity <- na.omit(obesity)
# Define predictor variables and response
X <- obesity %>% select(family_history_with_overweight, FAVC, CAEC, FAF)
y <- obesity$Obese
# logistic regression model
model <- glm(y ~ family_history_with_overweight + FAVC + CAEC + FAF,
data = obesity, family = "binomial")
# Display summary of the model
summary(model)
##
## Call:
## glm(formula = y ~ family_history_with_overweight + FAVC + CAEC +
## FAF, family = "binomial", data = obesity)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.40274 0.27361 5.127 2.95e-07 ***
## family_history_with_overweight 2.68371 0.15486 17.329 < 2e-16 ***
## FAVC 0.67721 0.18990 3.566 0.000362 ***
## CAEC -2.08899 0.14331 -14.577 < 2e-16 ***
## FAF -0.53841 0.07464 -7.214 5.44e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2440.4 on 2110 degrees of freedom
## Residual deviance: 1626.7 on 2106 degrees of freedom
## AIC: 1636.7
##
## Number of Fisher Scoring iterations: 5
**family_history_with_overweight*: The log-odds of being obese if there is a family history of overweight. FAVC: The log-odds increase in obesity associated with high-calorie food consumption. CAEC: The impact of the frequency of eating between meals on the odds of being obese. FAF:** The influence of physical activity on the odds of obesity.
# Fit the logistic regression model
model <- glm(Obese ~ family_history_with_overweight + FAVC + CAEC + FAF,
data = obesity, family = "binomial")
# Extract coefficient estimate and standard error for family_history_with_overweight
coef_estimate <- coef(summary(model))["family_history_with_overweight", "Estimate"]
std_error <- coef(summary(model))["family_history_with_overweight", "Std. Error"]
# Calculate the 95% confidence interval
z_value <- 1.96 # Z-score for 95% confidence
lower_bound <- coef_estimate - z_value * std_error
upper_bound <- coef_estimate + z_value * std_error
c(lower_bound, upper_bound)
## [1] 2.380174 2.987244