week 10

# Load necessary libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Load the data
obesity <- read.csv("C://Users//saisr//Downloads//statistics using R//obesity.csv")

glimpse(obesity)

## Rows: 2,111
## Columns: 17
## $ Gender                         <chr> "Female", "Female", "Male", "Male", "Ma…
## $ Age                            <dbl> 21, 21, 23, 27, 22, 29, 23, 22, 24, 22,…
## $ Height                         <dbl> 1.62, 1.52, 1.80, 1.80, 1.78, 1.62, 1.5…
## $ Weight                         <dbl> 64.0, 56.0, 77.0, 87.0, 89.8, 53.0, 55.…
## $ family_history_with_overweight <chr> "yes", "yes", "yes", "no", "no", "no", …
## $ FAVC                           <chr> "no", "no", "no", "no", "no", "yes", "y…
## $ FCVC                           <dbl> 2, 3, 2, 3, 2, 2, 3, 2, 3, 2, 3, 2, 3, …
## $ NCP                            <dbl> 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ CAEC                           <chr> "Sometimes", "Sometimes", "Sometimes", …
## $ SMOKE                          <chr> "no", "yes", "no", "no", "no", "no", "n…
## $ CH2O                           <dbl> 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, …
## $ SCC                            <chr> "no", "yes", "no", "no", "no", "no", "n…
## $ FAF                            <dbl> 0, 3, 2, 2, 0, 0, 1, 3, 1, 1, 2, 2, 2, …
## $ TUE                            <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 2, 1, 0, …
## $ CALC                           <chr> "no", "Sometimes", "Frequently", "Frequ…
## $ MTRANS                         <chr> "Public_Transportation", "Public_Transp…
## $ NObeyesdad                     <chr> "Normal_Weight", "Normal_Weight", "Norm…

Selecting and Creating the Binary Variable

converting NObeyesdad into a binary outcome

# Convert 'NObeyesdad' column to binary 'Obese'
obesity <- obesity %>%
  mutate(Obese = ifelse(NObeyesdad %in% c("Overweight_Level_I", 
                                          "Overweight_Level_II", 
                                          "Obesity_Type_I", 
                                          "Obesity_Type_II", 
                                          "Obesity_Type_III"), 1, 0))

table(obesity$Obese) / nrow(obesity)

## 
##         0         1 
## 0.2648034 0.7351966

Logistic Regression Model

Four explanatory variables are:

family_history_with_overweight FAVC (high-calorie food consumption frequency) CAEC (eating between meals frequency) FAF (physical activity frequency)

# Select and preprocess explanatory variables
obesity <- obesity%>%
  mutate(family_history_with_overweight = ifelse(family_history_with_overweight == "yes", 1, 0),
         FAVC = ifelse(FAVC == "yes", 1, 0),
         CAEC = case_when(CAEC == "Always" ~ 3,
                          CAEC == "Frequently" ~ 2,
                          CAEC == "Sometimes" ~ 1,
                          TRUE ~ 0))

obesity <- na.omit(obesity)

# Define predictor variables and response
X <- obesity %>% select(family_history_with_overweight, FAVC, CAEC, FAF)
y <- obesity$Obese

# logistic regression model
model <- glm(y ~ family_history_with_overweight + FAVC + CAEC + FAF, 
             data = obesity, family = "binomial")

# Display summary of the model
summary(model)

## 
## Call:
## glm(formula = y ~ family_history_with_overweight + FAVC + CAEC + 
##     FAF, family = "binomial", data = obesity)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     1.40274    0.27361   5.127 2.95e-07 ***
## family_history_with_overweight  2.68371    0.15486  17.329  < 2e-16 ***
## FAVC                            0.67721    0.18990   3.566 0.000362 ***
## CAEC                           -2.08899    0.14331 -14.577  < 2e-16 ***
## FAF                            -0.53841    0.07464  -7.214 5.44e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2440.4  on 2110  degrees of freedom
## Residual deviance: 1626.7  on 2106  degrees of freedom
## AIC: 1636.7
## 
## Number of Fisher Scoring iterations: 5

Interpretation of Coefficients

Each coefficient represents the change in the probability of being obese associated with a one-unit change in the predictor, holding other variables constant.

**family_history_with_overweight*: The log-odds of being obese if there is a family history of overweight. FAVC: The log-odds increase in obesity associated with high-calorie food consumption. CAEC: The impact of the frequency of eating between meals on the odds of being obese. FAF:** The influence of physical activity on the odds of obesity.

Confidence Interval for family_history_with_overweight

# Fit the logistic regression model
model <- glm(Obese ~ family_history_with_overweight + FAVC + CAEC + FAF, 
             data = obesity, family = "binomial")

# Extract coefficient estimate and standard error for family_history_with_overweight
coef_estimate <- coef(summary(model))["family_history_with_overweight", "Estimate"]
std_error <- coef(summary(model))["family_history_with_overweight", "Std. Error"]

# Calculate the 95% confidence interval
z_value <- 1.96  # Z-score for 95% confidence
lower_bound <- coef_estimate - z_value * std_error
upper_bound <- coef_estimate + z_value * std_error
c(lower_bound, upper_bound)

## [1] 2.380174 2.987244

Interpretation:

This interval means that we are 95% confident that the effect of having a family history of overweight on the log-odds of obesity lies between 2.38 and 2.99.
Individuals with a family history of being overweight are more likely to be obese.
In practical terms, this interval implies that having a family history of overweight increases the log-odds of obesity by 2.38 to 2.99, holding other factors constant.
This positive relationship aligns with expectations, as family history is often a strong predictor of obesity due to genetic and environmental factors.