library(boot)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(broom)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read.csv("~/Downloads/ObesityDataSet_raw_and_data_sinthetic.csv", header=TRUE)
df['BMI'] <- df['Weight']/(df['Height']**2)
df['is_obese'] <- ifelse( df$BMI >30 ,1,0)
model <- glm(is_obese ~ FAF + NCP + CH2O, data = df, family = binomial(link = 'logit'))
model$coefficients
## (Intercept) FAF NCP CH2O
## -0.9641686 -0.4167630 0.1528797 0.4048754
sigmoid <- function(x, model) {
intercept <- model$coefficients[1]
coef_FA <- model$coefficients['FAF']
coef_NCP <- model$coefficients['NCP']
coef_CH2O <- model$coefficients['CH2O']
log_odds <- intercept + coef_FA * mean(df$FAF) + coef_NCP * mean(df$NCP) + coef_CH2O * x
probability <- 1 / (1 + exp(-log_odds))
return(probability)
}
df |>
ggplot(mapping = aes(x = CH2O, y = is_obese)) +
geom_jitter(width = 0, height = 0.1, size = 1) +
geom_function(fun = function(x) sigmoid(x, model), color = 'blue', linewidth = 1) +
scale_y_continuous(breaks = c(0, 0.5, 1)) +
theme_classic()
CH2O_decision_threshold <- -model$coefficients["(Intercept)"]/model$coefficients["CH2O"]
CH2O_decision_threshold
## (Intercept)
## 2.381396
summary(model)
##
## Call:
## glm(formula = is_obese ~ FAF + NCP + CH2O, family = binomial(link = "logit"),
## data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.96417 0.21277 -4.531 5.86e-06 ***
## FAF -0.41676 0.05498 -7.580 3.46e-14 ***
## NCP 0.15288 0.05816 2.628 0.00858 **
## CH2O 0.40488 0.07449 5.435 5.48e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2913.9 on 2110 degrees of freedom
## Residual deviance: 2832.4 on 2107 degrees of freedom
## AIC: 2840.4
##
## Number of Fisher Scoring iterations: 4
Intercept = -0.96417 .This implies, a negative probability(log-odds) of -0.96417 when all the variables are zero. which is practically impossible. Since CH2O = 1 if a person drinks 0-1 liter water a day, thus the minimum value for CH2O is 1.
Coefficient of FAF = -0.416, This negative coefficient indicates that for every unit increase of FAF, the odds-ratio of obesity is changed by a factor of \(e^{-0.416}\) ( = 0.66)i.e., 34% decrease in odds-ratio of being obese.
Similarly, Coefficient of NCP = 0.1528, for every unit increase in NCP, odds-ratio increases by 16% (factor of \(e^{0.15288}\) = 1.16)
And for every unit increase in CH2O, odds-ratio of obesity increases by roughly 50% (factor of \(e^{0.40488}\) = 1.5)
tidy(model, conf.int = TRUE)
## # A tibble: 4 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.964 0.213 -4.53 5.86e- 6 -1.38 -0.549
## 2 FAF -0.417 0.0550 -7.58 3.46e-14 -0.525 -0.310
## 3 NCP 0.153 0.0582 2.63 8.58e- 3 0.0393 0.267
## 4 CH2O 0.405 0.0745 5.44 5.48e- 8 0.259 0.552
tidy(model, conf.int = TRUE) |>
ggplot(mapping = aes(x = estimate, y = term))+
geom_point()+
geom_vline(xintercept = 0, linetype = 'dotted', color = 'gray')+
geom_errorbarh(mapping = aes(xmin = conf.low,
xmax = conf.high,
height = 0.5)) +
labs(title = 'Model Coefficient C.I.')