library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
# Load the dataset
obesity<- read.csv("C:\\Users\\saisr\\Downloads\\statistics using R\\estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition\\obesity.csv")
# View the first few rows of the dataset
head(obesity)
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female 21 1.62 64.0 yes no 2 3
## 2 Female 21 1.52 56.0 yes no 3 3
## 3 Male 23 1.80 77.0 yes no 2 3
## 4 Male 27 1.80 87.0 no no 3 3
## 5 Male 22 1.78 89.8 no no 2 1
## 6 Male 29 1.62 53.0 no yes 2 3
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 Sometimes no 2 no 0 1 no Public_Transportation
## 2 Sometimes yes 3 yes 3 0 Sometimes Public_Transportation
## 3 Sometimes no 2 no 2 1 Frequently Public_Transportation
## 4 Sometimes no 2 no 2 0 Frequently Walking
## 5 Sometimes no 2 no 0 0 Sometimes Public_Transportation
## 6 Sometimes no 2 no 0 0 Sometimes Automobile
## NObeyesdad
## 1 Normal_Weight
## 2 Normal_Weight
## 3 Normal_Weight
## 4 Overweight_Level_I
## 5 Overweight_Level_II
## 6 Normal_Weight
# Convert height from cm to meters
obesity<- obesity %>%
mutate(Height_m = Height / 100)
# Calculate BMI
obesity <- obesity %>%
mutate(BMI = Weight / (Height_m^2))
# View the first few rows with the new BMI column
head(obesity)
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female 21 1.62 64.0 yes no 2 3
## 2 Female 21 1.52 56.0 yes no 3 3
## 3 Male 23 1.80 77.0 yes no 2 3
## 4 Male 27 1.80 87.0 no no 3 3
## 5 Male 22 1.78 89.8 no no 2 1
## 6 Male 29 1.62 53.0 no yes 2 3
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 Sometimes no 2 no 0 1 no Public_Transportation
## 2 Sometimes yes 3 yes 3 0 Sometimes Public_Transportation
## 3 Sometimes no 2 no 2 1 Frequently Public_Transportation
## 4 Sometimes no 2 no 2 0 Frequently Walking
## 5 Sometimes no 2 no 0 0 Sometimes Public_Transportation
## 6 Sometimes no 2 no 0 0 Sometimes Automobile
## NObeyesdad Height_m BMI
## 1 Normal_Weight 0.0162 243865.3
## 2 Normal_Weight 0.0152 242382.3
## 3 Normal_Weight 0.0180 237654.3
## 4 Overweight_Level_I 0.0180 268518.5
## 5 Overweight_Level_II 0.0178 283423.8
## 6 Normal_Weight 0.0162 201950.9
# Simple linear regression model
lm_model <- lm(BMI ~ Age, data = obesity)
# Summary of the simple model
summary(lm_model)
##
## Call:
## lm(formula = BMI ~ Age, data = obesity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -181485 -58100 -12853 54043 221154
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 222060.7 6698.2 33.15 <2e-16 ***
## Age 3082.4 266.6 11.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 77710 on 2109 degrees of freedom
## Multiple R-squared: 0.05962, Adjusted R-squared: 0.05917
## F-statistic: 133.7 on 1 and 2109 DF, p-value: < 2.2e-16
here the coefficient of Age shows the average change in BMI per unit increase in Age. However, a single predictor may be insufficient for accurately predicting BMI. Therefore, we will explore adding more variables to improve the model.
For this model, we consider adding: Height and Weight: These are essential for BMI calculations and are highly predictive. Gender: Adding this binary variable could reveal BMI differences based on gender.
# Expanded linear regression model with additional variables
obesity$Age_centered <- scale(obesity$Age, center = TRUE, scale = FALSE)
lm_model_expanded <- lm(BMI ~ Age_centered + Height + Weight + Age_centered * Gender, data = obesity)
# Summary of the expanded model
summary(lm_model_expanded)
##
## Call:
## lm(formula = BMI ~ Age_centered + Height + Weight + Age_centered *
## Gender, data = obesity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43525 -3856 510 4673 25955
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.630e+05 4.189e+03 134.411 <2e-16 ***
## Age_centered 4.651e+02 3.886e+01 11.969 <2e-16 ***
## Height -3.300e+05 2.714e+03 -121.594 <2e-16 ***
## Weight 3.410e+03 7.881e+00 432.680 <2e-16 ***
## GenderMale 6.420e+02 4.505e+02 1.425 0.154
## Age_centered:GenderMale -7.308e+02 5.515e+01 -13.252 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7924 on 2105 degrees of freedom
## Multiple R-squared: 0.9902, Adjusted R-squared: 0.9902
## F-statistic: 4.272e+04 on 5 and 2105 DF, p-value: < 2.2e-16
Age: As in the simple model, this reflects the relationship between Age and BMI.
Height and Weight: Since BMI is derived from weight and height, these should directly relate to BMI. If significant, these variables are essential for predictive accuracy.
(Age * Gender): This term helps us understand if the impact of age on BMI changes based on gender.
Height and Weight: Related to BMI calculations and are standard predictors in obesity research.
Gender: captures gender-specific trends in BMI that Age alone might not reveal.
Interaction (Age * Gender): Allows us to explore if gender modifies the effect of age on BMI, adding depth to the analysis.
# Check for multicollinearity
vif(lm_model_expanded)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## Age_centered Height Weight Gender
## 2.043523 2.154532 1.431982 1.705823
## Age_centered:Gender
## 2.036485
# Compare Adjusted R-squared, AIC, and BIC of both models
cat("Simple Model Adjusted R-squared: ", summary(lm_model)$adj.r.squared, "\n")
## Simple Model Adjusted R-squared: 0.05916974
cat("Expanded Model Adjusted R-squared: ", summary(lm_model_expanded)$adj.r.squared, "\n")
## Expanded Model Adjusted R-squared: 0.9902175
cat("Simple Model AIC: ", AIC(lm_model), "\n")
## Simple Model AIC: 53537.44
cat("Expanded Model AIC: ", AIC(lm_model_expanded), "\n")
## Expanded Model AIC: 43902.25
cat("Simple Model BIC: ", BIC(lm_model), "\n")
## Simple Model BIC: 53554.4
cat("Expanded Model BIC: ", BIC(lm_model_expanded), "\n")
## Expanded Model BIC: 43941.83
1. Residuals vs. Fitted Values
# Residuals vs. Fitted values
plot(lm_model_expanded$fitted.values, residuals(lm_model_expanded),
main = "Residuals vs Fitted",
xlab = "Fitted values",
ylab = "Residuals",
pch = 20, col = "blue")
abline(h = 0, col = "red")
2. Residuals vs. X Values Plot
# Residuals vs Weight
plot(obesity$Weight, residuals(lm_model_expanded),
main = "Residuals vs Weight",
xlab = "Weight",
ylab = "Residuals",
pch = 20, col = "pink")
abline(h = 0, col = "yellow")
plot(obesity$Height, residuals(lm_model_expanded),
main = "Residuals vs Height",
xlab = "Height",
ylab = "Residuals",
pch = 20, col = "pink")
abline(h = 0, col = "yellow")
3. Residual Histogram
# Histogram of residuals
hist(residuals(lm_model_expanded),
main = "Histogram of Residuals",
xlab = "Residuals",
col = "lightblue",
border = "white")
4. Q-Q Plot
# Q-Q plot
qqnorm(residuals(lm_model_expanded), main = "Normal Q-Q Plot")
qqline(residuals(lm_model_expanded), col = "red")
5. Cook’s D by Observation
# Cook's distance plot
plot(lm_model_expanded, which = 4)