library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(car) 
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
# Load the dataset
obesity<- read.csv("C:\\Users\\saisr\\Downloads\\statistics using R\\estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition\\obesity.csv")
# View the first few rows of the dataset
head(obesity)
##   Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female  21   1.62   64.0                            yes   no    2   3
## 2 Female  21   1.52   56.0                            yes   no    3   3
## 3   Male  23   1.80   77.0                            yes   no    2   3
## 4   Male  27   1.80   87.0                             no   no    3   3
## 5   Male  22   1.78   89.8                             no   no    2   1
## 6   Male  29   1.62   53.0                             no  yes    2   3
##        CAEC SMOKE CH2O SCC FAF TUE       CALC                MTRANS
## 1 Sometimes    no    2  no   0   1         no Public_Transportation
## 2 Sometimes   yes    3 yes   3   0  Sometimes Public_Transportation
## 3 Sometimes    no    2  no   2   1 Frequently Public_Transportation
## 4 Sometimes    no    2  no   2   0 Frequently               Walking
## 5 Sometimes    no    2  no   0   0  Sometimes Public_Transportation
## 6 Sometimes    no    2  no   0   0  Sometimes            Automobile
##            NObeyesdad
## 1       Normal_Weight
## 2       Normal_Weight
## 3       Normal_Weight
## 4  Overweight_Level_I
## 5 Overweight_Level_II
## 6       Normal_Weight

deriving BMI

# Convert height from cm to meters
obesity<- obesity %>%
  mutate(Height_m = Height / 100)

# Calculate BMI
obesity <- obesity %>%
  mutate(BMI = Weight / (Height_m^2))

# View the first few rows with the new BMI column
head(obesity)
##   Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female  21   1.62   64.0                            yes   no    2   3
## 2 Female  21   1.52   56.0                            yes   no    3   3
## 3   Male  23   1.80   77.0                            yes   no    2   3
## 4   Male  27   1.80   87.0                             no   no    3   3
## 5   Male  22   1.78   89.8                             no   no    2   1
## 6   Male  29   1.62   53.0                             no  yes    2   3
##        CAEC SMOKE CH2O SCC FAF TUE       CALC                MTRANS
## 1 Sometimes    no    2  no   0   1         no Public_Transportation
## 2 Sometimes   yes    3 yes   3   0  Sometimes Public_Transportation
## 3 Sometimes    no    2  no   2   1 Frequently Public_Transportation
## 4 Sometimes    no    2  no   2   0 Frequently               Walking
## 5 Sometimes    no    2  no   0   0  Sometimes Public_Transportation
## 6 Sometimes    no    2  no   0   0  Sometimes            Automobile
##            NObeyesdad Height_m      BMI
## 1       Normal_Weight   0.0162 243865.3
## 2       Normal_Weight   0.0152 242382.3
## 3       Normal_Weight   0.0180 237654.3
## 4  Overweight_Level_I   0.0180 268518.5
## 5 Overweight_Level_II   0.0178 283423.8
## 6       Normal_Weight   0.0162 201950.9

Review of Initial Linear Regression Model

# Simple linear regression model
lm_model <- lm(BMI ~ Age, data = obesity)

# Summary of the simple model
summary(lm_model)
## 
## Call:
## lm(formula = BMI ~ Age, data = obesity)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -181485  -58100  -12853   54043  221154 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 222060.7     6698.2   33.15   <2e-16 ***
## Age           3082.4      266.6   11.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 77710 on 2109 degrees of freedom
## Multiple R-squared:  0.05962,    Adjusted R-squared:  0.05917 
## F-statistic: 133.7 on 1 and 2109 DF,  p-value: < 2.2e-16

Interpretation of the Simple Model

here the coefficient of Age shows the average change in BMI per unit increase in Age. However, a single predictor may be insufficient for accurately predicting BMI. Therefore, we will explore adding more variables to improve the model.

Adding variables

For this model, we consider adding: Height and Weight: These are essential for BMI calculations and are highly predictive. Gender: Adding this binary variable could reveal BMI differences based on gender.

Building the expanded Model

# Expanded linear regression model with additional variables
obesity$Age_centered <- scale(obesity$Age, center = TRUE, scale = FALSE)
lm_model_expanded <- lm(BMI ~ Age_centered + Height + Weight + Age_centered * Gender, data = obesity)

# Summary of the expanded model
summary(lm_model_expanded)
## 
## Call:
## lm(formula = BMI ~ Age_centered + Height + Weight + Age_centered * 
##     Gender, data = obesity)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -43525  -3856    510   4673  25955 
## 
## Coefficients:
##                           Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)              5.630e+05  4.189e+03  134.411   <2e-16 ***
## Age_centered             4.651e+02  3.886e+01   11.969   <2e-16 ***
## Height                  -3.300e+05  2.714e+03 -121.594   <2e-16 ***
## Weight                   3.410e+03  7.881e+00  432.680   <2e-16 ***
## GenderMale               6.420e+02  4.505e+02    1.425    0.154    
## Age_centered:GenderMale -7.308e+02  5.515e+01  -13.252   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7924 on 2105 degrees of freedom
## Multiple R-squared:  0.9902, Adjusted R-squared:  0.9902 
## F-statistic: 4.272e+04 on 5 and 2105 DF,  p-value: < 2.2e-16

Interpretation of the Expanded Model

Why Each Variable was Included

Multicollinearity Check

# Check for multicollinearity
vif(lm_model_expanded)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
##        Age_centered              Height              Weight              Gender 
##            2.043523            2.154532            1.431982            1.705823 
## Age_centered:Gender 
##            2.036485

Insights

  • here we got a low VIF for all the interaction terms that is below 5 so it will be not collinear and suitable for inclusion.

Model Comparison and Fit

# Compare Adjusted R-squared, AIC, and BIC of both models
cat("Simple Model Adjusted R-squared: ", summary(lm_model)$adj.r.squared, "\n")
## Simple Model Adjusted R-squared:  0.05916974
cat("Expanded Model Adjusted R-squared: ", summary(lm_model_expanded)$adj.r.squared, "\n")
## Expanded Model Adjusted R-squared:  0.9902175
cat("Simple Model AIC: ", AIC(lm_model), "\n")
## Simple Model AIC:  53537.44
cat("Expanded Model AIC: ", AIC(lm_model_expanded), "\n")
## Expanded Model AIC:  43902.25
cat("Simple Model BIC: ", BIC(lm_model), "\n")
## Simple Model BIC:  53554.4
cat("Expanded Model BIC: ", BIC(lm_model_expanded), "\n")
## Expanded Model BIC:  43941.83

Insights

  • The expanded model shows certain improvement over the simple model across all metrics.
  • The high Adjusted R-squared indicates most of the variance, while the lower AIC and BIC suggest it does so efficiently, with less risk of overfitting.

Evaluating the models

1. Residuals vs. Fitted Values

# Residuals vs. Fitted values
plot(lm_model_expanded$fitted.values, residuals(lm_model_expanded), 
     main = "Residuals vs Fitted",
     xlab = "Fitted values",
     ylab = "Residuals",
     pch = 20, col = "blue")
abline(h = 0, col = "red")

Insights

  • Indications of Issues: - Look for patterns (e.g., a funnel shape, curves). This suggests non-linearity. - Random scatter around the horizontal line indicates the assumption of linearity and constant variance is met.
  • Severity: - If there is a clear pattern, this could indicate a moderate to high issue that may require transformation of variables.
  • Confidence Level: - High confidence if random scatter, low confidence if a pattern exists.

2. Residuals vs. X Values Plot

# Residuals vs Weight
plot(obesity$Weight, residuals(lm_model_expanded), 
     main = "Residuals vs Weight",
     xlab = "Weight",
     ylab = "Residuals",
     pch = 20, col = "pink")
abline(h = 0, col = "yellow")

plot(obesity$Height, residuals(lm_model_expanded), 
     main = "Residuals vs Height",
     xlab = "Height",
     ylab = "Residuals",
     pch = 20, col = "pink")
abline(h = 0, col = "yellow")

Insights

  • Indications of Issues: - Look for patterns indicating non-linearity. For example, if residuals fan out or show a curve, the relationship may not be linear.
  • Severity: - A pattern suggests moderate to high issues that may require variable transformation or additional polynomial terms.
  • Confidence Level: - High confidence if points are randomly scattered; low confidence if patterns appear.

3. Residual Histogram

# Histogram of residuals
hist(residuals(lm_model_expanded), 
     main = "Histogram of Residuals", 
     xlab = "Residuals", 
     col = "lightblue", 
     border = "white")

Insights

  • Indications of Issues: - A bell-shaped histogram supports the normality assumption. - Skewness or multiple peaks indicate potential violations of normality.
  • Severity: - Moderate issues if residuals are slightly skewed; severe if there are significant deviations from normality.
  • Confidence Level: - High confidence with a normal shape; low confidence with significant skewness.

4. Q-Q Plot

# Q-Q plot
qqnorm(residuals(lm_model_expanded), main = "Normal Q-Q Plot")
qqline(residuals(lm_model_expanded), col = "red")

Insights

  • Indications of Issues:
    • Points falling along the 45-degree line indicate normality.
    • Deviations, especially in the tails, suggest non-normality.
  • Severity:
    • Minor deviations indicate low severity; significant deviations suggest serious concerns about normality.
  • Confidence Level:
    • High confidence if points closely follow the line; low confidence if there are large deviations.

5. Cook’s D by Observation

# Cook's distance plot
plot(lm_model_expanded, which = 4)

Insights

  • Indications of Issues: - Points outside the reference lines indicate influential points that may disproportionately affect the model.
  • Severity: - The more points that lie outside the threshold, the greater the concern regarding the stability of the model.
  • Confidence Level: - High confidence if no points are influential; moderate to low confidence if influential points are present.