# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the data
obesity <- read.csv("C://Users//saisr//Downloads//statistics using R//obesity.csv")

head(obesity)
##   Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female  21   1.62   64.0                            yes   no    2   3
## 2 Female  21   1.52   56.0                            yes   no    3   3
## 3   Male  23   1.80   77.0                            yes   no    2   3
## 4   Male  27   1.80   87.0                             no   no    3   3
## 5   Male  22   1.78   89.8                             no   no    2   1
## 6   Male  29   1.62   53.0                             no  yes    2   3
##        CAEC SMOKE CH2O SCC FAF TUE       CALC                MTRANS
## 1 Sometimes    no    2  no   0   1         no Public_Transportation
## 2 Sometimes   yes    3 yes   3   0  Sometimes Public_Transportation
## 3 Sometimes    no    2  no   2   1 Frequently Public_Transportation
## 4 Sometimes    no    2  no   2   0 Frequently               Walking
## 5 Sometimes    no    2  no   0   0  Sometimes Public_Transportation
## 6 Sometimes    no    2  no   0   0  Sometimes            Automobile
##            NObeyesdad
## 1       Normal_Weight
## 2       Normal_Weight
## 3       Normal_Weight
## 4  Overweight_Level_I
## 5 Overweight_Level_II
## 6       Normal_Weight
# Checking for missing values
sum(is.na(obesity))
## [1] 0

Converting Categorical Variables to Factors

# Convert categorical variables to factors if necessary
obesity$Gender <- as.factor(obesity$Gender)
obesity$family_history_with_overweight <- as.factor(obesity$family_history_with_overweight)
obesity_clean <- na.omit(obesity)

# Select only numeric columns for correlation
numeric_data <- obesity_clean %>% select_if(is.numeric)

# Calculate the correlation matrix
cor_matrix <- cor(numeric_data)
# Check the structure of the dataset
str(obesity_clean)
## 'data.frame':    2111 obs. of  17 variables:
##  $ Gender                        : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ Weight                        : num  64 56 77 87 89.8 53 55 53 64 68 ...
##  $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
##  $ FAVC                          : chr  "no" "no" "no" "no" ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ CAEC                          : chr  "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
##  $ SMOKE                         : chr  "no" "yes" "no" "no" ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ SCC                           : chr  "no" "yes" "no" "no" ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CALC                          : chr  "no" "Sometimes" "Frequently" "Frequently" ...
##  $ MTRANS                        : chr  "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
##  $ NObeyesdad                    : chr  "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...

Linear Model

# Build a linear regression model
model <- lm(Height ~ Age + Gender + family_history_with_overweight, data = obesity_clean)

# Summarize the model
summary(model)
## 
## Call:
## lm(formula = Height ~ Age + Gender + family_history_with_overweight, 
##     data = obesity_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.17580 -0.05189 -0.00244  0.05001  0.21488 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        1.6390004  0.0065367 250.738  < 2e-16 ***
## Age                               -0.0014332  0.0002481  -5.777 8.74e-09 ***
## GenderMale                         0.1123323  0.0030973  36.268  < 2e-16 ***
## family_history_with_overweightyes  0.0497674  0.0040930  12.159  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07075 on 2107 degrees of freedom
## Multiple R-squared:  0.4259, Adjusted R-squared:  0.4251 
## F-statistic: 521.1 on 3 and 2107 DF,  p-value: < 2.2e-16

Insights

Diagnosing the Model

# Plot residuals vs fitted values
plot(model, which = 1)

Issues with residual vs fitted model

Non-constant Variance of Residuals: The Residuals vs Fitted plot shows a funnel shape, indicating that the variance of the residuals increases with the fitted values. This violates the assumption of homoscedasticity, which is a key assumption of linear regression.


# Check for normality of residuals
plot(model, which = 2)

Issues with Q-Q plot

Non-normality of Residuals: The Q-Q plot shows a clear deviation from the straight line, indicating that the residuals are not normally distributed. This opposes one of the key assumptions of linear regression.


# Check for Cook's distance (influence)
plot(model, which = 4)

Issues with Cook’s distance

Influential Points: The Cook’s Distance plot reveals several points with high Cook’s distances, exceeding the threshold. These points have a significant impact on the regression model and may be distorting the results.

Interpret Coefficients

# Get the coefficient for Age
coef(model)["Age"]
##          Age 
## -0.001433217

Insights

  • The negative sign of the coefficient suggests a small but significant decrease in height with age, which could reflect a general trend such as slight height reduction with aging.
  • As people age, especially after reaching adulthood, they might experience a decrease in height due to natural physiological changes like compression of spinal discs and changes in posture.