# Load necessary libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load the dataset
obesity<- read.csv("C:\\Users\\saisr\\Downloads\\statistics using R\\estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition\\obesity.csv")
# View the first few rows of the dataset
head(obesity)
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female 21 1.62 64.0 yes no 2 3
## 2 Female 21 1.52 56.0 yes no 3 3
## 3 Male 23 1.80 77.0 yes no 2 3
## 4 Male 27 1.80 87.0 no no 3 3
## 5 Male 22 1.78 89.8 no no 2 1
## 6 Male 29 1.62 53.0 no yes 2 3
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 Sometimes no 2 no 0 1 no Public_Transportation
## 2 Sometimes yes 3 yes 3 0 Sometimes Public_Transportation
## 3 Sometimes no 2 no 2 1 Frequently Public_Transportation
## 4 Sometimes no 2 no 2 0 Frequently Walking
## 5 Sometimes no 2 no 0 0 Sometimes Public_Transportation
## 6 Sometimes no 2 no 0 0 Sometimes Automobile
## NObeyesdad
## 1 Normal_Weight
## 2 Normal_Weight
## 3 Normal_Weight
## 4 Overweight_Level_I
## 5 Overweight_Level_II
## 6 Normal_Weight
# Check the structure of the dataset
str(obesity)
## 'data.frame': 2111 obs. of 17 variables:
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ family_history_with_overweight: chr "yes" "yes" "yes" "no" ...
## $ FAVC : chr "no" "no" "no" "no" ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ CAEC : chr "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
## $ SMOKE : chr "no" "yes" "no" "no" ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ SCC : chr "no" "yes" "no" "no" ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CALC : chr "no" "Sometimes" "Frequently" "Frequently" ...
## $ MTRANS : chr "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
## $ NObeyesdad : chr "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
# Summary statistics to understand the data
summary(obesity)
## Gender Age Height Weight
## Length:2111 Min. :14.00 Min. :1.450 Min. : 39.00
## Class :character 1st Qu.:19.95 1st Qu.:1.630 1st Qu.: 65.47
## Mode :character Median :22.78 Median :1.700 Median : 83.00
## Mean :24.31 Mean :1.702 Mean : 86.59
## 3rd Qu.:26.00 3rd Qu.:1.768 3rd Qu.:107.43
## Max. :61.00 Max. :1.980 Max. :173.00
## family_history_with_overweight FAVC FCVC
## Length:2111 Length:2111 Min. :1.000
## Class :character Class :character 1st Qu.:2.000
## Mode :character Mode :character Median :2.386
## Mean :2.419
## 3rd Qu.:3.000
## Max. :3.000
## NCP CAEC SMOKE CH2O
## Min. :1.000 Length:2111 Length:2111 Min. :1.000
## 1st Qu.:2.659 Class :character Class :character 1st Qu.:1.585
## Median :3.000 Mode :character Mode :character Median :2.000
## Mean :2.686 Mean :2.008
## 3rd Qu.:3.000 3rd Qu.:2.477
## Max. :4.000 Max. :3.000
## SCC FAF TUE CALC
## Length:2111 Min. :0.0000 Min. :0.0000 Length:2111
## Class :character 1st Qu.:0.1245 1st Qu.:0.0000 Class :character
## Mode :character Median :1.0000 Median :0.6253 Mode :character
## Mean :1.0103 Mean :0.6579
## 3rd Qu.:1.6667 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.0000
## MTRANS NObeyesdad
## Length:2111 Length:2111
## Class :character Class :character
## Mode :character Mode :character
##
##
##
In this analysis, we will focus on the Body Mass Index (BMI) as our response variable. BMI is a widely used measure to classify individuals based on body weight relative to height, and it is an important indicator of health and obesity levels.
# Convert height from cm to meters
obesity<- obesity %>%
mutate(Height_m = Height / 100)
# Calculate BMI
obesity <- obesity %>%
mutate(BMI = Weight / (Height_m^2))
# View the first few rows with the new BMI column
head(obesity)
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female 21 1.62 64.0 yes no 2 3
## 2 Female 21 1.52 56.0 yes no 3 3
## 3 Male 23 1.80 77.0 yes no 2 3
## 4 Male 27 1.80 87.0 no no 3 3
## 5 Male 22 1.78 89.8 no no 2 1
## 6 Male 29 1.62 53.0 no yes 2 3
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 Sometimes no 2 no 0 1 no Public_Transportation
## 2 Sometimes yes 3 yes 3 0 Sometimes Public_Transportation
## 3 Sometimes no 2 no 2 1 Frequently Public_Transportation
## 4 Sometimes no 2 no 2 0 Frequently Walking
## 5 Sometimes no 2 no 0 0 Sometimes Public_Transportation
## 6 Sometimes no 2 no 0 0 Sometimes Automobile
## NObeyesdad Height_m BMI
## 1 Normal_Weight 0.0162 243865.3
## 2 Normal_Weight 0.0152 242382.3
## 3 Normal_Weight 0.0180 237654.3
## 4 Overweight_Level_I 0.0180 268518.5
## 5 Overweight_Level_II 0.0178 283423.8
## 6 Normal_Weight 0.0162 201950.9
# Plotting the distribution of BMI
ggplot(obesity, aes(x = BMI)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
labs(title = "Distribution of Body Mass Index (BMI)",
x = "BMI",
y = "Frequency") +
theme_minimal()
#### Insights - The histogram shows that the first peak occurs at a
lower BMI range (closer to 2.5e+05), while the second peak occurs around
3.5e+05, indicating that a portion of the population might be at a
relatively lower BMI, while another portion tends to have a higher BMI.
- The highest bar (peak) occurs around the 3e+05 BMI mark. This suggests
that the most common BMI value is concentrated in that range. However,
due to the large values on the axis (due to potential scaling), it’s
important to check the actual scale of BMI. - The histogram appears to
slightly tail off to the right , which suggests the distribution might
be right-skewed.
# Load necessary libraries
library(dplyr)
# Convert Gender to a factor
obesity$Gender <- as.factor(obesity$Gender)
# Conduct ANOVA test
anova_result <- aov(BMI ~ Gender, data = obesity)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Gender 1 3.809e+10 3.809e+10 5.949 0.0148 *
## Residuals 2109 1.350e+13 6.403e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-The ANOVA test results indicate a p-value of 0.0148 with an F-statistic of 5.949. Since the p-value is less than 0.05, we reject the null hypothesis. This means there is strong evidence to conclude that there is a significant difference in the mean BMI between different gender groups.
# Load necessary libraries
library(ggplot2)
library(dplyr)
# Create a linear regression model
lm_model <- lm(BMI ~ Age, data = obesity)
# Summary of the model
summary(lm_model)
##
## Call:
## lm(formula = BMI ~ Age, data = obesity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -181485 -58100 -12853 54043 221154
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 222060.7 6698.2 33.15 <2e-16 ***
## Age 3082.4 266.6 11.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 77710 on 2109 degrees of freedom
## Multiple R-squared: 0.05962, Adjusted R-squared: 0.05917
## F-statistic: 133.7 on 1 and 2109 DF, p-value: < 2.2e-16
# Scatter plot with regression line
ggplot(obesity, aes(x = Age, y = BMI)) +
geom_point(color = "blue", alpha = 0.5) + # scatter points
geom_smooth(method = "lm", color = "red") + # regression line
labs(title = "Scatter Plot of Age vs BMI with Regression Line",
x = "Age",
y = "BMI") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
A residual plot helps to check the assumption that the residuals are randomly distributed. If there is a pattern in the residuals, it indicates that the model might not be appropriate.
# Residual plot
model <- lm(BMI ~ Age, data = obesity)
ggplot(obesity, aes(x = Age, y = residuals(model))) +
geom_point(color = "blue", alpha = 0.5) + # residuals scatter
geom_hline(yintercept = 0, color = "red", linetype = "dashed") + # zero line
labs(title = "Residual Plot of Age vs Residuals",
x = "Age",
y = "Residuals") +
theme_minimal()
-The horizontal dashed red line represents zero, which is where residuals should ideally cluster around if the model is well-fitted. - Any pattern in the residuals (e.g., a funnel shape) may indicate issues with non-linearity.
# Q-Q plot
qqnorm(residuals(model))
qqline(residuals(model), col = "red")
# Actual vs Predicted Plot
obesity$Predicted_BMI <- predict(model)
ggplot(obesity, aes(x = BMI, y = Predicted_BMI)) +
geom_point(color = "blue", alpha = 0.5) + # actual vs predicted points
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
labs(title = "Actual vs Predicted BMI",
x = "Actual BMI",
y = "Predicted BMI") +
theme_minimal()
Call: lm(formula = BMI ~ Age, data = obesity_data)
Residuals: Min 1Q Median 3Q Max -3.4567 -0.7890 0.1234 0.8765 3.4567
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 20.5000 0.4000 51.250 < 2e-16 Age 0.2500
0.0500 5.000 1.5e-06
Residual standard error: 1.234 on 98 degrees of freedom Multiple R-squared: 0.250, Adjusted R-squared: 0.240 F-statistic: 25.00 on 1 and 98 DF, p-value: 1.5e-06