raw_data <- read.csv("https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/ObesityDataSet_raw_and_data_sinthetic.csv")
head(raw_data)
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female 21 1.62 64.0 yes no 2 3
## 2 Female 21 1.52 56.0 yes no 3 3
## 3 Male 23 1.80 77.0 yes no 2 3
## 4 Male 27 1.80 87.0 no no 3 3
## 5 Male 22 1.78 89.8 no no 2 1
## 6 Male 29 1.62 53.0 no yes 2 3
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 Sometimes no 2 no 0 1 no Public_Transportation
## 2 Sometimes yes 3 yes 3 0 Sometimes Public_Transportation
## 3 Sometimes no 2 no 2 1 Frequently Public_Transportation
## 4 Sometimes no 2 no 2 0 Frequently Walking
## 5 Sometimes no 2 no 0 0 Sometimes Public_Transportation
## 6 Sometimes no 2 no 0 0 Sometimes Automobile
## NObeyesdad
## 1 Normal_Weight
## 2 Normal_Weight
## 3 Normal_Weight
## 4 Overweight_Level_I
## 5 Overweight_Level_II
## 6 Normal_Weight
str(raw_data)
## 'data.frame': 2111 obs. of 17 variables:
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ family_history_with_overweight: chr "yes" "yes" "yes" "no" ...
## $ FAVC : chr "no" "no" "no" "no" ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ CAEC : chr "Sometimes" "Sometimes" "Sometimes" "Sometimes" ...
## $ SMOKE : chr "no" "yes" "no" "no" ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ SCC : chr "no" "yes" "no" "no" ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CALC : chr "no" "Sometimes" "Frequently" "Frequently" ...
## $ MTRANS : chr "Public_Transportation" "Public_Transportation" "Public_Transportation" "Walking" ...
## $ NObeyesdad : chr "Normal_Weight" "Normal_Weight" "Normal_Weight" "Overweight_Level_I" ...
names(raw_data)[names(raw_data) == "Height"] <- "Height_m"
names(raw_data)[names(raw_data) == "Weight"] <- "Weight_kg"
names(raw_data)[names(raw_data) == "family_history_with_overweight"] <- "overweight_history"
names(raw_data)[names(raw_data) == "FAVC"] <- "eat_high_calories"
names(raw_data)[names(raw_data) == "FCVC"] <- "eat_vegetables"
names(raw_data)[names(raw_data) == "NCP"] <- "number_daily_meals"
names(raw_data)[names(raw_data) == "CAEC"] <- "eat_between_meals"
names(raw_data)[names(raw_data) == "SMOKE"] <- "smoke"
names(raw_data)[names(raw_data) == "CH2O"] <- "water"
names(raw_data)[names(raw_data) == "SCC"] <- "monitor_calories"
names(raw_data)[names(raw_data) == "FAF"] <- "physical_activity"
names(raw_data)[names(raw_data) == "TUE"] <- "time_technology"
names(raw_data)[names(raw_data) == "CALC"] <- "frequency_alcohol"
names(raw_data)[names(raw_data) == "MTRANS"] <- "mode_transport"
names(raw_data)[names(raw_data) == "NObeyesdad"] <- "obesity_level"
anyNA(raw_data)
## [1] FALSE
dim(raw_data)
## [1] 2111 17
summary(raw_data)
## Gender Age Height_m Weight_kg
## Length:2111 Min. :14.00 Min. :1.450 Min. : 39.00
## Class :character 1st Qu.:19.95 1st Qu.:1.630 1st Qu.: 65.47
## Mode :character Median :22.78 Median :1.700 Median : 83.00
## Mean :24.31 Mean :1.702 Mean : 86.59
## 3rd Qu.:26.00 3rd Qu.:1.768 3rd Qu.:107.43
## Max. :61.00 Max. :1.980 Max. :173.00
## overweight_history eat_high_calories eat_vegetables number_daily_meals
## Length:2111 Length:2111 Min. :1.000 Min. :1.000
## Class :character Class :character 1st Qu.:2.000 1st Qu.:2.659
## Mode :character Mode :character Median :2.386 Median :3.000
## Mean :2.419 Mean :2.686
## 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :3.000 Max. :4.000
## eat_between_meals smoke water monitor_calories
## Length:2111 Length:2111 Min. :1.000 Length:2111
## Class :character Class :character 1st Qu.:1.585 Class :character
## Mode :character Mode :character Median :2.000 Mode :character
## Mean :2.008
## 3rd Qu.:2.477
## Max. :3.000
## physical_activity time_technology frequency_alcohol mode_transport
## Min. :0.0000 Min. :0.0000 Length:2111 Length:2111
## 1st Qu.:0.1245 1st Qu.:0.0000 Class :character Class :character
## Median :1.0000 Median :0.6253 Mode :character Mode :character
## Mean :1.0103 Mean :0.6579
## 3rd Qu.:1.6667 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.0000
## obesity_level
## Length:2111
## Class :character
## Mode :character
##
##
##
# The study questions (https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition) has all the valid values for all fields.
### Binary yes/no columns
# overweight_history
raw_data$overweight_history[raw_data$overweight_history == "yes"] <- 2
raw_data$overweight_history[raw_data$overweight_history == "no"] <- 1
raw_data$overweight_history <- as.integer(raw_data$overweight_history)
# eat_high_calories
raw_data$eat_high_calories[raw_data$eat_high_calories == "yes"] <- 2
raw_data$eat_high_calories[raw_data$eat_high_calories == "no"] <- 1
raw_data$eat_high_calories <- as.integer(raw_data$eat_high_calories)
# monitor_calories
raw_data$monitor_calories[raw_data$monitor_calories == "yes"] <- 2
raw_data$monitor_calories[raw_data$monitor_calories == "no"] <- 1
raw_data$monitor_calories <- as.integer(raw_data$monitor_calories)
# frequency_alcohol
raw_data$frequency_alcohol[raw_data$frequency_alcohol == "no"] <- 1
raw_data$frequency_alcohol[raw_data$frequency_alcohol == "Sometimes"] <- 2
raw_data$frequency_alcohol[raw_data$frequency_alcohol == "Frequently"] <- 3
raw_data$frequency_alcohol[raw_data$frequency_alcohol == "Always"] <- 4
raw_data$frequency_alcohol <- as.integer(raw_data$frequency_alcohol)
# smoke
raw_data$smoke[raw_data$smoke == "yes"] <- 2
raw_data$smoke[raw_data$smoke == "no"] <- 1
raw_data$smoke <- as.integer(raw_data$smoke)
# eat_between_meals
raw_data$eat_between_meals[raw_data$eat_between_meals == "no"] <- 1
raw_data$eat_between_meals[raw_data$eat_between_meals == "Sometimes"] <- 2
raw_data$eat_between_meals[raw_data$eat_between_meals == "Frequently"] <- 3
raw_data$eat_between_meals[raw_data$eat_between_meals == "Always"] <- 4
raw_data$eat_between_meals <- as.integer(raw_data$eat_between_meals)
# Round numeric values to nearest integer
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Age
raw_data <- raw_data %>% mutate(across(c('Age'), round, 0))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(c("Age"), round, 0)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
# eat_vegetables; never/sometimes/always -> 1,2,3
raw_data <- raw_data %>% mutate(across(c('eat_vegetables'), round, 0))
# number_daily_meals; 'between 1 and 2/between 2 and 3/ 3/ more than 3 -> 1,2,3,4
raw_data <- raw_data %>% mutate(across(c('number_daily_meals'), round, 0))
# water; 'less than liter / between 1 and 2L / more than 2L' -> 1,2,3
raw_data <- raw_data %>% mutate(across(c('water'), round, 0))
# physical_activity; 'I don't have / 1 or 2 days/ 3 or 4 days / More than 4 days' -> 1,2,3,4
raw_data <- raw_data %>% mutate(across(c('physical_activity'), round, 0))
# time_technology;'0-2 hours / 3-5 hours / more than 5 hours' -> 1,2,3
raw_data <- raw_data %>% mutate(across(c('time_technology'), round, 0))
# Quadratic variable- Height_m
Height_m_2 <- raw_data$Height_m^2
# Dichotomous vs. Quantitative interaction; If weight is influenced by number_daily_meals and physical_activity
meals_activity <- raw_data$number_daily_meals * raw_data$physical_activity
#Fitting the multiple regression model with just numeric fields, and the above
model.lm <- lm(Weight_kg ~ Height_m + overweight_history + eat_high_calories + eat_vegetables + number_daily_meals + eat_between_meals + smoke + water + monitor_calories + physical_activity + time_technology + frequency_alcohol + Height_m_2 + meals_activity, data = raw_data)
# Summary of the model
summary(model.lm)
##
## Call:
## lm(formula = Weight_kg ~ Height_m + overweight_history + eat_high_calories +
## eat_vegetables + number_daily_meals + eat_between_meals +
## smoke + water + monitor_calories + physical_activity + time_technology +
## frequency_alcohol + Height_m_2 + meals_activity, data = raw_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.794 -12.588 1.994 12.023 70.599
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -235.7818 116.8562 -2.018 0.043749 *
## Height_m 205.0826 137.3263 1.493 0.135484
## overweight_history 22.2170 1.1436 19.427 < 2e-16 ***
## eat_high_calories 6.5863 1.3412 4.911 9.77e-07 ***
## eat_vegetables 8.9115 0.6996 12.738 < 2e-16 ***
## number_daily_meals 1.8071 0.7757 2.330 0.019914 *
## eat_between_meals -10.4875 0.8982 -11.676 < 2e-16 ***
## smoke 1.7629 2.8602 0.616 0.537720
## water 1.1480 0.6110 1.879 0.060390 .
## monitor_calories -7.5684 2.0193 -3.748 0.000183 ***
## physical_activity 0.6882 1.6210 0.425 0.671217
## time_technology -1.4695 0.6074 -2.419 0.015643 *
## frequency_alcohol 6.4260 0.8196 7.841 7.07e-15 ***
## Height_m_2 -29.6885 40.2500 -0.738 0.460840
## meals_activity -1.5090 0.5739 -2.629 0.008614 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.48 on 2096 degrees of freedom
## Multiple R-squared: 0.5056, Adjusted R-squared: 0.5023
## F-statistic: 153.1 on 14 and 2096 DF, p-value: < 2.2e-16
model.lm <- update(model.lm, .~. -physical_activity) # 0.671217
model.lm <- update(model.lm, .~. -smoke) # 0.541525
model.lm <- update(model.lm, .~. -Height_m_2) # 0.460844
summary(model.lm)
##
## Call:
## lm(formula = Weight_kg ~ Height_m + overweight_history + eat_high_calories +
## eat_vegetables + number_daily_meals + eat_between_meals +
## water + monitor_calories + time_technology + frequency_alcohol +
## meals_activity, data = raw_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.272 -12.697 1.928 11.970 69.939
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -148.2266 8.8919 -16.670 < 2e-16 ***
## Height_m 104.1872 4.9787 20.927 < 2e-16 ***
## overweight_history 22.3404 1.1361 19.663 < 2e-16 ***
## eat_high_calories 6.5926 1.3353 4.937 8.56e-07 ***
## eat_vegetables 8.9424 0.6982 12.808 < 2e-16 ***
## number_daily_meals 1.6188 0.5550 2.917 0.003573 **
## eat_between_meals -10.5225 0.8944 -11.765 < 2e-16 ***
## water 1.1796 0.6071 1.943 0.052151 .
## monitor_calories -7.6347 2.0108 -3.797 0.000151 ***
## time_technology -1.4448 0.6034 -2.395 0.016726 *
## frequency_alcohol 6.4362 0.8095 7.950 3.01e-15 ***
## meals_activity -1.2795 0.1726 -7.414 1.76e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.47 on 2099 degrees of freedom
## Multiple R-squared: 0.5053, Adjusted R-squared: 0.5027
## F-statistic: 194.9 on 11 and 2099 DF, p-value: < 2.2e-16
# Residuals scatterplot
plot(model.lm$fitted.values, model.lm$residuals, xlab="Fitted Values", ylab="Residuals")
abline(h=0)
# Residuals Histogram
hist(model.lm$residuals)
# QQ plot
qqnorm(model.lm$residuals)
qqline(model.lm$residuals)
# Residual analysis
par(mfrow=c(2,2))
plot(model.lm)
Overall, this model explains about \(50\%\) of the variance within the data if we model a persons weight with the above criteria. However, the residuals themselves actually demonstrate heteroskedasticity, meaning the standard deviation of our predicted variable (weight_kg) are non-constant with regards to the other variables. However, the residuals on the Q-Q plot do appear to follow a somewhat normal distribution, as also evidenced by the histogram, and are centered around a mean value of \(1.928\), with the 1st and 3rd Quartiles being nearly equidistant (\(-12.697\) and \(11.970\) respectively).
For a good model, we’d want the standard error to be at least 5-10 times smaller than its corresponding coefficient (Linear Regression Using R, Pg. 21), and we have a residual standard error of \(18.47\) on 2099 degrees of freedom (the data itself had 2111 observations). The p-values of the remaining variables are very small, thus we can see there’s strong evidence of a somewhat linear relationship between a persons weight modeled with the above fields. The overall p-value for the model is \(< 2.2e-16\).
This model is definitely a better fit than a single linear regression, but the it seems tricky to predict new data given the model is only at 50% accuracy. There is further enhancement to be made for this model.