library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(purrr)
library(boot)
library(broom)
library(lindia)
df <- read.csv("~/Downloads/ObesityDataSet_raw_and_data_sinthetic.csv", header=TRUE)
df['BMI'] <- df['Weight']/(df['Height']**2)
df |> ggplot(mapping = aes(x = CH2O, y = BMI, color = family_history_with_overweight)) + geom_point(size = 0.5)+
geom_smooth(method = 'lm', se = FALSE, color = 'Orange') + geom_hline( yintercept = mean(df$BMI), linetype = 'dashed') + theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
linear_regress <- lm(df$BMI~df$CH2O)
linear_regress$coefficients
## (Intercept) df$CH2O
## 25.915648 1.884706
CH2O_family <- df|> group_by(df$family_history_with_overweight) |> summarise(count = n(), mean = mean(BMI))
df['genetic'] = ifelse(
df$family_history_with_overweight %in% c('yes'),1,0
)
CH2O_family
## # A tibble: 2 × 3
## `df$family_history_with_overweight` count mean
## <chr> <int> <dbl>
## 1 no 385 21.5
## 2 yes 1726 31.5
df |> ggplot()+
facet_wrap(vars(genetic), labeller = label_both) +
geom_point(mapping = aes(CH2O,BMI)) +
geom_hline(data = CH2O_family,
mapping = aes(yintercept = mean), color = 'Orange')+
geom_smooth(mapping = aes(y = BMI ,x = CH2O) ,method = 'lm', se = FALSE, color = 'darkblue')
## `geom_smooth()` using formula = 'y ~ x'
lr <- lm(BMI ~ CH2O + genetic + CH2O:genetic , df)
summary(lr)
##
## Call:
## lm(formula = BMI ~ CH2O + genetic + CH2O:genetic, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.0379 -4.6836 -0.1526 4.9415 19.0092
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.8772 1.0394 18.161 < 2e-16 ***
## CH2O 1.4440 0.5375 2.687 0.00728 **
## genetic 10.9231 1.2032 9.078 < 2e-16 ***
## CH2O:genetic -0.6009 0.6079 -0.989 0.32299
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.991 on 2107 degrees of freedom
## Multiple R-squared: 0.2396, Adjusted R-squared: 0.2385
## F-statistic: 221.3 on 3 and 2107 DF, p-value: < 2.2e-16
checking_lr <- lm(CH2O ~ TUE , df)
checking_lr$coefficients
## (Intercept) TUE
## 2.00008777 0.01204445
Only this variable (TUE) has no linearity with CH2O. Thus upholding the assumption of ‘Independent variables should not be linearly correlated’
Visually
df |> ggplot(mapping = aes(x = CH2O, y = TUE)) + geom_point() + geom_smooth(method ='lm',se = FALSE, color = 'Orange' )
## `geom_smooth()` using formula = 'y ~ x'
final_lr <- lm(BMI ~ CH2O + genetic + TUE + CH2O:genetic, df)
summary(final_lr)
##
## Call:
## lm(formula = BMI ~ CH2O + genetic + TUE + CH2O:genetic, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.1274 -4.6279 -0.1496 4.7307 19.0059
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.9592 1.0475 19.054 < 2e-16 ***
## CH2O 1.3533 0.5335 2.537 0.0113 *
## genetic 10.7260 1.1942 8.982 < 2e-16 ***
## TUE -1.4600 0.2482 -5.882 4.7e-09 ***
## CH2O:genetic -0.4687 0.6035 -0.777 0.4374
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.936 on 2106 degrees of freedom
## Multiple R-squared: 0.2519, Adjusted R-squared: 0.2504
## F-statistic: 177.2 on 4 and 2106 DF, p-value: < 2.2e-16
final_lr
model has CH2O, genetic and TUE as
independent explanatory variables and an interaction term = CH2O:genetic
to predict BMI response variable.The following assumptions must be met.
gg_resfitted(final_lr) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
#plots <- gg_resX(final_lr, plot.all = FALSE)
#plots$genetic + geom_smooth(method = 'loess',se = FALSE)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:boot':
##
## logit
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
residualPlots(final_lr)
## Test stat Pr(>|Test stat|)
## CH2O 1.6343 0.1024
## genetic 1.0323 0.3021
## TUE -9.8533 <2e-16 ***
## Tukey test -0.1074 0.9145
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gg_reshist(final_lr)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The histograms doesn’t have an ideal normal distribution, the right tail is uniformly decreasing and relatively longer than left tail. And the left tail has a steep decline too.
The mean of the distribution is very small close to 0, while median is -0.1496.
print(median(residuals(final_lr)))
## [1] -0.1496425
mean(residuals(final_lr))
## [1] 2.49077e-16
gg_qqplot(final_lr)
gg_cooksd(final_lr, threshold = 'matlab')
Cook’s Distance plot shows a lot of Influencial points beyond threshold. Especially, 258 has a lot of influence on the linear regression model
These influencial points can have large impact on the errors as well.
From this plot, it is clear that our data has a lot of outliers and influencial points for this model.