library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(purrr)
library(boot)
library(broom)
library(lindia)
df <- read.csv("~/Downloads/ObesityDataSet_raw_and_data_sinthetic.csv", header=TRUE)

df['BMI'] <- df['Weight']/(df['Height']**2)

Refer to the simple linear regression model you built last week. Include 1-3 more variables into your regression model.

df |> ggplot(mapping = aes(x = CH2O, y = BMI, color = family_history_with_overweight)) + geom_point(size = 0.5)+
  geom_smooth(method = 'lm', se = FALSE, color = 'Orange') + geom_hline( yintercept = mean(df$BMI), linetype = 'dashed') +  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

linear_regress <- lm(df$BMI~df$CH2O)
linear_regress$coefficients
## (Intercept)     df$CH2O 
##   25.915648    1.884706

Try out either an interaction term or a binary term to start.

CH2O_family <- df|> group_by(df$family_history_with_overweight) |> summarise(count = n(), mean = mean(BMI))

df['genetic'] = ifelse(
  df$family_history_with_overweight  %in% c('yes'),1,0
)


CH2O_family
## # A tibble: 2 × 3
##   `df$family_history_with_overweight` count  mean
##   <chr>                               <int> <dbl>
## 1 no                                    385  21.5
## 2 yes                                  1726  31.5
df |> ggplot()+
  facet_wrap(vars(genetic), labeller = label_both) +
  geom_point(mapping = aes(CH2O,BMI)) + 
  geom_hline(data = CH2O_family, 
             mapping = aes(yintercept = mean), color = 'Orange')+
  geom_smooth(mapping = aes(y = BMI ,x = CH2O) ,method = 'lm', se = FALSE, color = 'darkblue')
## `geom_smooth()` using formula = 'y ~ x'

lr <- lm(BMI ~ CH2O + genetic +  CH2O:genetic  , df)
summary(lr)
## 
## Call:
## lm(formula = BMI ~ CH2O + genetic + CH2O:genetic, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.0379  -4.6836  -0.1526   4.9415  19.0092 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   18.8772     1.0394  18.161  < 2e-16 ***
## CH2O           1.4440     0.5375   2.687  0.00728 ** 
## genetic       10.9231     1.2032   9.078  < 2e-16 ***
## CH2O:genetic  -0.6009     0.6079  -0.989  0.32299    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.991 on 2107 degrees of freedom
## Multiple R-squared:  0.2396, Adjusted R-squared:  0.2385 
## F-statistic: 221.3 on 3 and 2107 DF,  p-value: < 2.2e-16

Consider adding other integer or continuous variables.

For each new variable you try, explain why you should include it, or not. E.g., are there any issues with multicollinearity?

checking_lr <-  lm(CH2O ~ TUE , df)
checking_lr$coefficients
## (Intercept)         TUE 
##  2.00008777  0.01204445
df |> ggplot(mapping = aes(x = CH2O, y = TUE)) + geom_point() + geom_smooth(method ='lm',se = FALSE, color = 'Orange' )
## `geom_smooth()` using formula = 'y ~ x'

final_lr <- lm(BMI ~ CH2O + genetic + TUE + CH2O:genetic, df)
summary(final_lr)
## 
## Call:
## lm(formula = BMI ~ CH2O + genetic + TUE + CH2O:genetic, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.1274  -4.6279  -0.1496   4.7307  19.0059 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   19.9592     1.0475  19.054  < 2e-16 ***
## CH2O           1.3533     0.5335   2.537   0.0113 *  
## genetic       10.7260     1.1942   8.982  < 2e-16 ***
## TUE           -1.4600     0.2482  -5.882  4.7e-09 ***
## CH2O:genetic  -0.4687     0.6035  -0.777   0.4374    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.936 on 2106 degrees of freedom
## Multiple R-squared:  0.2519, Adjusted R-squared:  0.2504 
## F-statistic: 177.2 on 4 and 2106 DF,  p-value: < 2.2e-16

Your model for this data dive should have 2-4 terms.

Evaluate this model.

The following assumptions must be met.

  1. VARIABLE x IS LINEARLY CORRELATED WITH RESPONSE y. —> True
  2. ERRORS HAVE CONSTANT VARIANCE ACROSS ALL PREDICTIONS —> Analyzied below Residual vs fitted values plot
  3. OBSERVATIONS ARE INDEPENDENT AND UNCORRELATED —> The data collected is individuals lifestyle data. So, the data is independent.
  4. INDEPENDENT VARIABLES CANNOT BE LINEARLY CORRELATED —> True .Proved/Concluded above
  5. ERRORS ARE NORMALLY DISTRIBUTED OVER THE PREDICTION LINE —> Analyzied below

At the very least, use the 5 diagnostic plots discussed in class to identify any issues with your model.

Residuals vs fitted values

gg_resfitted(final_lr) +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

  • Assumption two is definitely not satisfying. The errors have very different variance across the fitted values.
  • There are two clusters of data left and right, Right cluster has higher variance.
  • And the trend of the residuals mean line is more in negative plane compared to positive plan.
  • Therefore, Assumption-2 is failed.

Residuals vs X values

#plots <- gg_resX(final_lr, plot.all = FALSE)
#plots$genetic + geom_smooth(method = 'loess',se = FALSE)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:boot':
## 
##     logit
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
residualPlots(final_lr)

##            Test stat Pr(>|Test stat|)    
## CH2O          1.6343           0.1024    
## genetic       1.0323           0.3021    
## TUE          -9.8533           <2e-16 ***
## Tukey test   -0.1074           0.9145    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  • In four subplots there are 3 residual vs variables plots and 1 residuals vs fitted plot (same as above).
  • From the first subplot, the blue line is very linear and almost overlaps y = 0. This indicate the relationship between CH2O and BMI is linear.
  • Whereas, in the Residuals vs TUE subpot, the blue line is slightly curve like a negative parabola. So,the linearity is true but slightly abnormal.
  • But the variance of errors is still not very clear.

Residual Histogram

gg_reshist(final_lr)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  • The histograms doesn’t have an ideal normal distribution, the right tail is uniformly decreasing and relatively longer than left tail. And the left tail has a steep decline too.

  • The mean of the distribution is very small close to 0, while median is -0.1496.

print(median(residuals(final_lr)))
## [1] -0.1496425
mean(residuals(final_lr)) 
## [1] 2.49077e-16
  • This rough close-to normal distribution of residuals indicate that the model needs to be improved. And furthur intuition QQ-plot can be plotted as well.

QQ-Plot

gg_qqplot(final_lr)

  • This plot furthur helps to diagnose the quantiles where the residuals distribution is deviating from Ideal normal distribution
  • At the lower Quantile and at the higher quantile the linear regression model’s standardized Residuals deviate from the ideal normal distribution of residuals/errors
  • The although the observation made cannot be mathematically help to improve the model, but provides an idea where the model is abnormal.

Cook’D by Observation

gg_cooksd(final_lr, threshold = 'matlab')

  • Cook’s Distance plot shows a lot of Influencial points beyond threshold. Especially, 258 has a lot of influence on the linear regression model

  • These influencial points can have large impact on the errors as well.

  • From this plot, it is clear that our data has a lot of outliers and influencial points for this model.

For each plot, point out any indications of issues with the model. Otherwise, explain how the plot supports the claim that an assumption is met.

  1. VARIABLE x IS LINEARLY CORRELATED WITH RESPONSE y. –> TRUE
  2. ERRORS HAVE CONSTANT VARIANCE ACROSS ALL PREDICTIONS —> close to FASLE
  3. OBSERVATIONS ARE INDEPENDENT AND UNCORRELATED —> TRUE
  4. INDEPENDENT VARIABLES CANNOT BE LINEARLY CORRELATED –> TRUE
  5. ERRORS ARE NORMALLY DISTRIBUTED OVER THE PREDICTION LINE –> Close to TRUE

Try to measure the severity of any issues as well as the level of confidence you have in an assumption being met.

Furthur Questions

  • How to handle the shortcomings expressed above, ERRORS HAVE CONSTANT VARIANCE ACROSS ALL PREDICTIONS not being wholely satisfying?