Fixing the data

bodyfat=var_names <- c('case', 'brozek_bdyfat', 'siri_bdyfat', 'density', 'age', 
               'weight', 'height', 'adiposity', 'ff_weight', 'neck_circ',
               'chest_circ', 'ab_circ', 'hip_circ', 'thigh_circ', 'knee_circ',
               'ank_circ', 'bicep_circ', 'forearm_circ', 'wrist_circ')
bodyfat <- read.delim('fat.dat.txt', sep = '', header = FALSE, col.names = var_names)
bodyfat[42, 7] <- 69.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
bodyfat[c(48, 76, 96, 42, 182), 
        c("density", "brozek", "siri", "height_in")]
## Error in `[.data.frame`(bodyfat, c(48, 76, 96, 42, 182), c("density", : undefined columns selected
bodyfat$density[c(48, 76, 96)] <- c(1.0865, 1.0566, 1.0591)
bodyfat <- bodyfat %>% 
  mutate(siri_C = round(495/density - 450, 1),
         brozek_C = round(457/density - 414.2, 1),
         bmi_C = round((weight*0.453592) /
                         (height*2.54/100)^2, 1))
bodyfat[c(48, 76, 96, 42, 182), c("density", "brozek", 
                                  "brozek_C", "siri_C", 
                                  "siri")]
## Error in `[.data.frame`(bodyfat, c(48, 76, 96, 42, 182), c("density", : undefined columns selected
bodyfat= bodyfat[-182, ]

##Regressions We tested the Brozek body fat precentage as a response variable against all variables in the data

mod= lm(brozek_bdyfat ~ density + age + weight + height + adiposity+ 
          ff_weight+ neck_circ + chest_circ+ ab_circ+ hip_circ + thigh_circ + knee_circ+
          ank_circ + bicep_circ + forearm_circ + wrist_circ, data=bodyfat)

summary(mod)
## 
## Call:
## lm(formula = brozek_bdyfat ~ density + age + weight + height + 
##     adiposity + ff_weight + neck_circ + chest_circ + ab_circ + 
##     hip_circ + thigh_circ + knee_circ + ank_circ + bicep_circ + 
##     forearm_circ + wrist_circ, data = bodyfat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59205 -0.07011 -0.02040  0.04971  0.87370 
## 
## Coefficients:
##                Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)   4.055e+02  3.832e+00  105.825  < 2e-16 ***
## density      -3.756e+02  2.829e+00 -132.740  < 2e-16 ***
## age          -1.671e-03  1.079e-03   -1.549  0.12285    
## weight        1.147e-02  3.893e-03    2.946  0.00354 ** 
## height        1.292e-01  1.944e-02    6.645 2.10e-10 ***
## adiposity     1.689e-01  2.683e-02    6.294 1.51e-09 ***
## ff_weight    -4.916e-02  3.825e-03  -12.854  < 2e-16 ***
## neck_circ     1.508e-02  7.908e-03    1.907  0.05778 .  
## chest_circ    1.073e-03  3.603e-03    0.298  0.76617    
## ab_circ       1.493e-03  3.673e-03    0.406  0.68479    
## hip_circ      6.290e-03  4.909e-03    1.281  0.20140    
## thigh_circ   -2.674e-03  5.077e-03   -0.527  0.59895    
## knee_circ    -1.845e-02  8.238e-03   -2.239  0.02608 *  
## ank_circ      8.640e-03  7.479e-03    1.155  0.24916    
## bicep_circ   -4.103e-03  5.799e-03   -0.707  0.47996    
## forearm_circ  1.469e-02  6.776e-03    2.168  0.03115 *  
## wrist_circ    2.424e-02  1.817e-02    1.334  0.18336    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1428 on 234 degrees of freedom
## Multiple R-squared:  0.9997, Adjusted R-squared:  0.9997 
## F-statistic: 4.509e+04 on 16 and 234 DF,  p-value: < 2.2e-16

Using the significant variables from the previous model, we found our best model.

mod2= lm(brozek_bdyfat ~ density +  weight + height + adiposity+ 
          ff_weight+ knee_circ+ forearm_circ, data=bodyfat)
summary (mod2)
## 
## Call:
## lm(formula = brozek_bdyfat ~ density + weight + height + adiposity + 
##     ff_weight + knee_circ + forearm_circ, data = bodyfat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.58744 -0.06609 -0.02441  0.05095  0.92271 
## 
## Coefficients:
##                Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)   4.063e+02  3.413e+00  119.036  < 2e-16 ***
## density      -3.756e+02  2.557e+00 -146.894  < 2e-16 ***
## weight        1.293e-02  3.820e-03    3.384 0.000831 ***
## height        1.328e-01  1.872e-02    7.096 1.40e-11 ***
## adiposity     1.795e-01  2.525e-02    7.108 1.31e-11 ***
## ff_weight    -4.822e-02  3.543e-03  -13.609  < 2e-16 ***
## knee_circ    -1.832e-02  7.434e-03   -2.465 0.014412 *  
## forearm_circ  1.653e-02  6.015e-03    2.749 0.006435 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1433 on 243 degrees of freedom
## Multiple R-squared:  0.9997, Adjusted R-squared:  0.9997 
## F-statistic: 1.023e+05 on 7 and 243 DF,  p-value: < 2.2e-16

This residuals plot shows that this is a strong model.

par(mfrow=c(2,2))
plot(mod2)

Scatterplots

This figure shows a tight, negative correlation between density and Brozek body fat, meaning the more compact an individual’s body is, the less body fat they have.

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
(densityplot= ggplot(data = bodyfat, aes(x = density, y = brozek_bdyfat, 
                                color = age)) +
  geom_point(size=1.75) + 
  theme_bw())

This scatterplot depicts weight in pounds against the brozek body fat percentage. There is a positive correlation between the variables, but the relationship is relatively weak.

(weightplot= ggplot(data = bodyfat, aes(x = weight, y = brozek_bdyfat, 
                                        color = age)) +
    geom_point(size = 1.75) + 
    geom_smooth(method = "lm", se = FALSE, color = 'red') +
    theme_bw())
## `geom_smooth()` using formula 'y ~ x'

This figure demonstrates the relationship between height in inches and the Brozek body fat percentage. There is no clear correlation between the two variables.

(heightplot= ggplot(data = bodyfat, aes(x = height, y = brozek_bdyfat, 
                                        color = age)) +
    geom_point(size=1.75) +   geom_smooth(method = "lm", se = FALSE, color = 'red') +
    theme_bw())
## `geom_smooth()` using formula 'y ~ x'

There is a positive correlation between the adiposity index, which measures weight divided by height squared, and Brozek body fat percentage.

(adiposityplot= ggplot(data = bodyfat, aes(x = adiposity, y = brozek_bdyfat, 
                                        color = age)) +
    geom_point(size=1.75) +   geom_smooth(method = "lm", se = FALSE, color = 'red') +
    theme_bw())
## `geom_smooth()` using formula 'y ~ x'

This figure shows that there is no relationship between fat free weight in pounds and the Brozek body fat percentage. This makes sense because fat free weight does not account for body fat whereas the Brozek body fat percentage expressly calculates an individual’s body fat.

ff_plot <- ggplot(data = bodyfat, aes(x = ff_weight, y = brozek_bdyfat, 
                                      color = age)) +
  geom_point(size = 1.75) + 
  geom_smooth(method = "lm", se = FALSE, color = 'red') +
  theme_bw()
ff_plot
## `geom_smooth()` using formula 'y ~ x'

The is scatterplot shows the positive correlation between knee circumference in centimeters and the Brozek body fat percentage.

(kneeplot= ggplot(data = bodyfat, aes(x = knee_circ, y = brozek_bdyfat, 
                                        color = age)) +
    geom_point(size=1.75) + geom_smooth(method = "lm", se = FALSE, color = 'red')+
    theme_bw())
## `geom_smooth()` using formula 'y ~ x'

This figure depicts the positive correlation between forearm circumference and Brozek body fat percentage. This correlation is relatively weak though and has several outliers.

(forearmplot= ggplot(data = bodyfat, aes(x = forearm_circ, y = brozek_bdyfat, 
                                        color = age)) + geom_smooth(method = "lm", se = FALSE, color = 'red')+
    geom_point(size=1.75) + 
    theme_bw())
## `geom_smooth()` using formula 'y ~ x'

Outlier Tests

library(car)
## Warning: package 'car' was built under R version 4.1.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.1.2
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
qqPlot(mod2, simulate=TRUE, labels=row.names(bodyfat),
       id=list(method="identify"), main="Q-Q Plot")

outlierTest(mod2)
##      rstudent unadjusted p-value Bonferroni p
## 33   7.227187         6.4152e-12   1.6102e-09
## 163  4.332917         2.1581e-05   5.4168e-03
## 98  -4.328348         2.2000e-05   5.5221e-03
hat.plot <- function(mod2, bodyfat) {
  p <- length(coefficients(mod2))
  n <- length(fitted(mod2))
  plot(hatvalues(mod2), main="Index Plot of Hat Values")
  abline(h=c(2,3)*p/n, col="red", lty=2)
  text(hatvalues(mod2), labels=rownames(bodyfat), cex=0.9, font=2)
  identify(1:n, hatvalues(mod2), names(hatvalues(mod2)))
}
hat.plot(mod2, bodyfat)

## integer(0)
influencePlot(mod2, id="noteworthy", main="Influence Plot",
              sub="Circle size is proportional to Cook's distance")
## Warning in applyDefaults(id, defaults = list(method = "noteworthy", n = 2, :
## unnamed id arguments, will be ignored

##       StudRes        Hat     CookD
## 33   7.227187 0.03951233 0.2218225
## 39  -3.485991 0.43112829 1.1006954
## 163  4.332917 0.29797784 0.9282088
## 221  2.284564 0.30782517 0.2851865
spreadLevelPlot(mod2)

## 
## Suggested power transformation:  1.372288