Analyzing Bodyfat using regression techniques

What is this dataset?

This dataset shows the ID number of the patient and displays bodyfat(dependent variable) with independant factors which include:

1.Density
2.Age
3.Weight
4.Height
5.Adaposity
6.Neck
7.Chest
8.Abdomen
9.Hip
10.Thigh
11.Knee
12.Ankle
13.Bicepts
14.Forearm
15.Wrist

Unfortunately, I forgot where I found this dataset so that’s bad luck for anyone who wants the dataset

What is this data looking like?

library(readr)
BodyFat <- read_csv("C:/Documents/My Excel/BodyFat.csv")
## Warning: Missing column names filled in: 'X18' [18]
## Parsed with column specification:
## cols(
##   IDNO = col_integer(),
##   BODYFAT = col_double(),
##   DENSITY = col_double(),
##   AGE = col_integer(),
##   WEIGHT = col_double(),
##   HEIGHT = col_double(),
##   ADIPOSITY = col_double(),
##   NECK = col_double(),
##   CHEST = col_double(),
##   ABDOMEN = col_double(),
##   HIP = col_double(),
##   THIGH = col_double(),
##   KNEE = col_double(),
##   ANKLE = col_double(),
##   BICEPS = col_double(),
##   FOREARM = col_double(),
##   WRIST = col_double(),
##   X18 = col_character()
## )
BodyFat$X18<-NULL #This drops the unknown column called "X18"
View(BodyFat)
print(str(BodyFat))
## Classes 'tbl_df', 'tbl' and 'data.frame':    252 obs. of  17 variables:
##  $ IDNO     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ BODYFAT  : num  12.6 6.9 24.6 10.9 27.8 20.6 19 12.8 5.1 12 ...
##  $ DENSITY  : num  1.07 1.09 1.04 1.08 1.03 ...
##  $ AGE      : int  23 22 22 26 24 24 26 25 25 23 ...
##  $ WEIGHT   : num  154 173 154 185 184 ...
##  $ HEIGHT   : num  67.8 72.2 66.2 72.2 71.2 ...
##  $ ADIPOSITY: num  23.7 23.4 24.7 24.9 25.6 26.5 26.2 23.6 24.6 25.8 ...
##  $ NECK     : num  36.2 38.5 34 37.4 34.4 39 36.4 37.8 38.1 42.1 ...
##  $ CHEST    : num  93.1 93.6 95.8 101.8 97.3 ...
##  $ ABDOMEN  : num  85.2 83 87.9 86.4 100 94.4 90.7 88.5 82.5 88.6 ...
##  $ HIP      : num  94.5 98.7 99.2 101.2 101.9 ...
##  $ THIGH    : num  59 58.7 59.6 60.1 63.2 66 58.4 60 62.9 63.1 ...
##  $ KNEE     : num  37.3 37.3 38.9 37.3 42.2 42 38.3 39.4 38.3 41.7 ...
##  $ ANKLE    : num  21.9 23.4 24 22.8 24 25.6 22.9 23.2 23.8 25 ...
##  $ BICEPS   : num  32 30.5 28.8 32.4 32.2 35.7 31.9 30.5 35.9 35.6 ...
##  $ FOREARM  : num  27.4 28.9 25.2 29.4 27.7 30.6 27.8 29 31.1 30 ...
##  $ WRIST    : num  17.1 18.2 16.6 18.2 17.7 18.8 17.7 18.8 18.2 19.2 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 18
##   .. ..$ IDNO     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ BODYFAT  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ DENSITY  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ AGE      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ WEIGHT   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ HEIGHT   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ ADIPOSITY: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ NECK     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ CHEST    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ ABDOMEN  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ HIP      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ THIGH    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ KNEE     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ ANKLE    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ BICEPS   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ FOREARM  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ WRIST    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ X18      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
## NULL
summary(BodyFat)
##       IDNO           BODYFAT         DENSITY           AGE       
##  Min.   :  1.00   Min.   : 0.00   Min.   :0.995   Min.   :22.00  
##  1st Qu.: 63.75   1st Qu.:12.80   1st Qu.:1.041   1st Qu.:35.75  
##  Median :126.50   Median :19.00   Median :1.055   Median :43.00  
##  Mean   :126.50   Mean   :18.94   Mean   :1.056   Mean   :44.88  
##  3rd Qu.:189.25   3rd Qu.:24.60   3rd Qu.:1.070   3rd Qu.:54.00  
##  Max.   :252.00   Max.   :45.10   Max.   :1.109   Max.   :81.00  
##      WEIGHT          HEIGHT        ADIPOSITY          NECK      
##  Min.   :118.5   Min.   :29.50   Min.   :18.10   Min.   :31.10  
##  1st Qu.:159.0   1st Qu.:68.25   1st Qu.:23.10   1st Qu.:36.40  
##  Median :176.5   Median :70.00   Median :25.05   Median :38.00  
##  Mean   :178.9   Mean   :70.15   Mean   :25.44   Mean   :37.99  
##  3rd Qu.:197.0   3rd Qu.:72.25   3rd Qu.:27.32   3rd Qu.:39.42  
##  Max.   :363.1   Max.   :77.75   Max.   :48.90   Max.   :51.20  
##      CHEST           ABDOMEN            HIP            THIGH      
##  Min.   : 79.30   Min.   : 69.40   Min.   : 85.0   Min.   :47.20  
##  1st Qu.: 94.35   1st Qu.: 84.58   1st Qu.: 95.5   1st Qu.:56.00  
##  Median : 99.65   Median : 90.95   Median : 99.3   Median :59.00  
##  Mean   :100.82   Mean   : 92.56   Mean   : 99.9   Mean   :59.41  
##  3rd Qu.:105.38   3rd Qu.: 99.33   3rd Qu.:103.5   3rd Qu.:62.35  
##  Max.   :136.20   Max.   :148.10   Max.   :147.7   Max.   :87.30  
##       KNEE           ANKLE          BICEPS         FOREARM     
##  Min.   :33.00   Min.   :19.1   Min.   :24.80   Min.   :21.00  
##  1st Qu.:36.98   1st Qu.:22.0   1st Qu.:30.20   1st Qu.:27.30  
##  Median :38.50   Median :22.8   Median :32.05   Median :28.70  
##  Mean   :38.59   Mean   :23.1   Mean   :32.27   Mean   :28.66  
##  3rd Qu.:39.92   3rd Qu.:24.0   3rd Qu.:34.33   3rd Qu.:30.00  
##  Max.   :49.10   Max.   :33.9   Max.   :45.00   Max.   :34.90  
##      WRIST      
##  Min.   :15.80  
##  1st Qu.:17.60  
##  Median :18.30  
##  Mean   :18.23  
##  3rd Qu.:18.80  
##  Max.   :21.40

Listing the unique ages for this dataset

length(unique(BodyFat$AGE))
## [1] 51
unique(BodyFat$AGE)
##  [1] 23 22 26 24 25 27 32 30 35 34 28 33 31 29 41 49 40 50 46 45 44 48 39
## [24] 43 47 51 42 54 58 62 61 56 57 55 69 81 66 67 64 70 72 53 38 52 36 37
## [47] 60 63 65 68 74

Plotting age vs bodyfat

Looking at the graphs, there is some upward correlation between age and bodyfat, but the data is mostly scattered all over the place. There is some correlation but not so much since various people of various ages can have different bodyfat levels depending on their diet and health

library(ggplot2)
ggplot(BodyFat,aes(AGE,BODYFAT))+geom_point(aes(x=AGE,y=BODYFAT,color=BODYFAT,size=AGE))

ggplot(BodyFat,aes(AGE,BODYFAT))+geom_jitter() #There is some correlation b/t age and bodyfat

Plotting age vs weight

No correlation whatsoever. This can tell me that age has nothing do do with weight.

ggplot(BodyFat,aes(AGE,WEIGHT))+geom_jitter(aes(X=AGE,y=WEIGHT,color=WEIGHT,size=AGE))
## Warning: Ignoring unknown aesthetics: X

ggplot(BodyFat,aes(AGE,WEIGHT))+geom_jitter() #No correlation b/t age and weight

ggplot(BodyFat) + geom_density(aes(WEIGHT))

From the density graph shown above, most of the people weigh around 150lbs

Plotting weight vs height

There is a strong correlation between height and weight as shown by these 2 graphs

ggplot(BodyFat,aes(WEIGHT,HEIGHT))+geom_jitter(aes(X=WEIGHT,y=HEIGHT,color=WEIGHT,size=HEIGHT))
## Warning: Ignoring unknown aesthetics: X

ggplot(BodyFat,aes(WEIGHT,HEIGHT))+geom_jitter() 

Which patients have the highest and lowest bodyfat. Top5/bottom5

BodyFat[order(BodyFat$BODYFAT,decreasing=T)[1:5],] #What can I say about this?
## # A tibble: 5 x 17
##    IDNO BODYFAT DENSITY   AGE WEIGHT HEIGHT ADIPOSITY  NECK CHEST ABDOMEN
##   <int>   <dbl>   <dbl> <int>  <dbl>  <dbl>     <dbl> <dbl> <dbl>   <dbl>
## 1   216    45.1   0.995    51   219    64        37.6  41.2  120.    122.
## 2    36    38.2   1.01     49   192.   65        32    38.4  118.    113.
## 3   192    36.5   1.01     42   244.   76        29.8  41.8  115.    114.
## 4   169    34.7   1.02     35   228.   69.5      33.3  40.4  115.    116.
## 5    39    33.8   1.02     46   363.   72.2      48.9  51.2  136.    148.
## # ... with 7 more variables: HIP <dbl>, THIGH <dbl>, KNEE <dbl>,
## #   ANKLE <dbl>, BICEPS <dbl>, FOREARM <dbl>, WRIST <dbl>
BodyFat[order(BodyFat$BODYFAT,decreasing=F)[1:5],] #What can I say about this?
## # A tibble: 5 x 17
##    IDNO BODYFAT DENSITY   AGE WEIGHT HEIGHT ADIPOSITY  NECK CHEST ABDOMEN
##   <int>   <dbl>   <dbl> <int>  <dbl>  <dbl>     <dbl> <dbl> <dbl>   <dbl>
## 1   182     0      1.11    40   118.   68        18.1  33.8  79.3    69.4
## 2   172     1.9    1.10    35   126.   65.5      20.6  34    90.8    75  
## 3   171     4.1    1.09    35   152.   67.8      23.4  37    92.2    81.9
## 4    26     4.6    1.09    27   159.   71.5      21.9  35.7  89.6    79.7
## 5    29     4.7    1.09    27   133.   64.8      22.4  36.4  93.5    73.9
## # ... with 7 more variables: HIP <dbl>, THIGH <dbl>, KNEE <dbl>,
## #   ANKLE <dbl>, BICEPS <dbl>, FOREARM <dbl>, WRIST <dbl>

Calculationg the Regression lines

BfvsAge<-lm(BODYFAT~AGE,data=BodyFat)
summary(BfvsAge) #Does the R2 support and conclude the graph above
## 
## Call:
## lm(formula = BODYFAT ~ AGE, data = BodyFat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.0697  -5.7025   0.2846   4.8301  25.0739 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.95546    1.73576   6.312 1.25e-09 ***
## AGE          0.17786    0.03724   4.776 3.04e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.435 on 250 degrees of freedom
## Multiple R-squared:  0.08362,    Adjusted R-squared:  0.07996 
## F-statistic: 22.81 on 1 and 250 DF,  p-value: 3.045e-06
BfvsWeight<-lm(BODYFAT~WEIGHT,data=BodyFat)
summary(BfvsWeight)
## 
## Call:
## lm(formula = BODYFAT ~ WEIGHT, data = BodyFat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.434  -4.315   0.079   4.540  19.681 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -9.99515    2.38906  -4.184 3.97e-05 ***
## WEIGHT       0.16171    0.01318  12.273  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.135 on 250 degrees of freedom
## Multiple R-squared:  0.376,  Adjusted R-squared:  0.3735 
## F-statistic: 150.6 on 1 and 250 DF,  p-value: < 2.2e-16
BfvsHeight<-lm(BODYFAT~HEIGHT,data=BodyFat)
summary(BfvsHeight) #Explain why the R2 is too low
## 
## Call:
## lm(formula = BODYFAT ~ HEIGHT, data = BodyFat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.3437  -6.1422   0.4013   5.5710  25.0021 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  32.1654     9.3635   3.435 0.000693 ***
## HEIGHT       -0.1886     0.1333  -1.415 0.158453    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.735 on 250 degrees of freedom
## Multiple R-squared:  0.00794,    Adjusted R-squared:  0.003972 
## F-statistic: 2.001 on 1 and 250 DF,  p-value: 0.1585

Analysis of the Regressions

So I calculated the 3 regressions shown above where bodyfat(dependant variable) is compared against Age, Weight, and height, all 3 of which are independant variables. For bodyfat vs Age and Height. I found that the \(R^2<1 percent\) which tells me that Age and height has nothing do do with bodyfat. This is true due to many factors such as diet and excercise.

Now for bodyfat vs weight. The \(R^2\) was around 37 percent, which was much higher than Age and Height. What this is telling me is that some of the patients are heavy due to being fat, and some are heavy due to having lots of muscle. Therefore this model is a little accurate, but not so much.

Calculating Bodyfat vs all independant variables

BfvsALL<-lm(BODYFAT~DENSITY+AGE+WEIGHT+HEIGHT+ADIPOSITY+NECK+CHEST+ABDOMEN+HIP+THIGH+
              KNEE+ANKLE+BICEPS+FOREARM+WRIST,data=BodyFat)
summary(BfvsALL) 
## 
## Call:
## lm(formula = BODYFAT ~ DENSITY + AGE + WEIGHT + HEIGHT + ADIPOSITY + 
##     NECK + CHEST + ABDOMEN + HIP + THIGH + KNEE + ANKLE + BICEPS + 
##     FOREARM + WRIST, data = BodyFat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7632 -0.3308 -0.0954  0.2078 13.9487 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.190e+02  9.802e+00  42.750   <2e-16 ***
## DENSITY     -3.816e+02  7.559e+00 -50.481   <2e-16 ***
## AGE          1.078e-02  8.808e-03   1.224    0.222    
## WEIGHT       1.197e-02  1.467e-02   0.816    0.415    
## HEIGHT      -1.782e-02  3.019e-02  -0.590    0.556    
## ADIPOSITY   -5.493e-02  8.113e-02  -0.677    0.499    
## NECK        -2.062e-02  6.427e-02  -0.321    0.749    
## CHEST        2.993e-02  2.856e-02   1.048    0.296    
## ABDOMEN      2.260e-02  3.016e-02   0.749    0.454    
## HIP          1.611e-02  4.023e-02   0.401    0.689    
## THIGH        1.354e-03  3.980e-02   0.034    0.973    
## KNEE        -3.978e-02  6.705e-02  -0.593    0.554    
## ANKLE       -7.170e-02  6.073e-02  -1.181    0.239    
## BICEPS      -6.291e-02  4.688e-02  -1.342    0.181    
## FOREARM      4.324e-02  5.447e-02   0.794    0.428    
## WRIST        3.640e-02  1.480e-01   0.246    0.806    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.166 on 236 degrees of freedom
## Multiple R-squared:  0.9787, Adjusted R-squared:  0.9774 
## F-statistic: 723.9 on 15 and 236 DF,  p-value: < 2.2e-16

Analysis of this regression equation

From this regression, I got a \(R^2\) value of 97.74%. This model is extremely accurate and it appears that bodyfat is dependant on all the other independant variables/measurements.