# the following libraries will be used in order to perform the necessary code
library(tidyverse)
library(ggplot2)
# Loading the data into R
setwd("C:/Users/kpeter81/OneDrive - montgomerycollege.edu/Datasets")
yrbss <- read_csv("yrbss - yrbss.csv")Project #1 - DATA 110
Impact of Height, Age, and Physical Activity on Youth Weight
Introduction
From 1991-2013, data was collected on 13,583 youth across the United States by the US Centers for Disease Control and Prevention (data was accessed through OpenIntro). Their ages ranged from 12-18, and data was gathered on their age, gender, height (in meters), weight (in kilograms), physical activity, and 8 other variables. Physical activity was measured based on how many days per week the participant was physically active in some way. In this project, I will explore which of the numerical variables impacted the participants weight through visualizations and multiple linear regression.
Load the Libraries
Exploring the data
# Examining the structure of the dataset
str(yrbss)spc_tbl_ [13,583 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ age : num [1:13583] 14 14 15 15 15 15 15 14 15 15 ...
$ gender : chr [1:13583] "female" "female" "female" "female" ...
$ grade : chr [1:13583] "9" "9" "9" "9" ...
$ hispanic : chr [1:13583] "not" "not" "hispanic" "not" ...
$ race : chr [1:13583] "Black or African American" "Black or African American" "Native Hawaiian or Other Pacific Islander" "Black or African American" ...
$ height : num [1:13583] NA NA 1.73 1.6 1.5 1.57 1.65 1.88 1.75 1.37 ...
$ weight : num [1:13583] NA NA 84.4 55.8 46.7 ...
$ helmet_12m : chr [1:13583] "never" "never" "never" "never" ...
$ text_while_driving_30d : chr [1:13583] "0" NA "30" "0" ...
$ physically_active_7d : num [1:13583] 4 2 7 0 2 1 4 4 5 0 ...
$ hours_tv_per_school_day : chr [1:13583] "5+" "5+" "5+" "2" ...
$ strength_training_7d : num [1:13583] 0 0 0 0 1 0 2 0 3 0 ...
$ school_night_hours_sleep: chr [1:13583] "8" "6" "<5" "6" ...
- attr(*, "spec")=
.. cols(
.. age = col_double(),
.. gender = col_character(),
.. grade = col_character(),
.. hispanic = col_character(),
.. race = col_character(),
.. height = col_double(),
.. weight = col_double(),
.. helmet_12m = col_character(),
.. text_while_driving_30d = col_character(),
.. physically_active_7d = col_double(),
.. hours_tv_per_school_day = col_character(),
.. strength_training_7d = col_double(),
.. school_night_hours_sleep = col_character()
.. )
- attr(*, "problems")=<externalptr>
# List of the variable names
names(yrbss) [1] "age" "gender"
[3] "grade" "hispanic"
[5] "race" "height"
[7] "weight" "helmet_12m"
[9] "text_while_driving_30d" "physically_active_7d"
[11] "hours_tv_per_school_day" "strength_training_7d"
[13] "school_night_hours_sleep"
# the number of NAs in each column
colSums(is.na(yrbss)) age gender grade
77 12 79
hispanic race height
231 2805 1004
weight helmet_12m text_while_driving_30d
1004 311 918
physically_active_7d hours_tv_per_school_day strength_training_7d
273 338 1176
school_night_hours_sleep
1248
Cleaning the Data
I am going to create a new dataset (yrbss1) that will make visualization of the data, as well as the multiple linear regression, far easier. First, I will coerce 2 discrete quantitative variables (physical_activity_7d and age) to become qualitative. I will also filter out NA values from the variables I am using.
yrbss1 <- yrbss |>
mutate(physical_activity_days = as.factor(physically_active_7d)) |> # physical activity as a factor
mutate (age_fct = as.factor(age)) |> # age as a factor (12, 13, 14, 15, 16, 17, & 18)
# Removing NA values
filter(!is.na(physical_activity_days)) |>
filter (!is.na(height)) |>
filter (!is.na(age)) |>
filter(!is.na(weight))
head(yrbss1)# A tibble: 6 × 15
age gender grade hispanic race height weight helmet_12m
<dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
1 15 female 9 hispanic Native Hawaiian or Other… 1.73 84.4 never
2 15 female 9 not Black or African American 1.6 55.8 never
3 15 female 9 not Black or African American 1.5 46.7 did not r…
4 15 female 9 not Black or African American 1.57 67.1 did not r…
5 15 female 9 not Black or African American 1.65 132. did not r…
6 14 male 9 not Black or African American 1.88 71.2 never
# ℹ 7 more variables: text_while_driving_30d <chr>, physically_active_7d <dbl>,
# hours_tv_per_school_day <chr>, strength_training_7d <dbl>,
# school_night_hours_sleep <chr>, physical_activity_days <fct>, age_fct <fct>
Visualizing the data
I am going to create separate visualizations to see the individual relationships weight and each variable: physical activity, height, and age.
# Visualization of the relationship between physical activity and weight
activity_boxplot <- yrbss1 |>
ggplot(aes(x = physical_activity_days, y = weight, fill = physical_activity_days)) +
geom_boxplot(outlier.alpha = .3, outlier.shape = 1) +
theme_grey() +
labs(title = "Relationship between Physical Activity and Weight \nAmong youth ages 12-18", x = "Days of Physical Activity", y = "Weight (kgs)", caption = "Data Source: OpenIntro, data gathered by the CDC") +
scale_fill_brewer(palette = "Blues")
activity_boxplotBased on this graph, there seems to be no clear relationship between weight and the number of days a week that people exercise. Whether youth not at all or all 7 days a week, the median weight was generally about the same. In addition, the distribution of outliers was realitively even across all groups. This could be due to a range of factors, a prevalent one likely being the gender of the individual and genetics.
# Visualization of the relationship between height and weight
height_density <- yrbss1 |>
ggplot(aes(x = height, y = weight)) +
geom_smooth(color = "#569691", fill = "#84bdb8") +
theme_gray () +
#scale_fill_continuous(palette = "Blues", aesthetics = "colour") +
labs(title = "Relationship between Height and Weight \nAmong youth ages 12-18", x = "Height (meters)", y = "Weight (kgs)", caption = "Data Source: OpenIntro, data gathered by the CDC")
height_density`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
This graph shows a strong positive correlation between the height of the participant and their respective weight, which is helpful for considering which variables to include in the linear regression model. As Height incrased, there was a clear increase in weight as well. This makes sense logically, the taller an individual is, the more likely that they weigh heavier than someone shorter.
# Visualization of the relationship between age and weight
lm_activity <- yrbss1 |>
ggplot(aes(x = age_fct, y = weight, fill = age_fct)) +
geom_boxplot(outlier.alpha = .2, outlier.shape = 1) +
theme_grey() +
labs(title = "Relationship between Age and Weight \nAmong youth ages 12-18", x = "Age", y = "Weight (kgs)", caption = "Data Source: OpenIntro, data gathered by the CDC") +
scale_fill_brewer(palette = "Greens")
lm_activityThis visualization shows a slight positive trend between age and weight, which will be useful to know when creating our multiple linear regression model. It is also interesting that, from this graph, we can observe older age groups contain more outliers on the upper end of the weight variable than younger groups. Meaning, among older teens, there are more participants that are significantly outside the overall pattern of the dataset.
Multiple Linear Regression
Now that I’ve examined each of the variables individual relationship with the participants weight, I will use multiple linear regression to see how each of them contribute to the participants weight.
# Multiple Linear Regression model using age, height, and number of days physically active in a week
linear_reg <- lm(weight ~ age + height + physically_active_7d, data = yrbss1) #lm(y ~ x)
summary(linear_reg)
Call:
lm(formula = weight ~ age + height + physically_active_7d, data = yrbss1)
Residuals:
Min 1Q Median 3Q Max
-41.313 -9.447 -3.044 6.090 93.657
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -93.3892 2.5147 -37.14 < 2e-16 ***
age 1.4365 0.1044 13.76 < 2e-16 ***
height 82.3747 1.2731 64.70 < 2e-16 ***
physically_active_7d -0.3228 0.0519 -6.22 5.12e-10 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 14.33 on 12360 degrees of freedom
Multiple R-squared: 0.2794, Adjusted R-squared: 0.2793
F-statistic: 1598 on 3 and 12360 DF, p-value: < 2.2e-16
Adjusted R-squared: 0.2793
p-value: < 2.2e-16
Equation: y = 1.4365(age) + -0.3228(physically_active_7d) + 82.3747(height) - 93.3892
This multiple linear model has statistically significant p-values for each of the variables, which suggests all three variables (age, physical activity, and height) are meaningful to explain the weight of youth ages 12-18. The adjusted R-Squared value states that about 28% of the variation in the observations can be explained by the model, 72% of the variation is not explained by the model.
So, age, amount of physical activity, and height can explain about 28% of the variation in observations.
Conclusion
When cleaning this dataset, I had to ensure that there were no NA values in any of the variables I would be using, so I utilized the filter() function on the age, height, weight, and physically_active_7d variables. Then, in order to allow the visualization through boxplots, I had to mutate 2 new variables (physical_activity_days & age_fct) and coerce the original numeric data into qualitative data. This was only possible because a) the variables were discrete and b) they each had limited ranges. Only ages 12-18 were studied, and the maximum number of physically active days per week was 7.
Once cleaning the data was cleaned, I was able to begin creating visualizations. I was very surprised with the outcome of my first visualization (a boxplot displaying the relationship between weight and physical activity) displayed no clear relationship between the two variables. I would’ve assumed that increased physical activity would decrease weight, but I suppose that gender may play a role. While girls may engage in physical activity to decrease weight, physical activity for boys might occur to increase overall body weight and strengthen muscle. There might be other factors that explain this lack of strong correlation as well, but further research would need to be conducted to explain this surprising result. My other two graphs had far more expected results, height had a very strong positive correlation with weight, and age did as well though not quite as strong.
I would’ve liked to be able to visualize my multiple linear regression and to see how well the model matches the various data points. Unfortunately I wasn’t sure how to do that, hopefully I can include that in later projects.