This project focuses on analyzing FitBit data collected from 35 participants over two months. The participants consented to revealing data concerning calories burned, steps, activity, sleep, heart rate, and more. The data sets come from Kaggle:(https://www.kaggle.com/datasets/arashnic/fitbit)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
#Assigning data sets to variables
daily_activity <- read.csv("~/R/Case Study/Fitabase Data 3.12.16-4.11.16/dailyActivity_merged.csv")
sleep_day <- read.csv("~/R/Case Study/Fitabase Data 4.12.16-5.12.16/sleepDay_merged.csv")
glimpse(daily_activity)
## Rows: 457
## Columns: 15
## $ Id <dbl> 1503960366, 1503960366, 1503960366, 150396036…
## $ ActivityDate <chr> "3/25/2016", "3/26/2016", "3/27/2016", "3/28/…
## $ TotalSteps <int> 11004, 17609, 12736, 13231, 12041, 10970, 122…
## $ TotalDistance <dbl> 7.11, 11.55, 8.53, 8.93, 7.85, 7.16, 7.86, 7.…
## $ TrackerDistance <dbl> 7.11, 11.55, 8.53, 8.93, 7.85, 7.16, 7.86, 7.…
## $ LoggedActivitiesDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveDistance <dbl> 2.57, 6.92, 4.66, 3.19, 2.16, 2.36, 2.29, 3.3…
## $ ModeratelyActiveDistance <dbl> 0.46, 0.73, 0.16, 0.79, 1.09, 0.51, 0.49, 0.8…
## $ LightActiveDistance <dbl> 4.07, 3.91, 3.71, 4.95, 4.61, 4.29, 5.04, 3.6…
## $ SedentaryActiveDistance <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.0…
## $ VeryActiveMinutes <int> 33, 89, 56, 39, 28, 30, 33, 47, 40, 15, 43, 3…
## $ FairlyActiveMinutes <int> 12, 17, 5, 20, 28, 13, 12, 21, 11, 30, 18, 18…
## $ LightlyActiveMinutes <int> 205, 274, 268, 224, 243, 223, 239, 200, 244, …
## $ SedentaryMinutes <int> 804, 588, 605, 1080, 763, 1174, 820, 866, 636…
## $ Calories <int> 1819, 2154, 1944, 1932, 1886, 1820, 1889, 186…
glimpse(sleep_day)
## Rows: 413
## Columns: 5
## $ Id <dbl> 1503960366, 1503960366, 1503960366, 1503960366, 150…
## $ SleepDay <chr> "4/12/2016 12:00:00 AM", "4/13/2016 12:00:00 AM", "…
## $ TotalSleepRecords <int> 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ TotalMinutesAsleep <int> 327, 384, 412, 340, 700, 304, 360, 325, 361, 430, 2…
## $ TotalTimeInBed <int> 346, 407, 442, 367, 712, 320, 377, 364, 384, 449, 3…
#Summary statistics for a few columns in the daily_activity data set
daily_activity %>%
select(TotalSteps,
TotalDistance,
SedentaryMinutes,
Calories) %>%
summary()
## TotalSteps TotalDistance SedentaryMinutes Calories
## Min. : 0 Min. : 0.000 Min. : 32.0 Min. : 0
## 1st Qu.: 1988 1st Qu.: 1.410 1st Qu.: 728.0 1st Qu.:1776
## Median : 5986 Median : 4.090 Median :1057.0 Median :2062
## Mean : 6547 Mean : 4.664 Mean : 995.3 Mean :2189
## 3rd Qu.:10198 3rd Qu.: 7.160 3rd Qu.:1285.0 3rd Qu.:2667
## Max. :28497 Max. :27.530 Max. :1440.0 Max. :4562
#Summary statistics for a few columns in the sleep_day data set
sleep_day %>%
select(TotalSleepRecords,
TotalMinutesAsleep,
TotalTimeInBed) %>%
summary()
## TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
## Min. :1.000 Min. : 58.0 Min. : 61.0
## 1st Qu.:1.000 1st Qu.:361.0 1st Qu.:403.0
## Median :1.000 Median :433.0 Median :463.0
## Mean :1.119 Mean :419.5 Mean :458.6
## 3rd Qu.:1.000 3rd Qu.:490.0 3rd Qu.:526.0
## Max. :3.000 Max. :796.0 Max. :961.0
#Finding correlation between the two variables
time_asleep_and_time_in_bed_cor = cor(x=sleep_day$TotalTimeInBed, y=sleep_day$TotalMinutesAsleep)
#Plotting the relationship
ggplot(data=sleep_day, mapping=aes(x=TotalTimeInBed, y=TotalMinutesAsleep)) +
geom_point()+
geom_smooth()+
annotate('text', x=250, y=700, label=paste("r =", round(time_asleep_and_time_in_bed_cor, 2)), size=6)+
labs(
title = "Time in Bed Vs. Time Asleep",
x = "Total Time in Bed",
y = "Total Time Asleep"
)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
We can see a strong positive relationship between time in bed and time asleep with a correlation of 0.93.
#Combining sleep and daily activity data sets
sleep_and_activity_data <- full_join(sleep_day, daily_activity, by="Id")
#Checking there are 35 participants
n_distinct(sleep_and_activity_data$Id)
## [1] 35
#Finding the steps and sedentary minutes sorrelation
steps_and_sedentary_minutes_corr=cor(x=sleep_and_activity_data$TotalSteps, y=sleep_and_activity_data$SedentaryMinutes)
#Plotting the relationship
ggplot(data=daily_activity, mapping=aes(x=TotalSteps, y=SedentaryMinutes)) +
geom_point() +
geom_smooth(method='lm', se=FALSE, color="black")+
annotate('text', x=25000, y=1300, label=paste("r =", round(steps_and_sedentary_minutes_corr, 2)), size=6)+
scale_x_continuous(
breaks = seq(0, 30000, by = 2500),
)+
scale_y_continuous(
breaks = seq(0, 1500, by = 500)
)+
labs(
title = "Steps and Sedentary Minutes Correlation",
x = "Total Steps",
y = "Sedentary Minutes"
)
## `geom_smooth()` using formula = 'y ~ x'
There is a slightly negative relationship between steps and time being sedentary. As total steps increases, time being sedentary decreases slightly.
#Mean and median of total steps taken
mean_steps = mean(sleep_and_activity_data$TotalSteps)
median_steps = median(sleep_and_activity_data$TotalSteps)
#Plotting the distribution
ggplot(data=sleep_and_activity_data, mapping=aes(x=TotalSteps, fill=TotalSteps)) +
geom_histogram(fill = "skyblue", color = "black")+
geom_vline(aes(xintercept = median_steps), color = "black", linetype = "dashed", size = 1)+
annotate("text", x = median_steps, y = 500, label = paste("Median =", round(median_steps, 2)), color = "black")+
scale_x_continuous(
breaks = seq(0, 30000, by = 2500),
)+
stat_bin(
geom="text",
aes(label=..count..),
vjust=-0.5,
size=3
)+
labs(
title = "Total Steps Taken Distribution",
x = "Total Steps",
y = "Frequency"
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The median amount of steps per day of the participants is 7485. It was most common for the participants to get less than 1,000 steps. The second most common step count was around 3,000, followed by 10,000. This seems to indicate that the participants were usually either completely sedentary of very active.
#Finding correlation between steps and calories burned
steps_and_calories_corr=cor(daily_activity$TotalSteps, daily_activity$Calories)
#Plotting the relationship
ggplot(data=sleep_and_activity_data, mapping=aes(x=TotalSteps, y=Calories)) +
geom_point()+
geom_smooth(method="lm", se=FALSE, color='black',)+
scale_x_continuous(
breaks = seq(0,30000, by = 2500)
)+
annotate('text', x=25000, y=1500, label=paste("r =", round(steps_and_calories_corr, 2)))+
labs(
title = "Relationship Between Total Steps and Calories Burned",
x = "Total Steps"
)
## `geom_smooth()` using formula = 'y ~ x'
The relationship between steps taken and calories burned has a moderately positive relationship, with a correlation of 0.58.
#Creating new data frame for activity level and calories
activity_level <- data.frame(
Sedentary = sleep_and_activity_data$SedentaryMinutes,
Lightly_Active = sleep_and_activity_data$LightlyActiveMinutes,
Fairly_Active = sleep_and_activity_data$FairlyActiveMinutes,
Very_Active = sleep_and_activity_data$VeryActiveMinutes,
Calories = sleep_and_activity_data$Calories
)
#Pivoting the data frame
activity_level_long <- activity_level %>%
pivot_longer(
cols = c(Sedentary, Lightly_Active, Fairly_Active, Very_Active),
names_to = "Activity Level",
values_to = "Minutes"
) %>%
mutate(`Activity Level` = factor(`Activity Level`,
levels = c("Sedentary", "Lightly_Active", "Fairly_Active", "Very_Active")))
#Finding correlations for each activity level
correlations <- activity_level_long %>%
group_by(`Activity Level`) %>%
summarize(Correlation = cor(Minutes, Calories))
#Plotting the relationship
ggplot(data=activity_level_long, mapping=aes(x=Minutes, y=Calories, color=`Activity Level`)) +
geom_point()+
geom_smooth(method='lm', se=FALSE, color = 'black')+
geom_text(
data = correlations,
aes(x=1250, y=7500, label=paste("r =", round(Correlation, 2))), color = 'black')+
facet_wrap(~`Activity Level`, scales = "fixed")+
labs(
title = "Calories Burned by Activity Level"
)
## `geom_smooth()` using formula = 'y ~ x'
The very active activity level has the strongest correlation with calories burned at 0.5, while the sedentary activity level has the weakest correlation with calories burned at 0.05.