This project focuses on analyzing FitBit data collected from 35 participants over two months. The participants consented to revealing data concerning calories burned, steps, activity, sleep, heart rate, and more. The data sets come from Kaggle:(https://www.kaggle.com/datasets/arashnic/fitbit)

Loading Packages

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
#Assigning data sets to variables
daily_activity <- read.csv("~/R/Case Study/Fitabase Data 3.12.16-4.11.16/dailyActivity_merged.csv")
sleep_day <- read.csv("~/R/Case Study/Fitabase Data 4.12.16-5.12.16/sleepDay_merged.csv")
glimpse(daily_activity)
## Rows: 457
## Columns: 15
## $ Id                       <dbl> 1503960366, 1503960366, 1503960366, 150396036…
## $ ActivityDate             <chr> "3/25/2016", "3/26/2016", "3/27/2016", "3/28/…
## $ TotalSteps               <int> 11004, 17609, 12736, 13231, 12041, 10970, 122…
## $ TotalDistance            <dbl> 7.11, 11.55, 8.53, 8.93, 7.85, 7.16, 7.86, 7.…
## $ TrackerDistance          <dbl> 7.11, 11.55, 8.53, 8.93, 7.85, 7.16, 7.86, 7.…
## $ LoggedActivitiesDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveDistance       <dbl> 2.57, 6.92, 4.66, 3.19, 2.16, 2.36, 2.29, 3.3…
## $ ModeratelyActiveDistance <dbl> 0.46, 0.73, 0.16, 0.79, 1.09, 0.51, 0.49, 0.8…
## $ LightActiveDistance      <dbl> 4.07, 3.91, 3.71, 4.95, 4.61, 4.29, 5.04, 3.6…
## $ SedentaryActiveDistance  <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.0…
## $ VeryActiveMinutes        <int> 33, 89, 56, 39, 28, 30, 33, 47, 40, 15, 43, 3…
## $ FairlyActiveMinutes      <int> 12, 17, 5, 20, 28, 13, 12, 21, 11, 30, 18, 18…
## $ LightlyActiveMinutes     <int> 205, 274, 268, 224, 243, 223, 239, 200, 244, …
## $ SedentaryMinutes         <int> 804, 588, 605, 1080, 763, 1174, 820, 866, 636…
## $ Calories                 <int> 1819, 2154, 1944, 1932, 1886, 1820, 1889, 186…
glimpse(sleep_day)
## Rows: 413
## Columns: 5
## $ Id                 <dbl> 1503960366, 1503960366, 1503960366, 1503960366, 150…
## $ SleepDay           <chr> "4/12/2016 12:00:00 AM", "4/13/2016 12:00:00 AM", "…
## $ TotalSleepRecords  <int> 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ TotalMinutesAsleep <int> 327, 384, 412, 340, 700, 304, 360, 325, 361, 430, 2…
## $ TotalTimeInBed     <int> 346, 407, 442, 367, 712, 320, 377, 364, 384, 449, 3…
#Summary statistics for a few columns in the daily_activity data set
daily_activity %>%
  select(TotalSteps,
         TotalDistance,
         SedentaryMinutes, 
         Calories) %>%
  summary()
##    TotalSteps    TotalDistance    SedentaryMinutes    Calories   
##  Min.   :    0   Min.   : 0.000   Min.   :  32.0   Min.   :   0  
##  1st Qu.: 1988   1st Qu.: 1.410   1st Qu.: 728.0   1st Qu.:1776  
##  Median : 5986   Median : 4.090   Median :1057.0   Median :2062  
##  Mean   : 6547   Mean   : 4.664   Mean   : 995.3   Mean   :2189  
##  3rd Qu.:10198   3rd Qu.: 7.160   3rd Qu.:1285.0   3rd Qu.:2667  
##  Max.   :28497   Max.   :27.530   Max.   :1440.0   Max.   :4562
#Summary statistics for a few columns in the sleep_day data set
sleep_day %>%
  select(TotalSleepRecords,
         TotalMinutesAsleep,
         TotalTimeInBed) %>%
  summary()
##  TotalSleepRecords TotalMinutesAsleep TotalTimeInBed 
##  Min.   :1.000     Min.   : 58.0      Min.   : 61.0  
##  1st Qu.:1.000     1st Qu.:361.0      1st Qu.:403.0  
##  Median :1.000     Median :433.0      Median :463.0  
##  Mean   :1.119     Mean   :419.5      Mean   :458.6  
##  3rd Qu.:1.000     3rd Qu.:490.0      3rd Qu.:526.0  
##  Max.   :3.000     Max.   :796.0      Max.   :961.0

Time in Bed Vs. Time Asleep

#Finding correlation between the two variables
time_asleep_and_time_in_bed_cor = cor(x=sleep_day$TotalTimeInBed, y=sleep_day$TotalMinutesAsleep)

#Plotting the relationship
ggplot(data=sleep_day, mapping=aes(x=TotalTimeInBed, y=TotalMinutesAsleep)) +
  geom_point()+
  geom_smooth()+
  annotate('text', x=250, y=700, label=paste("r =", round(time_asleep_and_time_in_bed_cor, 2)), size=6)+
  labs(
    title = "Time in Bed Vs. Time Asleep",
    x = "Total Time in Bed",
    y = "Total Time Asleep"
  )
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

We can see a strong positive relationship between time in bed and time asleep with a correlation of 0.93.

#Combining sleep and daily activity data sets
sleep_and_activity_data <- full_join(sleep_day, daily_activity, by="Id")
#Checking there are 35 participants
n_distinct(sleep_and_activity_data$Id)
## [1] 35

Steps and Sedentary Time Correlation

#Finding the steps and sedentary minutes sorrelation
steps_and_sedentary_minutes_corr=cor(x=sleep_and_activity_data$TotalSteps, y=sleep_and_activity_data$SedentaryMinutes)

#Plotting the relationship
ggplot(data=daily_activity, mapping=aes(x=TotalSteps, y=SedentaryMinutes)) +
  geom_point() +
  geom_smooth(method='lm', se=FALSE, color="black")+
  annotate('text', x=25000, y=1300, label=paste("r =", round(steps_and_sedentary_minutes_corr, 2)), size=6)+
  scale_x_continuous(
    breaks = seq(0, 30000, by = 2500),
  )+
  scale_y_continuous(
    breaks = seq(0, 1500, by = 500)
  )+
  labs(
    title = "Steps and Sedentary Minutes Correlation",
    x = "Total Steps",
    y = "Sedentary Minutes"
  )
## `geom_smooth()` using formula = 'y ~ x'

There is a slightly negative relationship between steps and time being sedentary. As total steps increases, time being sedentary decreases slightly.

Distribution of Total Steps Taken

#Mean and median of total steps taken
mean_steps = mean(sleep_and_activity_data$TotalSteps)
median_steps = median(sleep_and_activity_data$TotalSteps)

#Plotting the distribution
ggplot(data=sleep_and_activity_data, mapping=aes(x=TotalSteps, fill=TotalSteps)) + 
  geom_histogram(fill = "skyblue", color = "black")+
  geom_vline(aes(xintercept = median_steps), color = "black", linetype = "dashed", size = 1)+
  annotate("text", x = median_steps, y = 500, label = paste("Median =", round(median_steps, 2)), color = "black")+
  scale_x_continuous(
    breaks = seq(0, 30000, by = 2500),
  )+
  stat_bin(
    geom="text",
    aes(label=..count..),
    vjust=-0.5,
    size=3
  )+
  labs(
    title = "Total Steps Taken Distribution",
    x = "Total Steps",
    y = "Frequency"
  )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The median amount of steps per day of the participants is 7485. It was most common for the participants to get less than 1,000 steps. The second most common step count was around 3,000, followed by 10,000. This seems to indicate that the participants were usually either completely sedentary of very active.

Relationship Between Total Steps and Calories Burned

#Finding correlation between steps and calories burned
steps_and_calories_corr=cor(daily_activity$TotalSteps, daily_activity$Calories)

#Plotting the relationship
ggplot(data=sleep_and_activity_data, mapping=aes(x=TotalSteps, y=Calories)) +
  geom_point()+
  geom_smooth(method="lm", se=FALSE, color='black',)+
  scale_x_continuous(
    breaks = seq(0,30000, by = 2500)
  )+
  annotate('text', x=25000, y=1500, label=paste("r =", round(steps_and_calories_corr, 2)))+
  labs(
    title = "Relationship Between Total Steps and Calories Burned",
    x = "Total Steps"
  )
## `geom_smooth()` using formula = 'y ~ x'

The relationship between steps taken and calories burned has a moderately positive relationship, with a correlation of 0.58.

#Creating new data frame for activity level and calories
activity_level <- data.frame(
  Sedentary = sleep_and_activity_data$SedentaryMinutes,
  Lightly_Active = sleep_and_activity_data$LightlyActiveMinutes,
  Fairly_Active = sleep_and_activity_data$FairlyActiveMinutes,
  Very_Active = sleep_and_activity_data$VeryActiveMinutes,
  Calories = sleep_and_activity_data$Calories
)
#Pivoting the data frame
activity_level_long <- activity_level %>% 
  pivot_longer(
    cols = c(Sedentary, Lightly_Active, Fairly_Active, Very_Active),
    names_to = "Activity Level",
    values_to = "Minutes"
  ) %>%
  mutate(`Activity Level` = factor(`Activity Level`, 
                                   levels = c("Sedentary", "Lightly_Active", "Fairly_Active", "Very_Active")))
#Finding correlations for each activity level
correlations <- activity_level_long %>% 
  group_by(`Activity Level`) %>% 
  summarize(Correlation = cor(Minutes, Calories))

Relationship Between Activity Level and Calories Burned

#Plotting the relationship
ggplot(data=activity_level_long, mapping=aes(x=Minutes, y=Calories, color=`Activity Level`)) +
  geom_point()+
  geom_smooth(method='lm', se=FALSE, color = 'black')+
  geom_text(
    data = correlations,
    aes(x=1250, y=7500, label=paste("r =", round(Correlation, 2))), color = 'black')+
  facet_wrap(~`Activity Level`, scales = "fixed")+
  labs(
    title = "Calories Burned by Activity Level"
  )
## `geom_smooth()` using formula = 'y ~ x'

The very active activity level has the strongest correlation with calories burned at 0.5, while the sedentary activity level has the weakest correlation with calories burned at 0.05.