Fitbit is an American consumer electronics and fitness company. It produces wireless-enabled wearable technology physical fitness monitors and activity trackers such as smartwatches, pedometers and monitors for heart rate, quality of sleep, and stairs climbed as well as related software.
This Kaggle dataset compiles many different activity/sleep/calories stats from users in 2016. I analyzed this dataset with R programming and used RStudio to build custom visualizations to understand correlations and trends with the data.
install.packages("tidyverse")
library(tidyverse)
library(readr)
library(lubridate)
install.packages("skimr")
library(skimr)
library(dplyr)
==================================================
downloaded 413 KB
* installing *binary* package ‘tidyverse’ ...
* DONE (tidyverse)
The downloaded source packages are in
‘/tmp/RtmpvIVAeB/downloaded_packages’
> library(tidyverse)
── Attaching packages ──────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.3 ✓ purrr 0.3.4
✓ tibble 3.1.0 ✓ dplyr 1.0.5
✓ tidyr 1.1.3 ✓ stringr 1.4.0
✓ readr 1.4.0 ✓ forcats 0.5.1
── Conflicts ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
daily_activity <- read_csv("dailyActivity_merged.csv")
View(daily_activity)
glimpse(daily_activity)
colnames(daily_activity)
daily_activity
[1] "Id" "Date" "TotalSteps"
[4] "TotalDistance" "TrackerDistance" "LoggedDistance"
[7] "VeryActiveDistance" "ModeratelyActiveDistance" "LightActiveDistance"
[10] "SedentaryActiveDistance" "VeryActiveMinutes" "FairlyActiveMinutes"
[13] "LightlyActiveMinutes" "SedentaryMinutes" "Calories"
[16] "WeekDay"
daily_steps <- read_csv("dailySteps_merged.csv")
View(daily_steps)
head(daily_steps)
A tibble: 6 x 3
Id ActivityDay StepTotal
<dbl> <chr> <dbl>
1 1503960366 4/12/2016 13162
2 1503960366 4/13/2016 10735
3 1503960366 4/14/2016 10460
4 1503960366 4/15/2016 9762
5 1503960366 4/16/2016 12669
6 1503960366 4/17/2016 9705
daily_calories <- read_csv("dailyCalories_merged.csv")
View(daily_calories)
skim_without_charts(daily_calories)
── Data Summary ────────────────────────
Values
Name daily_calories
Number of rows 940
Number of columns 3
_______________________
Column type frequency:
character 1
numeric 2
________________________
Group variables None
...
daily_sleep <- read_csv("sleepDay_merged.csv")
View(daily_sleep)
summary(daily_sleep)
Id SleepDay TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
Min. :1.504e+09 Length:413 Min. :1.000 Min. : 58.0 Min. : 61.0
1st Qu.:3.977e+09 Class :character 1st Qu.:1.000 1st Qu.:361.0 1st Qu.:403.0
Median :4.703e+09 Mode :character Median :1.000 Median :433.0 Median :463.0
Mean :5.001e+09 Mean :1.119 Mean :419.5 Mean :458.6
3rd Qu.:6.962e+09 3rd Qu.:1.000 3rd Qu.:490.0 3rd Qu.:526.0
Max. :8.792e+09 Max. :3.000 Max. :796.0 Max. :961.0
n_distinct(daily_activity$Id)
n_distinct(daily_sleep$Id)
n_distinct(daily_steps$Id)
n_distinct(daily_calories$Id)
[1] 33 [1] 24 [1] 33 [1] 33 - The daily sleep study has the least amount of participants
nrow(daily_activity)
nrow(daily_sleep)
nrow(daily_steps)
nrow(daily_calories)
[1] 940 [1] 410 [1] 940 [1] 940 - The daily sleep study has the least amount of observations
length(unique(do.call(c, list(daily_activity$Id, daily_steps$Id, daily_sleep$Id, daily_calories$Id))))
[1] 33 - There are 33 unique ID’s for participants
daily_sleep %>%
select(SleepDay)
help(lubridate)
daily_sleep <- dplyr::rename(daily_sleep, SleepDate = SleepDay) %>%
daily_sleep$SleepDate <- as.Date(daily_sleep$SleepDate, "%d%m%Y")
daily_sleep %>%
select(SleepDate)
daily_sleep$SleepDate <- as.Date(daily_sleep$SleepDate, "%d%m%Y")
daily_sleep$SleepDate <- strptime(daily_sleep$SleepDate,format = "%d/%m/%Y")
daily_sleep$SleepDate <- as.Date(daily_sleep$SleepDate = "%d%m%Y")
class(daily_sleep$SleepDate)
daily_sleep <- separate(daily_sleep, col=SleepDate, into=c('SleepDate','Time','AMPM'), sep=' ')
## this separation worked, now we will extract & separate month, day, and year
daily_sleep <- separate(daily_sleep, col=SleepDate, into=c('Month','Day','Year'), sep='/')
daily_sleep %>%
select(Year)
# A tibble: 6 x 7
Id Month Day Year TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
<dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 1503960366 4 12 2016 1 327 346
2 1503960366 4 13 2016 2 384 407
3 1503960366 4 15 2016 1 412 442 ...
daily_sleep %>% select(-AMPM, -Time)
daily_sleep <- select(daily_sleep, -AMPM, -Time)
daily_activity <-
rename(daily_activity, Date = ActivityDate, LoggedDistance = LoggedActivitiesDistance) %>%
mutate(Date = mdy(Date), WeekDay = weekdays(Date))
colnames(daily_activity)
class(daily_activity$Date)
daily_activity %>%
select(WeekDay)
daily_steps <-
rename(daily_steps, Date = ActivityDay, TotalSteps = StepTotal) %>%
mutate(Date = mdy(Date), WeekDay = weekdays(Date))
daily_steps %>%
select(WeekDay)
daily_calories <-
rename(daily_calories, Date = ActivityDay) %>%
mutate(Date = mdy(Date), WeekDay = weekdays(Date))
daily_calories %>%
select(WeekDay)
> print(paste("NAs: ", sum(is.na(daily_activity)))) # check for NAs
[1] "NAs: 0"
> print(paste("Duplicated rows: ", sum(duplicated(daily_activity)))) # check for Duplicates
[1] "Duplicated rows: 0"
> print(paste("NAs: ", sum(is.na(daily_calories))))
[1] "NAs: 0"
> print(paste("Duplicated rows: ", sum(duplicated(daily_calories))))
[1] "Duplicated rows: 0"
> print(paste("NAs: ", sum(is.na(daily_steps))))
[1] "NAs: 0"
> print(paste("Duplicated rows: ", sum(duplicated(daily_steps))))
[1] "Duplicated rows: 0"
> print(paste("NAs: ", sum(is.na(daily_sleep))))
[1] "NAs: 0"
> print(paste("Duplicated rows: ", sum(duplicated(daily_sleep))))
[1] "Duplicated rows: 3"
daily_sleep <- daily_sleep[!duplicated(daily_sleep), ]
print(paste("Duplicated rows: ", sum(duplicated(daily_sleep))))
> print(paste("Duplicated rows: ", sum(duplicated(daily_sleep))))
[1] "Duplicated rows: 0"
weekday_average <- daily_activity_sleep %>%
group_by(WeekDay) %>%
summarize(AvgSteps = mean(TotalSteps), AvgDistance = mean(TotalDistance), AvgSleep = mean(TotalMinutesAsleep), AvgInactivity = mean(TotalTimeInBed))
weekday_avg_cals <- daily_calories %>%
group_by(WeekDay) %>%
summarize(AvgCalories = mean(Calories))
weekday_average$WeekDay <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
weekday_avg_cals$WeekDay <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
> head(weekday_average)
# A tibble: 6 x 5
WeekDay AvgSteps AvgDistance AvgSleep AvgInactivity
<chr> <dbl> <dbl> <dbl> <dbl>
1 Monday 8231. 5.77 419. 458.
2 Tuesday 8644. 6.10 419. 459.
3 Wednesday 8622. 6.07 419. 458.
4 Thursday 6585. 4.70 419. 459.
5 Friday 7736. 5.46 420. 458.
6 Saturday 9017. 6.34 419. 458.
ggplot(data=daily_average, aes(x=AvgCalories, y=AvgSleep)) +
geom_point() +
labs(y="Average Calories", x="Average Sleep", title="Daily Average Sleep Vs. Calories Among Users")
ggplot(data=daily_sleep, aes(TotalMinutesAsleep, TotalTimeInBed, fill="origin")) +
geom_point(alpha=0.5, size=2, shape=21) +
labs(y="Minutes Inactive", x="Minutes Asleep", title="Daily Sleep Vs. Inactivity Among Users")
- Strong positive correlation shown here - total time spent in bed and total time asleep go hand-in-hand
ggplot(data=daily_activity, aes(x=TotalDistance, y=Calories)) +
geom_point(alpha=0.5, size=2) +
labs(y="Calories", x="Distance", title="Daily Distance Vs. Calories Burned Among Users")
- Here, we can see a weaker positive correlation between distance and calories burned, minus the outliers
ggplot(data=daily_activity, aes(x=TotalSteps, y=Calories)) +
geom_jitter() +
geom_smooth(color = "green") +
labs(y="Calories", x="Steps", title="Daily Steps Vs. Calories Burned Among Users")
- The same is true for the relationship between daily steps and calories
barplot(weekday_average$AvgSteps, main="Weekly Average Steps", horiz = TRUE,
names.arg=c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"),
col=c("dark green"),
density=c(4,5,9,8,7,30,7), angle=c(7,45,90,11,36,6,8))
- Saturday, which I outlined in the textured bar graphs, is the day out of the week with the most step activity
barplot(weekday_avg_cals$AvgCalories, main="Weekly Average Calories", horiz = TRUE,
names.arg=c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"),
col=c("blue"),
density=c(4,5,9,8,7,30,7), angle=c(7,45,90,11,36,6,8))
- The most two active days for burning calories are Saturday and Wednesday, with Saturday ahead by only 1.045 cals
barplot(weekday_average$AvgDistance, main="Weekly Average Distance",
names.arg=c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"),
col=c("white","light blue","pink","purple","maroon","dark green","black"))
- Saturday, shown in green, is the week day with the most distance covered, which aligns with the steps activity observation
user_activity <- daily_steps %>%
mutate(UserType = case_when(
TotalSteps < 5000 ~ "sedentary",
TotalSteps >= 5000 & TotalSteps < 7499 ~ "lightly active",
TotalSteps >= 7500 & TotalSteps < 9999 ~ "fairly active",
TotalSteps >= 10000 ~ "very active"))
activity_percentage <- user_activity %>%
group_by(UserType) %>%
summarise(total = n()) %>%
mutate(totals = sum(total)) %>%
group_by(UserType) %>%
summarise(TotalPercent = total / totals) %>%
mutate(Percentages = scales::percent(TotalPercent))
head(activity_percentage)
# A tibble: 4 x 3
UserType TotalPercent Percentages
<chr> <dbl> <chr>
1 very active 0.173 17.34%
2 fairly active 0.182 18.19%
3 lightly active 0.322 32.23%
4 sedentary 0.322 32.23%
activity_percentage$UserType <- c("very active", "fairly active", "lightly active", "sedentary")
head(activity_percentage)
activity_percentage %>%
ggplot(aes(x="",y=TotalPercent, fill=UserType)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start=-5, direction=-2) +
theme_void() +
theme(plot.title = element_text(hjust = 0.5, face="bold", size=15)) +
scale_fill_manual(values = c("pink2","yellow2","orange1", "red2")) +
geom_text(aes(label = Percentages),
position = position_stack(vjust = 0.5)) +
ggtitle("Distribution of User Activity")
- Here we can see that most users of the fitness app are either sedentary(inactive) or lightly active
- Fairly active and very active users are the minority, only making up 35.53% of the overall 100%
- Saturday is the most active day of the week for users
- There are positive correlations between total time spent in bed & total time asleep, calories & distance, steps & calories
- Sleep statistics are very similar across the span of a week, ranging from 418-420 minutes for sleep averages