library(tidyverse) #helps wrangle data
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Use the conflicted package to manage conflicts
library(conflicted)
# Set dplyr::filter and dplyr::lag as the default choices
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
conflict_prefer("lag", "dplyr")
## [conflicted] Will prefer dplyr::lag over any other package.
Create a dataframe for each of the csv data:
daily_activity <- read.csv("dailyActivity_merged.csv")
sleep_day <- read.csv("sleepDay_merged.csv")
weight <- read.csv("weightLogInfo_merged.csv")
hourly_steps <- read.csv("hourlySteps_merged.csv")
head(daily_activity)
## Id ActivityDate TotalSteps TotalDistance TrackerDistance
## 1 1503960366 4/12/2016 13162 8.50 8.50
## 2 1503960366 4/13/2016 10735 6.97 6.97
## 3 1503960366 4/14/2016 10460 6.74 6.74
## 4 1503960366 4/15/2016 9762 6.28 6.28
## 5 1503960366 4/16/2016 12669 8.16 8.16
## 6 1503960366 4/17/2016 9705 6.48 6.48
## LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 1 0 1.88 0.55
## 2 0 1.57 0.69
## 3 0 2.44 0.40
## 4 0 2.14 1.26
## 5 0 2.71 0.41
## 6 0 3.19 0.78
## LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 1 6.06 0 25
## 2 4.71 0 21
## 3 3.91 0 30
## 4 2.83 0 29
## 5 5.04 0 36
## 6 2.51 0 38
## FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## 1 13 328 728 1985
## 2 19 217 776 1797
## 3 11 181 1218 1776
## 4 34 209 726 1745
## 5 10 221 773 1863
## 6 20 164 539 1728
colnames(daily_activity)
## [1] "Id" "ActivityDate"
## [3] "TotalSteps" "TotalDistance"
## [5] "TrackerDistance" "LoggedActivitiesDistance"
## [7] "VeryActiveDistance" "ModeratelyActiveDistance"
## [9] "LightActiveDistance" "SedentaryActiveDistance"
## [11] "VeryActiveMinutes" "FairlyActiveMinutes"
## [13] "LightlyActiveMinutes" "SedentaryMinutes"
## [15] "Calories"
head(sleep_day)
## Id SleepDay TotalSleepRecords TotalMinutesAsleep
## 1 1503960366 4/12/2016 12:00:00 AM 1 327
## 2 1503960366 4/13/2016 12:00:00 AM 2 384
## 3 1503960366 4/15/2016 12:00:00 AM 1 412
## 4 1503960366 4/16/2016 12:00:00 AM 2 340
## 5 1503960366 4/17/2016 12:00:00 AM 1 700
## 6 1503960366 4/19/2016 12:00:00 AM 1 304
## TotalTimeInBed
## 1 346
## 2 407
## 3 442
## 4 367
## 5 712
## 6 320
head(weight)
## Id Date WeightKg WeightPounds Fat BMI
## 1 1503960366 5/2/2016 11:59:59 PM 52.6 115.9631 22 22.65
## 2 1503960366 5/3/2016 11:59:59 PM 52.6 115.9631 NA 22.65
## 3 1927972279 4/13/2016 1:08:52 AM 133.5 294.3171 NA 47.54
## 4 2873212765 4/21/2016 11:59:59 PM 56.7 125.0021 NA 21.45
## 5 2873212765 5/12/2016 11:59:59 PM 57.3 126.3249 NA 21.69
## 6 4319703577 4/17/2016 11:59:59 PM 72.4 159.6147 25 27.45
## IsManualReport LogId
## 1 True 1.462234e+12
## 2 True 1.462320e+12
## 3 False 1.460510e+12
## 4 True 1.461283e+12
## 5 True 1.463098e+12
## 6 True 1.460938e+12
head(hourly_steps)
## Id ActivityHour StepTotal
## 1 1503960366 4/12/2016 12:00:00 AM 373
## 2 1503960366 4/12/2016 1:00:00 AM 160
## 3 1503960366 4/12/2016 2:00:00 AM 151
## 4 1503960366 4/12/2016 3:00:00 AM 0
## 5 1503960366 4/12/2016 4:00:00 AM 0
## 6 1503960366 4/12/2016 5:00:00 AM 0
sum(duplicated(daily_activity))
## [1] 0
sum(duplicated(sleep_day))
## [1] 3
sum(duplicated(weight))
## [1] 0
sum(duplicated(hourly_steps))
## [1] 0
Looks like we have 3 duplicated entries in sleep_day. Let’s remove them.
sleep_day <- sleep_day %>%
distinct() %>%
drop_na()
sum(duplicated(sleep_day))
## [1] 0
Great, there are no more duplicates.
# Convert date formats in daily_activity
daily_activity <- daily_activity %>%
rename(Date = ActivityDate) %>%
mutate(Date = as.Date(Date, format = "%d/%m/%y"))
# Convert date formats in daily_sleep
sleep_day <- sleep_day %>%
rename(Date = SleepDay) %>%
mutate(Date = as.Date(Date, format = "%d/%m/%y"))
# Convert date formats in weight
weight <- weight %>%
select(-LogId) %>%
mutate(Date = as.Date(Date, format = "%d/%m/%y")) %>%
mutate(IsManualReport = as.factor(IsManualReport))
head(daily_activity)
## Id Date TotalSteps TotalDistance TrackerDistance
## 1 1503960366 2020-12-04 13162 8.50 8.50
## 2 1503960366 <NA> 10735 6.97 6.97
## 3 1503960366 <NA> 10460 6.74 6.74
## 4 1503960366 <NA> 9762 6.28 6.28
## 5 1503960366 <NA> 12669 8.16 8.16
## 6 1503960366 <NA> 9705 6.48 6.48
## LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 1 0 1.88 0.55
## 2 0 1.57 0.69
## 3 0 2.44 0.40
## 4 0 2.14 1.26
## 5 0 2.71 0.41
## 6 0 3.19 0.78
## LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 1 6.06 0 25
## 2 4.71 0 21
## 3 3.91 0 30
## 4 2.83 0 29
## 5 5.04 0 36
## 6 2.51 0 38
## FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## 1 13 328 728 1985
## 2 19 217 776 1797
## 3 11 181 1218 1776
## 4 34 209 726 1745
## 5 10 221 773 1863
## 6 20 164 539 1728
head(sleep_day)
## Id Date TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
## 1 1503960366 2020-12-04 1 327 346
## 2 1503960366 <NA> 2 384 407
## 3 1503960366 <NA> 1 412 442
## 4 1503960366 <NA> 2 340 367
## 5 1503960366 <NA> 1 700 712
## 6 1503960366 <NA> 1 304 320
head(weight)
## Id Date WeightKg WeightPounds Fat BMI IsManualReport
## 1 1503960366 2020-02-05 52.6 115.9631 22 22.65 True
## 2 1503960366 2020-03-05 52.6 115.9631 NA 22.65 True
## 3 1927972279 <NA> 133.5 294.3171 NA 47.54 False
## 4 2873212765 <NA> 56.7 125.0021 NA 21.45 True
## 5 2873212765 2020-12-05 57.3 126.3249 NA 21.69 True
## 6 4319703577 <NA> 72.4 159.6147 25 27.45 True
How many unique participants are there in each dataframe?
# Count unique IDs
n_distinct(daily_activity$Id)
## [1] 33
n_distinct(sleep_day$Id)
## [1] 24
n_distinct(weight$Id)
## [1] 8
n_distinct(hourly_steps$Id)
## [1] 33
How many observations are there in each dataframe?
nrow(daily_activity)
## [1] 940
nrow(sleep_day)
## [1] 410
nrow(weight)
## [1] 67
nrow(hourly_steps)
## [1] 22099
daily_activity %>%
select(TotalSteps,
TotalDistance,
SedentaryMinutes) %>%
summary()
## TotalSteps TotalDistance SedentaryMinutes
## Min. : 0 Min. : 0.000 Min. : 0.0
## 1st Qu.: 3790 1st Qu.: 2.620 1st Qu.: 729.8
## Median : 7406 Median : 5.245 Median :1057.5
## Mean : 7638 Mean : 5.490 Mean : 991.2
## 3rd Qu.:10727 3rd Qu.: 7.713 3rd Qu.:1229.5
## Max. :36019 Max. :28.030 Max. :1440.0
sleep_day %>%
select(TotalSleepRecords,
TotalMinutesAsleep,
TotalTimeInBed) %>%
summary()
## TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
## Min. :1.00 Min. : 58.0 Min. : 61.0
## 1st Qu.:1.00 1st Qu.:361.0 1st Qu.:403.8
## Median :1.00 Median :432.5 Median :463.0
## Mean :1.12 Mean :419.2 Mean :458.5
## 3rd Qu.:1.00 3rd Qu.:490.0 3rd Qu.:526.0
## Max. :3.00 Max. :796.0 Max. :961.0
weight %>%
select(WeightKg, BMI) %>%
summary()
## WeightKg BMI
## Min. : 52.60 Min. :21.45
## 1st Qu.: 61.40 1st Qu.:23.96
## Median : 62.50 Median :24.39
## Mean : 72.04 Mean :25.19
## 3rd Qu.: 85.05 3rd Qu.:25.56
## Max. :133.50 Max. :47.54
hourly_steps %>%
select(StepTotal) %>%
summary()
## StepTotal
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 40.0
## Mean : 320.2
## 3rd Qu.: 357.0
## Max. :10554.0
ggplot(data=daily_activity, aes(x=TotalSteps, y=SedentaryMinutes)) + geom_point() + geom_smooth() + labs(title = "Total Steps vs. Sedentary Minutes")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Expected result: The greater the number of total steps, the more active
a person is and hence the person would have lower sedentary minutes. But
it’s hard to tell from this graph. Let’s use Pearson correlation to
confirm.
cor(x = daily_activity[,"TotalSteps"],
y = daily_activity[,"SedentaryMinutes"],
use = "everything",
method = "pearson")
## [1] -0.3274835
This indicates a moderate negative linear relationship between Total Steps and Sedentary Minutes.
ggplot(data=sleep_day, aes(x=TotalMinutesAsleep, y=TotalTimeInBed)) + geom_point() + geom_smooth() + labs(title = "Minutes asleep vs. Time in bed")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# Calculate Pearson's correlation:
cor(x = sleep_day[,"TotalMinutesAsleep"],
y = sleep_day[,"TotalTimeInBed"],
use = "everything",
method = "pearson")
## [1] 0.9304224
As expected, there is a very strong positive linear relationship between TotalMinutesAsleep and TotalTimeInBed. This shows that people are generally not having trouble sleeping.
activity_sleep <- merge(daily_activity, sleep_day, by = c ("Id", "Date"))
glimpse(activity_sleep)
## Rows: 4,662
## Columns: 18
## $ Id <dbl> 1503960366, 1503960366, 1503960366, 150396036…
## $ Date <date> 2020-01-05, 2020-02-05, 2020-03-05, 2020-05-…
## $ TotalSteps <int> 10602, 14727, 15103, 14070, 12159, 11992, 100…
## $ TotalDistance <dbl> 6.81, 9.71, 9.66, 8.90, 8.03, 7.71, 6.58, 7.7…
## $ TrackerDistance <dbl> 6.81, 9.71, 9.66, 8.90, 8.03, 7.71, 6.58, 7.7…
## $ LoggedActivitiesDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveDistance <dbl> 2.29, 3.21, 3.73, 2.92, 1.97, 2.46, 3.53, 3.4…
## $ ModeratelyActiveDistance <dbl> 1.60, 0.57, 1.05, 1.08, 0.25, 2.12, 0.32, 0.5…
## $ LightActiveDistance <dbl> 2.92, 5.92, 4.88, 4.88, 5.81, 3.13, 2.73, 3.7…
## $ SedentaryActiveDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveMinutes <int> 33, 41, 50, 45, 24, 37, 44, 46, 46, 36, 25, 4…
## $ FairlyActiveMinutes <int> 35, 15, 24, 24, 6, 46, 8, 11, 31, 23, 13, 28,…
## $ LightlyActiveMinutes <int> 246, 277, 254, 250, 289, 175, 203, 206, 214, …
## $ SedentaryMinutes <int> 730, 798, 816, 857, 754, 833, 574, 835, 746, …
## $ Calories <int> 1820, 2004, 1990, 1959, 1896, 1821, 1740, 181…
## $ TotalSleepRecords <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, …
## $ TotalMinutesAsleep <int> 369, 277, 273, 247, 334, 331, 594, 338, 383, …
## $ TotalTimeInBed <int> 396, 309, 296, 264, 367, 349, 611, 342, 403, …
Q: Do participants who sleep more also take more steps or fewer steps per day?
ggplot(data=activity_sleep, aes(x=TotalTimeInBed, y=TotalSteps)) + geom_point() + geom_smooth() + labs(title = "Time in bed vs. Total steps")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Calculate Pearson's correlation:
cor(x = activity_sleep[,"TotalTimeInBed"],
y = activity_sleep[,"TotalSteps"],
use = "everything",
method = "pearson")
## [1] -0.001130026
There is an almost negligible relationship between the 2 variables. Let’s try something else.
activity_sleep_weight <- merge(activity_sleep, weight, by = c ("Id", "Date"))
glimpse(activity_sleep_weight)
## Rows: 6,316
## Columns: 23
## $ Id <dbl> 1503960366, 1503960366, 1927972279, 192797227…
## $ Date <date> 2020-02-05, 2020-03-05, NA, NA, NA, NA, NA, …
## $ TotalSteps <int> 14727, 15103, 356, 356, 356, 2163, 2163, 2090…
## $ TotalDistance <dbl> 9.71, 9.66, 0.25, 0.25, 0.25, 1.50, 1.50, 1.4…
## $ TrackerDistance <dbl> 9.71, 9.66, 0.25, 0.25, 0.25, 1.50, 1.50, 1.4…
## $ LoggedActivitiesDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveDistance <dbl> 3.21, 3.73, 0.00, 0.00, 0.00, 0.00, 0.00, 0.0…
## $ ModeratelyActiveDistance <dbl> 0.57, 1.05, 0.00, 0.00, 0.00, 0.40, 0.40, 0.2…
## $ LightActiveDistance <dbl> 5.92, 4.88, 0.25, 0.25, 0.25, 1.10, 1.10, 1.1…
## $ SedentaryActiveDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveMinutes <int> 41, 50, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ FairlyActiveMinutes <int> 15, 24, 0, 0, 0, 9, 9, 6, 9, 0, 0, 0, 0, 0, 0…
## $ LightlyActiveMinutes <int> 277, 254, 32, 32, 32, 88, 88, 75, 88, 32, 0, …
## $ SedentaryMinutes <int> 798, 816, 986, 986, 986, 1292, 1292, 1358, 12…
## $ Calories <int> 2004, 1990, 2151, 2151, 2151, 2383, 2383, 232…
## $ TotalSleepRecords <int> 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, …
## $ TotalMinutesAsleep <int> 277, 273, 475, 296, 166, 475, 166, 166, 296, …
## $ TotalTimeInBed <int> 309, 296, 499, 315, 178, 499, 178, 178, 315, …
## $ WeightKg <dbl> 52.6, 52.6, 133.5, 133.5, 133.5, 133.5, 133.5…
## $ WeightPounds <dbl> 115.9631, 115.9631, 294.3171, 294.3171, 294.3…
## $ Fat <int> 22, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ BMI <dbl> 22.65, 22.65, 47.54, 47.54, 47.54, 47.54, 47.…
## $ IsManualReport <fct> True, True, False, False, False, False, False…
Q: Do people with a higher BMI take less steps?
# Reframe the BMI into categorical ranges
activity_sleep_weight %>%
reframe(
BMI = factor(case_when(
BMI < 18.5 ~ "Underweight",
BMI >= 18.5 & BMI < 22.9 ~ "Normal",
BMI >= 23 & BMI < 29.9 ~ "Overweight",
BMI >= 30 ~ "Obese",
),levels = c("Obese","Overweight","Normal","Underweight")),
TotalSteps) %>%
# Create a box plot visualization
ggplot(aes(BMI,TotalSteps,fill=BMI)) +
geom_boxplot() +
labs(title = "BMI vs TotalSteps",x = NULL) +
coord_flip() +
theme(legend.position = "none",
text = element_text(size = 20),
plot.title = element_text(hjust = 0.5))
options(repr.plot.width = 10, repr.plot.height = 8)
Although our dataset doesn’t contain people who are underweight, it is clear that people who take more steps are generally healthier (having a lower BMI).
# Convert date formats in hourly_steps
h_steps <- hourly_steps %>%
rename(date_time = ActivityHour) %>%
mutate(date_time = as.POSIXct(date_time, format = "%d/%m/%Y %I:%M:%S %p", tz = Sys.timezone())) %>%
separate(date_time, into = c("date", "time"), sep = " ", remove = FALSE) %>%
mutate(date = ymd(date)) %>%
#Drop null values
drop_na()
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 357 rows [1, 457, 481,
## 505, 529, 553, 577, 601, 625, 649, 673, 697, 718, 1174, 1198, 1222, 1246, 1270,
## 1294, 1318, ...].
head(h_steps)
## Id date_time date time StepTotal
## 1 1503960366 2016-12-04 01:00:00 2016-12-04 01:00:00 160
## 2 1503960366 2016-12-04 02:00:00 2016-12-04 02:00:00 151
## 3 1503960366 2016-12-04 03:00:00 2016-12-04 03:00:00 0
## 4 1503960366 2016-12-04 04:00:00 2016-12-04 04:00:00 0
## 5 1503960366 2016-12-04 05:00:00 2016-12-04 05:00:00 0
## 6 1503960366 2016-12-04 06:00:00 2016-12-04 06:00:00 0
# Create a scatter plot to visualize the hourly total steps
ggplot(h_steps, aes(
x = time, y = StepTotal)) + geom_point() +
labs(title = "Hourly Total Steps", x = "") +
theme(axis.text.x = element_text(angle = 90),
text = element_text(size = 20),
plot.title = element_text(hjust = 0.5))
options(repr.plot.width = 10, repr.plot.height = 8)
# Create a bar plot to visualize the hourly average steps
h_steps %>%
group_by(time) %>%
summarize(average_steps = mean(StepTotal)) %>%
ggplot() +
geom_col(mapping = aes(x = time, y = average_steps, fill = average_steps)) +
labs(title = "Hourly Average Steps", x = "", y = "") +
scale_fill_gradient(low = "blue", high = "red") +
theme(axis.text.x = element_text(angle = 90),
text = element_text(size = 20),
plot.title = element_text(hjust = 0.5))
options(repr.plot.width = 10, repr.plot.height = 8)
Users’ total steps exceeding 7,500 occurred predominantly between 7:00-12:00 and 14:00-21:00, with a notable peak at 14:00 where steps exceeded 10,000.
The time periods where average steps per hour are consistently above 400 were noted from 8:00 to 19:00. Two distinct peak periods where average steps surpassed 500 were observed between 12:00-14:00 and 17:00-19:00.
The data reveals distinct patterns in users’ total and average steps throughout the day. The time periods of heightened physical activity align with typical daily routines. The peaks in total steps exceeding 7,500 occur in the morning and early evening, suggesting that users tend to be more active during these times, possibly engaging in routines that involve higher levels of physical activity such as morning exercise or evening walks.
The consistent average steps above 400 between 8:00-19:00 indicate a relatively active routine throughout the day. The identified peak periods with average steps surpassing 500 at 12:00-14:00 and 17:00-19:00 could reflect specific activities or behaviors that lead to increased physical activity levels during lunch breaks and early evening hours.
Overall, the data highlights the importance of time segmentation in understanding users’ activity patterns and trends. By recognizing these peak periods of physical activity, tailored interventions or recommendations can be implemented to promote sustained activity levels and enhance overall health and wellness.