Introduction

This follows on from investigating the lubridate package within the tidyverse and putting some of the features into practice by looking at running activity data over time.

I was interested to investigate some ways of plotting my running activities over time. The data is a CSV export from Garmin Connect.

Load packages needed

library(lubridate)
library(dplyr)
library(ggplot2)
library(skimr)
library(summarytools)
library(RColorBrewer)

Read in data

activities <- read.csv("../data/Activities.csv", stringsAsFactors = FALSE)

Initial view of data

Notes:

dim(activities)
[1] 660  42
#str(activities)
#summary(activities)
glimpse(activities)
Observations: 660
Variables: 42
$ Activity.Type            <chr> "running", "running", "running", "running", "running", "running", "running", "running...
$ Date                     <chr> "2018-08-01 18:19:10", "2018-07-31 06:23:37", "2018-07-29 08:11:23", "2018-07-28 10:2...
$ Favorite                 <chr> "false", "false", "false", "false", "false", "false", "false", "false", "false", "fal...
$ Title                    <chr> "Cape Town Running", "Cape Town Running", "Cape Town Running", "Cape Town Running", "...
$ Distance                 <dbl> 10.27, 7.36, 24.92, 14.13, 7.21, 8.13, 10.82, 10.16, 1.68, 18.97, 10.23, 12.01, 9.07,...
$ Calories                 <chr> "351", "311", "1,731", "1,308", "243", "262", "472", "510", "125", "1,235", "610", "6...
$ Time                     <chr> "50:36", "31:40", "4:17:17", "2:49:24", "38:03", "40:48", "49:51", "51:09", "12:15", ...
$ Avg.HR                   <chr> "130", "159", "139", "150", "123", "127", "153", "159", "160", "156", "166", "164", "...
$ Max.HR                   <chr> "154", "180", "185", "188", "141", "157", "178", "174", "173", "179", "190", "187", "...
$ Aerobic.TE               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ Avg.Run.Cadence          <chr> "168", "180", "140", "136", "172", "170", "172", "168", "152", "152", "170", "168", "...
$ Max.Run.Cadence          <chr> "180", "194", "248", "248", "178", "178", "248", "176", "168", "246", "184", "182", "...
$ Avg.Pace                 <chr> "4:56", "4:18", "10:20", "11:59", "5:17", "5:01", "4:36", "5:02", "7:18", "7:49", "5:...
$ Best.Pace                <chr> "1:19", "3:15", "4:11", "5:00", "4:16", "3:39", "3:03", "3:42", "5:00", "0:20", "3:49...
$ Elev.Gain                <chr> "91", "7", "1,779", "1,190", "36", "42", "18", "91", "32", "1,288", "170", "206", "9"...
$ Elev.Loss                <chr> "92", "6", "1,581", "1,130", "37", "42", "18", "89", "148", "1,179", "171", "207", "8...
$ Avg.Stride.Length        <dbl> 1.21, 1.29, 0.68, 0.61, 1.10, 1.17, 1.25, 1.18, 0.90, 0.84, 1.13, 1.18, 1.33, 1.07, 1...
$ Avg.Vertical.Ratio       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ Avg.Vertical.Oscillation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ Avg.Ground.Contact.Time  <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Avg.GCT.Balance          <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Avg.Run.Cadence.1        <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Max.Run.Cadence.1        <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Normalized.Power...NP..  <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ L.R.Balance              <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Training.Stress.Score.   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ Max.Avg.Power..20.min.   <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Power                    <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Max.Power                <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Total.Strokes            <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Avg..Swolf               <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Avg.Stroke.Rate          <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Max.Depth                <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Bottom.Time              <chr> "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00...
$ Min.Water.Temp           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ Gas.Type                 <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Surface.Interval         <chr> "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00...
$ Decompression            <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "...
$ Weight                   <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Current                  <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Surface.Conditions       <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Water.Type               <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
#skim(activities)

Tidy up data

Extract variables of interest

Many of the columns are empty, so let’s create a smaller subset of those we are interested in.

# Create subset of variables
running <- subset(activities, select = c(Activity.Type, Date, Distance, Calories, Time,
                                            Avg.Pace, Elev.Gain))
# View the classes of the subset of data
lapply(running, class)
$Activity.Type
[1] "character"

$Date
[1] "character"

$Distance
[1] "numeric"

$Calories
[1] "character"

$Time
[1] "character"

$Avg.Pace
[1] "character"

$Elev.Gain
[1] "character"

Transform data types

read.csv imported some of the the numbers, dates and times as characters, so let’s fix this. Also, convert Activity.Type to factor to see if there is more than one level.

# Numeric values
# Substitute 1000 comma separator then convert to numeric
running$Calories <- as.numeric(gsub(",", "", running$Calories))
running$Elev.Gain <- as.numeric(gsub(",", "", running$Elev.Gain))
# Date-time columns (Date, Time, Avg.Pace, Best.Pace)
running$Date = ymd_hms(activities$Date)
# TODO Figure out how to convert different time durations to actual times
#running$Duration <- ms(running$Time)
#running$Avg.Pace <- ms(running$Avg.Pace)
#running$Best.Pace <- ms(running$Best.Pace)
# Convert activity type to factor
running$Activity.Type <- as.factor(running$Activity.Type)
levels(running$Activity.Type)
[1] "running"        "street_running" "track_running"  "trail_running" 

View data again using summarytools package

view(dfSummary(running), method = 'render')

Data Frame Summary

running

N: 660
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
1 Activity.Type [factor] 1. running 2. street_running 3. track_running 4. trail_running 653 (98.9%) 2 (0.3%) 2 (0.3%) 3 (0.4%) 660 (100%) 0 (0%)
2 Date [POSIXct, POSIXt] min : 2014-12-24 07:43:13 med : 2017-03-25 12:42:45 max : 2018-08-01 18:19:10 range : 3y 7m 8d 10H 35M 57S 660 distinct val. 660 (100%) 0 (0%)
3 Distance [numeric] mean (sd) : 10.43 (7.67) min < med < max : 0.23 < 8.14 < 65.6 IQR (CV) : 4.24 (0.74) 465 distinct values 660 (100%) 0 (0%)
4 Calories [numeric] mean (sd) : 627.37 (406.27) min < med < max : 14 < 516 < 3033 IQR (CV) : 281.5 (0.65) 460 distinct values 659 (99.85%) 1 (0.15%)
5 Time [character] 1. 30:21 2. 30:45 3. 39:04 4. 39:15 5. 40:52 6. 1:12:03 7. 1:13:37 8. 1:20:13 9. 1:26:24 10. 20:17 [ 577 others ] 3 (0.4%) 3 (0.4%) 3 (0.4%) 3 (0.4%) 3 (0.4%) 2 (0.3%) 2 (0.3%) 2 (0.3%) 2 (0.3%) 2 (0.3%) 635 (95.2%) 660 (100%) 0 (0%)
6 Avg.Pace [character] 1. 5:05 2. 5:14 3. 5:02 4. 4:56 5. 5:03 6. 5:08 7. 4:57 8. 5:07 9. 4:55 10. 5:01 [ 205 others ] 19 (2.9%) 17 (2.6%) 16 (2.4%) 15 (2.3%) 15 (2.3%) 14 (2.1%) 13 (2.0%) 13 (2.0%) 12 (1.8%) 12 (1.8%) 514 (77.6%) 660 (100%) 0 (0%)
7 Elev.Gain [numeric] mean (sd) : 194.71 (266.72) min < med < max : 1 < 87 < 1779 IQR (CV) : 202 (1.37) 311 distinct values 645 (97.73%) 15 (2.27%)

Generated by summarytools 0.8.7 (R version 3.4.4)
2018-09-15

Subset again to only look at last 3 full years of data

running <- subset(running, running$Date >= "2016-01-01")

Create new variables

running$year <- year(running$Date)
running$month <- month(running$Date, label=TRUE)
running$week <- week(running$Date)
running$wday <- wday(running$Date, label=TRUE, abbr=FALSE)
running$hour <- hour(running$Date)
week <- c("Sunday", "Saturday", "Friday", "Thursday", "Wednesday", "Tuesday", "Monday")
running$wday <- factor(running$wday, levels = week)

Exploratory visualisation

Use dplyr to group and summarise and create some plots.

# Create summary
year_month <- running %>%
  group_by(year, month) %>%
  summarise(total_runs = n(), total_distance = sum(Distance))
# Plot data
# TODO

Heatmaps

Weekly pattern - average distance run at different times during the week

# Group by week day, then hour and summarise by calculating mean distance per group
heatmap_dist <- running %>%
  group_by(wday, hour) %>%
  summarise(mean_dist = mean(Distance))
# Join grid and heatmap_dist
heatmap_dist_full = heatmap_dist %>%
  right_join(grid, by = c("wday"="Var1", "hour"="Var2"))
Column `wday`/`Var1` has different attributes on LHS and RHS of join
# Replace NA with 0
heatmap_dist_full[is.na(heatmap_dist_full)] <- 0
# Plot heatmap using ggplot
pal <- colorRampPalette(brewer.pal(9, "YlGnBu"))(100)
ggplot(heatmap_dist_full, aes(x=hour, y=wday)) +
  geom_tile(aes(fill=mean_dist), colour="white") +
  scale_fill_gradientn(name = "Mean distance", colours=c("white", pal)) +
  scale_x_continuous(breaks=seq(0,23,1)) + ggtitle("Heatmap showing weekly activity pattern")

Mmmm…that doesn’t look right - what happened at 2pm on a Sunday?! Let’s investigate.

running_outliers = subset(running, Distance >=50)
running_outliers

Firstly, look at the average pace of entries 345 and 352. This is rather quick for a run! As this is my own data, I know what these outliers are from! These were actually cycles where I didn’t classify the activities correctly. Secondly, the 65km entry on 13 August was when I used my Garmin watch to measure how far we travelled on a boat in Brazil :) So, let’s remove these three entries.

running <- subset(running, Distance <=60)
# And create the graphs again above.
# Repeated here just for ease of reference.
# Group by week day, then hour and summarise by calculating mean distance per group
heatmap_dist <- running %>%
  group_by(wday, hour) %>%
  summarise(mean_dist = mean(Distance))
# Join grid and heatmap_dist
heatmap_dist_full = heatmap_dist %>%
  right_join(grid, by = c("wday"="Var1", "hour"="Var2"))
Column `wday`/`Var1` has different attributes on LHS and RHS of join
# Replace NA with 0
heatmap_dist_full[is.na(heatmap_dist_full)] <- 0
# Plot heatmap using ggplot
pal <- colorRampPalette(brewer.pal(9, "YlGnBu"))(100)
ggplot(heatmap_dist_full, aes(x=hour, y=wday)) +
  geom_tile(aes(fill=mean_dist), colour="white") +
  scale_fill_gradientn(name = "Mean distance", colours=c("white", pal)) +
  scale_x_continuous(breaks=seq(0,23,1)) + ggtitle("Heatmap showing weekly activity pattern")

# Save plot
ggsave("../results/heatmap_weekly_dist.png", width = 10, height = 7)

Calendar heat map

source ("calendarHeat.R")
calendarHeat(running$Date, running$Distance, varname = "running activities and distances", color = "y2b", ncolors=20)
Loading required package: lattice
Loading required package: grid
Loading required package: chron

Attaching package: ‘chron’

The following objects are masked from ‘package:lubridate’:

    days, hours, minutes, seconds, years

the condition has length > 1 and only the first element will be used