This follows on from investigating the lubridate package within the tidyverse and putting some of the features into practice by looking at running activity data over time.
I was interested to investigate some ways of plotting my running activities over time. The data is a CSV export from Garmin Connect.
library(lubridate)
library(dplyr)
library(ggplot2)
library(skimr)
library(summarytools)
library(RColorBrewer)
activities <- read.csv("../data/Activities.csv", stringsAsFactors = FALSE)
Notes:
dim(activities)
[1] 660 42
#str(activities)
#summary(activities)
glimpse(activities)
Observations: 660
Variables: 42
$ Activity.Type <chr> "running", "running", "running", "running", "running", "running", "running", "running...
$ Date <chr> "2018-08-01 18:19:10", "2018-07-31 06:23:37", "2018-07-29 08:11:23", "2018-07-28 10:2...
$ Favorite <chr> "false", "false", "false", "false", "false", "false", "false", "false", "false", "fal...
$ Title <chr> "Cape Town Running", "Cape Town Running", "Cape Town Running", "Cape Town Running", "...
$ Distance <dbl> 10.27, 7.36, 24.92, 14.13, 7.21, 8.13, 10.82, 10.16, 1.68, 18.97, 10.23, 12.01, 9.07,...
$ Calories <chr> "351", "311", "1,731", "1,308", "243", "262", "472", "510", "125", "1,235", "610", "6...
$ Time <chr> "50:36", "31:40", "4:17:17", "2:49:24", "38:03", "40:48", "49:51", "51:09", "12:15", ...
$ Avg.HR <chr> "130", "159", "139", "150", "123", "127", "153", "159", "160", "156", "166", "164", "...
$ Max.HR <chr> "154", "180", "185", "188", "141", "157", "178", "174", "173", "179", "190", "187", "...
$ Aerobic.TE <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ Avg.Run.Cadence <chr> "168", "180", "140", "136", "172", "170", "172", "168", "152", "152", "170", "168", "...
$ Max.Run.Cadence <chr> "180", "194", "248", "248", "178", "178", "248", "176", "168", "246", "184", "182", "...
$ Avg.Pace <chr> "4:56", "4:18", "10:20", "11:59", "5:17", "5:01", "4:36", "5:02", "7:18", "7:49", "5:...
$ Best.Pace <chr> "1:19", "3:15", "4:11", "5:00", "4:16", "3:39", "3:03", "3:42", "5:00", "0:20", "3:49...
$ Elev.Gain <chr> "91", "7", "1,779", "1,190", "36", "42", "18", "91", "32", "1,288", "170", "206", "9"...
$ Elev.Loss <chr> "92", "6", "1,581", "1,130", "37", "42", "18", "89", "148", "1,179", "171", "207", "8...
$ Avg.Stride.Length <dbl> 1.21, 1.29, 0.68, 0.61, 1.10, 1.17, 1.25, 1.18, 0.90, 0.84, 1.13, 1.18, 1.33, 1.07, 1...
$ Avg.Vertical.Ratio <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ Avg.Vertical.Oscillation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ Avg.Ground.Contact.Time <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Avg.GCT.Balance <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Avg.Run.Cadence.1 <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Max.Run.Cadence.1 <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Normalized.Power...NP.. <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ L.R.Balance <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Training.Stress.Score. <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ Max.Avg.Power..20.min. <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Power <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Max.Power <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Total.Strokes <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Avg..Swolf <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Avg.Stroke.Rate <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Max.Depth <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Bottom.Time <chr> "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00...
$ Min.Water.Temp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ Gas.Type <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Surface.Interval <chr> "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00", "0:00...
$ Decompression <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "...
$ Weight <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Current <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Surface.Conditions <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
$ Water.Type <chr> "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "--", "...
#skim(activities)
Many of the columns are empty, so let’s create a smaller subset of those we are interested in.
# Create subset of variables
running <- subset(activities, select = c(Activity.Type, Date, Distance, Calories, Time,
Avg.Pace, Elev.Gain))
# View the classes of the subset of data
lapply(running, class)
$Activity.Type
[1] "character"
$Date
[1] "character"
$Distance
[1] "numeric"
$Calories
[1] "character"
$Time
[1] "character"
$Avg.Pace
[1] "character"
$Elev.Gain
[1] "character"
read.csv imported some of the the numbers, dates and times as characters, so let’s fix this. Also, convert Activity.Type to factor to see if there is more than one level.
# Numeric values
# Substitute 1000 comma separator then convert to numeric
running$Calories <- as.numeric(gsub(",", "", running$Calories))
running$Elev.Gain <- as.numeric(gsub(",", "", running$Elev.Gain))
# Date-time columns (Date, Time, Avg.Pace, Best.Pace)
running$Date = ymd_hms(activities$Date)
# TODO Figure out how to convert different time durations to actual times
#running$Duration <- ms(running$Time)
#running$Avg.Pace <- ms(running$Avg.Pace)
#running$Best.Pace <- ms(running$Best.Pace)
# Convert activity type to factor
running$Activity.Type <- as.factor(running$Activity.Type)
levels(running$Activity.Type)
[1] "running" "street_running" "track_running" "trail_running"
View data again using summarytools package
view(dfSummary(running), method = 'render')
No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing |
---|---|---|---|---|---|---|
1 | Activity.Type [factor] | 1. running 2. street_running 3. track_running 4. trail_running | 653 (98.9%) 2 (0.3%) 2 (0.3%) 3 (0.4%) | 660 (100%) | 0 (0%) | |
2 | Date [POSIXct, POSIXt] | min : 2014-12-24 07:43:13 med : 2017-03-25 12:42:45 max : 2018-08-01 18:19:10 range : 3y 7m 8d 10H 35M 57S | 660 distinct val. | 660 (100%) | 0 (0%) | |
3 | Distance [numeric] | mean (sd) : 10.43 (7.67) min < med < max : 0.23 < 8.14 < 65.6 IQR (CV) : 4.24 (0.74) | 465 distinct values | 660 (100%) | 0 (0%) | |
4 | Calories [numeric] | mean (sd) : 627.37 (406.27) min < med < max : 14 < 516 < 3033 IQR (CV) : 281.5 (0.65) | 460 distinct values | 659 (99.85%) | 1 (0.15%) | |
5 | Time [character] | 1. 30:21 2. 30:45 3. 39:04 4. 39:15 5. 40:52 6. 1:12:03 7. 1:13:37 8. 1:20:13 9. 1:26:24 10. 20:17 [ 577 others ] | 3 (0.4%) 3 (0.4%) 3 (0.4%) 3 (0.4%) 3 (0.4%) 2 (0.3%) 2 (0.3%) 2 (0.3%) 2 (0.3%) 2 (0.3%) 635 (95.2%) | 660 (100%) | 0 (0%) | |
6 | Avg.Pace [character] | 1. 5:05 2. 5:14 3. 5:02 4. 4:56 5. 5:03 6. 5:08 7. 4:57 8. 5:07 9. 4:55 10. 5:01 [ 205 others ] | 19 (2.9%) 17 (2.6%) 16 (2.4%) 15 (2.3%) 15 (2.3%) 14 (2.1%) 13 (2.0%) 13 (2.0%) 12 (1.8%) 12 (1.8%) 514 (77.6%) | 660 (100%) | 0 (0%) | |
7 | Elev.Gain [numeric] | mean (sd) : 194.71 (266.72) min < med < max : 1 < 87 < 1779 IQR (CV) : 202 (1.37) | 311 distinct values | 645 (97.73%) | 15 (2.27%) |
Generated by summarytools 0.8.7 (R version 3.4.4)
2018-09-15
running <- subset(running, running$Date >= "2016-01-01")
running$year <- year(running$Date)
running$month <- month(running$Date, label=TRUE)
running$week <- week(running$Date)
running$wday <- wday(running$Date, label=TRUE, abbr=FALSE)
running$hour <- hour(running$Date)
week <- c("Sunday", "Saturday", "Friday", "Thursday", "Wednesday", "Tuesday", "Monday")
running$wday <- factor(running$wday, levels = week)
Use dplyr to group and summarise and create some plots.
# Create summary
year_month <- running %>%
group_by(year, month) %>%
summarise(total_runs = n(), total_distance = sum(Distance))
# Plot data
# TODO
# Create a complete grid to include all possible combinations of weekday and hour even if no data
hour = c(0:23)
grid = expand.grid(week, hour)
# Group by week day, then hour and summarise by counting number of activities per group
heatmap_runs <- running %>%
group_by(wday, hour) %>%
summarise(no_runs = n())
# Join grid and heatmap_runs
# If you don't do this, the heatmap won't show the full week, and have blank tiles for missing values instead of rather showing 0 runs/activity
heatmap_runs_full = heatmap_runs %>%
right_join(grid, by = c("wday"="Var1", "hour"="Var2"))
Column `wday`/`Var1` has different attributes on LHS and RHS of join
# Replace NA with 0
heatmap_runs_full[is.na(heatmap_runs_full)] <- 0
# Plot heatmap using ggplot
ggplot(heatmap_runs_full, aes(x=hour, y=wday)) +
geom_tile(aes(fill=no_runs), colour="white") +
scale_fill_gradient(name = "No. of runs", low = "white", high = "red") +
scale_x_continuous(breaks=seq(0,23,1)) + ggtitle("Heatmap showing weekly activity pattern")
# Using ColorBrewer to generate palette
display.brewer.all()
pal <- colorRampPalette(brewer.pal(9, "YlOrRd"))(100)
ggplot(heatmap_runs_full, aes(x=hour, y=wday)) +
geom_tile(aes(fill=no_runs), colour="white") +
scale_fill_gradientn(name = "No. of runs", colours=c("white", pal)) +
scale_x_continuous(breaks=seq(0,23,1)) + ggtitle("Heatmap showing weekly activity pattern")
# Save plot
ggsave("../results/heatmap_weekly_counts.png", width = 10, height = 7)
# Group by week day, then hour and summarise by calculating mean distance per group
heatmap_dist <- running %>%
group_by(wday, hour) %>%
summarise(mean_dist = mean(Distance))
# Join grid and heatmap_dist
heatmap_dist_full = heatmap_dist %>%
right_join(grid, by = c("wday"="Var1", "hour"="Var2"))
Column `wday`/`Var1` has different attributes on LHS and RHS of join
# Replace NA with 0
heatmap_dist_full[is.na(heatmap_dist_full)] <- 0
# Plot heatmap using ggplot
pal <- colorRampPalette(brewer.pal(9, "YlGnBu"))(100)
ggplot(heatmap_dist_full, aes(x=hour, y=wday)) +
geom_tile(aes(fill=mean_dist), colour="white") +
scale_fill_gradientn(name = "Mean distance", colours=c("white", pal)) +
scale_x_continuous(breaks=seq(0,23,1)) + ggtitle("Heatmap showing weekly activity pattern")
Mmmm…that doesn’t look right - what happened at 2pm on a Sunday?! Let’s investigate.
running_outliers = subset(running, Distance >=50)
running_outliers
Firstly, look at the average pace of entries 345 and 352. This is rather quick for a run! As this is my own data, I know what these outliers are from! These were actually cycles where I didn’t classify the activities correctly. Secondly, the 65km entry on 13 August was when I used my Garmin watch to measure how far we travelled on a boat in Brazil :) So, let’s remove these three entries.
running <- subset(running, Distance <=60)
# And create the graphs again above.
# Repeated here just for ease of reference.
# Group by week day, then hour and summarise by calculating mean distance per group
heatmap_dist <- running %>%
group_by(wday, hour) %>%
summarise(mean_dist = mean(Distance))
# Join grid and heatmap_dist
heatmap_dist_full = heatmap_dist %>%
right_join(grid, by = c("wday"="Var1", "hour"="Var2"))
Column `wday`/`Var1` has different attributes on LHS and RHS of join
# Replace NA with 0
heatmap_dist_full[is.na(heatmap_dist_full)] <- 0
# Plot heatmap using ggplot
pal <- colorRampPalette(brewer.pal(9, "YlGnBu"))(100)
ggplot(heatmap_dist_full, aes(x=hour, y=wday)) +
geom_tile(aes(fill=mean_dist), colour="white") +
scale_fill_gradientn(name = "Mean distance", colours=c("white", pal)) +
scale_x_continuous(breaks=seq(0,23,1)) + ggtitle("Heatmap showing weekly activity pattern")
# Save plot
ggsave("../results/heatmap_weekly_dist.png", width = 10, height = 7)
source ("calendarHeat.R")
calendarHeat(running$Date, running$Distance, varname = "running activities and distances", color = "y2b", ncolors=20)
Loading required package: lattice
Loading required package: grid
Loading required package: chron
Attaching package: ‘chron’
The following objects are masked from ‘package:lubridate’:
days, hours, minutes, seconds, years
the condition has length > 1 and only the first element will be used
<<<<<<< HEAD
======= >>>>>>> b1a1cff6c76adb84db272af543df1ec3940595db