# clear-up the environment
rm(list = ls())
# chunk options
knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
fig.align = "center",
comment = "#>"
)
In this Capstone, I will try to process a dataset using my own record files in Strava. Strava itself is an app to record your exercise activities such as cycling, running, swimming, hiking, etc. From Wikipedia, function of the app is described :
“Strava records data for a user’s activities, which can then be shared with the user’s followers or shared publicly. If an activity is shared publicly, Strava automatically groups activities that occur at the same time and place (such as taking part in a marathon, sportive or group ride). An activity’s recorded information may include a route summary, elevation (net and unidirectional), speed (average, minimum, maximum), timing (total and moving time), power and heart rate. Activities can be recorded using the mobile app or from devices manufactured by third parties like Garmin, Google Fit, Suunto, and Wahoo. Activities can also be entered manually via the Strava website.”
This project will try to describe and plot some statistics data of my activities from 2020 - 2021. Everything in this article is not confidential because the mapping of GPX files was taken before I moved out so every start point is not located from my current address.
The purpose of this project is to try summarize and getting insight from activities records of one person. This will beneficial for anyone who is interested in data processing their records for better exercise plan, compare performance with others or their own-record.
Feel free to copy, manipulate, or train your data skills with this article and data-set. Check out my Strava Profile
knitr::include_graphics("images/strava-logo.jpg")
# Import Data dan Pre-Processing
Importing libraries that will be used in this project.
# load basic library
options(scipen = 99) #disable scientific annotation
library(tidyverse) #collection of library for R
library(dplyr) #grammar of data manipulation
library(readr) #reading data
library(glue) #setting tooltip
library(scales) # scale for plot
#load library for plotting map
library(XML) #to read GPX file
library(OpenStreetMap) #plotting to Open Street
library(lubridate) #treatment date column
library(ggmap) #plotting map
library(ggplot2) #static plot
library(raster) #plotting map, to add background data to map
library(sp) #plotting map complementary
library(mapview) #interactive map plotting
library(leaflet) #interactive map plotting
library(plotly) #interactive plot
#Import Dataset
activities <- read_csv(file = "strava-data/activities.csv")
#Glimpse Dataset
glimpse(activities)
Based on the glimpse above, we will remove unnecessary columns to make it easier and lighter for processing the data. We only take this columns for the project (order by default) :
#preprocessing data
activities_clean <-
activities %>%
select('Activity ID',
'Activity Date',
'Activity Type',
'Elapsed Time...15',
'Distance...7',
'Moving Time',
'Max Speed',
'Average Speed',
'Elevation Gain',
'Average Watts',
'Calories',
'Bike',
'Gear') %>%
rename('activity_id' = 'Activity ID',
'date' = 'Activity Date',
'type' = 'Activity Type',
'elapsed_time' = 'Elapsed Time...15',
'distance' = 'Distance...7',
'moving_time' = 'Moving Time',
'max_speed' = 'Max Speed',
'avg_speed' = 'Average Speed',
'elevation_gain' = 'Elevation Gain',
'avg_watts' = 'Average Watts',
'calories' = 'Calories',
'bike_id' = 'Bike',
'gear_id' = 'Gear')
activities_clean
For ‘bike_id’ and ‘gear_id’ it can be one column because it contains unique id. Combine it as ‘equip_id’.
#https://stackoverflow.com/questions/14563531/combine-column-to-remove-nas
activities_clean <-
activities_clean %>%
mutate(equip_id = coalesce(bike_id, gear_id)) %>%
select(-(bike_id:gear_id))
activities_clean
#cek missing value
colSums(is.na(activities_clean))
#> activity_id date type elapsed_time distance
#> 0 0 0 0 0
#> moving_time max_speed avg_speed elevation_gain avg_watts
#> 0 0 0 1 81
#> calories equip_id
#> 2 7
Based on Number of Missing Value above, we check that there are several columns that contains missing value.
#remove missing value
activities_clean <-
activities_clean %>%
filter_at(vars(elevation_gain, calories), all_vars(!is.na(.)))
colSums(is.na(activities_clean))
#> activity_id date type elapsed_time distance
#> 0 0 0 0 0
#> moving_time max_speed avg_speed elevation_gain avg_watts
#> 0 0 0 0 79
#> calories equip_id
#> 0 5
Changing columns into appropriate types for easier processing and data analysis.
#change columns type
activities_clean_final <-
activities_clean %>%
mutate(
type = as_factor(type),
equip_id = as_factor(equip_id),
activity_id = as.character(activity_id)
)
#finding timezone (tz)
#grep("Jakarta", OlsonNames(), value=TRUE)
#change column date to proper type with lubridate
#input is time in UTC
activities_clean_final$date <- mdy_hms(activities_clean_final$date, tz="Asia/Jakarta") + hours(7)
glimpse(activities_clean_final)
#> Rows: 392
#> Columns: 12
#> $ activity_id <chr> "2933969536", "2988726263", "3000222167", "3017948738",…
#> $ date <dttm> 2019-12-15 05:47:11, 2020-01-07 05:39:06, 2020-01-11 0…
#> $ type <fct> Ride, Ride, Ride, Ride, Ride, Ride, Ride, Ride, Run, Ri…
#> $ elapsed_time <dbl> 42445, 5110, 26515, 8187, 5379, 8205, 33633, 10797, 544…
#> $ distance <dbl> 120.12, 16.69, 49.06, 25.30, 23.15, 24.08, 128.72, 76.4…
#> $ moving_time <dbl> 21971, 3520, 13252, 5944, 4851, 5619, 21776, 10615, 378…
#> $ max_speed <dbl> 16.7, 16.1, 22.8, 14.7, 14.7, 13.2, 15.2, 24.4, 7.8, 18…
#> $ avg_speed <dbl> 5.467503, 4.741960, 3.702445, 4.256830, 4.772233, 4.285…
#> $ elevation_gain <dbl> 2237.42700, 483.71408, 1443.58960, 431.74585, 334.57452…
#> $ avg_watts <dbl> 125.76705, 147.11372, 121.75650, 111.58076, 99.00698, 1…
#> $ calories <dbl> 3080.9978, 577.3930, 1799.0790, 739.5098, 535.5155, 651…
#> $ equip_id <fct> 7360096, 7360096, 7360096, 7360096, 7360096, 7360096, 7…
In this part, further cleansing is required to eliminate abnormal record such as abnormal maximum speed, too short moving time or distance. Which suggest that GPS recording error.
# find data which activities type is Ride and maximum speed above 120km/hour
activities_clean_final[activities_clean_final$type == "Ride" & activities_clean_final$max_speed > 120,]
# find data which activities type is Run and maximum pace (below 3minutes/km) (or above 20km/hour) ,which I couldn't do definitely, except I'm a World Class Runner.
activities_clean_final[activities_clean_final$type == "Run" & activities_clean_final$max_speed > 20,]
# find data which activities elapsed time is below 15 minutes
activities_clean_final[activities_clean_final$elapsed_time < 15,]
#applying filter
activities_clean_final <-
activities_clean_final %>%
filter(activity_id != "3075499358")
#create new columns for time constraint
activities_clean_final$year_record <- year(activities_clean_final$date)
activities_clean_final$month_record <- month(activities_clean_final$date, label = TRUE)
activities_clean_final$hour_record <- hour(activities_clean_final$date)
#take only records from 2020 and 2021
activities_clean_final <-
activities_clean_final %>%
filter(year_record %in% c(2020,2021)) %>%
mutate(
year_record= as_factor(year_record),
month_record= as_factor(month_record)#,
#hour_record= as_factor(hour_record)
)
activities_clean_final
This data will be presented in the Capstone Shiny Dashboard
#data aggregation
summary_sport <-
activities_clean_final %>%
summarise(total_moving_time = sum(moving_time),
total_calories = sum(calories),
total_distance = sum(distance))
summary_sport
#data aggregation
active_month_2020 <-
activities_clean_final %>%
filter(year_record == 2020)
#get month data
active_month_2020$month_record <- month(active_month_2020$date, label = TRUE)
active_month_2020 <-
active_month_2020 %>%
group_by(month_record) %>%
summarise(time_total = sum(elapsed_time)) %>%
ungroup() %>%
mutate(total_hours = time_total/3600) %>%
arrange(month_record) %>%
mutate(label = glue("Month : {month_record}
Total Hours : {comma(total_hours)}"))
#data aggregation
active_month_2021 <-
activities_clean_final %>%
filter(year_record == 2021)
#get month data
active_month_2021$month_record <- month(active_month_2021$date, label = TRUE)
active_month_2021 <-
active_month_2021 %>%
group_by(month_record) %>%
summarise(time_total = sum(elapsed_time)) %>%
ungroup() %>%
mutate(total_hours = time_total/3600) %>%
arrange(month_record) %>%
mutate(label = glue("Month : {month_record}
Total Hours : {comma(total_hours)}"))
#data aggregation
top_sport <-
activities_clean_final %>%
group_by(type) %>%
summarise(time_total = sum(elapsed_time)) %>%
ungroup() %>%
mutate(total_hours = time_total/3600,
label = glue("Activity Type : {type}
Total Hours : {comma(total_hours)}")) %>%
arrange(desc(total_hours))
top_sport
#data aggregation
fav_hour <-
activities_clean_final %>%
group_by(hour_record) %>%
summarise(total_record = n()) %>%
ungroup() %>%
mutate(label = glue("Activity Count : {total_record}
Starting Hour : {hour_record}")) %>%
arrange(hour_record)
#fav hour lineplot
fav_hour_lineplot <-
ggplot(fav_hour,
mapping = aes (x = hour_record,
group = 1 ,
text = label
)
) +
geom_point(aes(y = total_record),
size = 2,
color = "red"
) +
geom_segment(aes(y = 0,xend = hour_record, yend = total_record),
size = 2,
color = "red") +
scale_x_continuous(breaks = seq(0,23,1),
limit = c(0, 23)
) +
scale_y_continuous(breaks = seq(0,150,25),
limits = c(0, 150)
) +
labs(
title = "Favorite Hour",
y = "Activities Started",
x = "Hour"
) +
theme_minimal()
#data aggregation
sport_calories <-
activities_clean_final %>%
filter(type == "Ride") %>%
group_by(month_record) %>%
summarise(total_calories = sum(calories)) %>%
ungroup() %>%
mutate(label = glue("Total Calories : {comma(total_calories)}
Month : {month_record}")) %>%
arrange(month_record)
#calories lineplot
sport_calories_lineplot <-
ggplot(sport_calories,
mapping = aes (x = month_record,
group = 1 ,
text = label
)
) +
geom_line(aes(y = total_calories),
size = 1,
show.legend = FALSE,
color = "red"
) +
geom_point(aes(y = total_calories),
size = 2,
color = "red"
) +
scale_y_continuous(labels = comma,
breaks = seq(0,35000,5000),
limits = c(0, 35000)
) +
labs(
title = "Calories Burned Each Month",
y = "Total Calories",
x = "Month"
) +
theme_minimal()
#top calories by activity id
top_calories <-
activities_clean_final %>%
arrange(desc(calories)) %>%
select(activity_id, type, calories, moving_time) %>%
mutate(hours = moving_time/3600,
label = glue("Calories : {comma(calories)}
Activity Type : {type}
Moving Hours : {format(round(hours, 2), nsmall = 2)}")) %>%
head(10)
#plotting data
top_calories_barplot <-
ggplot(data = top_calories,
mapping = aes(x = calories,
y = reorder(activity_id, calories),
text = label)) +
geom_col(aes(fill = calories), show.legend = FALSE) +
scale_fill_gradient(low = "orange", high = "red") +
labs(
title = "Most Epic Activities",
y = "Activity ID",
x = "Calories Burned"
) +
scale_x_continuous(labels = comma,
breaks = seq(0,6000,500),
limits = c(0, 6000)) +
theme_minimal()