Intro

Strava.com, a social networking website for endurance athletes, has a series of metrics for their premium users. We will be reverse engineering their Fitness metric - a status of fitness which changes over time based on the level of activity of the athlete. This appears to leverage a metric calculated for each workout called Relative Effort, which was previously named “suffer score,”

Fitness

The graph showing Fitness Score over time includes a metric called “Training Impulse” which is consistently 1.3x the Suffer Score for the workout that day. Its unclear why these metrics are so related with different names - or why they are scaled differently, however, for our purposes this does not matter. Suffer Score seems to be calculated from the amount of time spent in each HR zone.

Suffer Score

We will start by scraping the page listening each of my workouts (120 in total), pulling the time spent in each HR zone and the Suffer Score for the workout. From here we can run a linear model to study the relationship between HR Zones and Suffer Score. If this relationship is in fact linear, we can study the correlation between the fitness metric and Suffer Score per workout over time by way of a second linear model.

Setup

Install Packages

library(rvest)
library(RSelenium)

## Warning: package 'RSelenium' was built under R version 3.6.3

library(stringr)
library(purrr)
library(lubridate)
library(ggplot2)
library(dplyr)

Store passwords

Stored email and password as variables, not included here for obvious reasons.

Start Session and Log In

remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4445L, browserName = "firefox")
remDr$open()

url <- "https://www.strava.com/login"
remDr$navigate(url) # Navigate to login page

Sys.sleep(5)
LoginWindow <- remDr$findElement(using = "xpath", value = '//*[(@id = "email")]')
LoginWindow$sendKeysToElement(list(email))
PwWindow <- remDr$findElement(using = "xpath", value = '//*[(@id = "password")]')
PwWindow$sendKeysToElement(list(password))

Sys.sleep(5)
#remDr$findElement(using = "xpath", value = '//*[(@id = "password")]')$clickElement() # Click the "Login" button

submitButton <- remDr$findElement(using = "xpath", value = '//*[(@id = "login-button")]')
submitButton$sendKeysToElement(list("R Cran", key = "enter"))
remDr$getCurrentUrl()

x <- remDr$screenshot
Sys.sleep(5)

url2 = "https://www.strava.com/athlete/training"
remDr$navigate(url2)
remDr$getCurrentUrl()

WebScrape HR & Suffer Score

Pull Workout URLS

Go to training page - scrape workout URLs from 6 pages (20 workouts a page, 120 workouts minus those without HR data).

remDr$navigate("https://www.strava.com/athletes/235069")
page <- read_html(remDr$getPageSource()[[1]])
name <- page %>%
    html_nodes(".athlete-name") %>%
    html_text() 
name

## [1] "\nchris bloome\n" "chris bloome"

remDr$navigate(url2)

remDr$refresh

## Class method definition for method refresh()
## function () 
## {
##     "Reload the current page."
##     qpath <- sprintf("%s/session/%s/refresh", serverURL, sessionInfo[["id"]])
##     queryRD(qpath, "POST")
## }
## <environment: 0x00000000193121e8>
## 
## Methods used: 
##      "queryRD"

Sys.sleep(2)
Sys.sleep(2)

#workout_type <- remDr$findElement(using = "xpath", value = '//*[(@id = "activity_type")]')

#workout_type$sendKeysToElement(list("Run"))


Sys.sleep(20)

#test to see if we can get right page. 
page <- read_html(remDr$getPageSource()[[1]])
num_workouts <- page %>%
    html_nodes(".activity-count") %>%
    html_text() 
num_workouts

## [1] "656 Activities"

#if that works - this will pull workout URLs
URL_Series <- page %>%
  html_nodes("#search-results a") %>%
    html_attr("href")

URL_Series <- as.vector(URL_Series)

#and this pulls dates 
Date_Series <- page %>%
  html_nodes(".view-col.col-date") %>%
html_text()

Date_Series <- mdy(substr(Date_Series,6,nchar(Date_Series)))
Date_Series <- as.vector(Date_Series)

for (i in 1:5) {
  
            next_page <- remDr$findElement(using = "xpath", value = '//*[contains(concat( " ", @class, " " ), concat( " ", "next_page", " " ))]')
          #next_page$sendKeysToElement(list("R Cran", key = "enter"))
          next_page$clickElement()
          Sys.sleep(2)
          
          page <- read_html(remDr$getPageSource()[[1]])
          
          URL_SeriesX <- page %>%
            html_nodes("#search-results a") %>%
            html_attr("href")
          
          
          num_pages <- page %>%
            html_nodes(".pagination") %>%
            html_text() 
          print(num_pages)
          
          URL_SeriesX <- as.vector(URL_SeriesX)
          URL_Series <- c(URL_Series, URL_SeriesX)
          
          Date_SeriesX <- page %>%
          html_nodes(".view-col.col-date") %>%
          html_text()

          Date_SeriesX <- mdy(substr(Date_SeriesX,6,nchar(Date_SeriesX)))
          Date_SeriesX <- as.vector(Date_SeriesX)
          Date_Series <- c(Date_Series, Date_SeriesX)
}

## [1] "21-40 of 656\n<U+2190>\n  \n  <U+2192>\n  \n"
## [1] "41-60 of 656\n<U+2190>\n  \n  <U+2192>\n  \n"
## [1] "61-80 of 656\n<U+2190>\n  \n  <U+2192>\n  \n"
## [1] "81-100 of 656\n<U+2190>\n  \n  <U+2192>\n  \n"
## [1] "101-120 of 656\n<U+2190>\n  \n  <U+2192>\n  \n"

Remove all other fields other than genuine workout URLs

Web Scrape contained a number of other links

URL_Series_DF<-data.frame(URL_Series)

URL_Series_DF2 <-data.frame(URL_Series_DF[grep("^https://www.s",URL_Series_DF[,1]),])

URL_Series_DF2[,2] <- paste(URL_Series_DF2[,1],"/heartrate",sep = "")

WorkoutCount <- nrow(URL_Series_DF2)

Scrape each workout for time in each zone and Suffer Score

for (i in 1:WorkoutCount) { 
  urlx <- URL_Series_DF2[i,2]
  remDr$navigate(urlx)
  remDr$getCurrentUrl()
  Sys.sleep(1)
  page <- read_html(remDr$getPageSource()[[1]])
  
  ZoneString <- page %>%
    html_nodes(".suffer-score a , .time strong") %>%
    html_text() %>%
  str_replace_all("\n","")
  
  if (length(ZoneString) == 7){
  
  URL_Series_DF2[i,3:9] <-ZoneString
  }
}

Strava_Table <- data.frame()
Strava_Table2 <- data.frame()
Strava_Table <- URL_Series_DF2[,c(3,5:9)]
for (i in 1:WorkoutCount){
Strava_Table2[i,1] <- Strava_Table[i,1] }

Clean data - convert time fields to seconds.

for (i in 1:WorkoutCount){
Strava_Table2[i,1] <- as.numeric(Strava_Table[i,1]) }

for (i in 2:6) {
  for (j in 1:WorkoutCount) { 
    
        Cell <- Strava_Table[j,i]
        
        Strava_Table2[j,i] <-
        if (is.na(Cell)) {
          ""
        } else if (str_count(Cell,pattern = "s")==1) {
          as.duration(dseconds(str_remove(Cell,"s")))
        } else if (str_count(Cell,pattern = ":")==2) {
            as.duration(lubridate::hms(Cell))
        } else if (str_count(Cell,pattern = ":")==1){
            as.duration(lubridate::ms(Cell)) 
        } else {
          "Check"
        }    
    }
}

Linear Model - HR to Suffer Score

names(Strava_Table2) <- c("Suffer_Score","Z1","Z2","Z3","Z4","Z5")
Strava_Table3 <- data.frame(as.numeric(Strava_Table2$Suffer_Score), as.numeric(Strava_Table2$Z1),as.numeric(Strava_Table2$Z2),as.numeric(Strava_Table2$Z3),as.numeric(Strava_Table2$Z4),as.numeric(Strava_Table2$Z5) )
names(Strava_Table3) <- c("Suffer_Score","Z1","Z2","Z3","Z4","Z5")

Strava_LM <- lm(Suffer_Score ~ 0 + Z1 + Z2 + Z3 + Z4 + Z5, data = Strava_Table3)
summary(Strava_LM)

## 
## Call:
## lm(formula = Suffer_Score ~ 0 + Z1 + Z2 + Z3 + Z4 + Z5, data = Strava_Table3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.1065  -3.9876  -0.2565   1.6473  15.0685 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)    
## Z1 0.0013937  0.0004927   2.829  0.00559 ** 
## Z2 0.0072173  0.0004814  14.992  < 2e-16 ***
## Z3 0.0360673  0.0007313  49.320  < 2e-16 ***
## Z4 0.0560018  0.0012160  46.054  < 2e-16 ***
## Z5 0.0629874  0.0055289  11.392  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.213 on 106 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.9966, Adjusted R-squared:  0.9964 
## F-statistic:  6138 on 5 and 106 DF,  p-value: < 2.2e-16

hist(Strava_LM$residuals, breaks = 30)

Strava_LM$coefficients[1]

##          Z1 
## 0.001393704

Strava_LM %>%
ggplot()  + 
  geom_point(aes(.fitted, Suffer_Score)) + 
  geom_smooth(aes(.fitted, Suffer_Score), method = "lm", se = FALSE, color = "lightgrey") + 
labs(x = "Actual", y = "Fitted") + 
  theme_bw()

With a .99 \(R^2\) this is exactly as expected. Suffer Score can be calculated by \[Z_1*.004+Z_2 *.007+Z_3*.036+Z_4*.056+Z_5*.062\] That being said, as each Suffer Score is rounded, there is some variation in our model. It is probable a trim or rounding function in the strava data is creating some static in our data - I would wager that the coefficient for Zone 3 is actually .33, Zone 4 is .05 and Zone 6 is .066.

Fitness Data Aggregation

The webpage hosting fitness over time more challenging to scrape than other pages on Strava. The data is only visible when the mouse hovers over the graph, and these metrics are not hosted elsewhere on the site. That being said - we do not need all that many observations to come to a conclusion on the relationship between Suffer Score and Fitness. Lets manually pull 35 observations of fitness from 3 different segments over the last yer - in July when I was playing rugby without my Heart Rate monitor, and April when I was running regularly and returning to form.

Of note - when digging into the above, I noticed that all my data from the late fall/early winter was off by one day. Workouts were listed one day off on the Fitness over time page when compared to the date listed on the workout itself. I avoided these dates altogether when modeling to keep things simple.

Manually build DF with observations.

Strava_Table3$Date <- Date_Series

Date <- c(mdy("7/11/2019"),mdy("7/12/2019"),mdy("7/13/2019"),mdy("7/14/2019"),mdy("7/15/2019"),mdy("7/16/2019"),mdy("7/17/2019"),mdy("7/18/2019"),mdy("7/19/2019"),mdy("7/20/2019"),mdy("7/21/2019"),mdy("7/22/2019"),mdy("7/28/2019"),mdy("7/29/2019"),mdy("4/3/2020"),mdy("4/4/2020"),mdy("4/5/2020"),mdy("4/6/2020"),mdy("4/7/2020"),mdy("4/8/2020"),mdy("4/9/2020"),mdy("4/10/2020"),mdy("4/11/2020"),mdy("4/12/2020"),mdy("4/13/2020"),mdy("4/14/2020"),mdy("4/15/2020"),mdy("4/16/2020"),mdy("4/17/2020"),mdy("4/18/2020"),mdy("4/19/2020"),mdy("4/20/2020"),mdy("4/21/2020"),mdy("4/22/2020"),mdy("4/23/2020"))

ending_fit <- c(67, 68, 66, 65, 63, 62, 60, 59, 58, 56, 55, 54, 48, 47, 33, 43, 42, 41, 41, 40, 39, 38, 48, 47, 46, 45, 43, 48, 46, 45, 44, 43, 42, 41, 40)

#Starting Fitness is Ending Fitness minus one day
starting_fit <- c(0, 67, 68, 66, 65, 63, 62, 60, 59, 58, 56, 55, 54, 48, 47, 33, 43, 42, 41, 41, 40, 39, 38, 48, 47, 46, 45, 43, 48, 46, 45, 44, 43, 42, 41)

#Remove first day from each string of days. 
Fit_table = data.frame(Date, ending_fit, starting_fit)
Fit_table <- Fit_table[Fit_table$Date != mdy("7/11/2019"),]
Fit_table <- Fit_table[Fit_table$Date != mdy("7/28/2019"),]
Fit_table <- Fit_table[Fit_table$Date != mdy("4/3/2020"),]


Strava_Table3$Date <- as_date(Strava_Table3$Date)

Combo_Table <- left_join(Fit_table, Strava_Table3,)

## Joining, by = "Date"

Combo_Table[is.na(Combo_Table)] <- 0

Linear Model - Suffer Score to Fitness

Strava_LM2 <- lm(ending_fit ~ 0 + starting_fit + Suffer_Score,  data = Combo_Table)
summary(Strava_LM2)

## 
## Call:
## lm(formula = ending_fit ~ 0 + starting_fit + Suffer_Score, data = Combo_Table)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93494 -0.05920  0.01467  0.17748  0.75998 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## starting_fit 0.9763320  0.0014479   674.3   <2e-16 ***
## Suffer_Score 0.0311109  0.0007701    40.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3991 on 30 degrees of freedom
## Multiple R-squared:  0.9999, Adjusted R-squared:  0.9999 
## F-statistic: 2.513e+05 on 2 and 30 DF,  p-value: < 2.2e-16

Strava_LM2 %>%
ggplot()  + 
  geom_point(aes(.fitted, ending_fit)) + 
  geom_smooth(aes(.fitted, ending_fit), method = "lm", se = FALSE, color = "lightgrey") + 
labs(x = "Actual", y = "Fitted") + 
  theme_bw()

Combo_Table$Resid <- Strava_LM2$residuals


plot(Combo_Table$Date,resid(Strava_LM2))

With a .999 \(R^2\) and residuals between -1 and 1, this model also seems to be dead-on. We can say with some confidence that Fitness is calculated by \[FitnessToday = .976*FitnessYesterday + .03*SufferScore\]

Again, because both Fitness and Suffer Score are rounded, there is some variation here. It is probable that these coefficients are .975 and .3 respectively.

Apply Model

Lets apply our model to all workouts since July 2019, to see if it matches the screenshot above.

Strava_Table3[is.na(Strava_Table3)] <- 0
Strava_Table4 <- Strava_Table3 %>% group_by(Date) %>% summarise("Suffer_Score"  = sum(Suffer_Score))

   
Strava_Final = data.frame(seq(mdy("7/8/2019"),mdy("5/25/2020"),by = 1))
names(Strava_Final) <- "Date" 
Strava_Final <- left_join(Strava_Final,Strava_Table4)

## Joining, by = "Date"

Strava_Final[is.na(Strava_Final)] <- 0
Strava_Final <- Strava_Final[ -c(3:7) ]
Strava_Final[1,3] <- 72
names(Strava_Final) <- c("Date", "Suffer_Score", "Fitness")


for (i in 2:nrow(Strava_Final)) {
  Strava_Final[i,3] <- Strava_Final[i-1,3]*Strava_LM2$coefficients[1]+Strava_Final[i,2]*Strava_LM2$coefficients[2]
}


Strava_Final %>%
  ggplot()+
  geom_line(aes(x=Date, y=Fitness))

Solving Strava

Christopher Bloome

5/23/2020