Loading library.

require(ggplot2)
require(knitr)
require(dplyr)
library(tidyr)
library(stringr)
library(lubridate)
library(table1)

Import data:

DF <- read.csv("Activities.csv", header = TRUE,stringsAsFactors=FALSE,encoding = "UTF-8")

View first 6 rows of the dataset

Head of the first 6 rows and 7 columns by Using kable() function of package(knitr) to generate clean table.

knitr:: kable(head(DF[10:11,1:8]),  format = "pipe")

View structure

str(DF)
## 'data.frame':    520 obs. of  47 variables:
##  $ Activity.Type           : chr  "Other" "Strength Training" "Other" "Other" ...
##  $ Date                    : chr  "2023-08-02 06:32:35" "2023-08-01 18:45:42" "2023-07-31 16:04:49" "2023-07-31 06:18:56" ...
##  $ Favorite                : chr  "false" "true" "false" "false" ...
##  $ Title                   : chr  "Quan Binh Thanh Track Me" "Strength" "Quan Thu Duc Track Me" "Quan Binh Thanh Track Me" ...
##  $ Distance                : chr  "6.99" "0.00" "17.28" "17.27" ...
##  $ Calories                : chr  "0" "135" "0" "0" ...
##  $ Time                    : chr  "00:26:32" "00:43:23" "00:54:59" "00:51:28" ...
##  $ Avg.HR                  : int  0 92 0 0 120 94 0 0 102 130 ...
##  $ Max.HR                  : int  0 138 0 0 151 146 0 0 117 141 ...
##  $ Aerobic.TE              : chr  "--" "0.8" "--" "--" ...
##  $ Avg.Bike.Cadence        : chr  "--" "--" "--" "--" ...
##  $ Max.Bike.Cadence        : chr  "--" "--" "--" "--" ...
##  $ Avg.Speed               : chr  "15.8" "--" "18.9" "20.1" ...
##  $ Max.Speed               : chr  "46.2" "--" "63.0" "61.0" ...
##  $ Total.Ascent            : chr  "366" "--" "141" "146" ...
##  $ Total.Descent           : chr  "120" "--" "225" "128" ...
##  $ Avg.Stride.Length       : num  0 0 0 0 0.7 0 0 0 0.69 0.71 ...
##  $ Avg.Vertical.Ratio      : num  0 0 0 0 7 0 0 0 0 8.1 ...
##  $ Avg.Vertical.Oscillation: num  0 0 0 0 5.1 0 0 0 0 5.9 ...
##  $ Avg.Ground.Contact.Time : int  0 0 0 0 318 0 0 0 0 280 ...
##  $ Avg.GCT.Balance         : chr  "--" "--" "--" "--" ...
##  $ Avg.GAP                 : chr  "--" "--" "--" "--" ...
##  $ Normalized.Power...NP.. : chr  "--" "--" "--" "--" ...
##  $ Training.Stress.Score.  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Avg.Power               : int  0 0 0 0 113 0 0 0 0 171 ...
##  $ Max.Power               : chr  "0" "0" "0" "0" ...
##  $ Grit                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Flow                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Total.Strokes           : chr  "--" "--" "--" "--" ...
##  $ Avg..Swolf              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Avg.Stroke.Rate         : chr  "0" "0" "0" "0" ...
##  $ Total.Reps              : chr  "0" "36" "0" "0" ...
##  $ Total.Sets              : chr  "--" "1" "--" "--" ...
##  $ Dive.Time               : chr  "0:00" "0:00" "0:00" "0:00" ...
##  $ Min.Temp                : num  0 31 0 0 29 31 0 0 30 29 ...
##  $ Surface.Interval        : chr  "0:00" "0:00" "0:00" "0:00" ...
##  $ Decompression           : chr  "No" "No" "No" "No" ...
##  $ Best.Lap.Time           : chr  "26:31.67" "43:22.57" "54:58.74" "51:27.99" ...
##  $ Number.of.Laps          : int  1 1 1 1 3 1 1 1 1 2 ...
##  $ Max.Temp                : num  0 33 0 0 32 32 0 0 31 30 ...
##  $ Avg.Resp                : chr  "--" "--" "--" "--" ...
##  $ Min.Resp                : chr  "--" "--" "--" "--" ...
##  $ Max.Resp                : chr  "--" "--" "--" "--" ...
##  $ Moving.Time             : chr  "00:24:35" "00:43:23" "00:46:02" "00:45:32" ...
##  $ Elapsed.Time            : chr  "00:26:32" "00:43:23" "00:54:59" "00:51:28" ...
##  $ Min.Elevation           : chr  "56" "--" "-3" "-1" ...
##  $ Max.Elevation           : chr  "414" "--" "175" "49" ...

This data should be cleaned. We first make a copy of dataset, and make data wrangling on it

Asign new new dataset called df, make edit and clean on the new dataset.

  • A new copy of DF data called df.
df <- DF
  • Change to lower characters.
names(df) <- tolower(names(df))

Remove the last characters with (..) of two columns

colnames(df)[colnames(df) == "normalized.power...np.."] = "normalized.power.np"
colnames(df)[colnames(df) == "training.stress.score."] = "training.stress.score"

View names of vairables

colnames(df)
##  [1] "activity.type"            "date"                    
##  [3] "favorite"                 "title"                   
##  [5] "distance"                 "calories"                
##  [7] "time"                     "avg.hr"                  
##  [9] "max.hr"                   "aerobic.te"              
## [11] "avg.bike.cadence"         "max.bike.cadence"        
## [13] "avg.speed"                "max.speed"               
## [15] "total.ascent"             "total.descent"           
## [17] "avg.stride.length"        "avg.vertical.ratio"      
## [19] "avg.vertical.oscillation" "avg.ground.contact.time" 
## [21] "avg.gct.balance"          "avg.gap"                 
## [23] "normalized.power.np"      "training.stress.score"   
## [25] "avg.power"                "max.power"               
## [27] "grit"                     "flow"                    
## [29] "total.strokes"            "avg..swolf"              
## [31] "avg.stroke.rate"          "total.reps"              
## [33] "total.sets"               "dive.time"               
## [35] "min.temp"                 "surface.interval"        
## [37] "decompression"            "best.lap.time"           
## [39] "number.of.laps"           "max.temp"                
## [41] "avg.resp"                 "min.resp"                
## [43] "max.resp"                 "moving.time"             
## [45] "elapsed.time"             "min.elevation"           
## [47] "max.elevation"

Handle some missing values

Replaced value 0 and -- with NA

df[df == 0] <- NA
df[df == "--"] <- NA

Extract date and month using `strptime()’ function

df$date1 <- strptime(df$date, format = "%Y-%m-%d %H:%M:%S")
Create a year.month column
df$year.month <- as.character(format(df$date1, "%Y-%m")) 

head first year.month

head(df$year.month)
## [1] "2023-08" "2023-08" "2023-07" "2023-07" "2023-07" "2023-07"

Hence, we have created a variable with month of the years

Check type of `year.month’ variable type.

typeof(df$year.month)
## [1] "character"

We need to covert from “character” to factor.

df$year.month <- as.factor(df$year.month)
levels(df$year.month)
##  [1] "2022-10" "2022-11" "2023-01" "2023-02" "2023-03" "2023-04" "2023-05"
##  [8] "2023-06" "2023-07" "2023-08"

Convert some numeric characters

df <- df |> mutate(avg.bike.cadence = as.numeric(avg.bike.cadence), 
                   max.bike.cadence = as.numeric(max.bike.cadence), 
                   distance = as.numeric(distance), 
                   calories = as.numeric(calories),
                   avg.resp = as.numeric(avg.resp),
                   avg.stride.length = as.numeric(avg.stride.length),
                   avg.vertical.ratio  = as.numeric(avg.vertical.ratio ),
                   avg.vertical.oscillatio  = as.numeric(avg.vertical.oscillation),
                   avg.ground.contact.time = as.numeric(avg.ground.contact.time))

Manage pace

  • First view 6 first values of pace
head(df,6)[,"avg.speed"]
## [1] "15.8"  NA      "18.9"  "20.1"  "10:46" NA
  • Note: Pace is the time value with 2 characters of minutes and last two characters of seconds.

  • Extract the first 2 characters to make them minutes and last two characters to make seconds.

  • Creating funtion getting first 2 of 5 characters of pace

first_2cha = function(x) {
  substr(x, 1, 2)
}
  • Applying function to minutes column of pace.
df$speed.mi = sapply(df$avg.speed, first_2cha)
  • Creating funtion getting last 2 of 5 characters of pace.
last_2cha = function(x) {
  substr(x, 4, 5)
}
  • Applying function to seconds of pace.
df$speed.se = sapply(df$avg.speed, last_2cha)
  • Chuyển pace to numeric datatype.
df <- df |> mutate(speed.mi = as.numeric(speed.mi), speed.se = as.numeric(speed.se))
  • create column named pace (minutes)
df$speed <- round(df$speed.mi+df$speed.se/60, digit=0)

Time processing

library(lubridate)
df$time.dur <- round(round(period_to_seconds(hms(df$time)), digits = 0)/60, digits = 0)

df$moving.time.dur <- round(round(period_to_seconds(hms(df$moving.time)), digits = 0)/60, digits = 0)

df$elapsed.time.dur  <- round(round(period_to_seconds(hms(df$elapsed.time )), digits = 0)/60, digits = 0)

Analysing walking and running

Remove other and activity type because they are not of garmin watch data.

  • subset function can be used in base R.
df.wr <- subset(df, !activity.type %in% c('Other','Motorcycling'))
  • Select only walking and running
df.wr <- subset(df, title %in% c('Quan Binh Thanh Walking','Quan Binh Thanh Running'), !favorite == 'false') |> 
  subset(!avg.hr=="NA")|> subset(!speed=="NA")

Describe variable

Using `table1’ package to generate a clean table:

table1(~distance+calories+avg.hr+avg.resp|activity.type, data=df.wr, topclass = "Rtable1-zebra")
Running
(N=51)
Walking
(N=114)
Overall
(N=165)
distance
Mean (SD) 2.31 (1.04) 1.24 (0.929) 1.57 (1.08)
Median [Min, Max] 2.27 [0.520, 5.33] 0.920 [0.220, 4.91] 1.13 [0.220, 5.33]
calories
Mean (SD) 150 (65.3) 76.9 (53.2) 99.6 (66.4)
Median [Min, Max] 144 [36.0, 354] 59.0 [15.0, 310] 74.0 [15.0, 354]
avg.hr
Mean (SD) 112 (14.1) 97.8 (10.6) 102 (13.5)
Median [Min, Max] 106 [88.0, 142] 96.5 [67.0, 128] 101 [67.0, 142]
avg.resp
Mean (SD) 29.0 (3.46) 21.4 (2.68) 23.0 (4.25)
Median [Min, Max] 30.0 [23.0, 32.0] 21.0 [17.0, 27.0] 23.0 [17.0, 32.0]
Missing 46 (90.2%) 96 (84.2%) 142 (86.1%)

Data visualization

  • Density plot
ggplot(df.wr, aes(x=distance , fill=activity.type)) +     geom_density(alpha=0.4)

  • Av heart rate and energty
ggplot(data=df.wr, aes(x=avg.hr, y=distance, col= activity.type))+geom_point()+geom_smooth()

- type of running and pace

ggplot(data=df.wr, aes(x=avg.hr, y=speed, col= activity.type))+geom_point()+geom_smooth()

distance vs month dates

ggplot(data = df.wr, aes(x=year.month, y=distance))+geom_boxplot()

### relationship between heart rate and respiratory rate:

cor.test(df.wr$avg.hr, df.wr$avg.resp)
## 
##  Pearson's product-moment correlation
## 
## data:  df.wr$avg.hr and df.wr$avg.resp
## t = 11.567, df = 21, p-value = 1.429e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8390340 0.9701255
## sample estimates:
##       cor 
## 0.9296985

Relationship of heart rate and respiratory rate during activity.

reg <- lm(avg.resp~avg.hr, data=df.wr)
 plot(df.wr$avg.hr, df.wr$avg.resp, xlim = c(60,150))
 abline(reg)