Loading library.

require(ggplot2)
require(knitr)
require(dplyr)
library(tidyr)
library(stringr)
library(lubridate)
library(table1)

Import data:

DF <- read.csv("Activities.csv", header = TRUE,stringsAsFactors=FALSE,encoding = "UTF-8")

View first 6 rows of the dataset

Head of the first 6 rows and 7 columns by Using kable() function of package(knitr) to generate clean table.

knitr:: kable(head(DF[,1:7]),  format = "pipe")

View structure

str(DF)
## 'data.frame':    540 obs. of  47 variables:
##  $ Activity.Type           : chr  "Walking" "Running" "Walking" "Walking" ...
##  $ Date                    : chr  "2023-08-24 20:46:05" "2023-08-24 20:27:47" "2023-08-24 20:14:06" "2023-08-23 21:07:01" ...
##  $ Favorite                : chr  "true" "true" "true" "true" ...
##  $ Title                   : chr  "Quan Binh Thanh Walking" "Quan Binh Thanh Running" "Quan Binh Thanh Walking" "Quan Binh Thanh Walking" ...
##  $ Distance                : chr  "0.47" "2.00" "0.78" "0.43" ...
##  $ Calories                : chr  "41" "130" "48" "33" ...
##  $ Time                    : chr  "00:09:38.6" "00:17:42" "00:13:07" "00:07:39.6" ...
##  $ Avg.HR                  : int  108 135 87 103 130 98 93 112 112 136 ...
##  $ Max.HR                  : int  121 159 102 111 148 114 121 127 136 150 ...
##  $ Aerobic.TE              : chr  "0.4" "2.6" "0.4" "0.4" ...
##  $ Avg.Cadence             : chr  "65" "168" "88" "84" ...
##  $ Max.Cadence             : chr  "133" "202" "154" "123" ...
##  $ Avg.Pace                : chr  "20:32" "8:50" "16:52" "17:39" ...
##  $ Best.Pace               : chr  "10:16" "3:51" "8:33" "11:32" ...
##  $ Total.Ascent            : chr  "1" "3" "3" "3" ...
##  $ Total.Descent           : chr  "7" "9" "3" "4" ...
##  $ Avg.Stride.Length       : num  0.75 0.68 0.67 0.67 0.61 0.72 0 0.73 0.62 0.65 ...
##  $ Avg.Vertical.Ratio      : num  0 8.2 0 0 8.8 0 0 8.3 0 9 ...
##  $ Avg.Vertical.Oscillation: num  0 5.6 0 0 5.4 0 0 6.2 0 5.8 ...
##  $ Avg.Ground.Contact.Time : int  0 288 0 0 300 0 0 348 0 283 ...
##  $ Avg.GCT.Balance         : chr  "--" "47.9% L / 52.1% R" "--" "--" ...
##  $ Avg.GAP                 : chr  "22:57" "8:42" "16:47" "18:55" ...
##  $ Normalized.Power...NP.. : chr  "--" "188" "--" "--" ...
##  $ Training.Stress.Score.  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Avg.Power               : int  0 168 0 0 154 0 0 158 0 163 ...
##  $ Max.Power               : chr  "0" "387" "0" "0" ...
##  $ Grit                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Flow                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Total.Strokes           : chr  "--" "--" "--" "--" ...
##  $ Avg..Swolf              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Avg.Stroke.Rate         : chr  "0" "0" "0" "0" ...
##  $ Total.Reps              : chr  "0" "0" "0" "0" ...
##  $ Total.Sets              : chr  "--" "--" "--" "--" ...
##  $ Dive.Time               : chr  "0:00" "0:00" "0:00" "0:00" ...
##  $ Min.Temp                : num  32 32 32 32 32 31 0 29 30 30 ...
##  $ Surface.Interval        : chr  "0:00" "0:00" "0:00" "0:00" ...
##  $ Decompression           : chr  "No" "No" "No" "No" ...
##  $ Best.Lap.Time           : chr  "09:38.59" "00:02.24" "13:06.85" "07:39.62" ...
##  $ Number.of.Laps          : int  1 3 1 1 3 1 1 2 1 2 ...
##  $ Max.Temp                : num  33 32 32 33 32 32 0 30 30 31 ...
##  $ Avg.Resp                : chr  "24" "33" "18" "22" ...
##  $ Min.Resp                : chr  "21" "21" "14" "18" ...
##  $ Max.Resp                : chr  "28" "40" "23" "27" ...
##  $ Moving.Time             : chr  "00:06:12" "00:16:53" "00:10:27" "00:06:15" ...
##  $ Elapsed.Time            : chr  "00:09:38.6" "00:17:42" "00:13:07" "00:07:39.6" ...
##  $ Min.Elevation           : chr  "-1" "7" "8" "8" ...
##  $ Max.Elevation           : chr  "8" "15" "12" "12" ...

This data sould be cleaned, first a copy of data is made

Asign new new dataset called df, make edit and clean on the new dataset.

  • A new copy of DF data called df.
df <- DF
  • Change to lower characters.
names(df) <- tolower(names(df))

Remove the last characters with (..) of two columns

colnames(df)[colnames(df) == "normalized.power...np.."] = "normalized.power.np"
colnames(df)[colnames(df) == "training.stress.score."] = "training.stress.score"

Handle some missing values

Replaced value 0 and -- with NA

df[df == 0] <- NA
df[df == "--"] <- NA

Extract date and month using `strptime()’ function

df$date1 <- strptime(df$date, format = "%Y-%m-%d %H:%M:%S")
Create a year.month column
df$year.month <- as.character(format(df$date1, "%Y-%m")) 

head first year.month

head(df$year.month)
## [1] "2023-08" "2023-08" "2023-08" "2023-08" "2023-08" "2023-08"

Hence, we have created a variable with month of the years #### Check type of `year.month’ variable type.

typeof(df$year.month)
## [1] "character"

We need to covert from “character” to factor.

df$year.month <- as.factor(df$year.month)
levels(df$year.month)
## [1] "2023-03" "2023-04" "2023-05" "2023-06" "2023-07" "2023-08"

Convert some numeric characters

df <- df |> mutate(avg.cadence = as.numeric(avg.cadence), 
                   max.cadence = as.numeric(max.cadence), 
                   distance = as.numeric(distance), 
                   calories = as.numeric(calories), 
                   avg.stride.length = as.numeric(avg.stride.length),
                   avg.vertical.ratio  = as.numeric(avg.vertical.ratio ),
                   avg.vertical.oscillatio  = as.numeric(avg.vertical.oscillation),
                   avg.ground.contact.time = as.numeric(avg.ground.contact.time))

Manage pace

  • First view 6 first values of pace
head(df,6)[,"avg.pace"]
## [1] "20:32" "8:50"  "16:52" "17:39" "9:34"  "13:50"
  • Note: Pace is the time value with 2 characters of minutes and last two characters of seconds.

  • Extract the first 2 characters to make them minutes and last two characters to make seconds.

  • Creating funtion getting first 2 of 5 characters of pace

first_2cha = function(x) {
  substr(x, 1, 2)
}
  • Applying function to minutes column of pace.
df$pace.mi = sapply(df$avg.pace, first_2cha)
  • Creating funtion getting last 2 of 5 characters of pace.
last_2cha = function(x) {
  substr(x, 4, 5)
}
  • Applying function to seconds of pace.
df$pace.se = sapply(df$avg.pace, last_2cha)
  • Chuyển pace to numeric datatype.
df <- df |> mutate(pace.mi = as.numeric(pace.mi), pace.se = as.numeric(pace.se))
  • create column named pace (minutes)
df$pace <- round(df$pace.mi+df$pace.se/60, digit=0)

Time processing

library(lubridate)
df$time.dur <- round(round(period_to_seconds(hms(df$time)), digits = 0)/60, digits = 0)

df$moving.time.dur <- round(round(period_to_seconds(hms(df$moving.time)), digits = 0)/60, digits = 0)

df$elapsed.time.dur  <- round(round(period_to_seconds(hms(df$elapsed.time )), digits = 0)/60, digits = 0)

Analysing walking and running

Remove other and activity type because they are not of garmin watch data.

  • subset function can be used in base R.
df.wr <- subset(df, !activity.type %in% c('Other','Motorcycling'))
  • Select only walking and running
df.wr <- subset(df, title %in% c('Quan Binh Thanh Walking','Quan Binh Thanh Running'), !favorite == 'false') |> 
  subset(!avg.hr=="NA")|> subset(!pace=="NA")

Describe variable

Using `table1’ package to generate a clean table:

table1(~distance+calories+avg.hr+max.hr+avg.cadence+max.cadence+time.dur+moving.time.dur+elapsed.time.dur+pace|activity.type, data=df.wr, topclass = "Rtable1-zebra")
Running
(N=35)
Walking
(N=144)
Overall
(N=179)
distance
Mean (SD) 2.03 (0.978) 1.17 (0.861) 1.34 (0.946)
Median [Min, Max] 1.72 [0.520, 5.29] 0.910 [0.220, 4.91] 0.990 [0.220, 5.29]
calories
Mean (SD) 132 (61.2) 72.9 (49.3) 84.5 (56.8)
Median [Min, Max] 114 [36.0, 326] 57.5 [15.0, 310] 62.0 [15.0, 326]
avg.hr
Mean (SD) 118 (14.3) 98.2 (9.95) 102 (13.4)
Median [Min, Max] 120 [88.0, 142] 98.0 [67.0, 128] 101 [67.0, 142]
max.hr
Mean (SD) 137 (17.0) 117 (14.9) 121 (17.1)
Median [Min, Max] 138 [107, 161] 115 [91.0, 230] 117 [91.0, 230]
avg.cadence
Mean (SD) 138 (33.0) 93.5 (18.6) 102 (28.3)
Median [Min, Max] 158 [73.0, 175] 98.0 [30.0, 124] 100 [30.0, 175]
max.cadence
Mean (SD) 187 (35.2) 170 (45.7) 173 (44.2)
Median [Min, Max] 185 [120, 246] 146 [120, 250] 154 [120, 250]
time.dur
Mean (SD) 25.9 (14.5) 17.7 (12.1) 19.3 (13.0)
Median [Min, Max] 22.0 [7.00, 81.0] 14.0 [4.00, 72.0] 15.0 [4.00, 81.0]
moving.time.dur
Mean (SD) 24.1 (12.7) 14.1 (9.79) 16.1 (11.1)
Median [Min, Max] 21.0 [7.00, 68.0] 11.0 [3.00, 56.0] 12.0 [3.00, 68.0]
elapsed.time.dur
Mean (SD) 26.2 (14.5) 17.9 (12.0) 19.5 (12.9)
Median [Min, Max] 22.0 [7.00, 81.0] 14.0 [4.00, 72.0] 15.0 [4.00, 81.0]
pace
Mean (SD) 12.7 (2.04) 15.9 (3.70) 15.3 (3.67)
Median [Min, Max] 12.0 [10.0, 17.0] 15.0 [10.0, 27.0] 15.0 [10.0, 27.0]

Data visualization

  • Density plot
ggplot(df.wr, aes(x=distance , fill=activity.type)) +     geom_density(alpha=0.4)

  • Av heart rate and energty
ggplot(data=df.wr, aes(x=avg.hr, y=distance, col= activity.type))+geom_point()+geom_smooth()

- type of running and pace

ggplot(data=df.wr, aes(x=avg.hr, y=pace, col= activity.type))+geom_point()+geom_smooth()

distance vs month dates

ggplot(data = df.wr, aes(x=year.month, y=distance))+geom_boxplot()