require(ggplot2)
require(knitr)
require(dplyr)
library(tidyr)
library(stringr)
library(lubridate)
library(table1)
DF <- read.csv("Activities.csv", header = TRUE,stringsAsFactors=FALSE,encoding = "UTF-8")
Head of the first 6 rows and 7 columns by Using kable()
function of package(knitr)
to generate clean table.
knitr:: kable(head(DF[,1:7]), format = "pipe")
dat.str.ka <- data.frame(variable = names(DF),
classe = sapply(DF, typeof),
first_values = sapply(DF, function(x) paste0(head(x), collapse = ", ")),
row.names = NULL)
dat.str.ka |> kable("pipe")
variable | classe | first_values |
---|---|---|
Activity.Type | character | Treadmill Running, Strength Training, Walking, Strength Training, Strength Training, Treadmill Running |
Date | character | 2023-07-03 21:10:43, 2023-07-03 18:51:15, 2023-07-02 20:04:16, 2023-07-02 10:08:36, 2023-07-01 16:07:05, 2023-06-29 18:50:17 |
Favorite | character | true, true, true, true, true, true |
Title | character | Treadmill Running, Strength, Quan Binh Thanh Walking, Strength, Strength, Treadmill Running |
Distance | character | 1.04, 0.00, 3.48, 0.00, 0.00, 1.90 |
Calories | character | 70, 107, 247, 191, 84, 95 |
Time | character | 00:16:03, 00:51:58, 01:00:59, 01:12:44, 00:30:39, 00:31:11 |
Avg.HR | integer | 102, 78, 103, 88, 87, 87 |
Max.HR | integer | 138, 105, 136, 123, 111, 126 |
Aerobic.TE | character | 1.6, 0.2, 2.0, 0.4, 0.3, 1.2 |
Avg.Run.Cadence | character | 131, –, 78, –, –, 138 |
Max.Run.Cadence | character | 175, –, 150, –, –, 196 |
Avg.Pace | character | 15:23, –, 17:32, –, –, 16:23 |
Best.Pace | character | 8:40, –, 10:49, –, –, 8:56 |
Total.Ascent | character | –, –, 48, –, –, – |
Total.Descent | character | –, –, 54, –, –, – |
Avg.Stride.Length | double | 0.58, 0, 0.73, 0, 0, 0.53 |
Avg.Vertical.Ratio | double | 8, 0, 0, 0, 0, 5.9 |
Avg.Vertical.Oscillation | double | 4.9, 0, 0, 0, 0, 4.2 |
Avg.Ground.Contact.Time | integer | 293, 0, 0, 0, 0, 292 |
Avg.GCT.Balance | character | 49.6% L / 50.4% R, –, –, –, –, 50.1% L / 49.9% R |
Avg.GAP | character | –, –, 19:16, –, –, – |
Normalized.Power…NP.. | character | 138, –, –, –, –, 132 |
Training.Stress.Score. | double | 0, 0, 0, 0, 0, 0 |
Avg.Power | integer | 95, 0, 0, 0, 0, 89 |
Max.Power | character | 254, 0, 0, 0, 0, 255 |
Grit | double | 0, 0, 0, 0, 0, 0 |
Flow | double | 0, 0, 0, 0, 0, 0 |
Total.Strokes | character | –, –, –, –, –, – |
Avg..Swolf | integer | 0, 0, 0, 0, 0, 0 |
Avg.Stroke.Rate | character | 0, 0, 0, 0, 0, 0 |
Total.Reps | character | 0, 12, 0, 80, 17, 0 |
Total.Sets | character | –, 1, –, 1, 1, – |
Dive.Time | character | 0:00, 0:00, 0:00, 0:00, 0:00, 0:00 |
Min.Temp | double | 29, 31, 27, 32, 31, 29 |
Surface.Interval | character | 0:00, 0:00, 0:00, 0:00, 0:00, 0:00 |
Decompression | character | No, No, No, No, No, No |
Best.Lap.Time | character | 00:59.48, 51:57.65, 10:23.99, 01:12:44.02, 30:39.34, 12:25.84 |
Number.of.Laps | integer | 2, 1, 4, 1, 1, 2 |
Max.Temp | double | 32, 33, 31, 33, 33, 29 |
Avg.Resp | character | –, –, –, –, –, – |
Min.Resp | character | –, –, –, –, –, – |
Max.Resp | character | –, –, –, –, –, – |
Moving.Time | character | 00:13:14, 00:51:58, 00:40:21, 01:12:44, 00:30:39, 00:22:29 |
Elapsed.Time | character | 00:16:03, 00:51:58, 01:00:59, 01:12:44, 00:30:39, 00:31:11 |
Min.Elevation | character | –, –, -41, –, –, – |
Max.Elevation | character | –, –, -28, –, –, – |
This data sould be cleaned, first a copy of data is made
df
, make edit and clean on
the new dataset.df
.df <- DF
names(df) <- tolower(names(df))
Remove the last characters with (..) of two columns
colnames(df)[colnames(df) == "normalized.power...np.."] = "normalized.power.np"
colnames(df)[colnames(df) == "training.stress.score."] = "training.stress.score"
Replaced value 0
and --
with
NA
df[df == 0] <- NA
df[df == "--"] <- NA
df <- df |> mutate(avg.run.cadence = as.numeric(avg.run.cadence),
max.run.cadence = as.numeric(max.run.cadence),
distance = as.numeric(distance),
calories = as.numeric(calories),
avg.stride.length = as.numeric(avg.stride.length),
avg.vertical.ratio = as.numeric(avg.vertical.ratio ),
avg.vertical.oscillatio = as.numeric(avg.vertical.oscillation),
avg.ground.contact.time = as.numeric(avg.ground.contact.time))
other
and activity type
because
they are not of garmin watch data.subset
function can be used in base R.df <- subset(df, !activity.type %in% c('Other','Motorcycling'))
head(df,6)[,"avg.pace"]
## [1] "15:23" NA "17:32" NA NA "16:23"
Note: Pace is the time value with 2 characters of minutes and last two characters of seconds.
Extract the first 2 characters to make them minutes
and last two characters to make seconds
.
Creating funtion getting first 2 of 5 characters of
pace
first_2cha = function(x) {
substr(x, 1, 2)
}
minutes
column of
pace.
df$pace.mi = sapply(df$avg.pace, first_2cha)
pace
.last_2cha = function(x) {
substr(x, 4, 5)
}
seconds
of pace
.df$pace.se = sapply(df$avg.pace, last_2cha)
pace
to numeric datatype.df <- df |> mutate(pace.mi = as.numeric(pace.mi), pace.se = as.numeric(pace.se))
df$pace <- round(df$pace.mi+df$pace.se/60, digit=0)
library(lubridate)
df$time.dur <- round(round(period_to_seconds(hms(df$time)), digits = 0)/60, digits = 0)
df$moving.time.dur <- round(round(period_to_seconds(hms(df$moving.time)), digits = 0)/60, digits = 0)
df$elapsed.time.dur <- round(round(period_to_seconds(hms(df$elapsed.time )), digits = 0)/60, digits = 0)
df.wr <- subset(df,
title %in% c('Quan Binh Thanh Walking','Quan Binh Thanh Running'),
!favorite == 'false') |>
subset(!avg.hr=="NA",
!pace=="NA")
df.wr <- subset(df, title %in% c(‘Quan Binh Thanh Walking’,‘Quan Binh Thanh Running’), !favorite == ‘false’) |> subset(!avg.hr==“NA”)|> subset(!pace==“NA”)
table1(~distance+calories+avg.hr+max.hr+avg.run.cadence+max.run.cadence+time.dur+moving.time.dur+elapsed.time.dur+pace|activity.type, data=df.wr, topclass = "Rtable1-zebra")
Running (N=81) |
Walking (N=100) |
Overall (N=181) |
|
---|---|---|---|
distance | |||
Mean (SD) | 2.04 (0.987) | 1.26 (0.947) | 1.61 (1.04) |
Median [Min, Max] | 1.82 [0.470, 5.33] | 0.920 [0.170, 4.91] | 1.33 [0.170, 5.33] |
calories | |||
Mean (SD) | 132 (63.5) | 77.5 (54.4) | 102 (64.6) |
Median [Min, Max] | 118 [22.0, 354] | 58.5 [15.0, 310] | 83.0 [15.0, 354] |
avg.hr | |||
Mean (SD) | 119 (15.6) | 98.3 (10.8) | 108 (16.8) |
Median [Min, Max] | 123 [88.0, 155] | 98.0 [67.0, 128] | 104 [67.0, 155] |
max.hr | |||
Mean (SD) | 137 (17.5) | 117 (12.6) | 126 (18.0) |
Median [Min, Max] | 141 [103, 169] | 115 [91.0, 149] | 124 [91.0, 169] |
avg.run.cadence | |||
Mean (SD) | 139 (35.6) | 98.1 (16.3) | 116 (33.5) |
Median [Min, Max] | 161 [71.0, 180] | 100 [27.0, 124] | 106 [27.0, 180] |
max.run.cadence | |||
Mean (SD) | 191 (33.5) | 170 (44.9) | 179 (41.4) |
Median [Min, Max] | 188 [120, 255] | 146 [123, 248] | 184 [120, 255] |
time.dur | |||
Mean (SD) | 25.6 (15.9) | 18.4 (12.8) | 21.6 (14.7) |
Median [Min, Max] | 21.0 [3.00, 81.0] | 14.0 [4.00, 72.0] | 16.0 [3.00, 81.0] |
moving.time.dur | |||
Mean (SD) | 23.4 (13.6) | 15.3 (10.9) | 18.9 (12.8) |
Median [Min, Max] | 20.0 [3.00, 68.0] | 11.0 [2.00, 56.0] | 14.0 [2.00, 68.0] |
elapsed.time.dur | |||
Mean (SD) | 25.9 (16.0) | 18.5 (12.9) | 21.8 (14.7) |
Median [Min, Max] | 22.0 [3.00, 81.0] | 14.5 [4.00, 72.0] | 16.0 [3.00, 81.0] |
pace | |||
Mean (SD) | 13.9 (2.29) | 15.1 (2.92) | 14.7 (2.78) |
Median [Min, Max] | 14.0 [10.0, 20.0] | 15.0 [10.0, 26.0] | 15.0 [10.0, 26.0] |
Missing | 31 (38.3%) | 1 (1.0%) | 32 (17.7%) |
ggplot(df.wr, aes(x=distance , fill=activity.type)) + geom_density(alpha=0.4)
ggplot(data=df.wr, aes(x=avg.hr, y=distance, col= activity.type))+geom_point()+geom_smooth()
- type of running and pace
ggplot(data=df.wr, aes(x=avg.hr, y=pace, col= activity.type))+geom_point()+geom_smooth()