require(ggplot2)
require(knitr)
require(dplyr)
library(tidyr)
library(stringr)
library(lubridate)
library(table1)
DF <- read.csv("Activities.csv", header = TRUE,stringsAsFactors=FALSE,encoding = "UTF-8")
Head of the first 6 rows and 7 columns by Using kable()
function of package(knitr)
to generate clean table.
knitr:: kable(head(DF[,1:7]), format = "pipe")
str(DF)
## 'data.frame': 540 obs. of 47 variables:
## $ Activity.Type : chr "Walking" "Running" "Walking" "Walking" ...
## $ Date : chr "2023-08-24 20:46:05" "2023-08-24 20:27:47" "2023-08-24 20:14:06" "2023-08-23 21:07:01" ...
## $ Favorite : chr "true" "true" "true" "true" ...
## $ Title : chr "Quan Binh Thanh Walking" "Quan Binh Thanh Running" "Quan Binh Thanh Walking" "Quan Binh Thanh Walking" ...
## $ Distance : chr "0.47" "2.00" "0.78" "0.43" ...
## $ Calories : chr "41" "130" "48" "33" ...
## $ Time : chr "00:09:38.6" "00:17:42" "00:13:07" "00:07:39.6" ...
## $ Avg.HR : int 108 135 87 103 130 98 93 112 112 136 ...
## $ Max.HR : int 121 159 102 111 148 114 121 127 136 150 ...
## $ Aerobic.TE : chr "0.4" "2.6" "0.4" "0.4" ...
## $ Avg.Cadence : chr "65" "168" "88" "84" ...
## $ Max.Cadence : chr "133" "202" "154" "123" ...
## $ Avg.Pace : chr "20:32" "8:50" "16:52" "17:39" ...
## $ Best.Pace : chr "10:16" "3:51" "8:33" "11:32" ...
## $ Total.Ascent : chr "1" "3" "3" "3" ...
## $ Total.Descent : chr "7" "9" "3" "4" ...
## $ Avg.Stride.Length : num 0.75 0.68 0.67 0.67 0.61 0.72 0 0.73 0.62 0.65 ...
## $ Avg.Vertical.Ratio : num 0 8.2 0 0 8.8 0 0 8.3 0 9 ...
## $ Avg.Vertical.Oscillation: num 0 5.6 0 0 5.4 0 0 6.2 0 5.8 ...
## $ Avg.Ground.Contact.Time : int 0 288 0 0 300 0 0 348 0 283 ...
## $ Avg.GCT.Balance : chr "--" "47.9% L / 52.1% R" "--" "--" ...
## $ Avg.GAP : chr "22:57" "8:42" "16:47" "18:55" ...
## $ Normalized.Power...NP.. : chr "--" "188" "--" "--" ...
## $ Training.Stress.Score. : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Avg.Power : int 0 168 0 0 154 0 0 158 0 163 ...
## $ Max.Power : chr "0" "387" "0" "0" ...
## $ Grit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Flow : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Total.Strokes : chr "--" "--" "--" "--" ...
## $ Avg..Swolf : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Avg.Stroke.Rate : chr "0" "0" "0" "0" ...
## $ Total.Reps : chr "0" "0" "0" "0" ...
## $ Total.Sets : chr "--" "--" "--" "--" ...
## $ Dive.Time : chr "0:00" "0:00" "0:00" "0:00" ...
## $ Min.Temp : num 32 32 32 32 32 31 0 29 30 30 ...
## $ Surface.Interval : chr "0:00" "0:00" "0:00" "0:00" ...
## $ Decompression : chr "No" "No" "No" "No" ...
## $ Best.Lap.Time : chr "09:38.59" "00:02.24" "13:06.85" "07:39.62" ...
## $ Number.of.Laps : int 1 3 1 1 3 1 1 2 1 2 ...
## $ Max.Temp : num 33 32 32 33 32 32 0 30 30 31 ...
## $ Avg.Resp : chr "24" "33" "18" "22" ...
## $ Min.Resp : chr "21" "21" "14" "18" ...
## $ Max.Resp : chr "28" "40" "23" "27" ...
## $ Moving.Time : chr "00:06:12" "00:16:53" "00:10:27" "00:06:15" ...
## $ Elapsed.Time : chr "00:09:38.6" "00:17:42" "00:13:07" "00:07:39.6" ...
## $ Min.Elevation : chr "-1" "7" "8" "8" ...
## $ Max.Elevation : chr "8" "15" "12" "12" ...
This data sould be cleaned, first a copy of data is made
df
, make edit and clean on
the new dataset.df
.df <- DF
names(df) <- tolower(names(df))
Remove the last characters with (..) of two columns
colnames(df)[colnames(df) == "normalized.power...np.."] = "normalized.power.np"
colnames(df)[colnames(df) == "training.stress.score."] = "training.stress.score"
Replaced value 0
and --
with
NA
df[df == 0] <- NA
df[df == "--"] <- NA
df$date1 <- strptime(df$date, format = "%Y-%m-%d %H:%M:%S")
year.month
columndf$year.month <- as.character(format(df$date1, "%Y-%m"))
head(df$year.month)
## [1] "2023-08" "2023-08" "2023-08" "2023-08" "2023-08" "2023-08"
Hence, we have created a variable with month of
the years #### Check type of
`year.month’ variable
type.
typeof(df$year.month)
## [1] "character"
We need to covert from “character” to factor
.
df$year.month <- as.factor(df$year.month)
levels(df$year.month)
## [1] "2023-03" "2023-04" "2023-05" "2023-06" "2023-07" "2023-08"
df <- df |> mutate(avg.cadence = as.numeric(avg.cadence),
max.cadence = as.numeric(max.cadence),
distance = as.numeric(distance),
calories = as.numeric(calories),
avg.stride.length = as.numeric(avg.stride.length),
avg.vertical.ratio = as.numeric(avg.vertical.ratio ),
avg.vertical.oscillatio = as.numeric(avg.vertical.oscillation),
avg.ground.contact.time = as.numeric(avg.ground.contact.time))
head(df,6)[,"avg.pace"]
## [1] "20:32" "8:50" "16:52" "17:39" "9:34" "13:50"
Note: Pace is the time value with 2 characters of minutes and last two characters of seconds.
Extract the first 2 characters to make them minutes
and last two characters to make seconds
.
Creating funtion getting first 2 of 5 characters of
pace
first_2cha = function(x) {
substr(x, 1, 2)
}
minutes
column of
pace.
df$pace.mi = sapply(df$avg.pace, first_2cha)
pace
.last_2cha = function(x) {
substr(x, 4, 5)
}
seconds
of pace
.df$pace.se = sapply(df$avg.pace, last_2cha)
pace
to numeric datatype.df <- df |> mutate(pace.mi = as.numeric(pace.mi), pace.se = as.numeric(pace.se))
df$pace <- round(df$pace.mi+df$pace.se/60, digit=0)
library(lubridate)
df$time.dur <- round(round(period_to_seconds(hms(df$time)), digits = 0)/60, digits = 0)
df$moving.time.dur <- round(round(period_to_seconds(hms(df$moving.time)), digits = 0)/60, digits = 0)
df$elapsed.time.dur <- round(round(period_to_seconds(hms(df$elapsed.time )), digits = 0)/60, digits = 0)
other
and activity type
because
they are not of garmin watch data.subset
function can be used in base R.df.wr <- subset(df, !activity.type %in% c('Other','Motorcycling'))
df.wr <- subset(df, title %in% c('Quan Binh Thanh Walking','Quan Binh Thanh Running'), !favorite == 'false') |>
subset(!avg.hr=="NA")|> subset(!pace=="NA")
table1(~distance+calories+avg.hr+max.hr+avg.cadence+max.cadence+time.dur+moving.time.dur+elapsed.time.dur+pace|activity.type, data=df.wr, topclass = "Rtable1-zebra")
Running (N=35) |
Walking (N=144) |
Overall (N=179) |
|
---|---|---|---|
distance | |||
Mean (SD) | 2.03 (0.978) | 1.17 (0.861) | 1.34 (0.946) |
Median [Min, Max] | 1.72 [0.520, 5.29] | 0.910 [0.220, 4.91] | 0.990 [0.220, 5.29] |
calories | |||
Mean (SD) | 132 (61.2) | 72.9 (49.3) | 84.5 (56.8) |
Median [Min, Max] | 114 [36.0, 326] | 57.5 [15.0, 310] | 62.0 [15.0, 326] |
avg.hr | |||
Mean (SD) | 118 (14.3) | 98.2 (9.95) | 102 (13.4) |
Median [Min, Max] | 120 [88.0, 142] | 98.0 [67.0, 128] | 101 [67.0, 142] |
max.hr | |||
Mean (SD) | 137 (17.0) | 117 (14.9) | 121 (17.1) |
Median [Min, Max] | 138 [107, 161] | 115 [91.0, 230] | 117 [91.0, 230] |
avg.cadence | |||
Mean (SD) | 138 (33.0) | 93.5 (18.6) | 102 (28.3) |
Median [Min, Max] | 158 [73.0, 175] | 98.0 [30.0, 124] | 100 [30.0, 175] |
max.cadence | |||
Mean (SD) | 187 (35.2) | 170 (45.7) | 173 (44.2) |
Median [Min, Max] | 185 [120, 246] | 146 [120, 250] | 154 [120, 250] |
time.dur | |||
Mean (SD) | 25.9 (14.5) | 17.7 (12.1) | 19.3 (13.0) |
Median [Min, Max] | 22.0 [7.00, 81.0] | 14.0 [4.00, 72.0] | 15.0 [4.00, 81.0] |
moving.time.dur | |||
Mean (SD) | 24.1 (12.7) | 14.1 (9.79) | 16.1 (11.1) |
Median [Min, Max] | 21.0 [7.00, 68.0] | 11.0 [3.00, 56.0] | 12.0 [3.00, 68.0] |
elapsed.time.dur | |||
Mean (SD) | 26.2 (14.5) | 17.9 (12.0) | 19.5 (12.9) |
Median [Min, Max] | 22.0 [7.00, 81.0] | 14.0 [4.00, 72.0] | 15.0 [4.00, 81.0] |
pace | |||
Mean (SD) | 12.7 (2.04) | 15.9 (3.70) | 15.3 (3.67) |
Median [Min, Max] | 12.0 [10.0, 17.0] | 15.0 [10.0, 27.0] | 15.0 [10.0, 27.0] |
ggplot(df.wr, aes(x=distance , fill=activity.type)) + geom_density(alpha=0.4)
ggplot(data=df.wr, aes(x=avg.hr, y=distance, col= activity.type))+geom_point()+geom_smooth()
- type of running and pace
ggplot(data=df.wr, aes(x=avg.hr, y=pace, col= activity.type))+geom_point()+geom_smooth()
ggplot(data = df.wr, aes(x=year.month, y=distance))+geom_boxplot()