require(ggplot2)
require(knitr)
require(dplyr)
library(tidyr)
library(stringr)
library(lubridate)
library(table1)
DF <- read.csv("Activities.csv", header = TRUE,stringsAsFactors=FALSE,encoding = "UTF-8")
Head of the first 6 rows and 7 columns by Using kable()
function of package(knitr)
to generate clean table.
knitr:: kable(head(DF[10:11,1:8]), format = "pipe")
str(DF)
## 'data.frame': 520 obs. of 47 variables:
## $ Activity.Type : chr "Other" "Strength Training" "Other" "Other" ...
## $ Date : chr "2023-08-02 06:32:35" "2023-08-01 18:45:42" "2023-07-31 16:04:49" "2023-07-31 06:18:56" ...
## $ Favorite : chr "false" "true" "false" "false" ...
## $ Title : chr "Quan Binh Thanh Track Me" "Strength" "Quan Thu Duc Track Me" "Quan Binh Thanh Track Me" ...
## $ Distance : chr "6.99" "0.00" "17.28" "17.27" ...
## $ Calories : chr "0" "135" "0" "0" ...
## $ Time : chr "00:26:32" "00:43:23" "00:54:59" "00:51:28" ...
## $ Avg.HR : int 0 92 0 0 120 94 0 0 102 130 ...
## $ Max.HR : int 0 138 0 0 151 146 0 0 117 141 ...
## $ Aerobic.TE : chr "--" "0.8" "--" "--" ...
## $ Avg.Bike.Cadence : chr "--" "--" "--" "--" ...
## $ Max.Bike.Cadence : chr "--" "--" "--" "--" ...
## $ Avg.Speed : chr "15.8" "--" "18.9" "20.1" ...
## $ Max.Speed : chr "46.2" "--" "63.0" "61.0" ...
## $ Total.Ascent : chr "366" "--" "141" "146" ...
## $ Total.Descent : chr "120" "--" "225" "128" ...
## $ Avg.Stride.Length : num 0 0 0 0 0.7 0 0 0 0.69 0.71 ...
## $ Avg.Vertical.Ratio : num 0 0 0 0 7 0 0 0 0 8.1 ...
## $ Avg.Vertical.Oscillation: num 0 0 0 0 5.1 0 0 0 0 5.9 ...
## $ Avg.Ground.Contact.Time : int 0 0 0 0 318 0 0 0 0 280 ...
## $ Avg.GCT.Balance : chr "--" "--" "--" "--" ...
## $ Avg.GAP : chr "--" "--" "--" "--" ...
## $ Normalized.Power...NP.. : chr "--" "--" "--" "--" ...
## $ Training.Stress.Score. : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Avg.Power : int 0 0 0 0 113 0 0 0 0 171 ...
## $ Max.Power : chr "0" "0" "0" "0" ...
## $ Grit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Flow : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Total.Strokes : chr "--" "--" "--" "--" ...
## $ Avg..Swolf : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Avg.Stroke.Rate : chr "0" "0" "0" "0" ...
## $ Total.Reps : chr "0" "36" "0" "0" ...
## $ Total.Sets : chr "--" "1" "--" "--" ...
## $ Dive.Time : chr "0:00" "0:00" "0:00" "0:00" ...
## $ Min.Temp : num 0 31 0 0 29 31 0 0 30 29 ...
## $ Surface.Interval : chr "0:00" "0:00" "0:00" "0:00" ...
## $ Decompression : chr "No" "No" "No" "No" ...
## $ Best.Lap.Time : chr "26:31.67" "43:22.57" "54:58.74" "51:27.99" ...
## $ Number.of.Laps : int 1 1 1 1 3 1 1 1 1 2 ...
## $ Max.Temp : num 0 33 0 0 32 32 0 0 31 30 ...
## $ Avg.Resp : chr "--" "--" "--" "--" ...
## $ Min.Resp : chr "--" "--" "--" "--" ...
## $ Max.Resp : chr "--" "--" "--" "--" ...
## $ Moving.Time : chr "00:24:35" "00:43:23" "00:46:02" "00:45:32" ...
## $ Elapsed.Time : chr "00:26:32" "00:43:23" "00:54:59" "00:51:28" ...
## $ Min.Elevation : chr "56" "--" "-3" "-1" ...
## $ Max.Elevation : chr "414" "--" "175" "49" ...
This data should be cleaned. We first make a copy of dataset, and make data wrangling on it
df
, make edit and clean on
the new dataset.df
.df <- DF
names(df) <- tolower(names(df))
Remove the last characters with (..) of two columns
colnames(df)[colnames(df) == "normalized.power...np.."] = "normalized.power.np"
colnames(df)[colnames(df) == "training.stress.score."] = "training.stress.score"
colnames(df)
## [1] "activity.type" "date"
## [3] "favorite" "title"
## [5] "distance" "calories"
## [7] "time" "avg.hr"
## [9] "max.hr" "aerobic.te"
## [11] "avg.bike.cadence" "max.bike.cadence"
## [13] "avg.speed" "max.speed"
## [15] "total.ascent" "total.descent"
## [17] "avg.stride.length" "avg.vertical.ratio"
## [19] "avg.vertical.oscillation" "avg.ground.contact.time"
## [21] "avg.gct.balance" "avg.gap"
## [23] "normalized.power.np" "training.stress.score"
## [25] "avg.power" "max.power"
## [27] "grit" "flow"
## [29] "total.strokes" "avg..swolf"
## [31] "avg.stroke.rate" "total.reps"
## [33] "total.sets" "dive.time"
## [35] "min.temp" "surface.interval"
## [37] "decompression" "best.lap.time"
## [39] "number.of.laps" "max.temp"
## [41] "avg.resp" "min.resp"
## [43] "max.resp" "moving.time"
## [45] "elapsed.time" "min.elevation"
## [47] "max.elevation"
Replaced value 0
and --
with
NA
df[df == 0] <- NA
df[df == "--"] <- NA
df$date1 <- strptime(df$date, format = "%Y-%m-%d %H:%M:%S")
year.month
columndf$year.month <- as.character(format(df$date1, "%Y-%m"))
head(df$year.month)
## [1] "2023-08" "2023-08" "2023-07" "2023-07" "2023-07" "2023-07"
Hence, we have created a variable with month of the years
type of
`year.month’ variable type.typeof(df$year.month)
## [1] "character"
We need to covert from “character” to factor
.
df$year.month <- as.factor(df$year.month)
levels(df$year.month)
## [1] "2022-10" "2022-11" "2023-01" "2023-02" "2023-03" "2023-04" "2023-05"
## [8] "2023-06" "2023-07" "2023-08"
df <- df |> mutate(avg.bike.cadence = as.numeric(avg.bike.cadence),
max.bike.cadence = as.numeric(max.bike.cadence),
distance = as.numeric(distance),
calories = as.numeric(calories),
avg.resp = as.numeric(avg.resp),
avg.stride.length = as.numeric(avg.stride.length),
avg.vertical.ratio = as.numeric(avg.vertical.ratio ),
avg.vertical.oscillatio = as.numeric(avg.vertical.oscillation),
avg.ground.contact.time = as.numeric(avg.ground.contact.time))
head(df,6)[,"avg.speed"]
## [1] "15.8" NA "18.9" "20.1" "10:46" NA
Note: Pace is the time value with 2 characters of minutes and last two characters of seconds.
Extract the first 2 characters to make them minutes
and last two characters to make seconds
.
Creating funtion getting first 2 of 5 characters of
pace
first_2cha = function(x) {
substr(x, 1, 2)
}
minutes
column of
pace.
df$speed.mi = sapply(df$avg.speed, first_2cha)
pace
.last_2cha = function(x) {
substr(x, 4, 5)
}
seconds
of pace
.df$speed.se = sapply(df$avg.speed, last_2cha)
pace
to numeric datatype.df <- df |> mutate(speed.mi = as.numeric(speed.mi), speed.se = as.numeric(speed.se))
df$speed <- round(df$speed.mi+df$speed.se/60, digit=0)
library(lubridate)
df$time.dur <- round(round(period_to_seconds(hms(df$time)), digits = 0)/60, digits = 0)
df$moving.time.dur <- round(round(period_to_seconds(hms(df$moving.time)), digits = 0)/60, digits = 0)
df$elapsed.time.dur <- round(round(period_to_seconds(hms(df$elapsed.time )), digits = 0)/60, digits = 0)
other
and activity type
because
they are not of garmin watch data.subset
function can be used in base R.df.wr <- subset(df, !activity.type %in% c('Other','Motorcycling'))
df.wr <- subset(df, title %in% c('Quan Binh Thanh Walking','Quan Binh Thanh Running'), !favorite == 'false') |>
subset(!avg.hr=="NA")|> subset(!speed=="NA")
table1(~distance+calories+avg.hr+avg.resp|activity.type, data=df.wr, topclass = "Rtable1-zebra")
Running (N=51) |
Walking (N=114) |
Overall (N=165) |
|
---|---|---|---|
distance | |||
Mean (SD) | 2.31 (1.04) | 1.24 (0.929) | 1.57 (1.08) |
Median [Min, Max] | 2.27 [0.520, 5.33] | 0.920 [0.220, 4.91] | 1.13 [0.220, 5.33] |
calories | |||
Mean (SD) | 150 (65.3) | 76.9 (53.2) | 99.6 (66.4) |
Median [Min, Max] | 144 [36.0, 354] | 59.0 [15.0, 310] | 74.0 [15.0, 354] |
avg.hr | |||
Mean (SD) | 112 (14.1) | 97.8 (10.6) | 102 (13.5) |
Median [Min, Max] | 106 [88.0, 142] | 96.5 [67.0, 128] | 101 [67.0, 142] |
avg.resp | |||
Mean (SD) | 29.0 (3.46) | 21.4 (2.68) | 23.0 (4.25) |
Median [Min, Max] | 30.0 [23.0, 32.0] | 21.0 [17.0, 27.0] | 23.0 [17.0, 32.0] |
Missing | 46 (90.2%) | 96 (84.2%) | 142 (86.1%) |
ggplot(df.wr, aes(x=distance , fill=activity.type)) + geom_density(alpha=0.4)
ggplot(data=df.wr, aes(x=avg.hr, y=distance, col= activity.type))+geom_point()+geom_smooth()
- type of running and pace
ggplot(data=df.wr, aes(x=avg.hr, y=speed, col= activity.type))+geom_point()+geom_smooth()
ggplot(data = df.wr, aes(x=year.month, y=distance))+geom_boxplot()
### relationship between heart rate and respiratory rate:
cor.test(df.wr$avg.hr, df.wr$avg.resp)
##
## Pearson's product-moment correlation
##
## data: df.wr$avg.hr and df.wr$avg.resp
## t = 11.567, df = 21, p-value = 1.429e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8390340 0.9701255
## sample estimates:
## cor
## 0.9296985
reg <- lm(avg.resp~avg.hr, data=df.wr)
plot(df.wr$avg.hr, df.wr$avg.resp, xlim = c(60,150))
abline(reg)