require(ggplot2)
## Loading required package: ggplot2
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(haven)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
# main script
####
dfB <- read.csv("Activities.csv", header = TRUE, stringsAsFactors = FALSE)
str(dfB)
## 'data.frame': 480 obs. of 47 variables:
## $ Activity.Type : chr "Walking" "Running" "Walking" "Other" ...
## $ Date : chr "2023-06-25 20:37:17" "2023-06-25 20:21:30" "2023-06-25 20:04:04" "2023-06-25 17:16:48" ...
## $ Favorite : chr "true" "true" "true" "false" ...
## $ Title : chr "Quan Binh Thanh Walking" "Quan Binh Thanh Running" "Quan Binh Thanh Walking" "Navigate" ...
## $ Distance : chr "0.78" "1.75" "0.99" "0.00" ...
## $ Calories : chr "58" "114" "71" "--" ...
## $ Time : chr "00:15:19" "00:15:13" "00:17:06" "00:00:03.3" ...
## $ Avg.HR : int 94 127 93 65 69 83 85 101 131 91 ...
## $ Max.HR : int 120 138 126 65 103 134 110 128 151 101 ...
## $ Aerobic.TE : chr "0.4" "2.6" "0.7" "0.0" ...
## $ Avg.Cadence : chr "74" "168" "97" "--" ...
## $ Max.Cadence : chr "247" "186" "158" "--" ...
## $ Avg.Pace : chr "19:39" "8:42" "17:13" "--" ...
## $ Best.Pace : chr "9:18" "6:23" "10:16" "--" ...
## $ Total.Ascent : chr "1" "3" "18" "--" ...
## $ Total.Descent : chr "10" "3" "12" "--" ...
## $ Avg.Stride.Length : num 0.69 0.81 0.6 0 0 0 0 0.68 0.69 0.66 ...
## $ Avg.Vertical.Ratio : num 0 6.3 0 0 0 0 0 0 0 0 ...
## $ Avg.Vertical.Oscillation: num 0 7.5 0 0 0 0 0 0 7.9 0 ...
## $ Avg.Ground.Contact.Time : int 0 271 0 0 0 0 0 0 269 0 ...
## $ Avg.GCT.Balance : chr "--" "49.5% L / 50.5% R" "--" "--" ...
## $ Avg.GAP : chr "20:58" "8:39" "18:18" "--" ...
## $ Normalized.Power...NP.. : chr "--" "211" "--" "--" ...
## $ Training.Stress.Score. : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Avg.Power : int 0 201 0 0 0 0 0 0 208 0 ...
## $ Max.Power : chr "0" "281" "0" "0" ...
## $ Grit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Flow : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Total.Strokes : chr "--" "--" "--" "--" ...
## $ Avg..Swolf : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Avg.Stroke.Rate : chr "0" "0" "0" "0" ...
## $ Total.Reps : chr "0" "0" "0" "0" ...
## $ Total.Sets : chr "--" "--" "--" "--" ...
## $ Dive.Time : chr "0:00" "0:00" "0:00" "0:00" ...
## $ Min.Temp : num 29 29 29 30 30 32 32 29 30 31 ...
## $ Surface.Interval : chr "0:00" "0:00" "0:00" "0:00" ...
## $ Decompression : chr "No" "No" "No" "No" ...
## $ Best.Lap.Time : chr "15:19.19" "06:37.33" "17:05.81" "00:03.28" ...
## $ Number.of.Laps : int 1 2 1 1 1 1 1 1 2 1 ...
## $ Max.Temp : num 31 30 31 30 34 33 33 32 32 32 ...
## $ Avg.Resp : chr "--" "--" "--" "--" ...
## $ Min.Resp : chr "--" "--" "--" "--" ...
## $ Max.Resp : chr "--" "--" "--" "--" ...
## $ Moving.Time : chr "00:10:06" "00:14:27" "00:11:41" "00:00:00" ...
## $ Elapsed.Time : chr "00:15:19" "00:15:13" "00:17:06" "00:00:03.3" ...
## $ Min.Elevation : chr "-21" "-13" "-28" "-17" ...
## $ Max.Elevation : chr "-11" "-10" "-11" "-17" ...
class(dfB)
## [1] "data.frame"
#Tao dfb
dfb <- dfB
# chuyen het ky tu thuong
names(dfb) <- tolower(names(dfb))
# doi ten 2 cot co dau cham
colnames(dfb)[colnames(dfb) == "normalized.power...np.."] = "normalized.power.np"
colnames(dfb)[colnames(dfb) == "training.stress.score."] = "training.stress.score"
#thay gia tri 0 va -- = na
dfb[dfb == 0] <- NA
dfb[dfb == "--"] <- NA
dfb$calories <- as.numeric(dfb$calories )
## Warning: NAs introduced by coercion
str(dfb)
## 'data.frame': 480 obs. of 47 variables:
## $ activity.type : chr "Walking" "Running" "Walking" "Other" ...
## $ date : chr "2023-06-25 20:37:17" "2023-06-25 20:21:30" "2023-06-25 20:04:04" "2023-06-25 17:16:48" ...
## $ favorite : chr "true" "true" "true" "false" ...
## $ title : chr "Quan Binh Thanh Walking" "Quan Binh Thanh Running" "Quan Binh Thanh Walking" "Navigate" ...
## $ distance : chr "0.78" "1.75" "0.99" "0.00" ...
## $ calories : num 58 114 71 NA 37 140 93 75 120 55 ...
## $ time : chr "00:15:19" "00:15:13" "00:17:06" "00:00:03.3" ...
## $ avg.hr : int 94 127 93 65 69 83 85 101 131 91 ...
## $ max.hr : int 120 138 126 65 103 134 110 128 151 101 ...
## $ aerobic.te : chr "0.4" "2.6" "0.7" "0.0" ...
## $ avg.cadence : chr "74" "168" "97" NA ...
## $ max.cadence : chr "247" "186" "158" NA ...
## $ avg.pace : chr "19:39" "8:42" "17:13" NA ...
## $ best.pace : chr "9:18" "6:23" "10:16" NA ...
## $ total.ascent : chr "1" "3" "18" NA ...
## $ total.descent : chr "10" "3" "12" NA ...
## $ avg.stride.length : num 0.69 0.81 0.6 NA NA NA NA 0.68 0.69 0.66 ...
## $ avg.vertical.ratio : num NA 6.3 NA NA NA NA NA NA NA NA ...
## $ avg.vertical.oscillation: num NA 7.5 NA NA NA NA NA NA 7.9 NA ...
## $ avg.ground.contact.time : int NA 271 NA NA NA NA NA NA 269 NA ...
## $ avg.gct.balance : chr NA "49.5% L / 50.5% R" NA NA ...
## $ avg.gap : chr "20:58" "8:39" "18:18" NA ...
## $ normalized.power.np : chr NA "211" NA NA ...
## $ training.stress.score : num NA NA NA NA NA NA NA NA NA NA ...
## $ avg.power : int NA 201 NA NA NA NA NA NA 208 NA ...
## $ max.power : chr NA "281" NA NA ...
## $ grit : num NA NA NA NA NA NA NA NA NA NA ...
## $ flow : num NA NA NA NA NA NA NA NA NA NA ...
## $ total.strokes : chr NA NA NA NA ...
## $ avg..swolf : int NA NA NA NA NA NA NA NA NA NA ...
## $ avg.stroke.rate : chr NA NA NA NA ...
## $ total.reps : chr NA NA NA NA ...
## $ total.sets : chr NA NA NA NA ...
## $ dive.time : chr "0:00" "0:00" "0:00" "0:00" ...
## $ min.temp : num 29 29 29 30 30 32 32 29 30 31 ...
## $ surface.interval : chr "0:00" "0:00" "0:00" "0:00" ...
## $ decompression : chr "No" "No" "No" "No" ...
## $ best.lap.time : chr "15:19.19" "06:37.33" "17:05.81" "00:03.28" ...
## $ number.of.laps : int 1 2 1 1 1 1 1 1 2 1 ...
## $ max.temp : num 31 30 31 30 34 33 33 32 32 32 ...
## $ avg.resp : chr NA NA NA NA ...
## $ min.resp : chr NA NA NA NA ...
## $ max.resp : chr NA NA NA NA ...
## $ moving.time : chr "00:10:06" "00:14:27" "00:11:41" "00:00:00" ...
## $ elapsed.time : chr "00:15:19" "00:15:13" "00:17:06" "00:00:03.3" ...
## $ min.elevation : chr "-21" "-13" "-28" "-17" ...
## $ max.elevation : chr "-11" "-10" "-11" "-17" ...
typeof(dfb)
## [1] "list"
typeof(dfb$date)
## [1] "character"
typeof(dfb$date)
## [1] "character"
## tao dfg datafAME
## tao ra co ngay thang
#bien so pace
dfb$avg.pace1 <- dfb$avg.pace
#dfb$avg.pace1 <- hm(dfb$avg.pace)
#hour(dfb$avg.pace1)
dfb <- dfb %>% separate(col = avg.pace1, into = c('pace.1', 'pace.2'), sep =":") %>% mutate_at(c('pace.1', 'pace.2'), as.numeric) %>% mutate(pace.m = (pace.1*60 + pace.2)/60) %>% mutate(pace.m =round(pace.m, digits = 0))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 102 rows [5, 24, 25, 46,
## 47, 48, 54, 62, 64, 66, 67, 68, 110, 131, 132, 133, 138, 154, 162, 175, ...].
typeof(dfb$pace.m)
## [1] "double"
#dfb <- dfb %>% separate(col = avg.pace1, into = c('pace.1', 'pace.2'), sep =":") %>% mutate_at(c('pace.1', 'pace.2'), as.numeric) %>% mutate(pace.m = (pace.1*60 + pace.2)/60) %>% mutate(pace.m =round(pace.m, digits = 0))
dfb <- dfb %>% mutate(elapsed.time.t = hms(elapsed.time)) %>% mutate(elapsed.time.m = hour(elapsed.time.t)*60 + minute(elapsed.time.t))
dfb <- dfb %>% mutate(moving.time.t = hms(moving.time)) %>% mutate(moving.time.m = hour(moving.time.t)*60 + minute(moving.time.t))
dfb$moving.time.m
## [1] 10 14 11 0 10 57 36 9 15 12 11 14 10 51 20 11 26 6
## [19] 16 12 22 38 61 73 47 8 15 9 9 14 10 38 28 5 15 10
## [37] 62 10 12 12 8 16 8 12 10 43 33 23 0 20 68 17 18 193
## [55] 10 12 17 16 10 0 10 345 31 92 0 110 63 126 9 14 11 10
## [73] 15 6 34 33 11 14 12 16 6 28 52 7 14 10 48 19 6 10
## [91] 61 30 42 9 15 20 25 30 3 50 45 11 20 12 30 13 8 7
## [109] 15 10 52 31 7 13 10 77 10 9 10 76 8 15 11 29 30 18
## [127] 23 32 42 42 18 31 25 34 12 3 10 105 6 26 10 15 10 35
## [145] 7 0 56 6 31 13 10 10 10 40 21 18 29 1 10 9 14 160
## [163] 21 9 25 9 8 8 7 10 7 7 20 9 572 575 10 11 12 518
## [181] 434 47 26 32 32 7 21 38 1 4 11 16 20 0 5 7 1 6
## [199] 4 15 40 27 9 1 1 3 0 1 0 0 10 3 13 11 8 29
## [217] 9 3 3 9 8 8 20 629 29 14 7 13 49 32 13 12 33 68
## [235] 21 21 19 40 15 0 0 41 39 25 31 17 17 20 15 17 29 36
## [253] 31 51 29 24 19 0 34 31 19 11 15 20 10 21 15 68 23 25
## [271] 30 12 14 25 11 16 15 5 29 16 14 21 8 7 5 19 12 3
## [289] 35 29 30 10 12 8 8 32 17 19 6 11 35 9 2 22 4 1
## [307] 8 5 1 13 7 7 4 12 20 15 15 5 9 12 2 5 59 4
## [325] 19 12 55 15 29 22 29 18 20 3 66 11 6 30 29 48 27 31
## [343] 9 4 56 3 2 28 29 30 13 10 8 13 21 41 14 10 10 17
## [361] 8 8 25 21 27 126 0 4 4 20 29 36 40 28 4 31 65 7
## [379] 28 29 52 15 11 36 24 7 56 19 33 50 20 40 67 7 6 55
## [397] 25 32 29 25 58 68 20 62 26 31 67 52 33 55 31 32 35 24
## [415] 4 2 23 29 53 18 39 39 44 25 52 33 55 22 52 41 33 33
## [433] 37 51 5 25 3 25 61 21 43 196 30 48 47 29 131 220 61 107
## [451] 77 0 41 29 91 82 62 28 28 37 54 53 60 32 68 72 55 0
## [469] 61 47 34 25 0 90 27 0 28 8 58 52
#dfb$moving.time.t <- dfb$moving.time
#dfb$moving.time.t <- hms(dfb$moving.time)
#hour(dfb$moving.time.t) *60 + minute(dfb$moving.time.t)
#typeof(dfb$moving.time1)
dfb$date <- strptime(dfb$date, format = "%Y-%m-%d %H:%M:%S")
#Tao bien ngay gio
dfb$hour.minute <- as.character(format(dfb$date, "%H:%S"))
dfb$year.month <- as.character(format(dfb$date, "%Y-%m"))
dfb$week.days <- weekdays(dfb$date)
dfb$week.days <- as.factor(dfb$week.days)
# reorder weekdays as ordering
dfb$week.days <- ordered(dfb$week.days, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
#table1(~distance|week.days, data = dfb)
####______
#tao bien thoi gian 1 lan chay (di bo ) backage lubridate voi ham period_to_second va ham hms
library(lubridate)
dfb$time.duration <- round(round(period_to_seconds(hms(dfb$time)), digits = 0)/60, digits = 0)
###__________
#Kiem tra du lieu
#Cau truc
str(dfb)
## 'data.frame': 480 obs. of 58 variables:
## $ activity.type : chr "Walking" "Running" "Walking" "Other" ...
## $ date : POSIXlt, format: "2023-06-25 20:37:17" "2023-06-25 20:21:30" ...
## $ favorite : chr "true" "true" "true" "false" ...
## $ title : chr "Quan Binh Thanh Walking" "Quan Binh Thanh Running" "Quan Binh Thanh Walking" "Navigate" ...
## $ distance : chr "0.78" "1.75" "0.99" "0.00" ...
## $ calories : num 58 114 71 NA 37 140 93 75 120 55 ...
## $ time : chr "00:15:19" "00:15:13" "00:17:06" "00:00:03.3" ...
## $ avg.hr : int 94 127 93 65 69 83 85 101 131 91 ...
## $ max.hr : int 120 138 126 65 103 134 110 128 151 101 ...
## $ aerobic.te : chr "0.4" "2.6" "0.7" "0.0" ...
## $ avg.cadence : chr "74" "168" "97" NA ...
## $ max.cadence : chr "247" "186" "158" NA ...
## $ avg.pace : chr "19:39" "8:42" "17:13" NA ...
## $ best.pace : chr "9:18" "6:23" "10:16" NA ...
## $ total.ascent : chr "1" "3" "18" NA ...
## $ total.descent : chr "10" "3" "12" NA ...
## $ avg.stride.length : num 0.69 0.81 0.6 NA NA NA NA 0.68 0.69 0.66 ...
## $ avg.vertical.ratio : num NA 6.3 NA NA NA NA NA NA NA NA ...
## $ avg.vertical.oscillation: num NA 7.5 NA NA NA NA NA NA 7.9 NA ...
## $ avg.ground.contact.time : int NA 271 NA NA NA NA NA NA 269 NA ...
## $ avg.gct.balance : chr NA "49.5% L / 50.5% R" NA NA ...
## $ avg.gap : chr "20:58" "8:39" "18:18" NA ...
## $ normalized.power.np : chr NA "211" NA NA ...
## $ training.stress.score : num NA NA NA NA NA NA NA NA NA NA ...
## $ avg.power : int NA 201 NA NA NA NA NA NA 208 NA ...
## $ max.power : chr NA "281" NA NA ...
## $ grit : num NA NA NA NA NA NA NA NA NA NA ...
## $ flow : num NA NA NA NA NA NA NA NA NA NA ...
## $ total.strokes : chr NA NA NA NA ...
## $ avg..swolf : int NA NA NA NA NA NA NA NA NA NA ...
## $ avg.stroke.rate : chr NA NA NA NA ...
## $ total.reps : chr NA NA NA NA ...
## $ total.sets : chr NA NA NA NA ...
## $ dive.time : chr "0:00" "0:00" "0:00" "0:00" ...
## $ min.temp : num 29 29 29 30 30 32 32 29 30 31 ...
## $ surface.interval : chr "0:00" "0:00" "0:00" "0:00" ...
## $ decompression : chr "No" "No" "No" "No" ...
## $ best.lap.time : chr "15:19.19" "06:37.33" "17:05.81" "00:03.28" ...
## $ number.of.laps : int 1 2 1 1 1 1 1 1 2 1 ...
## $ max.temp : num 31 30 31 30 34 33 33 32 32 32 ...
## $ avg.resp : chr NA NA NA NA ...
## $ min.resp : chr NA NA NA NA ...
## $ max.resp : chr NA NA NA NA ...
## $ moving.time : chr "00:10:06" "00:14:27" "00:11:41" "00:00:00" ...
## $ elapsed.time : chr "00:15:19" "00:15:13" "00:17:06" "00:00:03.3" ...
## $ min.elevation : chr "-21" "-13" "-28" "-17" ...
## $ max.elevation : chr "-11" "-10" "-11" "-17" ...
## $ pace.1 : num 19 8 17 NA 7.3 NA NA 26 8 14 ...
## $ pace.2 : num 39 42 13 NA NA NA NA 0 43 59 ...
## $ pace.m : num 20 9 17 NA NA NA NA 26 9 15 ...
## $ elapsed.time.t :Formal class 'Period' [package "lubridate"] with 6 slots
## .. ..@ .Data : num 19 13 6 3.3 3 40 36 49 58 42 ...
## .. ..@ year : num 0 0 0 0 0 0 0 0 0 0 ...
## .. ..@ month : num 0 0 0 0 0 0 0 0 0 0 ...
## .. ..@ day : num 0 0 0 0 0 0 0 0 0 0 ...
## .. ..@ hour : num 0 0 0 0 0 0 0 0 0 0 ...
## .. ..@ minute: num 15 15 17 0 43 57 36 23 15 13 ...
## $ elapsed.time.m : num 15 15 17 0 43 57 36 23 15 13 ...
## $ moving.time.t :Formal class 'Period' [package "lubridate"] with 6 slots
## .. ..@ .Data : num 6 27 41 0 14 40 36 29 21 12 ...
## .. ..@ year : num 0 0 0 0 0 0 0 0 0 0 ...
## .. ..@ month : num 0 0 0 0 0 0 0 0 0 0 ...
## .. ..@ day : num 0 0 0 0 0 0 0 0 0 0 ...
## .. ..@ hour : num 0 0 0 0 0 0 0 0 0 0 ...
## .. ..@ minute: num 10 14 11 0 10 57 36 9 15 12 ...
## $ moving.time.m : num 10 14 11 0 10 57 36 9 15 12 ...
## $ hour.minute : chr "20:17" "20:30" "20:04" "17:48" ...
## $ year.month : chr "2023-06" "2023-06" "2023-06" "2023-06" ...
## $ week.days : Ord.factor w/ 7 levels "Monday"<"Tuesday"<..: 7 7 7 7 7 7 6 5 5 5 ...
## $ time.duration : num 15 15 17 0 24 58 37 19 16 14 ...
ncol(dfb) #xem cot
## [1] 58
nrow(dfb) # Dem so quan sat
## [1] 480
# Re check dfb
head(dfb)
## activity.type date favorite title
## 1 Walking 2023-06-25 20:37:17 true Quan Binh Thanh Walking
## 2 Running 2023-06-25 20:21:30 true Quan Binh Thanh Running
## 3 Walking 2023-06-25 20:04:04 true Quan Binh Thanh Walking
## 4 Other 2023-06-25 17:16:48 false Navigate
## 5 Other 2023-06-25 10:50:45 false Quan 1 Track Me
## 6 Strength Training 2023-06-25 09:18:06 true Strength
## distance calories time avg.hr max.hr aerobic.te avg.cadence max.cadence
## 1 0.78 58 00:15:19 94 120 0.4 74 247
## 2 1.75 114 00:15:13 127 138 2.6 168 186
## 3 0.99 71 00:17:06 93 126 0.7 97 158
## 4 0.00 NA 00:00:03.3 65 65 0.0 <NA> <NA>
## 5 2.94 37 00:24:05 69 103 0.1 <NA> <NA>
## 6 0.00 140 00:57:40 83 134 0.4 <NA> <NA>
## avg.pace best.pace total.ascent total.descent avg.stride.length
## 1 19:39 9:18 1 10 0.69
## 2 8:42 6:23 3 3 0.81
## 3 17:13 10:16 18 12 0.60
## 4 <NA> <NA> <NA> <NA> NA
## 5 7.3 30.2 4 7 NA
## 6 <NA> <NA> <NA> <NA> NA
## avg.vertical.ratio avg.vertical.oscillation avg.ground.contact.time
## 1 NA NA NA
## 2 6.3 7.5 271
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## avg.gct.balance avg.gap normalized.power.np training.stress.score avg.power
## 1 <NA> 20:58 <NA> NA NA
## 2 49.5% L / 50.5% R 8:39 211 NA 201
## 3 <NA> 18:18 <NA> NA NA
## 4 <NA> <NA> <NA> NA NA
## 5 <NA> 4:08 <NA> NA NA
## 6 <NA> <NA> <NA> NA NA
## max.power grit flow total.strokes avg..swolf avg.stroke.rate total.reps
## 1 <NA> NA NA <NA> NA <NA> <NA>
## 2 281 NA NA <NA> NA <NA> <NA>
## 3 <NA> NA NA <NA> NA <NA> <NA>
## 4 <NA> NA NA <NA> NA <NA> <NA>
## 5 <NA> NA NA <NA> NA <NA> <NA>
## 6 <NA> NA NA <NA> NA <NA> 24
## total.sets dive.time min.temp surface.interval decompression best.lap.time
## 1 <NA> 0:00 29 0:00 No 15:19.19
## 2 <NA> 0:00 29 0:00 No 06:37.33
## 3 <NA> 0:00 29 0:00 No 17:05.81
## 4 <NA> 0:00 30 0:00 No 00:03.28
## 5 <NA> 0:00 30 0:00 No 24:04.71
## 6 1 0:00 32 0:00 No 57:39.60
## number.of.laps max.temp avg.resp min.resp max.resp moving.time elapsed.time
## 1 1 31 <NA> <NA> <NA> 00:10:06 00:15:19
## 2 2 30 <NA> <NA> <NA> 00:14:27 00:15:13
## 3 1 31 <NA> <NA> <NA> 00:11:41 00:17:06
## 4 1 30 <NA> <NA> <NA> 00:00:00 00:00:03.3
## 5 1 34 <NA> <NA> <NA> 00:10:14 00:43:03
## 6 1 33 <NA> <NA> <NA> 00:57:40 00:57:40
## min.elevation max.elevation pace.1 pace.2 pace.m elapsed.time.t
## 1 -21 -11 19.0 39 20 15M 19S
## 2 -13 -10 8.0 42 9 15M 13S
## 3 -28 -11 17.0 13 17 17M 6S
## 4 -17 -17 NA NA NA 3.3S
## 5 -23 -15 7.3 NA NA 43M 3S
## 6 <NA> <NA> NA NA NA 57M 40S
## elapsed.time.m moving.time.t moving.time.m hour.minute year.month week.days
## 1 15 10M 6S 10 20:17 2023-06 Sunday
## 2 15 14M 27S 14 20:30 2023-06 Sunday
## 3 17 11M 41S 11 20:04 2023-06 Sunday
## 4 0 0S 0 17:48 2023-06 Sunday
## 5 43 10M 14S 10 10:45 2023-06 Sunday
## 6 57 57M 40S 57 09:06 2023-06 Sunday
## time.duration
## 1 15
## 2 15
## 3 17
## 4 0
## 5 24
## 6 58
library(ggplot2)
#chon runing hoac walking binh thanh
df_dy <- dfb %>% filter(!is.na(avg.stride.length)) %>% drop_na(avg.hr) %>% filter(stringr::str_detect(title, 'Binh Thanh') ) %>% filter(calories <500) %>% filter(elapsed.time.m <=120)
df_dy$title <- as.factor(df_dy$title)
df_dy <- df_dy %>% filter(title %in% c("Quan Binh Thanh Running", "Quan Binh Thanh Walking")) %>% droplevels()
df_dy$calories <- as.numeric(df_dy$calories)
df_dy$calories <- as.numeric(df_dy$calories)
df_dy$distance <- as.numeric(df_dy$distance)
#table1
table1(~avg.hr+distance+calories+pace.m+moving.time.m+elapsed.time.m|title, data=df_dy )
Quan Binh Thanh Running (N=81) |
Quan Binh Thanh Walking (N=99) |
Overall (N=180) |
|
---|---|---|---|
avg.hr | |||
Mean (SD) | 119 (15.6) | 98.3 (10.8) | 108 (16.9) |
Median [Min, Max] | 123 [88.0, 155] | 98.0 [67.0, 128] | 105 [67.0, 155] |
distance | |||
Mean (SD) | 2.04 (0.987) | 1.23 (0.925) | 1.60 (1.03) |
Median [Min, Max] | 1.82 [0.470, 5.33] | 0.920 [0.170, 4.91] | 1.32 [0.170, 5.33] |
calories | |||
Mean (SD) | 132 (63.5) | 75.7 (51.9) | 101 (63.9) |
Median [Min, Max] | 118 [22.0, 354] | 58.0 [15.0, 310] | 82.5 [15.0, 354] |
pace.m | |||
Mean (SD) | 12.0 (3.03) | 15.1 (2.92) | 13.7 (3.34) |
Median [Min, Max] | 11.0 [7.00, 20.0] | 15.0 [10.0, 26.0] | 14.0 [7.00, 26.0] |
Missing | 0 (0%) | 1 (1.0%) | 1 (0.6%) |
moving.time.m | |||
Mean (SD) | 22.9 (13.6) | 14.6 (10.7) | 18.3 (12.7) |
Median [Min, Max] | 20.0 [3.00, 67.0] | 10.0 [2.00, 56.0] | 14.0 [2.00, 67.0] |
elapsed.time.m | |||
Mean (SD) | 25.4 (16.0) | 17.5 (12.2) | 21.1 (14.5) |
Median [Min, Max] | 21.0 [3.00, 81.0] | 14.0 [3.00, 72.0] | 15.0 [3.00, 81.0] |
df_dy <- df_dy %>% mutate_at(c('distance', 'calories'), as.numeric)
ggplot(data=df_dy, aes(x=title, y =calories))+geom_boxplot()
ggplot(data=df_dy, aes(x=title, y =distance))+geom_boxplot()
ggplot(data=df_dy, aes(x=title, y =pace.m))+geom_boxplot()
## Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).
# only running or walking in Binh Thanh
df_dy$calories
## [1] 58 114 71 75 120 55 59 94 54 119 84 119 38 133 62 179 45 119
## [19] 41 54 117 55 172 36 100 50 57 90 59 54 127 46 66 111 56 61
## [37] 116 32 70 113 75 124 240 42 113 59 91 37 52 156 43 111 100 90
## [55] 66 124 65 277 180 35 103 51 51 83 55 37 95 62 164 80 180 53
## [73] 34 202 58 70 262 47 267 38 184 61 55 72 56 114 43 136 73 41
## [91] 53 54 51 52 46 115 39 52 89 62 65 22 55 41 50 177 35 31
## [109] 15 47 59 36 117 79 50 76 185 63 71 81 129 78 107 164 100 135
## [127] 103 143 310 67 96 81 106 110 54 93 50 49 69 42 55 97 153 264
## [145] 101 155 164 53 33 230 184 146 144 85 77 111 36 82 90 326 120 160
## [163] 278 143 195 145 148 164 128 96 170 185 115 236 158 118 251 135 354 101
df_dy %>% pull(calories) %>% sort()
## [1] 15 22 31 32 33 34 35 35 36 36 36 37 37 38 38 39 41 41
## [19] 41 42 42 43 43 45 46 46 47 47 49 50 50 50 50 51 51 51
## [37] 52 52 52 53 53 53 54 54 54 54 54 55 55 55 55 55 55 56
## [55] 56 57 58 58 59 59 59 59 61 61 62 62 62 63 65 65 66 66
## [73] 67 69 70 70 71 71 72 73 75 75 76 77 78 79 80 81 81 82
## [91] 83 84 85 89 90 90 90 91 93 94 95 96 96 97 100 100 100 101
## [109] 101 103 103 106 107 110 111 111 111 113 113 114 114 115 115 116 117 117
## [127] 118 119 119 119 120 120 124 124 127 128 129 133 135 135 136 143 143 144
## [145] 145 146 148 153 155 156 158 160 164 164 164 164 170 172 177 179 180 180
## [163] 184 184 185 185 195 202 230 236 240 251 262 264 267 277 278 310 326 354
df_dy %>% ggplot(aes(x=0, y= calories, fill=))+geom_boxplot()
ggplot(df_dy,aes(x=factor(0),calories))+geom_boxplot()
df_dy %>% ggplot(aes(x= distance, y= calories, col= activity.type))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#Phan tich hoi quy voi base R
df_dy$avg.cadence <- as.numeric(df_dy$avg.cadence)
typeof(df_dy$calories)
## [1] "double"
df_dy$calories <- as.numeric(df_dy$calories)
ggplot(df_dy, aes(avg.hr, calories))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(df_dy, aes(x=calories))+geom_boxplot()
x <- c('calories',"time.duration", "avg.hr" ,"avg.cadence" ,"avg.stride.length") #cac bien can nghien cuu
glimpse(df_dy$x)
## NULL
#Kiem tra
summary(df_dy[,x])
## calories time.duration avg.hr avg.cadence
## Min. : 15.0 Min. : 3.00 Min. : 67.0 Min. : 27.0
## 1st Qu.: 54.0 1st Qu.:12.00 1st Qu.: 95.0 1st Qu.: 95.0
## Median : 82.5 Median :16.00 Median :104.5 Median :106.5
## Mean :101.3 Mean :21.42 Mean :107.8 Mean :116.4
## 3rd Qu.:127.2 3rd Qu.:27.25 3rd Qu.:122.0 3rd Qu.:150.5
## Max. :354.0 Max. :81.00 Max. :155.0 Max. :180.0
## avg.stride.length
## Min. :0.2000
## 1st Qu.:0.5975
## Median :0.6800
## Mean :0.6643
## 3rd Qu.:0.7300
## Max. :1.7100
cor(df_dy[,x])
## calories time.duration avg.hr avg.cadence
## calories 1.00000000 0.94143978 0.1551077 0.1055287
## time.duration 0.94143978 1.00000000 -0.1190087 -0.1584721
## avg.hr 0.15510769 -0.11900866 1.0000000 0.7592891
## avg.cadence 0.10552873 -0.15847210 0.7592891 1.0000000
## avg.stride.length -0.01940753 0.08290801 -0.1875751 -0.3890326
## avg.stride.length
## calories -0.01940753
## time.duration 0.08290801
## avg.hr -0.18757514
## avg.cadence -0.38903261
## avg.stride.length 1.00000000
#ve bieu do tuong quan
plot(df_dy$calories, df_dy$avg.hr)
plot(df_dy$calories, df_dy$time.duration)
plot(df_dy$calories, df_dy$distance)
ggplot(df_dy, aes(distance, calories,col=title))+geom_point(aes(col=title))+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(df_dy, aes(avg.hr, calories,col=title))+geom_point(aes(col=title))+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
table1(~avg.hr +calories+distance+avg.stride.length+avg.cadence+avg.vertical.ratio|title, data = df_dy)
Quan Binh Thanh Running (N=81) |
Quan Binh Thanh Walking (N=99) |
Overall (N=180) |
|
---|---|---|---|
avg.hr | |||
Mean (SD) | 119 (15.6) | 98.3 (10.8) | 108 (16.9) |
Median [Min, Max] | 123 [88.0, 155] | 98.0 [67.0, 128] | 105 [67.0, 155] |
calories | |||
Mean (SD) | 132 (63.5) | 75.7 (51.9) | 101 (63.9) |
Median [Min, Max] | 118 [22.0, 354] | 58.0 [15.0, 310] | 82.5 [15.0, 354] |
distance | |||
Mean (SD) | 2.04 (0.987) | 1.23 (0.925) | 1.60 (1.03) |
Median [Min, Max] | 1.82 [0.470, 5.33] | 0.920 [0.170, 4.91] | 1.32 [0.170, 5.33] |
avg.stride.length | |||
Mean (SD) | 0.624 (0.186) | 0.697 (0.102) | 0.664 (0.150) |
Median [Min, Max] | 0.650 [0.200, 1.71] | 0.710 [0.450, 0.970] | 0.680 [0.200, 1.71] |
avg.cadence | |||
Mean (SD) | 139 (35.6) | 98.3 (16.2) | 116 (33.4) |
Median [Min, Max] | 161 [71.0, 180] | 100 [27.0, 124] | 107 [27.0, 180] |
avg.vertical.ratio | |||
Mean (SD) | 12.4 (6.47) | NA (NA) | 12.4 (6.47) |
Median [Min, Max] | 10.1 [3.90, 38.0] | NA [NA, NA] | 10.1 [3.90, 38.0] |
Missing | 45 (55.6%) | 99 (100%) | 144 (80.0%) |
#Hoi quy
model <- lm(calories~ distance+time.duration+avg.hr+title, data = df_dy)
summary(model)
##
## Call:
## lm(formula = calories ~ distance + time.duration + avg.hr + title,
## data = df_dy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.706 -4.439 0.863 4.812 30.323
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -39.28340 7.46262 -5.264 4.09e-07 ***
## distance 37.51725 2.36660 15.853 < 2e-16 ***
## time.duration 1.65610 0.17101 9.684 < 2e-16 ***
## avg.hr 0.44015 0.06034 7.295 9.99e-12 ***
## titleQuan Binh Thanh Walking -4.26879 1.78732 -2.388 0.018 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.36 on 175 degrees of freedom
## Multiple R-squared: 0.9833, Adjusted R-squared: 0.9829
## F-statistic: 2569 on 4 and 175 DF, p-value: < 2.2e-16
m.hr <- lm(avg.hr~ distance+time.duration+title, data = df_dy)
summary(m.hr)
##
## Call:
## lm(formula = avg.hr ~ distance + time.duration + title, data = df_dy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.0595 -6.7174 -0.0206 7.8080 28.6673
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 120.5232 2.0923 57.602 < 2e-16 ***
## distance 20.6731 2.5124 8.228 4.09e-14 ***
## time.duration -1.6910 0.1714 -9.864 < 2e-16 ***
## titleQuan Binh Thanh Walking -17.3937 1.8073 -9.624 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.44 on 176 degrees of freedom
## Multiple R-squared: 0.624, Adjusted R-squared: 0.6176
## F-statistic: 97.38 on 3 and 176 DF, p-value: < 2.2e-16