R Markdown

require(ggplot2)
## Loading required package: ggplot2
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(haven)
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
# main script


####
dfB <- read.csv("Activities.csv", header = TRUE, stringsAsFactors = FALSE)
str(dfB)
## 'data.frame':    480 obs. of  47 variables:
##  $ Activity.Type           : chr  "Walking" "Running" "Walking" "Other" ...
##  $ Date                    : chr  "2023-06-25 20:37:17" "2023-06-25 20:21:30" "2023-06-25 20:04:04" "2023-06-25 17:16:48" ...
##  $ Favorite                : chr  "true" "true" "true" "false" ...
##  $ Title                   : chr  "Quan Binh Thanh Walking" "Quan Binh Thanh Running" "Quan Binh Thanh Walking" "Navigate" ...
##  $ Distance                : chr  "0.78" "1.75" "0.99" "0.00" ...
##  $ Calories                : chr  "58" "114" "71" "--" ...
##  $ Time                    : chr  "00:15:19" "00:15:13" "00:17:06" "00:00:03.3" ...
##  $ Avg.HR                  : int  94 127 93 65 69 83 85 101 131 91 ...
##  $ Max.HR                  : int  120 138 126 65 103 134 110 128 151 101 ...
##  $ Aerobic.TE              : chr  "0.4" "2.6" "0.7" "0.0" ...
##  $ Avg.Cadence             : chr  "74" "168" "97" "--" ...
##  $ Max.Cadence             : chr  "247" "186" "158" "--" ...
##  $ Avg.Pace                : chr  "19:39" "8:42" "17:13" "--" ...
##  $ Best.Pace               : chr  "9:18" "6:23" "10:16" "--" ...
##  $ Total.Ascent            : chr  "1" "3" "18" "--" ...
##  $ Total.Descent           : chr  "10" "3" "12" "--" ...
##  $ Avg.Stride.Length       : num  0.69 0.81 0.6 0 0 0 0 0.68 0.69 0.66 ...
##  $ Avg.Vertical.Ratio      : num  0 6.3 0 0 0 0 0 0 0 0 ...
##  $ Avg.Vertical.Oscillation: num  0 7.5 0 0 0 0 0 0 7.9 0 ...
##  $ Avg.Ground.Contact.Time : int  0 271 0 0 0 0 0 0 269 0 ...
##  $ Avg.GCT.Balance         : chr  "--" "49.5% L / 50.5% R" "--" "--" ...
##  $ Avg.GAP                 : chr  "20:58" "8:39" "18:18" "--" ...
##  $ Normalized.Power...NP.. : chr  "--" "211" "--" "--" ...
##  $ Training.Stress.Score.  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Avg.Power               : int  0 201 0 0 0 0 0 0 208 0 ...
##  $ Max.Power               : chr  "0" "281" "0" "0" ...
##  $ Grit                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Flow                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Total.Strokes           : chr  "--" "--" "--" "--" ...
##  $ Avg..Swolf              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Avg.Stroke.Rate         : chr  "0" "0" "0" "0" ...
##  $ Total.Reps              : chr  "0" "0" "0" "0" ...
##  $ Total.Sets              : chr  "--" "--" "--" "--" ...
##  $ Dive.Time               : chr  "0:00" "0:00" "0:00" "0:00" ...
##  $ Min.Temp                : num  29 29 29 30 30 32 32 29 30 31 ...
##  $ Surface.Interval        : chr  "0:00" "0:00" "0:00" "0:00" ...
##  $ Decompression           : chr  "No" "No" "No" "No" ...
##  $ Best.Lap.Time           : chr  "15:19.19" "06:37.33" "17:05.81" "00:03.28" ...
##  $ Number.of.Laps          : int  1 2 1 1 1 1 1 1 2 1 ...
##  $ Max.Temp                : num  31 30 31 30 34 33 33 32 32 32 ...
##  $ Avg.Resp                : chr  "--" "--" "--" "--" ...
##  $ Min.Resp                : chr  "--" "--" "--" "--" ...
##  $ Max.Resp                : chr  "--" "--" "--" "--" ...
##  $ Moving.Time             : chr  "00:10:06" "00:14:27" "00:11:41" "00:00:00" ...
##  $ Elapsed.Time            : chr  "00:15:19" "00:15:13" "00:17:06" "00:00:03.3" ...
##  $ Min.Elevation           : chr  "-21" "-13" "-28" "-17" ...
##  $ Max.Elevation           : chr  "-11" "-10" "-11" "-17" ...
class(dfB)
## [1] "data.frame"
#Tao dfb 
dfb <- dfB

# chuyen het ky tu thuong 
names(dfb) <- tolower(names(dfb))
# doi ten  2 cot co dau cham
colnames(dfb)[colnames(dfb) == "normalized.power...np.."] = "normalized.power.np"
colnames(dfb)[colnames(dfb) == "training.stress.score."] = "training.stress.score"
#thay gia tri 0 va -- = na
dfb[dfb == 0] <- NA
dfb[dfb == "--"] <- NA
dfb$calories <- as.numeric(dfb$calories )
## Warning: NAs introduced by coercion
str(dfb)
## 'data.frame':    480 obs. of  47 variables:
##  $ activity.type           : chr  "Walking" "Running" "Walking" "Other" ...
##  $ date                    : chr  "2023-06-25 20:37:17" "2023-06-25 20:21:30" "2023-06-25 20:04:04" "2023-06-25 17:16:48" ...
##  $ favorite                : chr  "true" "true" "true" "false" ...
##  $ title                   : chr  "Quan Binh Thanh Walking" "Quan Binh Thanh Running" "Quan Binh Thanh Walking" "Navigate" ...
##  $ distance                : chr  "0.78" "1.75" "0.99" "0.00" ...
##  $ calories                : num  58 114 71 NA 37 140 93 75 120 55 ...
##  $ time                    : chr  "00:15:19" "00:15:13" "00:17:06" "00:00:03.3" ...
##  $ avg.hr                  : int  94 127 93 65 69 83 85 101 131 91 ...
##  $ max.hr                  : int  120 138 126 65 103 134 110 128 151 101 ...
##  $ aerobic.te              : chr  "0.4" "2.6" "0.7" "0.0" ...
##  $ avg.cadence             : chr  "74" "168" "97" NA ...
##  $ max.cadence             : chr  "247" "186" "158" NA ...
##  $ avg.pace                : chr  "19:39" "8:42" "17:13" NA ...
##  $ best.pace               : chr  "9:18" "6:23" "10:16" NA ...
##  $ total.ascent            : chr  "1" "3" "18" NA ...
##  $ total.descent           : chr  "10" "3" "12" NA ...
##  $ avg.stride.length       : num  0.69 0.81 0.6 NA NA NA NA 0.68 0.69 0.66 ...
##  $ avg.vertical.ratio      : num  NA 6.3 NA NA NA NA NA NA NA NA ...
##  $ avg.vertical.oscillation: num  NA 7.5 NA NA NA NA NA NA 7.9 NA ...
##  $ avg.ground.contact.time : int  NA 271 NA NA NA NA NA NA 269 NA ...
##  $ avg.gct.balance         : chr  NA "49.5% L / 50.5% R" NA NA ...
##  $ avg.gap                 : chr  "20:58" "8:39" "18:18" NA ...
##  $ normalized.power.np     : chr  NA "211" NA NA ...
##  $ training.stress.score   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ avg.power               : int  NA 201 NA NA NA NA NA NA 208 NA ...
##  $ max.power               : chr  NA "281" NA NA ...
##  $ grit                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ flow                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total.strokes           : chr  NA NA NA NA ...
##  $ avg..swolf              : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ avg.stroke.rate         : chr  NA NA NA NA ...
##  $ total.reps              : chr  NA NA NA NA ...
##  $ total.sets              : chr  NA NA NA NA ...
##  $ dive.time               : chr  "0:00" "0:00" "0:00" "0:00" ...
##  $ min.temp                : num  29 29 29 30 30 32 32 29 30 31 ...
##  $ surface.interval        : chr  "0:00" "0:00" "0:00" "0:00" ...
##  $ decompression           : chr  "No" "No" "No" "No" ...
##  $ best.lap.time           : chr  "15:19.19" "06:37.33" "17:05.81" "00:03.28" ...
##  $ number.of.laps          : int  1 2 1 1 1 1 1 1 2 1 ...
##  $ max.temp                : num  31 30 31 30 34 33 33 32 32 32 ...
##  $ avg.resp                : chr  NA NA NA NA ...
##  $ min.resp                : chr  NA NA NA NA ...
##  $ max.resp                : chr  NA NA NA NA ...
##  $ moving.time             : chr  "00:10:06" "00:14:27" "00:11:41" "00:00:00" ...
##  $ elapsed.time            : chr  "00:15:19" "00:15:13" "00:17:06" "00:00:03.3" ...
##  $ min.elevation           : chr  "-21" "-13" "-28" "-17" ...
##  $ max.elevation           : chr  "-11" "-10" "-11" "-17" ...
typeof(dfb)
## [1] "list"
typeof(dfb$date)
## [1] "character"
typeof(dfb$date)
## [1] "character"
## tao dfg datafAME
## tao ra co ngay thang
#bien so pace
dfb$avg.pace1 <- dfb$avg.pace
#dfb$avg.pace1 <- hm(dfb$avg.pace) 
#hour(dfb$avg.pace1)

dfb <- dfb %>% separate(col = avg.pace1, into = c('pace.1', 'pace.2'), sep =":") %>% mutate_at(c('pace.1', 'pace.2'), as.numeric) %>% mutate(pace.m = (pace.1*60 + pace.2)/60) %>% mutate(pace.m =round(pace.m, digits = 0))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 102 rows [5, 24, 25, 46,
## 47, 48, 54, 62, 64, 66, 67, 68, 110, 131, 132, 133, 138, 154, 162, 175, ...].
typeof(dfb$pace.m)
## [1] "double"
#dfb <- dfb %>% separate(col = avg.pace1, into = c('pace.1', 'pace.2'), sep =":") %>% mutate_at(c('pace.1', 'pace.2'), as.numeric) %>% mutate(pace.m = (pace.1*60 + pace.2)/60) %>% mutate(pace.m =round(pace.m, digits = 0))


dfb <- dfb %>% mutate(elapsed.time.t = hms(elapsed.time)) %>% mutate(elapsed.time.m = hour(elapsed.time.t)*60 + minute(elapsed.time.t))

dfb <- dfb %>% mutate(moving.time.t = hms(moving.time)) %>% mutate(moving.time.m = hour(moving.time.t)*60 + minute(moving.time.t))

dfb$moving.time.m
##   [1]  10  14  11   0  10  57  36   9  15  12  11  14  10  51  20  11  26   6
##  [19]  16  12  22  38  61  73  47   8  15   9   9  14  10  38  28   5  15  10
##  [37]  62  10  12  12   8  16   8  12  10  43  33  23   0  20  68  17  18 193
##  [55]  10  12  17  16  10   0  10 345  31  92   0 110  63 126   9  14  11  10
##  [73]  15   6  34  33  11  14  12  16   6  28  52   7  14  10  48  19   6  10
##  [91]  61  30  42   9  15  20  25  30   3  50  45  11  20  12  30  13   8   7
## [109]  15  10  52  31   7  13  10  77  10   9  10  76   8  15  11  29  30  18
## [127]  23  32  42  42  18  31  25  34  12   3  10 105   6  26  10  15  10  35
## [145]   7   0  56   6  31  13  10  10  10  40  21  18  29   1  10   9  14 160
## [163]  21   9  25   9   8   8   7  10   7   7  20   9 572 575  10  11  12 518
## [181] 434  47  26  32  32   7  21  38   1   4  11  16  20   0   5   7   1   6
## [199]   4  15  40  27   9   1   1   3   0   1   0   0  10   3  13  11   8  29
## [217]   9   3   3   9   8   8  20 629  29  14   7  13  49  32  13  12  33  68
## [235]  21  21  19  40  15   0   0  41  39  25  31  17  17  20  15  17  29  36
## [253]  31  51  29  24  19   0  34  31  19  11  15  20  10  21  15  68  23  25
## [271]  30  12  14  25  11  16  15   5  29  16  14  21   8   7   5  19  12   3
## [289]  35  29  30  10  12   8   8  32  17  19   6  11  35   9   2  22   4   1
## [307]   8   5   1  13   7   7   4  12  20  15  15   5   9  12   2   5  59   4
## [325]  19  12  55  15  29  22  29  18  20   3  66  11   6  30  29  48  27  31
## [343]   9   4  56   3   2  28  29  30  13  10   8  13  21  41  14  10  10  17
## [361]   8   8  25  21  27 126   0   4   4  20  29  36  40  28   4  31  65   7
## [379]  28  29  52  15  11  36  24   7  56  19  33  50  20  40  67   7   6  55
## [397]  25  32  29  25  58  68  20  62  26  31  67  52  33  55  31  32  35  24
## [415]   4   2  23  29  53  18  39  39  44  25  52  33  55  22  52  41  33  33
## [433]  37  51   5  25   3  25  61  21  43 196  30  48  47  29 131 220  61 107
## [451]  77   0  41  29  91  82  62  28  28  37  54  53  60  32  68  72  55   0
## [469]  61  47  34  25   0  90  27   0  28   8  58  52
#dfb$moving.time.t <- dfb$moving.time 

#dfb$moving.time.t <-  hms(dfb$moving.time)
#hour(dfb$moving.time.t) *60 + minute(dfb$moving.time.t)
#typeof(dfb$moving.time1) 

dfb$date <- strptime(dfb$date, format = "%Y-%m-%d %H:%M:%S")
#Tao bien ngay gio
dfb$hour.minute <- as.character(format(dfb$date, "%H:%S")) 

dfb$year.month <- as.character(format(dfb$date, "%Y-%m")) 
dfb$week.days <- weekdays(dfb$date)

dfb$week.days <- as.factor(dfb$week.days)
# reorder weekdays as ordering

dfb$week.days <- ordered(dfb$week.days, levels=c("Monday", "Tuesday", "Wednesday", "Thursday",  "Friday", "Saturday", "Sunday"))

#table1(~distance|week.days, data = dfb)
####______
#tao bien thoi gian 1 lan chay (di bo ) backage lubridate voi ham period_to_second va ham hms
library(lubridate)
dfb$time.duration <- round(round(period_to_seconds(hms(dfb$time)), digits = 0)/60, digits = 0)

###__________
#Kiem tra du lieu
#Cau truc
str(dfb)
## 'data.frame':    480 obs. of  58 variables:
##  $ activity.type           : chr  "Walking" "Running" "Walking" "Other" ...
##  $ date                    : POSIXlt, format: "2023-06-25 20:37:17" "2023-06-25 20:21:30" ...
##  $ favorite                : chr  "true" "true" "true" "false" ...
##  $ title                   : chr  "Quan Binh Thanh Walking" "Quan Binh Thanh Running" "Quan Binh Thanh Walking" "Navigate" ...
##  $ distance                : chr  "0.78" "1.75" "0.99" "0.00" ...
##  $ calories                : num  58 114 71 NA 37 140 93 75 120 55 ...
##  $ time                    : chr  "00:15:19" "00:15:13" "00:17:06" "00:00:03.3" ...
##  $ avg.hr                  : int  94 127 93 65 69 83 85 101 131 91 ...
##  $ max.hr                  : int  120 138 126 65 103 134 110 128 151 101 ...
##  $ aerobic.te              : chr  "0.4" "2.6" "0.7" "0.0" ...
##  $ avg.cadence             : chr  "74" "168" "97" NA ...
##  $ max.cadence             : chr  "247" "186" "158" NA ...
##  $ avg.pace                : chr  "19:39" "8:42" "17:13" NA ...
##  $ best.pace               : chr  "9:18" "6:23" "10:16" NA ...
##  $ total.ascent            : chr  "1" "3" "18" NA ...
##  $ total.descent           : chr  "10" "3" "12" NA ...
##  $ avg.stride.length       : num  0.69 0.81 0.6 NA NA NA NA 0.68 0.69 0.66 ...
##  $ avg.vertical.ratio      : num  NA 6.3 NA NA NA NA NA NA NA NA ...
##  $ avg.vertical.oscillation: num  NA 7.5 NA NA NA NA NA NA 7.9 NA ...
##  $ avg.ground.contact.time : int  NA 271 NA NA NA NA NA NA 269 NA ...
##  $ avg.gct.balance         : chr  NA "49.5% L / 50.5% R" NA NA ...
##  $ avg.gap                 : chr  "20:58" "8:39" "18:18" NA ...
##  $ normalized.power.np     : chr  NA "211" NA NA ...
##  $ training.stress.score   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ avg.power               : int  NA 201 NA NA NA NA NA NA 208 NA ...
##  $ max.power               : chr  NA "281" NA NA ...
##  $ grit                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ flow                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total.strokes           : chr  NA NA NA NA ...
##  $ avg..swolf              : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ avg.stroke.rate         : chr  NA NA NA NA ...
##  $ total.reps              : chr  NA NA NA NA ...
##  $ total.sets              : chr  NA NA NA NA ...
##  $ dive.time               : chr  "0:00" "0:00" "0:00" "0:00" ...
##  $ min.temp                : num  29 29 29 30 30 32 32 29 30 31 ...
##  $ surface.interval        : chr  "0:00" "0:00" "0:00" "0:00" ...
##  $ decompression           : chr  "No" "No" "No" "No" ...
##  $ best.lap.time           : chr  "15:19.19" "06:37.33" "17:05.81" "00:03.28" ...
##  $ number.of.laps          : int  1 2 1 1 1 1 1 1 2 1 ...
##  $ max.temp                : num  31 30 31 30 34 33 33 32 32 32 ...
##  $ avg.resp                : chr  NA NA NA NA ...
##  $ min.resp                : chr  NA NA NA NA ...
##  $ max.resp                : chr  NA NA NA NA ...
##  $ moving.time             : chr  "00:10:06" "00:14:27" "00:11:41" "00:00:00" ...
##  $ elapsed.time            : chr  "00:15:19" "00:15:13" "00:17:06" "00:00:03.3" ...
##  $ min.elevation           : chr  "-21" "-13" "-28" "-17" ...
##  $ max.elevation           : chr  "-11" "-10" "-11" "-17" ...
##  $ pace.1                  : num  19 8 17 NA 7.3 NA NA 26 8 14 ...
##  $ pace.2                  : num  39 42 13 NA NA NA NA 0 43 59 ...
##  $ pace.m                  : num  20 9 17 NA NA NA NA 26 9 15 ...
##  $ elapsed.time.t          :Formal class 'Period' [package "lubridate"] with 6 slots
##   .. ..@ .Data : num  19 13 6 3.3 3 40 36 49 58 42 ...
##   .. ..@ year  : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ month : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ day   : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ hour  : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ minute: num  15 15 17 0 43 57 36 23 15 13 ...
##  $ elapsed.time.m          : num  15 15 17 0 43 57 36 23 15 13 ...
##  $ moving.time.t           :Formal class 'Period' [package "lubridate"] with 6 slots
##   .. ..@ .Data : num  6 27 41 0 14 40 36 29 21 12 ...
##   .. ..@ year  : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ month : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ day   : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ hour  : num  0 0 0 0 0 0 0 0 0 0 ...
##   .. ..@ minute: num  10 14 11 0 10 57 36 9 15 12 ...
##  $ moving.time.m           : num  10 14 11 0 10 57 36 9 15 12 ...
##  $ hour.minute             : chr  "20:17" "20:30" "20:04" "17:48" ...
##  $ year.month              : chr  "2023-06" "2023-06" "2023-06" "2023-06" ...
##  $ week.days               : Ord.factor w/ 7 levels "Monday"<"Tuesday"<..: 7 7 7 7 7 7 6 5 5 5 ...
##  $ time.duration           : num  15 15 17 0 24 58 37 19 16 14 ...
ncol(dfb) #xem cot 
## [1] 58
nrow(dfb) # Dem so quan sat
## [1] 480
# Re check dfb
head(dfb)
##       activity.type                date favorite                   title
## 1           Walking 2023-06-25 20:37:17     true Quan Binh Thanh Walking
## 2           Running 2023-06-25 20:21:30     true Quan Binh Thanh Running
## 3           Walking 2023-06-25 20:04:04     true Quan Binh Thanh Walking
## 4             Other 2023-06-25 17:16:48    false                Navigate
## 5             Other 2023-06-25 10:50:45    false         Quan 1 Track Me
## 6 Strength Training 2023-06-25 09:18:06     true                Strength
##   distance calories       time avg.hr max.hr aerobic.te avg.cadence max.cadence
## 1     0.78       58   00:15:19     94    120        0.4          74         247
## 2     1.75      114   00:15:13    127    138        2.6         168         186
## 3     0.99       71   00:17:06     93    126        0.7          97         158
## 4     0.00       NA 00:00:03.3     65     65        0.0        <NA>        <NA>
## 5     2.94       37   00:24:05     69    103        0.1        <NA>        <NA>
## 6     0.00      140   00:57:40     83    134        0.4        <NA>        <NA>
##   avg.pace best.pace total.ascent total.descent avg.stride.length
## 1    19:39      9:18            1            10              0.69
## 2     8:42      6:23            3             3              0.81
## 3    17:13     10:16           18            12              0.60
## 4     <NA>      <NA>         <NA>          <NA>                NA
## 5      7.3      30.2            4             7                NA
## 6     <NA>      <NA>         <NA>          <NA>                NA
##   avg.vertical.ratio avg.vertical.oscillation avg.ground.contact.time
## 1                 NA                       NA                      NA
## 2                6.3                      7.5                     271
## 3                 NA                       NA                      NA
## 4                 NA                       NA                      NA
## 5                 NA                       NA                      NA
## 6                 NA                       NA                      NA
##     avg.gct.balance avg.gap normalized.power.np training.stress.score avg.power
## 1              <NA>   20:58                <NA>                    NA        NA
## 2 49.5% L / 50.5% R    8:39                 211                    NA       201
## 3              <NA>   18:18                <NA>                    NA        NA
## 4              <NA>    <NA>                <NA>                    NA        NA
## 5              <NA>    4:08                <NA>                    NA        NA
## 6              <NA>    <NA>                <NA>                    NA        NA
##   max.power grit flow total.strokes avg..swolf avg.stroke.rate total.reps
## 1      <NA>   NA   NA          <NA>         NA            <NA>       <NA>
## 2       281   NA   NA          <NA>         NA            <NA>       <NA>
## 3      <NA>   NA   NA          <NA>         NA            <NA>       <NA>
## 4      <NA>   NA   NA          <NA>         NA            <NA>       <NA>
## 5      <NA>   NA   NA          <NA>         NA            <NA>       <NA>
## 6      <NA>   NA   NA          <NA>         NA            <NA>         24
##   total.sets dive.time min.temp surface.interval decompression best.lap.time
## 1       <NA>      0:00       29             0:00            No      15:19.19
## 2       <NA>      0:00       29             0:00            No      06:37.33
## 3       <NA>      0:00       29             0:00            No      17:05.81
## 4       <NA>      0:00       30             0:00            No      00:03.28
## 5       <NA>      0:00       30             0:00            No      24:04.71
## 6          1      0:00       32             0:00            No      57:39.60
##   number.of.laps max.temp avg.resp min.resp max.resp moving.time elapsed.time
## 1              1       31     <NA>     <NA>     <NA>    00:10:06     00:15:19
## 2              2       30     <NA>     <NA>     <NA>    00:14:27     00:15:13
## 3              1       31     <NA>     <NA>     <NA>    00:11:41     00:17:06
## 4              1       30     <NA>     <NA>     <NA>    00:00:00   00:00:03.3
## 5              1       34     <NA>     <NA>     <NA>    00:10:14     00:43:03
## 6              1       33     <NA>     <NA>     <NA>    00:57:40     00:57:40
##   min.elevation max.elevation pace.1 pace.2 pace.m elapsed.time.t
## 1           -21           -11   19.0     39     20        15M 19S
## 2           -13           -10    8.0     42      9        15M 13S
## 3           -28           -11   17.0     13     17         17M 6S
## 4           -17           -17     NA     NA     NA           3.3S
## 5           -23           -15    7.3     NA     NA         43M 3S
## 6          <NA>          <NA>     NA     NA     NA        57M 40S
##   elapsed.time.m moving.time.t moving.time.m hour.minute year.month week.days
## 1             15        10M 6S            10       20:17    2023-06    Sunday
## 2             15       14M 27S            14       20:30    2023-06    Sunday
## 3             17       11M 41S            11       20:04    2023-06    Sunday
## 4              0            0S             0       17:48    2023-06    Sunday
## 5             43       10M 14S            10       10:45    2023-06    Sunday
## 6             57       57M 40S            57       09:06    2023-06    Sunday
##   time.duration
## 1            15
## 2            15
## 3            17
## 4             0
## 5            24
## 6            58
library(ggplot2)

#chon runing hoac walking binh thanh
df_dy <- dfb %>% filter(!is.na(avg.stride.length)) %>% drop_na(avg.hr) %>% filter(stringr::str_detect(title, 'Binh Thanh') ) %>% filter(calories <500) %>% filter(elapsed.time.m <=120)

df_dy$title <- as.factor(df_dy$title)
df_dy <- df_dy %>% filter(title %in% c("Quan Binh Thanh Running", "Quan Binh Thanh Walking")) %>% droplevels() 

df_dy$calories <- as.numeric(df_dy$calories)
df_dy$calories <- as.numeric(df_dy$calories)
df_dy$distance <- as.numeric(df_dy$distance)

#table1
table1(~avg.hr+distance+calories+pace.m+moving.time.m+elapsed.time.m|title, data=df_dy )
Quan Binh Thanh Running
(N=81)
Quan Binh Thanh Walking
(N=99)
Overall
(N=180)
avg.hr
Mean (SD) 119 (15.6) 98.3 (10.8) 108 (16.9)
Median [Min, Max] 123 [88.0, 155] 98.0 [67.0, 128] 105 [67.0, 155]
distance
Mean (SD) 2.04 (0.987) 1.23 (0.925) 1.60 (1.03)
Median [Min, Max] 1.82 [0.470, 5.33] 0.920 [0.170, 4.91] 1.32 [0.170, 5.33]
calories
Mean (SD) 132 (63.5) 75.7 (51.9) 101 (63.9)
Median [Min, Max] 118 [22.0, 354] 58.0 [15.0, 310] 82.5 [15.0, 354]
pace.m
Mean (SD) 12.0 (3.03) 15.1 (2.92) 13.7 (3.34)
Median [Min, Max] 11.0 [7.00, 20.0] 15.0 [10.0, 26.0] 14.0 [7.00, 26.0]
Missing 0 (0%) 1 (1.0%) 1 (0.6%)
moving.time.m
Mean (SD) 22.9 (13.6) 14.6 (10.7) 18.3 (12.7)
Median [Min, Max] 20.0 [3.00, 67.0] 10.0 [2.00, 56.0] 14.0 [2.00, 67.0]
elapsed.time.m
Mean (SD) 25.4 (16.0) 17.5 (12.2) 21.1 (14.5)
Median [Min, Max] 21.0 [3.00, 81.0] 14.0 [3.00, 72.0] 15.0 [3.00, 81.0]
df_dy <- df_dy %>% mutate_at(c('distance', 'calories'), as.numeric)
ggplot(data=df_dy, aes(x=title, y =calories))+geom_boxplot()

ggplot(data=df_dy, aes(x=title, y =distance))+geom_boxplot()

ggplot(data=df_dy, aes(x=title, y =pace.m))+geom_boxplot()
## Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).

# only running or walking in Binh Thanh
df_dy$calories
##   [1]  58 114  71  75 120  55  59  94  54 119  84 119  38 133  62 179  45 119
##  [19]  41  54 117  55 172  36 100  50  57  90  59  54 127  46  66 111  56  61
##  [37] 116  32  70 113  75 124 240  42 113  59  91  37  52 156  43 111 100  90
##  [55]  66 124  65 277 180  35 103  51  51  83  55  37  95  62 164  80 180  53
##  [73]  34 202  58  70 262  47 267  38 184  61  55  72  56 114  43 136  73  41
##  [91]  53  54  51  52  46 115  39  52  89  62  65  22  55  41  50 177  35  31
## [109]  15  47  59  36 117  79  50  76 185  63  71  81 129  78 107 164 100 135
## [127] 103 143 310  67  96  81 106 110  54  93  50  49  69  42  55  97 153 264
## [145] 101 155 164  53  33 230 184 146 144  85  77 111  36  82  90 326 120 160
## [163] 278 143 195 145 148 164 128  96 170 185 115 236 158 118 251 135 354 101
df_dy %>% pull(calories)  %>% sort()
##   [1]  15  22  31  32  33  34  35  35  36  36  36  37  37  38  38  39  41  41
##  [19]  41  42  42  43  43  45  46  46  47  47  49  50  50  50  50  51  51  51
##  [37]  52  52  52  53  53  53  54  54  54  54  54  55  55  55  55  55  55  56
##  [55]  56  57  58  58  59  59  59  59  61  61  62  62  62  63  65  65  66  66
##  [73]  67  69  70  70  71  71  72  73  75  75  76  77  78  79  80  81  81  82
##  [91]  83  84  85  89  90  90  90  91  93  94  95  96  96  97 100 100 100 101
## [109] 101 103 103 106 107 110 111 111 111 113 113 114 114 115 115 116 117 117
## [127] 118 119 119 119 120 120 124 124 127 128 129 133 135 135 136 143 143 144
## [145] 145 146 148 153 155 156 158 160 164 164 164 164 170 172 177 179 180 180
## [163] 184 184 185 185 195 202 230 236 240 251 262 264 267 277 278 310 326 354
df_dy %>% ggplot(aes(x=0, y= calories, fill=))+geom_boxplot()

ggplot(df_dy,aes(x=factor(0),calories))+geom_boxplot()

df_dy %>% ggplot(aes(x= distance, y= calories, col= activity.type))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#Phan tich hoi quy voi base R
df_dy$avg.cadence <- as.numeric(df_dy$avg.cadence)
typeof(df_dy$calories)
## [1] "double"
df_dy$calories <- as.numeric(df_dy$calories)

ggplot(df_dy, aes(avg.hr, calories))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(df_dy, aes(x=calories))+geom_boxplot()

x <- c('calories',"time.duration", "avg.hr" ,"avg.cadence" ,"avg.stride.length") #cac bien can nghien cuu

glimpse(df_dy$x)
##  NULL
#Kiem tra
summary(df_dy[,x])
##     calories     time.duration       avg.hr       avg.cadence   
##  Min.   : 15.0   Min.   : 3.00   Min.   : 67.0   Min.   : 27.0  
##  1st Qu.: 54.0   1st Qu.:12.00   1st Qu.: 95.0   1st Qu.: 95.0  
##  Median : 82.5   Median :16.00   Median :104.5   Median :106.5  
##  Mean   :101.3   Mean   :21.42   Mean   :107.8   Mean   :116.4  
##  3rd Qu.:127.2   3rd Qu.:27.25   3rd Qu.:122.0   3rd Qu.:150.5  
##  Max.   :354.0   Max.   :81.00   Max.   :155.0   Max.   :180.0  
##  avg.stride.length
##  Min.   :0.2000   
##  1st Qu.:0.5975   
##  Median :0.6800   
##  Mean   :0.6643   
##  3rd Qu.:0.7300   
##  Max.   :1.7100
cor(df_dy[,x])
##                      calories time.duration     avg.hr avg.cadence
## calories           1.00000000    0.94143978  0.1551077   0.1055287
## time.duration      0.94143978    1.00000000 -0.1190087  -0.1584721
## avg.hr             0.15510769   -0.11900866  1.0000000   0.7592891
## avg.cadence        0.10552873   -0.15847210  0.7592891   1.0000000
## avg.stride.length -0.01940753    0.08290801 -0.1875751  -0.3890326
##                   avg.stride.length
## calories                -0.01940753
## time.duration            0.08290801
## avg.hr                  -0.18757514
## avg.cadence             -0.38903261
## avg.stride.length        1.00000000
#ve bieu do tuong quan
plot(df_dy$calories, df_dy$avg.hr)

plot(df_dy$calories, df_dy$time.duration)

plot(df_dy$calories, df_dy$distance)

ggplot(df_dy, aes(distance, calories,col=title))+geom_point(aes(col=title))+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(df_dy, aes(avg.hr, calories,col=title))+geom_point(aes(col=title))+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

table1(~avg.hr +calories+distance+avg.stride.length+avg.cadence+avg.vertical.ratio|title, data = df_dy)
Quan Binh Thanh Running
(N=81)
Quan Binh Thanh Walking
(N=99)
Overall
(N=180)
avg.hr
Mean (SD) 119 (15.6) 98.3 (10.8) 108 (16.9)
Median [Min, Max] 123 [88.0, 155] 98.0 [67.0, 128] 105 [67.0, 155]
calories
Mean (SD) 132 (63.5) 75.7 (51.9) 101 (63.9)
Median [Min, Max] 118 [22.0, 354] 58.0 [15.0, 310] 82.5 [15.0, 354]
distance
Mean (SD) 2.04 (0.987) 1.23 (0.925) 1.60 (1.03)
Median [Min, Max] 1.82 [0.470, 5.33] 0.920 [0.170, 4.91] 1.32 [0.170, 5.33]
avg.stride.length
Mean (SD) 0.624 (0.186) 0.697 (0.102) 0.664 (0.150)
Median [Min, Max] 0.650 [0.200, 1.71] 0.710 [0.450, 0.970] 0.680 [0.200, 1.71]
avg.cadence
Mean (SD) 139 (35.6) 98.3 (16.2) 116 (33.4)
Median [Min, Max] 161 [71.0, 180] 100 [27.0, 124] 107 [27.0, 180]
avg.vertical.ratio
Mean (SD) 12.4 (6.47) NA (NA) 12.4 (6.47)
Median [Min, Max] 10.1 [3.90, 38.0] NA [NA, NA] 10.1 [3.90, 38.0]
Missing 45 (55.6%) 99 (100%) 144 (80.0%)
#Hoi quy


model <- lm(calories~ distance+time.duration+avg.hr+title, data = df_dy)
summary(model)
## 
## Call:
## lm(formula = calories ~ distance + time.duration + avg.hr + title, 
##     data = df_dy)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.706  -4.439   0.863   4.812  30.323 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -39.28340    7.46262  -5.264 4.09e-07 ***
## distance                      37.51725    2.36660  15.853  < 2e-16 ***
## time.duration                  1.65610    0.17101   9.684  < 2e-16 ***
## avg.hr                         0.44015    0.06034   7.295 9.99e-12 ***
## titleQuan Binh Thanh Walking  -4.26879    1.78732  -2.388    0.018 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.36 on 175 degrees of freedom
## Multiple R-squared:  0.9833, Adjusted R-squared:  0.9829 
## F-statistic:  2569 on 4 and 175 DF,  p-value: < 2.2e-16
m.hr <- lm(avg.hr~ distance+time.duration+title, data = df_dy)
summary(m.hr)
## 
## Call:
## lm(formula = avg.hr ~ distance + time.duration + title, data = df_dy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.0595  -6.7174  -0.0206   7.8080  28.6673 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  120.5232     2.0923  57.602  < 2e-16 ***
## distance                      20.6731     2.5124   8.228 4.09e-14 ***
## time.duration                 -1.6910     0.1714  -9.864  < 2e-16 ***
## titleQuan Binh Thanh Walking -17.3937     1.8073  -9.624  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.44 on 176 degrees of freedom
## Multiple R-squared:  0.624,  Adjusted R-squared:  0.6176 
## F-statistic: 97.38 on 3 and 176 DF,  p-value: < 2.2e-16