Load my library
library("tidyverse")
library("janitor")
library("lubridate")
library("dplyr")
read my csv file into R
dirty_pedometer <-read_csv("com.samsung.shealth.tracker.pedometer_day_summary.202205141347.csv")
hr <- read.csv("com.samsung.shealth.tracker.heart_rate.202205141347.csv")
Take a look at my data
glimpse(dirty_pedometer)
## Rows: 3,752
## Columns: 19
## $ step_count <dbl> 2854, 1215, 13817, 3845, 3118, 3610, 11874, 18, 13…
## $ binning_data <chr> "5af9518a-f435-49e8-aa21-498288115d0b.binning_data…
## $ active_time <dbl> 1384027, 659314, 6907844, 2070113, 1631525, 191735…
## $ recommendation <dbl> 10000, 10000, 6000, 10000, 10000, 10000, 10000, 10…
## $ run_step_count <dbl> 3, 0, 13, 3, 21, 0, 3046, 0, 6187, 0, 3952, 16, 30…
## $ update_time <time> 01:36:00, 26:49:00, 18:31:00, 00:02:00, 26:35:00,…
## $ source_package_name <chr> "com.sec.android.app.shealth", "com.sec.android.ap…
## $ create_time <time> 37:17:00, 52:04:00, 56:51:00, 00:02:00, 27:37:00,…
## $ source_info <chr> NA, NA, "70941998-9f62-4bd0-929a-3aa796780061.sour…
## $ speed <dbl> 1.750000, 1.972222, 1.401954, 2.318954, 1.367646, …
## $ distance <dbl> 1959.611, 771.151, 9684.481, 2672.490, 2231.350, 2…
## $ calorie <dbl> 93.9490000, 37.8900000, 438.6090000, 125.2899860, …
## $ walk_step_count <dbl> 2851, 1215, 13804, 3842, 3097, 3610, 8828, 18, 763…
## $ deviceuuid <chr> "cMpZmcg7wE", "cMpZmcg7wE", "VfS0qUERdZ", "aGTdXSg…
## $ pkg_name <chr> "com.sec.android.app.shealth", "com.sec.android.ap…
## $ healthy_step <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ achievement <chr> "5af9518a-f435-49e8-aa21-498288115d0b.achievement.…
## $ datauuid <chr> "5af9518a-f435-49e8-aa21-498288115d0b", "029e8438-…
## $ day_time <dbl> 1.53066e+12, 1.53593e+12, 1.53429e+12, 1.53179e+12…
colnames(dirty_pedometer)
## [1] "step_count" "binning_data" "active_time"
## [4] "recommendation" "run_step_count" "update_time"
## [7] "source_package_name" "create_time" "source_info"
## [10] "speed" "distance" "calorie"
## [13] "walk_step_count" "deviceuuid" "pkg_name"
## [16] "healthy_step" "achievement" "datauuid"
## [19] "day_time"
clean up column names
clean_names(dirty_pedometer)
## # A tibble: 3,752 × 19
## step_count binning_data active_time recommendation run_step_count update_time
## <dbl> <chr> <dbl> <dbl> <dbl> <time>
## 1 2854 5af9518a-f4… 1384027 10000 3 01:36
## 2 1215 029e8438-d4… 659314 10000 0 26:49
## 3 13817 70941998-9f… 6907844 6000 13 18:31
## 4 3845 35253cb3-13… 2070113 10000 3 00:02
## 5 3118 674015c9-1f… 1631525 10000 21 26:35
## 6 3610 a4c446d0-3d… 1917359 10000 0 00:01
## 7 11874 37c413cb-67… 5377074 10000 3046 18:32
## 8 18 7794ec31-68… 10909 10000 0 13:52
## 9 13821 5bc9d2a8-9d… 5823435 10000 6187 57:55
## 10 3465 cae133e9-0e… 1797552 10000 0 00:02
## # … with 3,742 more rows, and 13 more variables: source_package_name <chr>,
## # create_time <time>, source_info <chr>, speed <dbl>, distance <dbl>,
## # calorie <dbl>, walk_step_count <dbl>, deviceuuid <chr>, pkg_name <chr>,
## # healthy_step <dbl>, achievement <chr>, datauuid <chr>, day_time <dbl>
Look back at the names
colnames(dirty_pedometer)
## [1] "step_count" "binning_data" "active_time"
## [4] "recommendation" "run_step_count" "update_time"
## [7] "source_package_name" "create_time" "source_info"
## [10] "speed" "distance" "calorie"
## [13] "walk_step_count" "deviceuuid" "pkg_name"
## [16] "healthy_step" "achievement" "datauuid"
## [19] "day_time"
remove columns
select_pedometer <- select(dirty_pedometer,step_count,active_time,run_step_count,walk_step_count,distance,day_time)
view new data set with only the columns I am interested in
head(select_pedometer)
## # A tibble: 6 × 6
## step_count active_time run_step_count walk_step_count distance day_time
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2854 1384027 3 2851 1960. 1530660000000
## 2 1215 659314 0 1215 771. 1535930000000
## 3 13817 6907844 13 13804 9684. 1534290000000
## 4 3845 2070113 3 3842 2672. 1531790000000
## 5 3118 1631525 21 3097 2231. 1535670000000
## 6 3610 1917359 0 3610 2500. 1531270000000
try to figure out how to convert the date
create a variable with the day_time column, which is in scientific notation
myDate <- as.numeric(select_pedometer$day_time)
str(myDate)
## num [1:3752] 1.53e+12 1.54e+12 1.53e+12 1.53e+12 1.54e+12 ...
Create another variable that tells the scientific notation/1000 to begin its count on Jan 01, 1970
mydate1 <- (select_pedometer$day_time/1000)
my_posix <- as.POSIXct(mydate1, origin="1970-01-01")
str(my_posix)
## POSIXct[1:3752], format: "2018-07-03 23:20:00" "2018-09-02 23:13:20" "2018-08-14 23:40:00" ...
Then convert to just a date without the time.
myHealth_date <- as.Date(my_posix)
str(myHealth_date)
## Date[1:3752], format: "2018-07-03" "2018-09-02" "2018-08-14" "2018-07-17" "2018-08-30" ...
Make sure it’s right
head(myHealth_date,2)
## [1] "2018-07-03" "2018-09-02"
now our hr data
colnames(hr)
## [1] "source"
## [2] "tag_id"
## [3] "com.samsung.health.heart_rate.heart_beat_count"
## [4] "com.samsung.health.heart_rate.start_time"
## [5] "com.samsung.health.heart_rate.custom"
## [6] "com.samsung.health.heart_rate.binning_data"
## [7] "com.samsung.health.heart_rate.update_time"
## [8] "com.samsung.health.heart_rate.create_time"
## [9] "com.samsung.health.heart_rate.max"
## [10] "com.samsung.health.heart_rate.min"
## [11] "com.samsung.health.heart_rate.time_offset"
## [12] "com.samsung.health.heart_rate.deviceuuid"
## [13] "com.samsung.health.heart_rate.comment"
## [14] "com.samsung.health.heart_rate.pkg_name"
## [15] "com.samsung.health.heart_rate.end_time"
## [16] "com.samsung.health.heart_rate.datauuid"
## [17] "com.samsung.health.heart_rate.heart_rate"
select only the hr column
Heart_Rate <- select(hr, "com.samsung.health.heart_rate.heart_rate")
select only 3752 rows so that it fits with the existing dataset
heart_rate1 <- slice(Heart_Rate, 1:3752)
make sure it’s just the 1 column
colnames(heart_rate1)
## [1] "com.samsung.health.heart_rate.heart_rate"
Now that that’s all cleaned up….we’ll make sure dplyr is loaded to build our dataframe
library(dplyr)
Make the date variable and heart_rate variable a column
date <- c(myHealth_date)
heart_rate <- c(heart_rate1)
cbind them with our original dataframe to create the cleaned dataframe
Samsung_Health <- cbind(select_pedometer,date,heart_rate)
make sure i’ve got all my columns
colnames(Samsung_Health)
## [1] "step_count"
## [2] "active_time"
## [3] "run_step_count"
## [4] "walk_step_count"
## [5] "distance"
## [6] "day_time"
## [7] "date"
## [8] "com.samsung.health.heart_rate.heart_rate"
take a peek at my dataset
glimpse(Samsung_Health)
## Rows: 3,752
## Columns: 8
## $ step_count <dbl> 2854, 1215, 13817, 3845, 3118…
## $ active_time <dbl> 1384027, 659314, 6907844, 207…
## $ run_step_count <dbl> 3, 0, 13, 3, 21, 0, 3046, 0, …
## $ walk_step_count <dbl> 2851, 1215, 13804, 3842, 3097…
## $ distance <dbl> 1959.611, 771.151, 9684.481, …
## $ day_time <dbl> 1.53066e+12, 1.53593e+12, 1.5…
## $ date <date> 2018-07-03, 2018-09-02, 2018…
## $ com.samsung.health.heart_rate.heart_rate <int> 79, 49, 76, 62, 51, 43, 50, 5…
Now lets visualize some of this data
library(ggplot2)
ggplot(Samsung_Health,aes(x=distance,y=com.samsung.health.heart_rate.heart_rate))+
geom_smooth()+
labs(x="distance(Meter)",y="heart rate(Bpm)",title="Heart Rate vs. Distance")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(Samsung_Health,aes(x=run_step_count,y=com.samsung.health.heart_rate.heart_rate))+
geom_smooth()+
labs(x="run steps",y="heart rate(Bpm)",title="Heart Rate vs. Run Steps")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(Samsung_Health,aes(x=walk_step_count,y=com.samsung.health.heart_rate.heart_rate))+
geom_smooth()+
labs(x="walk steps",y="heart rate(Bpm)",title="Heart Rate vs. Walk Steps")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(Samsung_Health,aes(x=date,y=distance))+
geom_smooth()+
labs(x="date",y="distance(Meter)",title="Distance vs. Date")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(Samsung_Health,aes(x=date,y=step_count))+
geom_smooth()+
labs(x="date",y="total step count",title="Total Step Count vs. Date")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(Samsung_Health,aes(x=date,y=run_step_count))+
geom_smooth()+
labs(x="date",y="run steps",title="Run Steps vs. Date")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
library(readr)
make a copy of my clean data in a csv file
write_csv(Samsung_Health, file="SamsungHealthClean.csv")