Load my library

library("tidyverse")
library("janitor")
library("lubridate")
library("dplyr")

read my csv file into R

dirty_pedometer <-read_csv("com.samsung.shealth.tracker.pedometer_day_summary.202205141347.csv")
hr <- read.csv("com.samsung.shealth.tracker.heart_rate.202205141347.csv")

Take a look at my data

glimpse(dirty_pedometer)
## Rows: 3,752
## Columns: 19
## $ step_count          <dbl> 2854, 1215, 13817, 3845, 3118, 3610, 11874, 18, 13…
## $ binning_data        <chr> "5af9518a-f435-49e8-aa21-498288115d0b.binning_data…
## $ active_time         <dbl> 1384027, 659314, 6907844, 2070113, 1631525, 191735…
## $ recommendation      <dbl> 10000, 10000, 6000, 10000, 10000, 10000, 10000, 10…
## $ run_step_count      <dbl> 3, 0, 13, 3, 21, 0, 3046, 0, 6187, 0, 3952, 16, 30…
## $ update_time         <time> 01:36:00, 26:49:00, 18:31:00, 00:02:00, 26:35:00,…
## $ source_package_name <chr> "com.sec.android.app.shealth", "com.sec.android.ap…
## $ create_time         <time> 37:17:00, 52:04:00, 56:51:00, 00:02:00, 27:37:00,…
## $ source_info         <chr> NA, NA, "70941998-9f62-4bd0-929a-3aa796780061.sour…
## $ speed               <dbl> 1.750000, 1.972222, 1.401954, 2.318954, 1.367646, …
## $ distance            <dbl> 1959.611, 771.151, 9684.481, 2672.490, 2231.350, 2…
## $ calorie             <dbl> 93.9490000, 37.8900000, 438.6090000, 125.2899860, …
## $ walk_step_count     <dbl> 2851, 1215, 13804, 3842, 3097, 3610, 8828, 18, 763…
## $ deviceuuid          <chr> "cMpZmcg7wE", "cMpZmcg7wE", "VfS0qUERdZ", "aGTdXSg…
## $ pkg_name            <chr> "com.sec.android.app.shealth", "com.sec.android.ap…
## $ healthy_step        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ achievement         <chr> "5af9518a-f435-49e8-aa21-498288115d0b.achievement.…
## $ datauuid            <chr> "5af9518a-f435-49e8-aa21-498288115d0b", "029e8438-…
## $ day_time            <dbl> 1.53066e+12, 1.53593e+12, 1.53429e+12, 1.53179e+12…
colnames(dirty_pedometer)
##  [1] "step_count"          "binning_data"        "active_time"        
##  [4] "recommendation"      "run_step_count"      "update_time"        
##  [7] "source_package_name" "create_time"         "source_info"        
## [10] "speed"               "distance"            "calorie"            
## [13] "walk_step_count"     "deviceuuid"          "pkg_name"           
## [16] "healthy_step"        "achievement"         "datauuid"           
## [19] "day_time"

clean up column names

clean_names(dirty_pedometer)
## # A tibble: 3,752 × 19
##    step_count binning_data active_time recommendation run_step_count update_time
##         <dbl> <chr>              <dbl>          <dbl>          <dbl> <time>     
##  1       2854 5af9518a-f4…     1384027          10000              3 01:36      
##  2       1215 029e8438-d4…      659314          10000              0 26:49      
##  3      13817 70941998-9f…     6907844           6000             13 18:31      
##  4       3845 35253cb3-13…     2070113          10000              3 00:02      
##  5       3118 674015c9-1f…     1631525          10000             21 26:35      
##  6       3610 a4c446d0-3d…     1917359          10000              0 00:01      
##  7      11874 37c413cb-67…     5377074          10000           3046 18:32      
##  8         18 7794ec31-68…       10909          10000              0 13:52      
##  9      13821 5bc9d2a8-9d…     5823435          10000           6187 57:55      
## 10       3465 cae133e9-0e…     1797552          10000              0 00:02      
## # … with 3,742 more rows, and 13 more variables: source_package_name <chr>,
## #   create_time <time>, source_info <chr>, speed <dbl>, distance <dbl>,
## #   calorie <dbl>, walk_step_count <dbl>, deviceuuid <chr>, pkg_name <chr>,
## #   healthy_step <dbl>, achievement <chr>, datauuid <chr>, day_time <dbl>

Look back at the names

colnames(dirty_pedometer)
##  [1] "step_count"          "binning_data"        "active_time"        
##  [4] "recommendation"      "run_step_count"      "update_time"        
##  [7] "source_package_name" "create_time"         "source_info"        
## [10] "speed"               "distance"            "calorie"            
## [13] "walk_step_count"     "deviceuuid"          "pkg_name"           
## [16] "healthy_step"        "achievement"         "datauuid"           
## [19] "day_time"

remove columns

select_pedometer <- select(dirty_pedometer,step_count,active_time,run_step_count,walk_step_count,distance,day_time)

view new data set with only the columns I am interested in

head(select_pedometer)
## # A tibble: 6 × 6
##   step_count active_time run_step_count walk_step_count distance      day_time
##        <dbl>       <dbl>          <dbl>           <dbl>    <dbl>         <dbl>
## 1       2854     1384027              3            2851    1960. 1530660000000
## 2       1215      659314              0            1215     771. 1535930000000
## 3      13817     6907844             13           13804    9684. 1534290000000
## 4       3845     2070113              3            3842    2672. 1531790000000
## 5       3118     1631525             21            3097    2231. 1535670000000
## 6       3610     1917359              0            3610    2500. 1531270000000

try to figure out how to convert the date

create a variable with the day_time column, which is in scientific notation

myDate <- as.numeric(select_pedometer$day_time)
str(myDate)
##  num [1:3752] 1.53e+12 1.54e+12 1.53e+12 1.53e+12 1.54e+12 ...

Create another variable that tells the scientific notation/1000 to begin its count on Jan 01, 1970

mydate1 <- (select_pedometer$day_time/1000)
my_posix <- as.POSIXct(mydate1, origin="1970-01-01")
str(my_posix)
##  POSIXct[1:3752], format: "2018-07-03 23:20:00" "2018-09-02 23:13:20" "2018-08-14 23:40:00" ...

Then convert to just a date without the time.

myHealth_date <- as.Date(my_posix)
str(myHealth_date)
##  Date[1:3752], format: "2018-07-03" "2018-09-02" "2018-08-14" "2018-07-17" "2018-08-30" ...

Make sure it’s right

head(myHealth_date,2)
## [1] "2018-07-03" "2018-09-02"

now our hr data

colnames(hr)
##  [1] "source"                                        
##  [2] "tag_id"                                        
##  [3] "com.samsung.health.heart_rate.heart_beat_count"
##  [4] "com.samsung.health.heart_rate.start_time"      
##  [5] "com.samsung.health.heart_rate.custom"          
##  [6] "com.samsung.health.heart_rate.binning_data"    
##  [7] "com.samsung.health.heart_rate.update_time"     
##  [8] "com.samsung.health.heart_rate.create_time"     
##  [9] "com.samsung.health.heart_rate.max"             
## [10] "com.samsung.health.heart_rate.min"             
## [11] "com.samsung.health.heart_rate.time_offset"     
## [12] "com.samsung.health.heart_rate.deviceuuid"      
## [13] "com.samsung.health.heart_rate.comment"         
## [14] "com.samsung.health.heart_rate.pkg_name"        
## [15] "com.samsung.health.heart_rate.end_time"        
## [16] "com.samsung.health.heart_rate.datauuid"        
## [17] "com.samsung.health.heart_rate.heart_rate"

select only the hr column

Heart_Rate <- select(hr, "com.samsung.health.heart_rate.heart_rate")

select only 3752 rows so that it fits with the existing dataset

heart_rate1 <- slice(Heart_Rate, 1:3752)

make sure it’s just the 1 column

colnames(heart_rate1)
## [1] "com.samsung.health.heart_rate.heart_rate"

Now that that’s all cleaned up….we’ll make sure dplyr is loaded to build our dataframe

library(dplyr)

Make the date variable and heart_rate variable a column

date <- c(myHealth_date)
heart_rate <- c(heart_rate1)

cbind them with our original dataframe to create the cleaned dataframe

Samsung_Health <- cbind(select_pedometer,date,heart_rate)

make sure i’ve got all my columns

colnames(Samsung_Health)
## [1] "step_count"                              
## [2] "active_time"                             
## [3] "run_step_count"                          
## [4] "walk_step_count"                         
## [5] "distance"                                
## [6] "day_time"                                
## [7] "date"                                    
## [8] "com.samsung.health.heart_rate.heart_rate"

take a peek at my dataset

glimpse(Samsung_Health)
## Rows: 3,752
## Columns: 8
## $ step_count                               <dbl> 2854, 1215, 13817, 3845, 3118…
## $ active_time                              <dbl> 1384027, 659314, 6907844, 207…
## $ run_step_count                           <dbl> 3, 0, 13, 3, 21, 0, 3046, 0, …
## $ walk_step_count                          <dbl> 2851, 1215, 13804, 3842, 3097…
## $ distance                                 <dbl> 1959.611, 771.151, 9684.481, …
## $ day_time                                 <dbl> 1.53066e+12, 1.53593e+12, 1.5…
## $ date                                     <date> 2018-07-03, 2018-09-02, 2018…
## $ com.samsung.health.heart_rate.heart_rate <int> 79, 49, 76, 62, 51, 43, 50, 5…

Now lets visualize some of this data

library(ggplot2)
ggplot(Samsung_Health,aes(x=distance,y=com.samsung.health.heart_rate.heart_rate))+
  geom_smooth()+
  labs(x="distance(Meter)",y="heart rate(Bpm)",title="Heart Rate vs. Distance")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(Samsung_Health,aes(x=run_step_count,y=com.samsung.health.heart_rate.heart_rate))+
  geom_smooth()+
    labs(x="run steps",y="heart rate(Bpm)",title="Heart Rate vs. Run Steps")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(Samsung_Health,aes(x=walk_step_count,y=com.samsung.health.heart_rate.heart_rate))+
  geom_smooth()+
    labs(x="walk steps",y="heart rate(Bpm)",title="Heart Rate vs. Walk Steps")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(Samsung_Health,aes(x=date,y=distance))+
  geom_smooth()+
    labs(x="date",y="distance(Meter)",title="Distance vs. Date")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(Samsung_Health,aes(x=date,y=step_count))+
  geom_smooth()+
    labs(x="date",y="total step count",title="Total Step Count vs. Date")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(Samsung_Health,aes(x=date,y=run_step_count))+
  geom_smooth()+
    labs(x="date",y="run steps",title="Run Steps vs. Date")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

library(readr)

make a copy of my clean data in a csv file

write_csv(Samsung_Health, file="SamsungHealthClean.csv")