Introduction

Data Cleaning and Manipulation

Loading packages

library (tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (lubridate)
library (dplyr)

Importing the csv files

Daily_Activity <- read.csv ("dailyActivity_merged.csv")
Daily_Sleep <- read.csv ("sleepDay_merged.csv")

Checking the structure of the data frames

str (Daily_Activity)
## 'data.frame':    940 obs. of  15 variables:
##  $ Id                      : num  1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
##  $ ActivityDate            : chr  "4/12/2016" "4/13/2016" "4/14/2016" "4/15/2016" ...
##  $ TotalSteps              : int  13162 10735 10460 9762 12669 9705 13019 15506 10544 9819 ...
##  $ TotalDistance           : num  8.5 6.97 6.74 6.28 8.16 ...
##  $ TrackerDistance         : num  8.5 6.97 6.74 6.28 8.16 ...
##  $ LoggedActivitiesDistance: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VeryActiveDistance      : num  1.88 1.57 2.44 2.14 2.71 ...
##  $ ModeratelyActiveDistance: num  0.55 0.69 0.4 1.26 0.41 ...
##  $ LightActiveDistance     : num  6.06 4.71 3.91 2.83 5.04 ...
##  $ SedentaryActiveDistance : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VeryActiveMinutes       : int  25 21 30 29 36 38 42 50 28 19 ...
##  $ FairlyActiveMinutes     : int  13 19 11 34 10 20 16 31 12 8 ...
##  $ LightlyActiveMinutes    : int  328 217 181 209 221 164 233 264 205 211 ...
##  $ SedentaryMinutes        : int  728 776 1218 726 773 539 1149 775 818 838 ...
##  $ Calories                : int  1985 1797 1776 1745 1863 1728 1921 2035 1786 1775 ...
str (Daily_Sleep)
## 'data.frame':    413 obs. of  5 variables:
##  $ Id                : num  1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
##  $ SleepDay          : chr  "4/12/2016 12:00:00 AM" "4/13/2016 12:00:00 AM" "4/15/2016 12:00:00 AM" "4/16/2016 12:00:00 AM" ...
##  $ TotalSleepRecords : int  1 2 1 2 1 1 1 1 1 1 ...
##  $ TotalMinutesAsleep: int  327 384 412 340 700 304 360 325 361 430 ...
##  $ TotalTimeInBed    : int  346 407 442 367 712 320 377 364 384 449 ...

Correcting the inconsistencies on the date columns

Changing the data type of the date columns
Daily_Activity$ActivityDate <- as.Date(Daily_Activity$ActivityDate, "%m/%d/%Y")
Daily_Sleep$SleepDay <- as.Date(Daily_Sleep$SleepDay, "%m/%d/%Y")
Renaming the date columns
Daily_Activity <- Daily_Activity %>%
  rename (Date = ActivityDate)

Daily_Sleep <- Daily_Sleep%>%
  rename (Date = SleepDay)
Inspecting correction
str (Daily_Activity)
## 'data.frame':    940 obs. of  15 variables:
##  $ Id                      : num  1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
##  $ Date                    : Date, format: "2016-04-12" "2016-04-13" ...
##  $ TotalSteps              : int  13162 10735 10460 9762 12669 9705 13019 15506 10544 9819 ...
##  $ TotalDistance           : num  8.5 6.97 6.74 6.28 8.16 ...
##  $ TrackerDistance         : num  8.5 6.97 6.74 6.28 8.16 ...
##  $ LoggedActivitiesDistance: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VeryActiveDistance      : num  1.88 1.57 2.44 2.14 2.71 ...
##  $ ModeratelyActiveDistance: num  0.55 0.69 0.4 1.26 0.41 ...
##  $ LightActiveDistance     : num  6.06 4.71 3.91 2.83 5.04 ...
##  $ SedentaryActiveDistance : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VeryActiveMinutes       : int  25 21 30 29 36 38 42 50 28 19 ...
##  $ FairlyActiveMinutes     : int  13 19 11 34 10 20 16 31 12 8 ...
##  $ LightlyActiveMinutes    : int  328 217 181 209 221 164 233 264 205 211 ...
##  $ SedentaryMinutes        : int  728 776 1218 726 773 539 1149 775 818 838 ...
##  $ Calories                : int  1985 1797 1776 1745 1863 1728 1921 2035 1786 1775 ...
str (Daily_Sleep)
## 'data.frame':    413 obs. of  5 variables:
##  $ Id                : num  1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
##  $ Date              : Date, format: "2016-04-12" "2016-04-13" ...
##  $ TotalSleepRecords : int  1 2 1 2 1 1 1 1 1 1 ...
##  $ TotalMinutesAsleep: int  327 384 412 340 700 304 360 325 361 430 ...
##  $ TotalTimeInBed    : int  346 407 442 367 712 320 377 364 384 449 ...

Further investigation and cleaning

Checking for null values
sum (is.na(Daily_Activity)) 
## [1] 0
sum (is.na(Daily_Sleep))
## [1] 0
Checking for duplicates
sum (duplicated(Daily_Activity)) 
## [1] 0
sum (duplicated(Daily_Sleep))
## [1] 3
Removing duplicates from Daily_Sleep
Daily_sleep <- unique (Daily_Sleep)
sum (duplicated(Daily_sleep))
## [1] 0
Checking the number of Participants
n_distinct (Daily_Activity$Id)
## [1] 33
n_distinct (Daily_sleep$Id)
## [1] 24

Merging data

Daily_Activity_Sleep <- merge (Daily_Activity, Daily_sleep, by = c ("Id", "Date"))

Adding more columns

Daily_Activity_Sleep$Weekday <- format (as.Date(Daily_Activity_Sleep$Date), "%A") #Adds Weekday column

Daily_Activity_Sleep$TotalActiveMinutes = rowSums (Daily_Activity_Sleep [ c ("VeryActiveMinutes", #Adds Total active minutes column
                                                                          "FairlyActiveMinutes",
                                                                          "LightlyActiveMinutes")])

colnames (Daily_Activity_Sleep)
##  [1] "Id"                       "Date"                    
##  [3] "TotalSteps"               "TotalDistance"           
##  [5] "TrackerDistance"          "LoggedActivitiesDistance"
##  [7] "VeryActiveDistance"       "ModeratelyActiveDistance"
##  [9] "LightActiveDistance"      "SedentaryActiveDistance" 
## [11] "VeryActiveMinutes"        "FairlyActiveMinutes"     
## [13] "LightlyActiveMinutes"     "SedentaryMinutes"        
## [15] "Calories"                 "TotalSleepRecords"       
## [17] "TotalMinutesAsleep"       "TotalTimeInBed"          
## [19] "Weekday"                  "TotalActiveMinutes"
str (Daily_Activity_Sleep)
## 'data.frame':    410 obs. of  20 variables:
##  $ Id                      : num  1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
##  $ Date                    : Date, format: "2016-04-12" "2016-04-13" ...
##  $ TotalSteps              : int  13162 10735 9762 12669 9705 15506 10544 9819 14371 10039 ...
##  $ TotalDistance           : num  8.5 6.97 6.28 8.16 6.48 ...
##  $ TrackerDistance         : num  8.5 6.97 6.28 8.16 6.48 ...
##  $ LoggedActivitiesDistance: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VeryActiveDistance      : num  1.88 1.57 2.14 2.71 3.19 ...
##  $ ModeratelyActiveDistance: num  0.55 0.69 1.26 0.41 0.78 ...
##  $ LightActiveDistance     : num  6.06 4.71 2.83 5.04 2.51 ...
##  $ SedentaryActiveDistance : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VeryActiveMinutes       : int  25 21 29 36 38 50 28 19 41 39 ...
##  $ FairlyActiveMinutes     : int  13 19 34 10 20 31 12 8 21 5 ...
##  $ LightlyActiveMinutes    : int  328 217 209 221 164 264 205 211 262 238 ...
##  $ SedentaryMinutes        : int  728 776 726 773 539 775 818 838 732 709 ...
##  $ Calories                : int  1985 1797 1745 1863 1728 2035 1786 1775 1949 1788 ...
##  $ TotalSleepRecords       : int  1 2 1 2 1 1 1 1 1 1 ...
##  $ TotalMinutesAsleep      : int  327 384 412 340 700 304 360 325 361 430 ...
##  $ TotalTimeInBed          : int  346 407 442 367 712 320 377 364 384 449 ...
##  $ Weekday                 : chr  "Tuesday" "Wednesday" "Friday" "Saturday" ...
##  $ TotalActiveMinutes      : num  366 257 272 267 222 345 245 238 324 282 ...
n_distinct (Daily_Activity_Sleep$Id)
## [1] 24

Analysis

Quick summary statistics

Daily_Activity_Sleep %>%
  select (TotalSteps,
          TotalDistance,
          SedentaryMinutes,
          TotalActiveMinutes,
          Calories,
          TotalSleepRecords,
          TotalMinutesAsleep,
          TotalTimeInBed)%>%
  summary ()
##    TotalSteps    TotalDistance    SedentaryMinutes TotalActiveMinutes
##  Min.   :   17   Min.   : 0.010   Min.   :   0.0   Min.   :  2.0     
##  1st Qu.: 5189   1st Qu.: 3.592   1st Qu.: 631.2   1st Qu.:206.5     
##  Median : 8913   Median : 6.270   Median : 717.0   Median :263.5     
##  Mean   : 8515   Mean   : 6.012   Mean   : 712.1   Mean   :259.5     
##  3rd Qu.:11370   3rd Qu.: 8.005   3rd Qu.: 782.8   3rd Qu.:315.5     
##  Max.   :22770   Max.   :17.540   Max.   :1265.0   Max.   :540.0     
##     Calories    TotalSleepRecords TotalMinutesAsleep TotalTimeInBed 
##  Min.   : 257   Min.   :1.00      Min.   : 58.0      Min.   : 61.0  
##  1st Qu.:1841   1st Qu.:1.00      1st Qu.:361.0      1st Qu.:403.8  
##  Median :2207   Median :1.00      Median :432.5      Median :463.0  
##  Mean   :2389   Mean   :1.12      Mean   :419.2      Mean   :458.5  
##  3rd Qu.:2920   3rd Qu.:1.00      3rd Qu.:490.0      3rd Qu.:526.0  
##  Max.   :4900   Max.   :3.00      Max.   :796.0      Max.   :961.0

Comparing average distance (Very active distance, Moderately active distance and Light active distance)

Daily_Activity_Sleep %>%
  summarise (Very_active_distance = mean (VeryActiveDistance), 
             Moderately_active_distance = mean (ModeratelyActiveDistance),
             Light_active_distance = mean (LightActiveDistance)) %>%
  round(3)
##   Very_active_distance Moderately_active_distance Light_active_distance
## 1                1.446                      0.744                 3.791

Average steps by weekday

Daily_Activity_Sleep$Weekday <- factor (Daily_Activity_Sleep$Weekday, levels = c ("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday",
                                          "Friday", "Saturday")) 
Daily_Activity_Sleep %>%  
  group_by (Weekday) %>%
  summarise (Total_Steps = mean (TotalSteps)) 
## # A tibble: 7 × 2
##   Weekday   Total_Steps
##   <fct>           <dbl>
## 1 Sunday          7298.
## 2 Monday          9273.
## 3 Tuesday         9183.
## 4 Wednesday       8023.
## 5 Thursday        8184.
## 6 Friday          7901.
## 7 Saturday        9871.

Average minutes asleep by weekday

Daily_Activity_Sleep %>%  
  group_by (Weekday) %>%
  summarise (Sleep = mean (TotalMinutesAsleep)) 
## # A tibble: 7 × 2
##   Weekday   Sleep
##   <fct>     <dbl>
## 1 Sunday     453.
## 2 Monday     420.
## 3 Tuesday    405.
## 4 Wednesday  435.
## 5 Thursday   401.
## 6 Friday     405.
## 7 Saturday   419.
Number of days per Id
Daily_Activity_Sleep %>% 
  group_by (Id) %>%
  summarise (Number_of_days = n ()) %>%
  print (n = 24)
## # A tibble: 24 × 2
##            Id Number_of_days
##         <dbl>          <int>
##  1 1503960366             25
##  2 1644430081              4
##  3 1844505072              3
##  4 1927972279              5
##  5 2026352035             28
##  6 2320127002              1
##  7 2347167796             15
##  8 3977333714             28
##  9 4020332650              8
## 10 4319703577             26
## 11 4388161847             23
## 12 4445114986             28
## 13 4558609924              5
## 14 4702921684             27
## 15 5553957443             31
## 16 5577150313             26
## 17 6117666160             18
## 18 6775888955              3
## 19 6962181067             31
## 20 7007744171              2
## 21 7086361926             24
## 22 8053475328              3
## 23 8378563200             31
## 24 8792009665             15

Consistency classification

Daily_Activity_Sleep %>% 
  group_by(Id) %>%
  summarise (Number_of_days = n ()) %>%
  mutate (Consistency = case_when(
    Number_of_days >= 1 & Number_of_days <= 10 ~ "Inconsistent",
    Number_of_days >= 11 & Number_of_days <= 20 ~ "Moderate", 
    Number_of_days >= 21 & Number_of_days <= 31 ~ "Consistent" 
  )) %>%
  count(Consistency)
## # A tibble: 3 × 2
##   Consistency      n
##   <chr>        <int>
## 1 Consistent      12
## 2 Inconsistent     9
## 3 Moderate         3

Exporting data

Activity_Sleep <- Daily_Activity_Sleep
write.csv(Activity_Sleep, file = 'Activity_Sleep.csv')