Load Libraries

install.packages('tidyverse')
library(tidyverse)

Load CSV files

daily_activity <- read.csv("dailyActivity_merged.csv")
sleep_day <- read.csv("sleepDay_merged.csv")

Explore the data

The daily_activity data

head(daily_activity)
##           Id ActivityDate TotalSteps TotalDistance TrackerDistance
## 1 1503960366    4/12/2016      13162          8.50            8.50
## 2 1503960366    4/13/2016      10735          6.97            6.97
## 3 1503960366    4/14/2016      10460          6.74            6.74
## 4 1503960366    4/15/2016       9762          6.28            6.28
## 5 1503960366    4/16/2016      12669          8.16            8.16
## 6 1503960366    4/17/2016       9705          6.48            6.48
##   LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 1                        0               1.88                     0.55
## 2                        0               1.57                     0.69
## 3                        0               2.44                     0.40
## 4                        0               2.14                     1.26
## 5                        0               2.71                     0.41
## 6                        0               3.19                     0.78
##   LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 1                6.06                       0                25
## 2                4.71                       0                21
## 3                3.91                       0                30
## 4                2.83                       0                29
## 5                5.04                       0                36
## 6                2.51                       0                38
##   FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## 1                  13                  328              728     1985
## 2                  19                  217              776     1797
## 3                  11                  181             1218     1776
## 4                  34                  209              726     1745
## 5                  10                  221              773     1863
## 6                  20                  164              539     1728

columns in the daily_activity data.

colnames(daily_activity)
##  [1] "Id"                       "ActivityDate"            
##  [3] "TotalSteps"               "TotalDistance"           
##  [5] "TrackerDistance"          "LoggedActivitiesDistance"
##  [7] "VeryActiveDistance"       "ModeratelyActiveDistance"
##  [9] "LightActiveDistance"      "SedentaryActiveDistance" 
## [11] "VeryActiveMinutes"        "FairlyActiveMinutes"     
## [13] "LightlyActiveMinutes"     "SedentaryMinutes"        
## [15] "Calories"

The sleep_day data

head(sleep_day)
##           Id              SleepDay TotalSleepRecords TotalMinutesAsleep
## 1 1503960366 4/12/2016 12:00:00 AM                 1                327
## 2 1503960366 4/13/2016 12:00:00 AM                 2                384
## 3 1503960366 4/15/2016 12:00:00 AM                 1                412
## 4 1503960366 4/16/2016 12:00:00 AM                 2                340
## 5 1503960366 4/17/2016 12:00:00 AM                 1                700
## 6 1503960366 4/19/2016 12:00:00 AM                 1                304
##   TotalTimeInBed
## 1            346
## 2            407
## 3            442
## 4            367
## 5            712
## 6            320

Columns in the daily_activity data.

colnames(sleep_day)
## [1] "Id"                 "SleepDay"           "TotalSleepRecords" 
## [4] "TotalMinutesAsleep" "TotalTimeInBed"

Summarize the data

Distinguish unique Ids

n_distinct(daily_activity$Id)
## [1] 33
n_distinct(sleep_day$Id)
## [1] 24

Compare observations

nrow(daily_activity)
## [1] 940
nrow(sleep_day)
## [1] 413

For the daily activity dataframe:

daily_activity %>%  
  select(TotalSteps,
         TotalDistance,
         SedentaryMinutes) %>%
  summary()
##    TotalSteps    TotalDistance    SedentaryMinutes
##  Min.   :    0   Min.   : 0.000   Min.   :   0.0  
##  1st Qu.: 3790   1st Qu.: 2.620   1st Qu.: 729.8  
##  Median : 7406   Median : 5.245   Median :1057.5  
##  Mean   : 7638   Mean   : 5.490   Mean   : 991.2  
##  3rd Qu.:10727   3rd Qu.: 7.713   3rd Qu.:1229.5  
##  Max.   :36019   Max.   :28.030   Max.   :1440.0

For the sleep dataframe:

sleep_day %>%  
  select(TotalSleepRecords,
  TotalMinutesAsleep,
  TotalTimeInBed) %>%
  summary()
##  TotalSleepRecords TotalMinutesAsleep TotalTimeInBed 
##  Min.   :1.000     Min.   : 58.0      Min.   : 61.0  
##  1st Qu.:1.000     1st Qu.:361.0      1st Qu.:403.0  
##  Median :1.000     Median :433.0      Median :463.0  
##  Mean   :1.119     Mean   :419.5      Mean   :458.6  
##  3rd Qu.:1.000     3rd Qu.:490.0      3rd Qu.:526.0  
##  Max.   :3.000     Max.   :796.0      Max.   :961.0

Visualize the data

ggplot(data=daily_activity, aes(x=TotalSteps, y=SedentaryMinutes)) + geom_point()

ggplot(data=sleep_day, aes(x=TotalMinutesAsleep, y=TotalTimeInBed)) + geom_point()

Merging the datasets

combined_data <- merge(sleep_day, daily_activity, by="Id")

number of participants are in this data set.

n_distinct(combined_data$Id)
## [1] 24

Explore merged data

colnames(combined_data)
##  [1] "Id"                       "SleepDay"                
##  [3] "TotalSleepRecords"        "TotalMinutesAsleep"      
##  [5] "TotalTimeInBed"           "ActivityDate"            
##  [7] "TotalSteps"               "TotalDistance"           
##  [9] "TrackerDistance"          "LoggedActivitiesDistance"
## [11] "VeryActiveDistance"       "ModeratelyActiveDistance"
## [13] "LightActiveDistance"      "SedentaryActiveDistance" 
## [15] "VeryActiveMinutes"        "FairlyActiveMinutes"     
## [17] "LightlyActiveMinutes"     "SedentaryMinutes"        
## [19] "Calories"
glimpse(combined_data)
## Rows: 12,441
## Columns: 19
## $ Id                       <dbl> 1503960366, 1503960366, 1503960366, 150396036…
## $ SleepDay                 <chr> "4/12/2016 12:00:00 AM", "4/12/2016 12:00:00 …
## $ TotalSleepRecords        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ TotalMinutesAsleep       <int> 327, 327, 327, 327, 327, 327, 327, 327, 327, …
## $ TotalTimeInBed           <int> 346, 346, 346, 346, 346, 346, 346, 346, 346, …
## $ ActivityDate             <chr> "5/7/2016", "5/6/2016", "5/1/2016", "4/30/201…
## $ TotalSteps               <int> 11992, 12159, 10602, 14673, 13162, 10735, 153…
## $ TotalDistance            <dbl> 7.71, 8.03, 6.81, 9.25, 8.50, 6.97, 9.80, 8.9…
## $ TrackerDistance          <dbl> 7.71, 8.03, 6.81, 9.25, 8.50, 6.97, 9.80, 8.9…
## $ LoggedActivitiesDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveDistance       <dbl> 2.46, 1.97, 2.29, 3.56, 1.88, 1.57, 5.29, 2.9…
## $ ModeratelyActiveDistance <dbl> 2.12, 0.25, 1.60, 1.42, 0.55, 0.69, 0.57, 1.0…
## $ LightActiveDistance      <dbl> 3.13, 5.81, 2.92, 4.27, 6.06, 4.71, 3.94, 4.8…
## $ SedentaryActiveDistance  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveMinutes        <int> 37, 24, 33, 52, 25, 21, 73, 45, 48, 16, 31, 7…
## $ FairlyActiveMinutes      <int> 46, 6, 35, 34, 13, 19, 14, 24, 28, 12, 23, 11…
## $ LightlyActiveMinutes     <int> 175, 289, 246, 217, 328, 217, 216, 250, 189, …
## $ SedentaryMinutes         <int> 833, 754, 730, 712, 728, 776, 814, 857, 782, …
## $ Calories                 <int> 1821, 1896, 1820, 1947, 1985, 1797, 2013, 195…

Visualize combined data

ggplot(combined_data, aes(x=VeryActiveMinutes, y=Calories, color=Calories))+geom_point()+ggtitle("Calories by very Active Minutes")

ggplot(combined_data, aes(x=FairlyActiveMinutes, y=Calories, color=Calories))+geom_point()

ggplot(combined_data, aes(x=LightlyActiveMinutes, y=Calories, color=Calories))+geom_point()

Calorie Findings: Based on the above visualizations, intensity of activity in minutes shows correlation. This could be a valuable message for the Bellabeat marketing team.