Bellabeat - Capstone Project

Preparing Data

Load the packages

install.packages(‘readr’) install.packages(‘tidyverse’) install.packages(‘janitor’) install.packages(‘lubridate’) install.packages(‘here’) install.packages(‘skimr’) install.packages(‘ggrepel’) install.packages(‘ggpubr’)

library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ dplyr   1.0.8
## ✓ tibble  3.1.6     ✓ stringr 1.4.0
## ✓ tidyr   1.2.0     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(here)
## here() starts at /Users/saurabh
library(skimr)
library(ggrepel)
library(ggpubr)

#Import the datasets

dailyActivity_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/dailyActivity_merged.csv")
## Rows: 940 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): ActivityDate
## dbl (14): Id, TotalSteps, TotalDistance, TrackerDistance, LoggedActivitiesDi...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dailyCalories_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/dailyCalories_merged.csv")
## Rows: 940 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityDay
## dbl (2): Id, Calories
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dailyIntensities_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/dailyIntensities_merged.csv")
## Rows: 940 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityDay
## dbl (9): Id, SedentaryMinutes, LightlyActiveMinutes, FairlyActiveMinutes, Ve...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dailySteps_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/dailySteps_merged.csv")
## Rows: 940 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityDay
## dbl (2): Id, StepTotal
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
heartrate_seconds_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/heartrate_seconds_merged.csv")
## Rows: 2483658 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Time
## dbl (2): Id, Value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hourlyCalories_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/hourlyCalories_merged.csv")
## Rows: 22099 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityHour
## dbl (2): Id, Calories
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hourlyIntensities_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/hourlyIntensities_merged.csv")
## Rows: 22099 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityHour
## dbl (3): Id, TotalIntensity, AverageIntensity
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hourlySteps_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/hourlySteps_merged.csv")
## Rows: 22099 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityHour
## dbl (2): Id, StepTotal
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
minuteCaloriesNarrow_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/minuteCaloriesNarrow_merged.csv")
## Rows: 1325580 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityMinute
## dbl (2): Id, Calories
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
minuteCaloriesWide_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/minuteCaloriesWide_merged.csv")
## Rows: 21645 Columns: 62
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): ActivityHour
## dbl (61): Id, Calories00, Calories01, Calories02, Calories03, Calories04, Ca...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
minuteIntensitiesNarrow_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/minuteIntensitiesNarrow_merged.csv")
## Rows: 1325580 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityMinute
## dbl (2): Id, Intensity
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
minuteIntensitiesWide_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/minuteIntensitiesWide_merged.csv")
## Rows: 21645 Columns: 62
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): ActivityHour
## dbl (61): Id, Intensity00, Intensity01, Intensity02, Intensity03, Intensity0...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
minuteMETsNarrow_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/minuteMETsNarrow_merged.csv")
## Rows: 1325580 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityMinute
## dbl (2): Id, METs
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
minuteSleep_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/minuteSleep_merged.csv")
## Rows: 188521 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): date
## dbl (3): Id, value, logId
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
minuteStepsNarrow_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/minuteStepsNarrow_merged.csv")
## Rows: 1325580 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityMinute
## dbl (2): Id, Steps
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
minuteStepsWide_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/minuteStepsWide_merged.csv")
## Rows: 21645 Columns: 62
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): ActivityHour
## dbl (61): Id, Steps00, Steps01, Steps02, Steps03, Steps04, Steps05, Steps06,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sleepDay_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/sleepDay_merged.csv")
## Rows: 413 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): SleepDay
## dbl (4): Id, TotalSleepRecords, TotalMinutesAsleep, TotalTimeInBed
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
weightLogInfo_merged <- read_csv("Downloads/Fitabase Data 4.12.16-5.12.16/weightLogInfo_merged.csv")
## Rows: 67 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Date
## dbl (6): Id, WeightKg, WeightPounds, Fat, BMI, LogId
## lgl (1): IsManualReport
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Read Imported data

head(dailyActivity_merged)
## # A tibble: 6 × 15
##        Id ActivityDate TotalSteps TotalDistance TrackerDistance LoggedActivitie…
##     <dbl> <chr>             <dbl>         <dbl>           <dbl>            <dbl>
## 1  1.50e9 4/12/2016         13162          8.5             8.5                 0
## 2  1.50e9 4/13/2016         10735          6.97            6.97                0
## 3  1.50e9 4/14/2016         10460          6.74            6.74                0
## 4  1.50e9 4/15/2016          9762          6.28            6.28                0
## 5  1.50e9 4/16/2016         12669          8.16            8.16                0
## 6  1.50e9 4/17/2016          9705          6.48            6.48                0
## # … with 9 more variables: VeryActiveDistance <dbl>,
## #   ModeratelyActiveDistance <dbl>, LightActiveDistance <dbl>,
## #   SedentaryActiveDistance <dbl>, VeryActiveMinutes <dbl>,
## #   FairlyActiveMinutes <dbl>, LightlyActiveMinutes <dbl>,
## #   SedentaryMinutes <dbl>, Calories <dbl>
head(dailyCalories_merged)
## # A tibble: 6 × 3
##           Id ActivityDay Calories
##        <dbl> <chr>          <dbl>
## 1 1503960366 4/12/2016       1985
## 2 1503960366 4/13/2016       1797
## 3 1503960366 4/14/2016       1776
## 4 1503960366 4/15/2016       1745
## 5 1503960366 4/16/2016       1863
## 6 1503960366 4/17/2016       1728
head(dailyIntensities_merged)
## # A tibble: 6 × 10
##           Id ActivityDay SedentaryMinutes LightlyActiveMinutes FairlyActiveMinu…
##        <dbl> <chr>                  <dbl>                <dbl>             <dbl>
## 1 1503960366 4/12/2016                728                  328                13
## 2 1503960366 4/13/2016                776                  217                19
## 3 1503960366 4/14/2016               1218                  181                11
## 4 1503960366 4/15/2016                726                  209                34
## 5 1503960366 4/16/2016                773                  221                10
## 6 1503960366 4/17/2016                539                  164                20
## # … with 5 more variables: VeryActiveMinutes <dbl>,
## #   SedentaryActiveDistance <dbl>, LightActiveDistance <dbl>,
## #   ModeratelyActiveDistance <dbl>, VeryActiveDistance <dbl>
head(dailySteps_merged)
## # A tibble: 6 × 3
##           Id ActivityDay StepTotal
##        <dbl> <chr>           <dbl>
## 1 1503960366 4/12/2016       13162
## 2 1503960366 4/13/2016       10735
## 3 1503960366 4/14/2016       10460
## 4 1503960366 4/15/2016        9762
## 5 1503960366 4/16/2016       12669
## 6 1503960366 4/17/2016        9705
head(heartrate_seconds_merged)
## # A tibble: 6 × 3
##           Id Time                 Value
##        <dbl> <chr>                <dbl>
## 1 2022484408 4/12/2016 7:21:00 AM    97
## 2 2022484408 4/12/2016 7:21:05 AM   102
## 3 2022484408 4/12/2016 7:21:10 AM   105
## 4 2022484408 4/12/2016 7:21:20 AM   103
## 5 2022484408 4/12/2016 7:21:25 AM   101
## 6 2022484408 4/12/2016 7:22:05 AM    95
head(sleepDay_merged)
## # A tibble: 6 × 5
##           Id SleepDay           TotalSleepRecor… TotalMinutesAsl… TotalTimeInBed
##        <dbl> <chr>                         <dbl>            <dbl>          <dbl>
## 1 1503960366 4/12/2016 12:00:0…                1              327            346
## 2 1503960366 4/13/2016 12:00:0…                2              384            407
## 3 1503960366 4/15/2016 12:00:0…                1              412            442
## 4 1503960366 4/16/2016 12:00:0…                2              340            367
## 5 1503960366 4/17/2016 12:00:0…                1              700            712
## 6 1503960366 4/19/2016 12:00:0…                1              304            320
head(weightLogInfo_merged)
## # A tibble: 6 × 8
##           Id Date       WeightKg WeightPounds   Fat   BMI IsManualReport   LogId
##        <dbl> <chr>         <dbl>        <dbl> <dbl> <dbl> <lgl>            <dbl>
## 1 1503960366 5/2/2016 …     52.6         116.    22  22.6 TRUE           1.46e12
## 2 1503960366 5/3/2016 …     52.6         116.    NA  22.6 TRUE           1.46e12
## 3 1927972279 4/13/2016…    134.          294.    NA  47.5 FALSE          1.46e12
## 4 2873212765 4/21/2016…     56.7         125.    NA  21.5 TRUE           1.46e12
## 5 2873212765 5/12/2016…     57.3         126.    NA  21.7 TRUE           1.46e12
## 6 4319703577 4/17/2016…     72.4         160.    25  27.5 TRUE           1.46e12
head(hourlyIntensities_merged)
## # A tibble: 6 × 4
##           Id ActivityHour          TotalIntensity AverageIntensity
##        <dbl> <chr>                          <dbl>            <dbl>
## 1 1503960366 4/12/2016 12:00:00 AM             20            0.333
## 2 1503960366 4/12/2016 1:00:00 AM               8            0.133
## 3 1503960366 4/12/2016 2:00:00 AM               7            0.117
## 4 1503960366 4/12/2016 3:00:00 AM               0            0    
## 5 1503960366 4/12/2016 4:00:00 AM               0            0    
## 6 1503960366 4/12/2016 5:00:00 AM               0            0

Look for missing values

sum(is.na(dailyActivity_merged))
## [1] 0
sum(is.na(dailyCalories_merged))
## [1] 0
sum(is.na(dailyIntensities_merged))
## [1] 0
sum(is.na(dailySteps_merged))
## [1] 0
sum(is.na(heartrate_seconds_merged))
## [1] 0
sum(is.na(sleepDay_merged))
## [1] 0
sum(is.na(weightLogInfo_merged))
## [1] 65
sum(is.na(hourlyIntensities_merged))
## [1] 0
  • If the value is 0 there are no missing values.
  • The weight df has 65 missing values, the rest of them aren’t missing any at all.

Counting Participants

n_distinct(dailyActivity_merged$Id)
## [1] 33
n_distinct(dailyCalories_merged$Id)
## [1] 33
n_distinct(dailyIntensities_merged$Id)
## [1] 33
n_distinct(dailySteps_merged$Id)
## [1] 33
n_distinct(heartrate_seconds_merged$Id)
## [1] 14
n_distinct(sleepDay_merged$Id)
## [1] 24
n_distinct(weightLogInfo_merged$Id)
## [1] 8
n_distinct(hourlyIntensities_merged$Id)
## [1] 33

#Formatting Dates

dailyActivity_merged$ActivityDate = as.POSIXct(dailyActivity_merged$ActivityDate, format= "%m/%d/%y", tz=Sys.timezone())
dailyActivity_merged$date <- format(dailyActivity_merged$ActivityDate, format= "%m/%d/%y")

sleepDay_merged$SleepDay = as.POSIXct(sleepDay_merged$SleepDay, format= "%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
sleepDay_merged$date <- format(sleepDay_merged$SleepDay, format = "%m/%d/%y")

dhourlyInt <- hourlyIntensities_merged %>%
  extract(ActivityHour, c("Date" , "Hour"), "([^ ]+) (.*)")

Exploratory Analysis

Looking at summaries

summary(dailyActivity_merged)
##        Id             ActivityDate                   TotalSteps   
##  Min.   :1.504e+09   Min.   :2020-04-12 00:00:00   Min.   :    0  
##  1st Qu.:2.320e+09   1st Qu.:2020-04-19 00:00:00   1st Qu.: 3790  
##  Median :4.445e+09   Median :2020-04-26 00:00:00   Median : 7406  
##  Mean   :4.855e+09   Mean   :2020-04-26 06:53:37   Mean   : 7638  
##  3rd Qu.:6.962e+09   3rd Qu.:2020-05-04 00:00:00   3rd Qu.:10727  
##  Max.   :8.878e+09   Max.   :2020-05-12 00:00:00   Max.   :36019  
##  TotalDistance    TrackerDistance  LoggedActivitiesDistance VeryActiveDistance
##  Min.   : 0.000   Min.   : 0.000   Min.   :0.0000           Min.   : 0.000    
##  1st Qu.: 2.620   1st Qu.: 2.620   1st Qu.:0.0000           1st Qu.: 0.000    
##  Median : 5.245   Median : 5.245   Median :0.0000           Median : 0.210    
##  Mean   : 5.490   Mean   : 5.475   Mean   :0.1082           Mean   : 1.503    
##  3rd Qu.: 7.713   3rd Qu.: 7.710   3rd Qu.:0.0000           3rd Qu.: 2.053    
##  Max.   :28.030   Max.   :28.030   Max.   :4.9421           Max.   :21.920    
##  ModeratelyActiveDistance LightActiveDistance SedentaryActiveDistance
##  Min.   :0.0000           Min.   : 0.000      Min.   :0.000000       
##  1st Qu.:0.0000           1st Qu.: 1.945      1st Qu.:0.000000       
##  Median :0.2400           Median : 3.365      Median :0.000000       
##  Mean   :0.5675           Mean   : 3.341      Mean   :0.001606       
##  3rd Qu.:0.8000           3rd Qu.: 4.782      3rd Qu.:0.000000       
##  Max.   :6.4800           Max.   :10.710      Max.   :0.110000       
##  VeryActiveMinutes FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes
##  Min.   :  0.00    Min.   :  0.00      Min.   :  0.0        Min.   :   0.0  
##  1st Qu.:  0.00    1st Qu.:  0.00      1st Qu.:127.0        1st Qu.: 729.8  
##  Median :  4.00    Median :  6.00      Median :199.0        Median :1057.5  
##  Mean   : 21.16    Mean   : 13.56      Mean   :192.8        Mean   : 991.2  
##  3rd Qu.: 32.00    3rd Qu.: 19.00      3rd Qu.:264.0        3rd Qu.:1229.5  
##  Max.   :210.00    Max.   :143.00      Max.   :518.0        Max.   :1440.0  
##     Calories        date          
##  Min.   :   0   Length:940        
##  1st Qu.:1828   Class :character  
##  Median :2134   Mode  :character  
##  Mean   :2304                     
##  3rd Qu.:2793                     
##  Max.   :4900
summary(dailyCalories_merged)
##        Id            ActivityDay           Calories   
##  Min.   :1.504e+09   Length:940         Min.   :   0  
##  1st Qu.:2.320e+09   Class :character   1st Qu.:1828  
##  Median :4.445e+09   Mode  :character   Median :2134  
##  Mean   :4.855e+09                      Mean   :2304  
##  3rd Qu.:6.962e+09                      3rd Qu.:2793  
##  Max.   :8.878e+09                      Max.   :4900
summary(dailyIntensities_merged)  
##        Id            ActivityDay        SedentaryMinutes LightlyActiveMinutes
##  Min.   :1.504e+09   Length:940         Min.   :   0.0   Min.   :  0.0       
##  1st Qu.:2.320e+09   Class :character   1st Qu.: 729.8   1st Qu.:127.0       
##  Median :4.445e+09   Mode  :character   Median :1057.5   Median :199.0       
##  Mean   :4.855e+09                      Mean   : 991.2   Mean   :192.8       
##  3rd Qu.:6.962e+09                      3rd Qu.:1229.5   3rd Qu.:264.0       
##  Max.   :8.878e+09                      Max.   :1440.0   Max.   :518.0       
##  FairlyActiveMinutes VeryActiveMinutes SedentaryActiveDistance
##  Min.   :  0.00      Min.   :  0.00    Min.   :0.000000       
##  1st Qu.:  0.00      1st Qu.:  0.00    1st Qu.:0.000000       
##  Median :  6.00      Median :  4.00    Median :0.000000       
##  Mean   : 13.56      Mean   : 21.16    Mean   :0.001606       
##  3rd Qu.: 19.00      3rd Qu.: 32.00    3rd Qu.:0.000000       
##  Max.   :143.00      Max.   :210.00    Max.   :0.110000       
##  LightActiveDistance ModeratelyActiveDistance VeryActiveDistance
##  Min.   : 0.000      Min.   :0.0000           Min.   : 0.000    
##  1st Qu.: 1.945      1st Qu.:0.0000           1st Qu.: 0.000    
##  Median : 3.365      Median :0.2400           Median : 0.210    
##  Mean   : 3.341      Mean   :0.5675           Mean   : 1.503    
##  3rd Qu.: 4.782      3rd Qu.:0.8000           3rd Qu.: 2.053    
##  Max.   :10.710      Max.   :6.4800           Max.   :21.920
summary(dailySteps_merged)
##        Id            ActivityDay          StepTotal    
##  Min.   :1.504e+09   Length:940         Min.   :    0  
##  1st Qu.:2.320e+09   Class :character   1st Qu.: 3790  
##  Median :4.445e+09   Mode  :character   Median : 7406  
##  Mean   :4.855e+09                      Mean   : 7638  
##  3rd Qu.:6.962e+09                      3rd Qu.:10727  
##  Max.   :8.878e+09                      Max.   :36019
summary(sleepDay_merged)
##        Id               SleepDay                   TotalSleepRecords
##  Min.   :1.504e+09   Min.   :2016-04-12 00:00:00   Min.   :1.000    
##  1st Qu.:3.977e+09   1st Qu.:2016-04-19 00:00:00   1st Qu.:1.000    
##  Median :4.703e+09   Median :2016-04-27 00:00:00   Median :1.000    
##  Mean   :5.001e+09   Mean   :2016-04-26 12:40:05   Mean   :1.119    
##  3rd Qu.:6.962e+09   3rd Qu.:2016-05-04 00:00:00   3rd Qu.:1.000    
##  Max.   :8.792e+09   Max.   :2016-05-12 00:00:00   Max.   :3.000    
##  TotalMinutesAsleep TotalTimeInBed      date          
##  Min.   : 58.0      Min.   : 61.0   Length:413        
##  1st Qu.:361.0      1st Qu.:403.0   Class :character  
##  Median :433.0      Median :463.0   Mode  :character  
##  Mean   :419.5      Mean   :458.6                     
##  3rd Qu.:490.0      3rd Qu.:526.0                     
##  Max.   :796.0      Max.   :961.0
summary(hourlyIntensities_merged)
##        Id            ActivityHour       TotalIntensity   AverageIntensity
##  Min.   :1.504e+09   Length:22099       Min.   :  0.00   Min.   :0.0000  
##  1st Qu.:2.320e+09   Class :character   1st Qu.:  0.00   1st Qu.:0.0000  
##  Median :4.445e+09   Mode  :character   Median :  3.00   Median :0.0500  
##  Mean   :4.848e+09                      Mean   : 12.04   Mean   :0.2006  
##  3rd Qu.:6.962e+09                      3rd Qu.: 16.00   3rd Qu.:0.2667  
##  Max.   :8.878e+09                      Max.   :180.00   Max.   :3.0000
  • I can see a lot of information from the summaries.
  • The timeframe is 1 month, from 4/12/16 to 5/12/16
  • They burned an average of 2000 calories per day
  • The average steps taken per day is 7638
  • The max record of sedentary minutes is 1440

#Calculating usused device days * Let’s first see how many days the 33 participants did not use their devices. * To do this, I will count the days where SedentaryMinutes = 1440, since 1440 is the total number of minutes in a day. * It is highly unlikely that participants did not move at all for a whole day. It makes more sense that the fitbit was not used during those days.

deviceUnused <- dailyActivity_merged %>%
  filter(SedentaryMinutes == 1440) %>%
  group_by(Id) %>%
  summarise(countofUnusedDays = n()) %>%
  print()
## # A tibble: 17 × 2
##            Id countofUnusedDays
##         <dbl>             <int>
##  1 1503960366                 1
##  2 1844505072                 9
##  3 1927972279                13
##  4 4020332650                14
##  5 4057192912                 1
##  6 4319703577                 1
##  7 4388161847                 1
##  8 4702921684                 1
##  9 5577150313                 2
## 10 6117666160                 5
## 11 6290855005                 4
## 12 6775888955                 9
## 13 7007744171                 1
## 14 7086361926                 1
## 15 8253242879                 1
## 16 8583815059                 6
## 17 8792009665                 9
dailyActivity_merged %>%
  group_by(ActivityDate) %>%
  summarise(count = n()) %>%
  print()
## # A tibble: 31 × 2
##    ActivityDate        count
##    <dttm>              <int>
##  1 2020-04-12 00:00:00    33
##  2 2020-04-13 00:00:00    33
##  3 2020-04-14 00:00:00    33
##  4 2020-04-15 00:00:00    33
##  5 2020-04-16 00:00:00    32
##  6 2020-04-17 00:00:00    32
##  7 2020-04-18 00:00:00    32
##  8 2020-04-19 00:00:00    32
##  9 2020-04-20 00:00:00    32
## 10 2020-04-21 00:00:00    32
## # … with 21 more rows
percentDaysUnused <- dailyIntensities_merged %>%
  filter(SedentaryMinutes == 1440) %>%
  group_by(Id) %>%
  summarise(unusedDays = n(), percentUnused = (unusedDays/31)*100)

head(percentDaysUnused)
## # A tibble: 6 × 3
##           Id unusedDays percentUnused
##        <dbl>      <int>         <dbl>
## 1 1503960366          1          3.23
## 2 1844505072          9         29.0 
## 3 1927972279         13         41.9 
## 4 4020332650         14         45.2 
## 5 4057192912          1          3.23
## 6 4319703577          1          3.23
  • It looks like two users went almost half the month without using their fitbit.
  • Some missed at least a day, but the majority used it every day.

Visualization

Percent of fitbit usage

Not lets make a pie chart to show total fitbit usage from all users, showing how many missed 1 day of use, missed up to a week of use, missed more than a week of use, and how many did not any days.

piechart <- percentDaysUnused %>%
  group_by(unusedDays) %>%
  summarise(totalParticipants = n())
head(piechart)
## # A tibble: 6 × 2
##   unusedDays totalParticipants
##        <int>             <int>
## 1          1                 8
## 2          2                 1
## 3          4                 1
## 4          5                 1
## 5          6                 1
## 6          9                 3
slices <- c(54, 24, 12, 10)
lbls <- c("Used everyday", "Unused for 1 day", "Unused 2 to 7 days", "Unused >7 days")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls,pct)
lbls <- paste(lbls,"%", sep = "")
pie(slices,labels = lbls,col = rainbow(length(lbls)),
 main = "Percents of Fitbit Usage") 

  • A little less than half of the users do not wear their fitbit every day.
  • We could use the app to remind users to wear their device daily for the most health benefit.

Activity Level

Now lets make a new table to filter out the unused days and group users by activity level and look at that with a bar chart.

participantActivity <- dailyActivity_merged %>%
  filter(SedentaryMinutes !=1440) %>%
  group_by(Id) %>%
  summarize(total_very_active_mins = sum(VeryActiveMinutes),
            total_fairly_active_mins = sum(FairlyActiveMinutes),
            total_lightly_active_mins = sum(LightlyActiveMinutes),
            total_sedentary_mins = sum(SedentaryMinutes),
            total_mins = sum(VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes),
            
            percent_very_active = (total_very_active_mins/total_mins)*100,
            percent_faily_active = (total_fairly_active_mins/total_mins)*100,
            percent_lightly_active = (total_lightly_active_mins/total_mins)*100,
            percent_sedentary_active = (total_sedentary_mins/total_mins)*100)
head(participantActivity)
## # A tibble: 6 × 10
##           Id total_very_acti… total_fairly_ac… total_lightly_a… total_sedentary…
##        <dbl>            <dbl>            <dbl>            <dbl>            <dbl>
## 1 1503960366             1200              594             6818            24853
## 2 1624580081              269              180             4758            38990
## 3 1644430081              287              641             5354            34856
## 4 1844505072                4               40             3579            24445
## 5 1927972279               41               24             1196            22120
## 6 2022484408             1125              600             7981            34490
## # … with 5 more variables: total_mins <dbl>, percent_very_active <dbl>,
## #   percent_faily_active <dbl>, percent_lightly_active <dbl>,
## #   percent_sedentary_active <dbl>
participantActivity <- participantActivity %>%
  mutate(intensity =
           case_when(percent_very_active > mean(percent_very_active) ~ "Very Active",
                     percent_faily_active > mean(percent_faily_active) ~ "Fairly Active",
                 percent_lightly_active > mean(percent_lightly_active) ~ "Lightly Active",
                 percent_sedentary_active > mean(percent_sedentary_active) ~ "Sedentary")
         )

newActivity <- participantActivity %>%
  group_by(intensity) %>%
  summarise(count = n())

ggplot(newActivity, aes(x = intensity, y = count, fill = intensity))+
  geom_histogram(stat = "identity") +
  ylab("Number of Participants") +
  xlab("Intensity Type") +
  labs(title = "Number of Participants by Intensity") +
  theme(legend.position = "none")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Hourly Intensities

newHourlyInt <- dhourlyInt %>%
  group_by(Hour) %>%
  drop_na() %>%
  summarise(mean_total_int = mean(TotalIntensity))

head(newHourlyInt)
## # A tibble: 6 × 2
##   Hour        mean_total_int
##   <chr>                <dbl>
## 1 1:00:00 AM            1.42
## 2 1:00:00 PM           18.8 
## 3 10:00:00 AM          17.6 
## 4 10:00:00 PM           9.06
## 5 11:00:00 AM          16.9 
## 6 11:00:00 PM           5.00
ggplot(data = newHourlyInt, aes(x= Hour, y= mean_total_int))+
  geom_histogram(stat = "identity", fill = "cyan4") +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(title = "Average Total Intensity vs. Time", y = "Average Intensity")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

** Looks like people are most active between 5:00 PM & 7:00 PM._This would be a good time to send out that notification to remind and motivate the sedentary users (or all users) to be more active_.

Looking at Sleep Pattern

merged_data <- merge(sleepDay_merged, dailyActivity_merged, by=c('Id')) %>%
  filter(SedentaryMinutes !=1440)

ggplot(data = merged_data, aes(x=TotalMinutesAsleep, y=SedentaryMinutes)) +
geom_point(color='orange') + geom_smooth(color = 'blue')+
  labs(title= "Minutes Asleep vs.Sedentary Minutes", x= "Total Minutes Asleep",
       y= "Sedentary Minutes")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#Conclusion

The Bellabeat time device is similar to the Fitbit watches, and the apps are also comparable. The following strategies can be used to increase usage of both devices.