Introduction
This is an analysis of smart device usage data of about thirty (30)
Fitbit users who consented to the submission of personal tracker data.
The aim is to gain insight into how people are already using their smart
devices and then use this insight to make high-level recommendations for
how these trends can inform Bellabeat marketing strategy.
Data Cleaning and Manipulation
Loading packages
library (tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (lubridate)
library (dplyr)
Importing the csv files
Daily_Activity <- read.csv ("dailyActivity_merged.csv")
Daily_Sleep <- read.csv ("sleepDay_merged.csv")
Checking the structure of the data frames
str (Daily_Activity)
## 'data.frame': 940 obs. of 15 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityDate : chr "4/12/2016" "4/13/2016" "4/14/2016" "4/15/2016" ...
## $ TotalSteps : int 13162 10735 10460 9762 12669 9705 13019 15506 10544 9819 ...
## $ TotalDistance : num 8.5 6.97 6.74 6.28 8.16 ...
## $ TrackerDistance : num 8.5 6.97 6.74 6.28 8.16 ...
## $ LoggedActivitiesDistance: num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveDistance : num 1.88 1.57 2.44 2.14 2.71 ...
## $ ModeratelyActiveDistance: num 0.55 0.69 0.4 1.26 0.41 ...
## $ LightActiveDistance : num 6.06 4.71 3.91 2.83 5.04 ...
## $ SedentaryActiveDistance : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveMinutes : int 25 21 30 29 36 38 42 50 28 19 ...
## $ FairlyActiveMinutes : int 13 19 11 34 10 20 16 31 12 8 ...
## $ LightlyActiveMinutes : int 328 217 181 209 221 164 233 264 205 211 ...
## $ SedentaryMinutes : int 728 776 1218 726 773 539 1149 775 818 838 ...
## $ Calories : int 1985 1797 1776 1745 1863 1728 1921 2035 1786 1775 ...
str (Daily_Sleep)
## 'data.frame': 413 obs. of 5 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ SleepDay : chr "4/12/2016 12:00:00 AM" "4/13/2016 12:00:00 AM" "4/15/2016 12:00:00 AM" "4/16/2016 12:00:00 AM" ...
## $ TotalSleepRecords : int 1 2 1 2 1 1 1 1 1 1 ...
## $ TotalMinutesAsleep: int 327 384 412 340 700 304 360 325 361 430 ...
## $ TotalTimeInBed : int 346 407 442 367 712 320 377 364 384 449 ...
Correcting the inconsistencies on the date columns
Changing the data type of the date columns
Daily_Activity$ActivityDate <- as.Date(Daily_Activity$ActivityDate, "%m/%d/%Y")
Daily_Sleep$SleepDay <- as.Date(Daily_Sleep$SleepDay, "%m/%d/%Y")
Renaming the date columns
Daily_Activity <- Daily_Activity %>%
rename (Date = ActivityDate)
Daily_Sleep <- Daily_Sleep%>%
rename (Date = SleepDay)
Inspecting correction
str (Daily_Activity)
## 'data.frame': 940 obs. of 15 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ Date : Date, format: "2016-04-12" "2016-04-13" ...
## $ TotalSteps : int 13162 10735 10460 9762 12669 9705 13019 15506 10544 9819 ...
## $ TotalDistance : num 8.5 6.97 6.74 6.28 8.16 ...
## $ TrackerDistance : num 8.5 6.97 6.74 6.28 8.16 ...
## $ LoggedActivitiesDistance: num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveDistance : num 1.88 1.57 2.44 2.14 2.71 ...
## $ ModeratelyActiveDistance: num 0.55 0.69 0.4 1.26 0.41 ...
## $ LightActiveDistance : num 6.06 4.71 3.91 2.83 5.04 ...
## $ SedentaryActiveDistance : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveMinutes : int 25 21 30 29 36 38 42 50 28 19 ...
## $ FairlyActiveMinutes : int 13 19 11 34 10 20 16 31 12 8 ...
## $ LightlyActiveMinutes : int 328 217 181 209 221 164 233 264 205 211 ...
## $ SedentaryMinutes : int 728 776 1218 726 773 539 1149 775 818 838 ...
## $ Calories : int 1985 1797 1776 1745 1863 1728 1921 2035 1786 1775 ...
str (Daily_Sleep)
## 'data.frame': 413 obs. of 5 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ Date : Date, format: "2016-04-12" "2016-04-13" ...
## $ TotalSleepRecords : int 1 2 1 2 1 1 1 1 1 1 ...
## $ TotalMinutesAsleep: int 327 384 412 340 700 304 360 325 361 430 ...
## $ TotalTimeInBed : int 346 407 442 367 712 320 377 364 384 449 ...
Further investigation and cleaning
Checking for null values
sum (is.na(Daily_Activity))
## [1] 0
sum (is.na(Daily_Sleep))
## [1] 0
Checking for duplicates
sum (duplicated(Daily_Activity))
## [1] 0
sum (duplicated(Daily_Sleep))
## [1] 3
Removing duplicates from Daily_Sleep
Daily_sleep <- unique (Daily_Sleep)
sum (duplicated(Daily_sleep))
## [1] 0
Checking the number of Participants
n_distinct (Daily_Activity$Id)
## [1] 33
n_distinct (Daily_sleep$Id)
## [1] 24
Merging data
Daily_Activity_Sleep <- merge (Daily_Activity, Daily_sleep, by = c ("Id", "Date"))
Adding more columns
Daily_Activity_Sleep$Weekday <- format (as.Date(Daily_Activity_Sleep$Date), "%A") #Adds Weekday column
Daily_Activity_Sleep$TotalActiveMinutes = rowSums (Daily_Activity_Sleep [ c ("VeryActiveMinutes", #Adds Total active minutes column
"FairlyActiveMinutes",
"LightlyActiveMinutes")])
colnames (Daily_Activity_Sleep)
## [1] "Id" "Date"
## [3] "TotalSteps" "TotalDistance"
## [5] "TrackerDistance" "LoggedActivitiesDistance"
## [7] "VeryActiveDistance" "ModeratelyActiveDistance"
## [9] "LightActiveDistance" "SedentaryActiveDistance"
## [11] "VeryActiveMinutes" "FairlyActiveMinutes"
## [13] "LightlyActiveMinutes" "SedentaryMinutes"
## [15] "Calories" "TotalSleepRecords"
## [17] "TotalMinutesAsleep" "TotalTimeInBed"
## [19] "Weekday" "TotalActiveMinutes"
str (Daily_Activity_Sleep)
## 'data.frame': 410 obs. of 20 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ Date : Date, format: "2016-04-12" "2016-04-13" ...
## $ TotalSteps : int 13162 10735 9762 12669 9705 15506 10544 9819 14371 10039 ...
## $ TotalDistance : num 8.5 6.97 6.28 8.16 6.48 ...
## $ TrackerDistance : num 8.5 6.97 6.28 8.16 6.48 ...
## $ LoggedActivitiesDistance: num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveDistance : num 1.88 1.57 2.14 2.71 3.19 ...
## $ ModeratelyActiveDistance: num 0.55 0.69 1.26 0.41 0.78 ...
## $ LightActiveDistance : num 6.06 4.71 2.83 5.04 2.51 ...
## $ SedentaryActiveDistance : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveMinutes : int 25 21 29 36 38 50 28 19 41 39 ...
## $ FairlyActiveMinutes : int 13 19 34 10 20 31 12 8 21 5 ...
## $ LightlyActiveMinutes : int 328 217 209 221 164 264 205 211 262 238 ...
## $ SedentaryMinutes : int 728 776 726 773 539 775 818 838 732 709 ...
## $ Calories : int 1985 1797 1745 1863 1728 2035 1786 1775 1949 1788 ...
## $ TotalSleepRecords : int 1 2 1 2 1 1 1 1 1 1 ...
## $ TotalMinutesAsleep : int 327 384 412 340 700 304 360 325 361 430 ...
## $ TotalTimeInBed : int 346 407 442 367 712 320 377 364 384 449 ...
## $ Weekday : chr "Tuesday" "Wednesday" "Friday" "Saturday" ...
## $ TotalActiveMinutes : num 366 257 272 267 222 345 245 238 324 282 ...
n_distinct (Daily_Activity_Sleep$Id)
## [1] 24
Analysis
Quick summary statistics
Daily_Activity_Sleep %>%
select (TotalSteps,
TotalDistance,
SedentaryMinutes,
TotalActiveMinutes,
Calories,
TotalSleepRecords,
TotalMinutesAsleep,
TotalTimeInBed)%>%
summary ()
## TotalSteps TotalDistance SedentaryMinutes TotalActiveMinutes
## Min. : 17 Min. : 0.010 Min. : 0.0 Min. : 2.0
## 1st Qu.: 5189 1st Qu.: 3.592 1st Qu.: 631.2 1st Qu.:206.5
## Median : 8913 Median : 6.270 Median : 717.0 Median :263.5
## Mean : 8515 Mean : 6.012 Mean : 712.1 Mean :259.5
## 3rd Qu.:11370 3rd Qu.: 8.005 3rd Qu.: 782.8 3rd Qu.:315.5
## Max. :22770 Max. :17.540 Max. :1265.0 Max. :540.0
## Calories TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
## Min. : 257 Min. :1.00 Min. : 58.0 Min. : 61.0
## 1st Qu.:1841 1st Qu.:1.00 1st Qu.:361.0 1st Qu.:403.8
## Median :2207 Median :1.00 Median :432.5 Median :463.0
## Mean :2389 Mean :1.12 Mean :419.2 Mean :458.5
## 3rd Qu.:2920 3rd Qu.:1.00 3rd Qu.:490.0 3rd Qu.:526.0
## Max. :4900 Max. :3.00 Max. :796.0 Max. :961.0
Comparing average distance (Very active distance, Moderately active
distance and Light active distance)
Daily_Activity_Sleep %>%
summarise (Very_active_distance = mean (VeryActiveDistance),
Moderately_active_distance = mean (ModeratelyActiveDistance),
Light_active_distance = mean (LightActiveDistance)) %>%
round(3)
## Very_active_distance Moderately_active_distance Light_active_distance
## 1 1.446 0.744 3.791
Average steps by weekday
Daily_Activity_Sleep$Weekday <- factor (Daily_Activity_Sleep$Weekday, levels = c ("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday"))
Daily_Activity_Sleep %>%
group_by (Weekday) %>%
summarise (Total_Steps = mean (TotalSteps))
## # A tibble: 7 × 2
## Weekday Total_Steps
## <fct> <dbl>
## 1 Sunday 7298.
## 2 Monday 9273.
## 3 Tuesday 9183.
## 4 Wednesday 8023.
## 5 Thursday 8184.
## 6 Friday 7901.
## 7 Saturday 9871.
Average minutes asleep by weekday
Daily_Activity_Sleep %>%
group_by (Weekday) %>%
summarise (Sleep = mean (TotalMinutesAsleep))
## # A tibble: 7 × 2
## Weekday Sleep
## <fct> <dbl>
## 1 Sunday 453.
## 2 Monday 420.
## 3 Tuesday 405.
## 4 Wednesday 435.
## 5 Thursday 401.
## 6 Friday 405.
## 7 Saturday 419.
Number of days per Id
Daily_Activity_Sleep %>%
group_by (Id) %>%
summarise (Number_of_days = n ()) %>%
print (n = 24)
## # A tibble: 24 × 2
## Id Number_of_days
## <dbl> <int>
## 1 1503960366 25
## 2 1644430081 4
## 3 1844505072 3
## 4 1927972279 5
## 5 2026352035 28
## 6 2320127002 1
## 7 2347167796 15
## 8 3977333714 28
## 9 4020332650 8
## 10 4319703577 26
## 11 4388161847 23
## 12 4445114986 28
## 13 4558609924 5
## 14 4702921684 27
## 15 5553957443 31
## 16 5577150313 26
## 17 6117666160 18
## 18 6775888955 3
## 19 6962181067 31
## 20 7007744171 2
## 21 7086361926 24
## 22 8053475328 3
## 23 8378563200 31
## 24 8792009665 15
Consistency classification
Daily_Activity_Sleep %>%
group_by(Id) %>%
summarise (Number_of_days = n ()) %>%
mutate (Consistency = case_when(
Number_of_days >= 1 & Number_of_days <= 10 ~ "Inconsistent",
Number_of_days >= 11 & Number_of_days <= 20 ~ "Moderate",
Number_of_days >= 21 & Number_of_days <= 31 ~ "Consistent"
)) %>%
count(Consistency)
## # A tibble: 3 × 2
## Consistency n
## <chr> <int>
## 1 Consistent 12
## 2 Inconsistent 9
## 3 Moderate 3
Exporting data
Activity_Sleep <- Daily_Activity_Sleep
write.csv(Activity_Sleep, file = 'Activity_Sleep.csv')