install.packages('tidyverse')
library(tidyverse)
daily_activity <- read.csv("dailyActivity_merged.csv")
sleep_day <- read.csv("sleepDay_merged.csv")
The daily_activity data
head(daily_activity)
## Id ActivityDate TotalSteps TotalDistance TrackerDistance
## 1 1503960366 4/12/2016 13162 8.50 8.50
## 2 1503960366 4/13/2016 10735 6.97 6.97
## 3 1503960366 4/14/2016 10460 6.74 6.74
## 4 1503960366 4/15/2016 9762 6.28 6.28
## 5 1503960366 4/16/2016 12669 8.16 8.16
## 6 1503960366 4/17/2016 9705 6.48 6.48
## LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 1 0 1.88 0.55
## 2 0 1.57 0.69
## 3 0 2.44 0.40
## 4 0 2.14 1.26
## 5 0 2.71 0.41
## 6 0 3.19 0.78
## LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 1 6.06 0 25
## 2 4.71 0 21
## 3 3.91 0 30
## 4 2.83 0 29
## 5 5.04 0 36
## 6 2.51 0 38
## FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## 1 13 328 728 1985
## 2 19 217 776 1797
## 3 11 181 1218 1776
## 4 34 209 726 1745
## 5 10 221 773 1863
## 6 20 164 539 1728
columns in the daily_activity data.
colnames(daily_activity)
## [1] "Id" "ActivityDate"
## [3] "TotalSteps" "TotalDistance"
## [5] "TrackerDistance" "LoggedActivitiesDistance"
## [7] "VeryActiveDistance" "ModeratelyActiveDistance"
## [9] "LightActiveDistance" "SedentaryActiveDistance"
## [11] "VeryActiveMinutes" "FairlyActiveMinutes"
## [13] "LightlyActiveMinutes" "SedentaryMinutes"
## [15] "Calories"
The sleep_day data
head(sleep_day)
## Id SleepDay TotalSleepRecords TotalMinutesAsleep
## 1 1503960366 4/12/2016 12:00:00 AM 1 327
## 2 1503960366 4/13/2016 12:00:00 AM 2 384
## 3 1503960366 4/15/2016 12:00:00 AM 1 412
## 4 1503960366 4/16/2016 12:00:00 AM 2 340
## 5 1503960366 4/17/2016 12:00:00 AM 1 700
## 6 1503960366 4/19/2016 12:00:00 AM 1 304
## TotalTimeInBed
## 1 346
## 2 407
## 3 442
## 4 367
## 5 712
## 6 320
Columns in the daily_activity data.
colnames(sleep_day)
## [1] "Id" "SleepDay" "TotalSleepRecords"
## [4] "TotalMinutesAsleep" "TotalTimeInBed"
Distinguish unique Ids
n_distinct(daily_activity$Id)
## [1] 33
n_distinct(sleep_day$Id)
## [1] 24
Compare observations
nrow(daily_activity)
## [1] 940
nrow(sleep_day)
## [1] 413
For the daily activity dataframe:
daily_activity %>%
select(TotalSteps,
TotalDistance,
SedentaryMinutes) %>%
summary()
## TotalSteps TotalDistance SedentaryMinutes
## Min. : 0 Min. : 0.000 Min. : 0.0
## 1st Qu.: 3790 1st Qu.: 2.620 1st Qu.: 729.8
## Median : 7406 Median : 5.245 Median :1057.5
## Mean : 7638 Mean : 5.490 Mean : 991.2
## 3rd Qu.:10727 3rd Qu.: 7.713 3rd Qu.:1229.5
## Max. :36019 Max. :28.030 Max. :1440.0
For the sleep dataframe:
sleep_day %>%
select(TotalSleepRecords,
TotalMinutesAsleep,
TotalTimeInBed) %>%
summary()
## TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
## Min. :1.000 Min. : 58.0 Min. : 61.0
## 1st Qu.:1.000 1st Qu.:361.0 1st Qu.:403.0
## Median :1.000 Median :433.0 Median :463.0
## Mean :1.119 Mean :419.5 Mean :458.6
## 3rd Qu.:1.000 3rd Qu.:490.0 3rd Qu.:526.0
## Max. :3.000 Max. :796.0 Max. :961.0
ggplot(data=daily_activity, aes(x=TotalSteps, y=SedentaryMinutes)) + geom_point()
ggplot(data=sleep_day, aes(x=TotalMinutesAsleep, y=TotalTimeInBed)) + geom_point()
combined_data <- merge(sleep_day, daily_activity, by="Id")
number of participants are in this data set.
n_distinct(combined_data$Id)
## [1] 24
colnames(combined_data)
## [1] "Id" "SleepDay"
## [3] "TotalSleepRecords" "TotalMinutesAsleep"
## [5] "TotalTimeInBed" "ActivityDate"
## [7] "TotalSteps" "TotalDistance"
## [9] "TrackerDistance" "LoggedActivitiesDistance"
## [11] "VeryActiveDistance" "ModeratelyActiveDistance"
## [13] "LightActiveDistance" "SedentaryActiveDistance"
## [15] "VeryActiveMinutes" "FairlyActiveMinutes"
## [17] "LightlyActiveMinutes" "SedentaryMinutes"
## [19] "Calories"
glimpse(combined_data)
## Rows: 12,441
## Columns: 19
## $ Id <dbl> 1503960366, 1503960366, 1503960366, 150396036…
## $ SleepDay <chr> "4/12/2016 12:00:00 AM", "4/12/2016 12:00:00 …
## $ TotalSleepRecords <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ TotalMinutesAsleep <int> 327, 327, 327, 327, 327, 327, 327, 327, 327, …
## $ TotalTimeInBed <int> 346, 346, 346, 346, 346, 346, 346, 346, 346, …
## $ ActivityDate <chr> "5/7/2016", "5/6/2016", "5/1/2016", "4/30/201…
## $ TotalSteps <int> 11992, 12159, 10602, 14673, 13162, 10735, 153…
## $ TotalDistance <dbl> 7.71, 8.03, 6.81, 9.25, 8.50, 6.97, 9.80, 8.9…
## $ TrackerDistance <dbl> 7.71, 8.03, 6.81, 9.25, 8.50, 6.97, 9.80, 8.9…
## $ LoggedActivitiesDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveDistance <dbl> 2.46, 1.97, 2.29, 3.56, 1.88, 1.57, 5.29, 2.9…
## $ ModeratelyActiveDistance <dbl> 2.12, 0.25, 1.60, 1.42, 0.55, 0.69, 0.57, 1.0…
## $ LightActiveDistance <dbl> 3.13, 5.81, 2.92, 4.27, 6.06, 4.71, 3.94, 4.8…
## $ SedentaryActiveDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveMinutes <int> 37, 24, 33, 52, 25, 21, 73, 45, 48, 16, 31, 7…
## $ FairlyActiveMinutes <int> 46, 6, 35, 34, 13, 19, 14, 24, 28, 12, 23, 11…
## $ LightlyActiveMinutes <int> 175, 289, 246, 217, 328, 217, 216, 250, 189, …
## $ SedentaryMinutes <int> 833, 754, 730, 712, 728, 776, 814, 857, 782, …
## $ Calories <int> 1821, 1896, 1820, 1947, 1985, 1797, 2013, 195…
ggplot(combined_data, aes(x=VeryActiveMinutes, y=Calories, color=Calories))+geom_point()+ggtitle("Calories by very Active Minutes")
ggplot(combined_data, aes(x=FairlyActiveMinutes, y=Calories, color=Calories))+geom_point()
ggplot(combined_data, aes(x=LightlyActiveMinutes, y=Calories, color=Calories))+geom_point()
Calorie Findings: Based on the above visualizations, intensity of activity in minutes shows correlation. This could be a valuable message for the Bellabeat marketing team.