Load all packages necessary to complete the analysis. Either set the working directory or make sure the current working directory is correct.
library(tidyverse) #data manipulation, exploration, and visualization
library(lubridate) #works with dates and times
library(ggplot2) #visualize data
library(dplyr) #good for data manipulation
library(tidyr) #data cleaning
library(reshape2)
###Set working directory
getwd() #displays your working directory
setwd("C:/Users/aligr/OneDrive/Documents/R/Bellabeat")
Read data into R from the working directory. Data is in the form of 18 csv files. Each data frame represents the output of data, such as sleep monitoring, physical activity, and number of calories burned, generated from 33 participants using their Fitbit smart device. The time frame is April 12, 2016 - May 12, 2016. The data frames include daily and hourly output, as well as data measured by the minute or second.
Daily_Activity <- read_csv("dailyActivity_merged.csv")
Daily_Calories <- read_csv("dailyCalories_merged.csv")
Daily_Intensities <- read_csv("dailyIntensities_merged.csv")
Daily_Steps <- read_csv("dailySteps_merged.csv")
Sec_Heartrate <- read_csv("heartrate_seconds_merged.csv")
Hourly_Calories <- read_csv("hourlyCalories_merged.csv")
Hourly_Intensities <- read_csv("hourlyIntensities_merged.csv")
Hourly_Steps <- read_csv("hourlySteps_merged.csv")
Min_Calories_Narrow <- read_csv("minuteCaloriesNarrow_merged.csv")
Min_Calories_Wide <- read_csv("minuteCaloriesWide_merged.csv")
Min_Intensities_Wide <- read_csv("minuteIntensitiesWide_merged.csv")
Min_Intensities_Narrow <- read_csv("minuteIntensitiesNarrow_merged.csv")
Min_METs <- read_csv("minuteMETsNarrow_merged.csv")
Min_Sleep <- read_csv("minuteSleep_merged.csv")
Min_Steps_Narrow <- read_csv("minuteStepsNarrow_merged.csv")
Min_Steps_Wide <- read_csv("minuteStepsWide_merged.csv")
Daily_Sleep <- read_csv("sleepDay_merged.csv")
WeightLog <- read_csv("weightLogInfo_merged.csv")
List <- list(Daily_Activity,Daily_Calories,Daily_Intensities,Daily_Sleep,Daily_Steps,Hourly_Calories,Hourly_Intensities,Hourly_Steps,Min_Calories_Narrow,Min_Calories_Wide,Min_Intensities_Narrow,Min_Intensities_Wide,Min_METs,Min_Sleep,Min_Steps_Narrow,Min_Steps_Wide,Sec_Heartrate,WeightLog)
List_Colnames<- lapply(List,names)
Common_Cols <- Reduce(intersect,List_Colnames)
Common_Cols
List_Colnames
A column shared between all the data sets is Id. There is also a date or date/time column in all data sets but it is named differently across the data sets.
By viewing the data sets I see that the Daily_Activity table includes the same data as the Daily_Calories, Daily_Intensities, and Daily_Steps tables. Those tables are therefore redundant and not needed.
Daily_Activity Table
str(Daily_Activity)
Daily_Activity$ActivityDate <- as.Date(Daily_Activity$ActivityDate, format="%m/%d/%Y")
Daily_Activity$DayofWeek <- weekdays(as.Date(Daily_Activity$ActivityDate))
Daily_Activity$Week <- ifelse(Daily_Activity$ActivityDate >= as.Date("2016-04-12") & Daily_Activity$ActivityDate <= as.Date("2016-04-18"),"Week 1",
ifelse(Daily_Activity$ActivityDate >= as.Date("2016-04-19") & Daily_Activity$ActivityDate <= as.Date("2016-04-25"), "Week 2",
ifelse(Daily_Activity$ActivityDate >= as.Date("2016-04-26") & Daily_Activity$ActivityDate <= as.Date("2016-05-02"), "Week 3",
ifelse(Daily_Activity$ActivityDate >= as.Date("2016-05-03") & Daily_Activity$ActivityDate <= as.Date("2016-05-09"), "Week 4",
ifelse(Daily_Activity$ActivityDate >= as.Date("2016-05-10") & Daily_Activity$ActivityDate <= as.Date("2016-05-12"), "Week 5",0)))))
lapply(Daily_Activity,class)
Daily_Activity<-Daily_Activity %>%
distinct()
The date is a character data type rather than date. This was changed and new columns were added, such as the Week column. The time frame the data covers is not a full 5 weeks so the 5th week is only a few days. This is taken into account while analyzing the data.
Daily_Sleep Table
str(Daily_Sleep)
Daily_Sleep <- Daily_Sleep %>%
rename(ActivityDate = SleepDay)
Daily_Sleep$ActivityDate <- as.Date(Daily_Sleep$ActivityDate, format="%m/%d/%Y")
Daily_Sleep$DayofWeek <- weekdays(as.Date(Daily_Sleep$ActivityDate))
Daily_Sleep$HoursinBed <- Daily_Sleep$TotalTimeInBed/60
Daily_Sleep$Hours_Asleep <- Daily_Sleep$TotalMinutesAsleep/60
lapply(Daily_Sleep,class)
Daily_Sleep<-Daily_Sleep %>%
distinct()
Hourly_Calories table
str(Hourly_Calories)
Hourly_Calories <- Hourly_Calories %>%
rename(ActivityDateTime = ActivityHour)
Hourly_Calories$ActivityDateTime <- parse_date_time(Hourly_Calories$ActivityDateTime,"%m/%d/%Y %I:%M:%S %p")
Hourly_Calories$Hour <- hour(Hourly_Calories$ActivityDateTime)
Hourly_Calories$Date <- as.Date(Hourly_Calories$ActivityDateTime)
Hourly_Calories$DayofWeek <- weekdays(as.Date(Hourly_Calories$ActivityDateTime))
lapply(Hourly_Calories,class)
summary(Hourly_Calories)
Hourly_Calories<-Hourly_Calories %>%
distinct()
Hourly_Intensities table
str(Hourly_Intensities)
Hourly_Intensities <- Hourly_Intensities %>%
rename(ActivityDateTime = ActivityHour)
Hourly_Intensities$ActivityDateTime <- parse_date_time(Hourly_Intensities$ActivityDateTime,"%m/%d/%Y %I:%M:%S %p")
Hourly_Intensities$Hour <- hour(Hourly_Intensities$ActivityDateTime)
Hourly_Intensities$Date <- as.Date(Hourly_Intensities$ActivityDateTime)
Hourly_Intensities$DayofWeek <- weekdays(as.Date(Hourly_Intensities$ActivityDateTime))
lapply(Hourly_Intensities,class)
Hourly_Intensities<-Hourly_Intensities %>%
distinct()
Hourly_Steps table
str(Hourly_Steps)
Hourly_Steps <- Hourly_Steps %>%
rename(ActivityDateTime = ActivityHour)
Hourly_Steps$ActivityDateTime <- parse_date_time(Hourly_Steps$ActivityDateTime, "%m/%d/%Y %I:%M:%S %p")
Hourly_Steps$Hour <- hour(Hourly_Steps$ActivityDateTime)
Hourly_Steps$Date <- as.Date(Hourly_Steps$ActivityDateTime)
Hourly_Steps$DayofWeek <- weekdays(as.Date(Hourly_Steps$ActivityDateTime))
lapply(Hourly_Steps,class)
Hourly_Steps<-Hourly_Steps %>%
distinct()
str(WeightLog)
WeightLog <- WeightLog %>%
rename(WeightDateTime = Date)
WeightLog$WeightDateTime <- parse_date_time(WeightLog$WeightDateTime, "%m/%d/%Y %I:%M:%S %p")
WeightLog$Minute <- minute(WeightLog$WeightDateTime)
WeightLog$Hour <- hour(WeightLog$WeightDateTime)
WeightLog$Date <- as.Date(WeightLog$WeightDateTime)
WeightLog$DayofWeek <- weekdays(as.Date(WeightLog$WeightDateTime))
lapply(WeightLog,class)
str(Min_Sleep)
Min_Sleep <- Min_Sleep %>%
rename(SleepDateTime = date)
Min_Sleep$SleepDateTime <- parse_date_time(Min_Sleep$SleepDateTime, "%m/%d/%Y %I:%M:%S %p")
Min_Sleep$Minute <- minute(Min_Sleep$SleepDateTime)
Min_Sleep$Hour <- hour(Min_Sleep$SleepDateTime)
Min_Sleep$Date <- as.Date(Min_Sleep$SleepDateTime)
Min_Sleep$DayofWeek <- weekdays(as.Date(Min_Sleep$SleepDateTime))
Min_Sleep$Time <- format(Min_Sleep$SleepDateTime,"%H:%M:%S")
lapply(Min_Sleep, class)
Min_Sleep<-Min_Sleep %>%
distinct()
summary(Daily_Activity)
lapply(Daily_Activity,class)
n_distinct(Daily_Activity$Id)
## [1] 33
Daily Total Steps
summary(Daily_Activity$TotalSteps)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 3790 7406 7638 10727 36019
Daily_Activity %>%
ggplot(aes(TotalSteps))+
geom_histogram(color = "#000000", fill = "#0099F8")+
geom_vline(aes(xintercept = mean(TotalSteps)), color = "#000000", linewidth=.75)+
geom_vline(aes(xintercept = median(TotalSteps)), color = "#983270", linewidth=.75)+
labs(
title="Daily Total Steps of All IDs",
x="Total Steps Taken in One Day",
y="Number of Instances Across All IDs"
)+
annotate("text", x=8300, y=95, label="Mean", angle=270)+
annotate("text", x=6700, y= 95, label="Median", angle=90, color="#983270")
The mean and median are close together. There are a few outliers, but nothing skewing the data significantly.
Daily Minutes of Very Active Activity
summary(Daily_Activity$VeryActiveMinutes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 4.00 21.16 32.00 210.00
Daily_Activity %>%
ggplot(aes(x=VeryActiveMinutes))+
geom_histogram(color = "#000000", fill = "#0099F8")+
geom_vline(aes(xintercept = mean(VeryActiveMinutes)), color = "#000000", linewidth=.75)+
geom_vline(aes(xintercept = median(VeryActiveMinutes)), color = "#983270", linewidth=.75)+
labs(
title="Daily Minutes of 'Very Active' Activity of All IDs",
x="Total Number of Minutes Per Day",
y="Number of Instances Across All IDs"
)+
annotate("text", x=25, y=120, label="Mean", angle=270)+
annotate("text", x=9, y= 120, label="Median", angle=270, color="#983270")
Using the mean would make it appear that people are getting a significantly greater amount of Very Active activity per day. However, the median is 4 minutes. The histogram illustrates this significant difference.
Daily Minutes of Fairly Active Activity
summary(Daily_Activity$FairlyActiveMinutes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 6.00 13.56 19.00 143.00
Daily_Activity %>%
ggplot(aes(x=FairlyActiveMinutes))+
geom_histogram(color = "#000000", fill = "#0099F8")+
geom_vline(aes(xintercept = mean(FairlyActiveMinutes)), color = "#000000", linewidth=.75)+
geom_vline(aes(xintercept = median(FairlyActiveMinutes)), color = "#983270", linewidth=.75)+
labs(
title="Daily Minutes of 'Fairly Active' Activity of All IDs",
x="Total Number of Minutes Per Day",
y="Number of Instances Across All IDs"
)+
annotate("text", x=16, y=200, label="Mean", angle=270)+
annotate("text", x=9, y= 200, label="Median", angle=270, color="#983270")
We see a similar situation to the Very Active minutes histogram.
Daily Minutes of Lightly Active Activity
summary(Daily_Activity$LightlyActiveMinutes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 127.0 199.0 192.8 264.0 518.0
Daily_Activity %>%
ggplot(aes(x=LightlyActiveMinutes))+
geom_histogram(color = "#000000", fill = "#0099F8")+
geom_vline(aes(xintercept = mean(LightlyActiveMinutes)), color = "#000000", linewidth=.75)+
geom_vline(aes(xintercept = median(LightlyActiveMinutes)), color = "#983270", linewidth=.75)+
labs(
title="Daily Minutes of 'Lightly Active' Activity of All IDs",
x="Total Number of Minutes Per Day",
y="Number of Instances Across All IDs"
)+
annotate("text", x=180, y=80, label="Mean", angle=90)+
annotate("text", x=210, y= 80, label="Median", angle=270, color="#983270")
The mean and median are close together. The mean is slightly less than the median. This is likely due to the large number of instance where daily minutes are less than 10 followed by a more normal distribution.
Daily Minutes of Sedentary Activity
summary(Daily_Activity$SedentaryMinutes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 729.8 1057.5 991.2 1229.5 1440.0
Daily_Activity %>% ggplot(aes(x=SedentaryMinutes))+
geom_histogram(color = "#000000", fill = "#0099F8")+
geom_vline(aes(xintercept = mean(SedentaryMinutes)), color = "#000000", linewidth=.75)+
geom_vline(aes(xintercept = median(SedentaryMinutes)), color = "#983270", linewidth=.75)+
labs(
title="Daily Minutes of 'Sedentary' Activity of All IDs",
x="Total Number of Minutes Per Day",
y="Number of Instances Across All IDs"
)+
annotate("text", x=955, y=80, label="Mean", angle=90)+
annotate("text", x=1090, y= 80, label="Median", angle=270, color="#983270")
Daily Number of Calories Burned
summary(Daily_Activity$Calories)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1828 2134 2304 2793 4900
Daily_Activity %>% ggplot(aes(x=Calories))+
geom_histogram(color = "#000000", fill = "#0099F8")+
geom_vline(aes(xintercept = mean(Calories)), color = "#000000", linewidth=.75)+
geom_vline(aes(xintercept = median(Calories)), color = "#983270", linewidth=.75)+
labs(
title="Daily Number of Calories Burned of All IDs",
x="Total Calories Per Day",
y="Number of Instances Across All IDs"
)+
annotate("text", x=2405, y=25, label="Mean", angle=270)+
annotate("text", x=2010, y= 25, label="Median", angle=90, color="#983270")
The mean will be used for the following variables: TotalSteps, LightlyActiveMinutes. The median will be used for the following variables: VeryActiveMinutes, FairlyActiveMinutes, SedentaryMinutes, Calories.
Obtaining average/median values of Daily_Activity data set (grouped by ID)
Avg_Dly_Act_by_ID <- Daily_Activity %>%
group_by(Id) %>%
summarise(Avg_Steps = mean(TotalSteps),
Med_Cals = median(Calories),
Avg_Dist = mean(TotalDistance),
Med_VeryActiveMin = median(VeryActiveMinutes),
Med_FairlyActiveMin = median(FairlyActiveMinutes),
Avg_LightlyActiveMin = mean(LightlyActiveMinutes),
Med_SedentaryMin = median(SedentaryMinutes),
Med_VA_FA = median(VeryActiveMinutes+FairlyActiveMinutes))
Obtaining average/median values of Daily_Activity table by day of week (grouped by ID)
Avg_Dly_Act_by_ID_DayofWeek <- Daily_Activity %>%
group_by(Id, DayofWeek) %>%
summarise(Avg_Steps = mean(TotalSteps),
Med_Cals = median(Calories),
Avg_Dist = mean(TotalDistance),
Med_VeryActiveMin = median(VeryActiveMinutes),
Med_FairlyActiveMin = median(FairlyActiveMinutes),
Avg_LightlyActiveMin = mean(LightlyActiveMinutes),
Med_SedentaryMin = median(SedentaryMinutes),
Med_VA_FA = median(VeryActiveMinutes+FairlyActiveMinutes)) %>%
arrange(Id, DayofWeek)
Obtaining average/median values of Daily_Activity table by day of week (NOT grouped by ID)
Avg_Dly_Act_by_DayofWeek <- Daily_Activity %>%
group_by(DayofWeek) %>%
summarise(Avg_Steps = mean(TotalSteps),
Med_Cals = median(Calories),
Avg_Dist = mean(TotalDistance),
Med_VeryActiveMin = median(VeryActiveMinutes),
Med_FairlyActiveMin = median(FairlyActiveMinutes),
Avg_LightlyActiveMin = mean(LightlyActiveMinutes),
Med_SedentaryMin = median(SedentaryMinutes),
Med_VA_FA = median(VeryActiveMinutes+FairlyActiveMinutes)) %>%
arrange(DayofWeek)
Obtaining average/median values of Daily_Activity table by week number (NOT grouped by ID)
Avg_Dly_Act_by_WkNum <- Daily_Activity %>%
group_by(Week) %>%
summarise(Avg_Steps = mean(TotalSteps),
Med_Cals = median(Calories),
Avg_Dist = mean(TotalDistance),
Med_VeryActiveMin = median(VeryActiveMinutes),
Med_FairlyActiveMin = median(FairlyActiveMinutes),
Avg_LightlyActiveMin = mean(LightlyActiveMinutes),
Med_SedentaryMin = median(SedentaryMinutes),
Med_VA_FA = median(VeryActiveMinutes+FairlyActiveMinutes)) %>%
arrange(Week)
Obtaining average/median values of Daily_Activity table by week number (grouped by ID)
Avg_Dly_Act_by_ID_WkNum <- Daily_Activity %>%
group_by(Id, Week) %>%
summarise(Avg_Steps = mean(TotalSteps),
Med_Cals = median(Calories),
Avg_Dist = mean(TotalDistance),
Med_VeryActiveMin = median(VeryActiveMinutes),
Med_FairlyActiveMin = median(FairlyActiveMinutes),
Avg_LightlyActiveMin = mean(LightlyActiveMinutes),
Med_SedentaryMin = median(SedentaryMinutes),
Med_VA_FA = median(VeryActiveMinutes+FairlyActiveMinutes)) %>%
arrange(Id, Week)
The correlation between each activity type and the number of calories burned will be determined. A stronger positive correlation would mean the more minutes a user has of an activity type, the more calories they burn. From this, we are making the assumption that the more strongly correlated an activity type is to calories, the more vigorous the activity is.
Daily_Activity %>%
ggplot(aes(x=VeryActiveMinutes, y=Calories))+
geom_point()+
geom_smooth()+
theme_minimal()
cor(Daily_Activity$VeryActiveMinutes, Daily_Activity$Calories)
## [1] 0.6158383
Daily_Activity %>%
ggplot(aes(x=FairlyActiveMinutes, y=Calories))+
geom_point()+
geom_smooth()+
theme_minimal()
cor(Daily_Activity$FairlyActiveMinutes, Daily_Activity$Calories)
## [1] 0.2976235
Daily_Activity %>%
ggplot(aes(x=LightlyActiveMinutes, y=Calories))+
geom_point()+
geom_smooth()+
theme_minimal()
cor(Daily_Activity$LightlyActiveMinutes, Daily_Activity$Calories)
## [1] 0.2867175
Daily_Activity %>%
ggplot(aes(x=SedentaryMinutes, y=Calories))+
geom_point()+
geom_smooth()+
theme_minimal()
cor(Daily_Activity$SedentaryMinutes, Daily_Activity$Calories)
## [1] -0.106973
cor(Daily_Activity$TotalSteps,Daily_Activity$Calories)
## [1] 0.5915681
cor(Daily_Activity$TotalDistance,Daily_Activity$Calories)
## [1] 0.6449619
cor(Daily_Activity$VeryActiveDistance,Daily_Activity$Calories)
## [1] 0.4919586
cor(Daily_Activity$ModeratelyActiveDistance,Daily_Activity$Calories)
## [1] 0.2167899
cor(Daily_Activity$LightActiveDistance,Daily_Activity$Calories)
## [1] 0.4669168
cor(Daily_Activity$SedentaryActiveDistance,Daily_Activity$Calories)
## [1] 0.04365187
Total Distance correlates the best to the number of calories burned from all the other variables. VeryActiveMinutes was the next bestn followed by TotalSteps. FairlyActive and Lightly Active have very similar correlations and Sedentary Minutes, as expected, has the least amount of correlation.
From this, VeryActiveMinutes will equal “intense” activity; VeryActive and FairlyActive will equal “moderate” activity (since LightlyActive is even less correlation than FairlyActive and given the title “lightly” we will not categorize this as moderate intensity. The VeryActive minutes (which are vigorous) will be added to the Fairly Active(moderate) to make up the “moderate” activity minutes)
ggplot(data=Avg_Dly_Act_by_ID_WkNum)+
geom_point(mapping=aes(x=Week,y=Med_VeryActiveMin))+
facet_wrap("Id")+
theme(axis.text.x = element_text(angle=90,hjust=1))+
geom_hline(yintercept=10.71, linetype="dashed", color="blue")+
labs(title="Amount of Vigorous Activity Per Week for Each User",
x="Week Number",
y="Number of Minutes")
10.71 is 75 minutes/7 days. Either 75 minutes/week of vigorous-intensity aerobic activity or 150 minutes/week of moderate-intensity aerobic activity is recommended by the CDC (https://www.cdc.gov/physicalactivity/basics/adults/index.htm).
ggplot(data=Avg_Dly_Act_by_ID_WkNum)+
geom_point(mapping=aes(x=Week,y=Med_VA_FA))+
facet_wrap("Id")+
theme(axis.text.x = element_text(angle=90,hjust=1))+
geom_hline(yintercept=21.42, linetype="dashed", color="blue")+
labs(title="Amount of Moderate/Vigorous Activity Per Week for Each User",
x="Week Number",
y="Number of Minutes")
21.42 is 150 minutes/7 days. Either 150 minutes/week of moderate-intensity aerobic activity or 90 minutes/week of vigorous-intensity aerobic activity is recommended by the CDC (https://www.cdc.gov/physicalactivity/basics/adults/index.htm).
Plotting average VeryActiveMin to see if people are exercising more on certain days
ggplot(data=Avg_Dly_Act_by_ID_DayofWeek)+
geom_point(mapping=aes(x=DayofWeek,y=Med_VeryActiveMin))+
facet_wrap("Id")+
theme(axis.text.x = element_text(angle=90,hjust=1))+
geom_hline(yintercept=10, linetype="dashed", color="blue")+
labs(title="Number of Minutes of Very Active Activity by Day of Week",
x="Date",
y="Number of Steps")
Many users are very consistent, while others show more activity certain days of the week.
Daily_Act_Wkly<-Daily_Activity %>%
group_by(Id, Week) %>%
summarise(TotalFairlyActMin = sum(FairlyActiveMinutes),
TotalVeryActMin = sum(VeryActiveMinutes),
TotalActiveMin = sum(VeryActiveMinutes)+sum(FairlyActiveMinutes)) %>%
arrange(Id, Week)
Daily_Act_Wkly_Long <- melt(Daily_Act_Wkly,
id.vars = c("Id","Week"))
Daily_Act_Wkly_Long <- Daily_Act_Wkly_Long %>%
rename(Minutes = value,
ActivityType = variable)
ggplot(data=Daily_Act_Wkly_Long)+
geom_line(mapping=aes(x=Week, y=Minutes, group=ActivityType, color=ActivityType))+
facet_wrap("Id") +
theme(axis.text.x = element_text(angle=90,hjust=1))+
geom_hline(yintercept=150, linetype="dashed", color="blue")+
labs(
title="Minutes of Activity Per Week For Each User"
)
Daily_Act_Wkly_Long$Over150 <- ifelse(Daily_Act_Wkly_Long$ActivityType=="TotalActiveMin" & Daily_Act_Wkly_Long$Minutes>150, "Yes","No")
Daily_Act_Wkly_Long$Over75 <- ifelse(Daily_Act_Wkly_Long$ActivityType=="TotalVeryActMin" & Daily_Act_Wkly_Long$Minutes>75, "Yes","No")
WeeklyMinGoal <- Daily_Act_Wkly_Long %>%
group_by(Id) %>%
summarise(countover150 = sum(Over150=="Yes"),
countover75 = sum(Over75=="Yes"))
WeeklyMinGoal_Long <- melt(WeeklyMinGoal,
id.vars = c("Id"))
WeeklyMinGoal_Long <- WeeklyMinGoal_Long %>%
rename(Num_Weeks_Achieved = value,
Goal = variable)
These datasets help us to see how many weeks each user achieved 75 minutes of vigorous weekly activity or 150 minutes of moderate weekly activity
Plotting the number of instances where user achieved over 150 minutes of weekly activity
WeeklyGoal <- Daily_Act_Wkly_Long %>%
group_by(Id) %>%
summarise(NumWeeksOver150 = sum(Over150=="Yes"),
NumWeeksOver75 = sum(Over75=="Yes")) %>%
arrange(Id)
WeeklyGoal$Id<-as.character(WeeklyGoal$Id)
WeeklyGoal %>%
ggplot(aes(x=reorder(Id,-NumWeeksOver150), y=NumWeeksOver150))+
geom_bar(stat="identity", color="blue", fill="blue")+
theme(axis.text.x = element_text(angle=90,hjust=1))+
labs(title="Number of Weeks Each User Achieved Over 150 Active Minutes",
x="User Id",
y="Number of Weeks")
WeeklyGoal %>%
ggplot(aes(x=reorder(Id,-NumWeeksOver75), y=NumWeeksOver75))+
geom_bar(stat="identity", color="blue", fill="blue")+
theme(axis.text.x = element_text(angle=90,hjust=1))+
labs(title="Number of Weeks Each User Achieved Over 75 Minutes of Vigorous Activity",
x="User Id",
y="Number of Weeks")
WeeklyGoal %>%
ggplot(aes(x=NumWeeksOver150))+
geom_bar(color="blue",fill="blue")+
scale_y_continuous(breaks=seq(0,10,by=1))+
scale_x_continuous(breaks=seq(0,5,by=1))+
labs(title="Count of Users by Number of Weeks Achieved 150 Minutes of Activity",
x="Number of Weeks",
y="Number of Users")
WeeklyGoal %>%
ggplot(aes(x=NumWeeksOver75))+
geom_bar(color="blue",fill="blue")+
scale_y_continuous(breaks=seq(0,10,by=1))+
scale_x_continuous(breaks=seq(0,5,by=1))+
labs(title="Count of Users by Number of Weeks Achieved 75 Minutes of Activity",
x="Number of Weeks",
y="Number of Users")
Week 5 isn’t a full week so it can’t be counted. Overall, 12 of the 33 people met the guideline of either 150 minutes of moderate activity of 75 minutes of vigorous activity at least 4 of the 5 weeks. Of those 12, 11 of them had over 75 minutes of VeryActiveMinutes in at least 4 of the 5 weeks.
ggplot(data=Avg_Dly_Act_by_ID_WkNum)+
geom_point(mapping=aes(x=Week,y=Avg_Steps))+
facet_wrap("Id")+
theme(axis.text.x = element_text(angle=90,hjust=1))+
geom_hline(yintercept=10000, linetype="dashed", color="blue")+
labs(title="Number of Steps Per Week for Each User",
x="Week Number",
y="Number of Steps")
We see some users did not have any activity or steps taken some weeks. Could be due to not wearing the fitbit. Some users were very consistent while others were not.
How long are people wearing their fitbits while they are awake?
Days_Used<- Daily_Activity %>%
group_by(Id,ActivityDate) %>%
reframe(
MinutesUse = sum(VeryActiveMinutes+FairlyActiveMinutes+LightlyActiveMinutes+SedentaryMinutes),
HoursUse=MinutesUse/60) %>%
arrange(MinutesUse)
How long are people wearing their fitbits while they are asleep? Create new dataframe to merge with Days_Used
Min_Sleep2<-Min_Sleep %>%
group_by(Id,Date) %>%
summarise(Minutes_Asleep=n()) %>%
mutate(Hours_Asleep=Minutes_Asleep/60) %>%
rename("ActivityDate"="Date")
Merging the Days_Used data set with Daily_Sleep to get the total time per day the users wore their fitbits. Remove any NA values from the Hours_Asleep column. Create new column adding together sleep time and time awake.
Days_Used_v2<-merge(x=Days_Used,y=Min_Sleep2,by=c("Id","ActivityDate"), all=TRUE)
Days_Used_v2$Hours_Asleep[is.na(Days_Used_v2$Hours_Asleep)] <- 0
Days_Used_v2$TotalHoursUse=Days_Used_v2$HoursUse+Days_Used_v2$Hours_Asleep
The fitbit is not accurately recording data. There are multiple instances where the user wore the fitbit for the full 24 hours but no sleep was registered. To explore this further, the number of instances where active use is 24 hours is counted.
length(which(Days_Used_v2$HoursUse==24))
## [1] 478
length(which(Days_Used_v2$TotalHoursUse>24))
## [1] 0
People using smart devices want accurate data. If Bellabeat’s products are able to record more accurate data than what is shown in this dataset, may want to advertise their focus on accuracy.
Days where people did not use their fitbit - provide motivation to wear the device (tied in with point about accuracy of the data) Create a date table
start_date <- as.Date("2016-04-12")
end_date <- as.Date("2016-05-12")
ActivityDate <- seq(start_date, end_date,"days")
Date_Range<-(ActivityDate)
Id<-(unique(Daily_Activity$Id))
Ids<-data_frame(Id)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Date_Table <- data_frame(
Id=rep(Ids$Id, each=length(Date_Range)),
ActivityDate=rep(Date_Range, length(Ids$Id)))
Days_Used_v3 <- merge(x=Days_Used_v2,y=Date_Table,by=c("Id","ActivityDate"), all=TRUE)
Days_Not_Used<-Days_Used_v3 %>%
group_by(Id) %>%
summarise(
DaysNotUsed=sum(is.na(MinutesUse))) %>%
arrange(desc(DaysNotUsed))
length(which(Days_Not_Used$DaysNotUsed>0))
## [1] 19
12 out of 33 people did not use their Fitbits everyday. Of those 12, 4 did not use them for greater than 10 of the 31 days and 1 did not use them for 27 of the 31 days. This is a significant number of people that did not use their Fitbits consistently.
Did people stop using their Fitbits as time went on during the time period covered by the data?
IdsActiveEachDay<-Days_Used_v2 %>%
group_by(ActivityDate) %>%
summarise(NumIdsActive=sum(!is.na(MinutesUse))) %>%
arrange(desc(NumIdsActive))
Number of people using their Fitbit dropping as time went on
Visually show the days people used their Fitbit
Days_Used_v4<-Days_Used_v3[!(is.na(Days_Used_v3$MinutesUse)), ]
Days_Used_v4$Id <- as.character(Days_Used_v4$Id)
Days_Used_Chart<-Days_Used_v4[!(is.na(Days_Used_v4$MinutesUse)),]
ggplot(Days_Used_Chart, aes(x=ActivityDate, y=fct_infreq(Id)))+
geom_point(shape=22, fill="black", size=3)+
theme(legend.position = "none")+
labs(title="Number of Days Fitbit Used",
x="Date",
y="User ID")
This chart really shows that a lot of people did not continue to use their fitbit. They stopped part of the way through.
For the 7 participants that do not have the full 5 weeks plotted in the previous analysis, that is due to them not wearing their fitbits.
Assessing consistency of steps with standard deviation
ConsistencyofSteps <- Daily_Activity %>%
group_by(Id) %>%
summarise(SD = sd(TotalSteps),
AVG_Steps = mean(TotalSteps)) %>%
arrange(SD)
Plotting the Number of Steps being taken each week
ggplot(Avg_Dly_Act_by_ID_WkNum, aes(x=Avg_Steps, fill=Week))+
geom_histogram(binwidth=1000, color="black") + facet_wrap("Week")+
labs(title="Average Number of Steps Taken Daily Each Week",
x="Average Number of Steps",
y="Count of User ID")
There is quite a bit of variation in people’s use of the fitbit week by week. Week 3 appears to be the week users were taking the most steps. Can bellabeat provide a better way to motivate people to use their devices?
Number of steps taken each day for each user on a scatterplot
ggplot(data=Daily_Activity)+
geom_point(mapping=aes(x=ActivityDate,y=TotalSteps))+
facet_wrap("Id")+
geom_hline(yintercept=10000, linetype="dashed", color="blue")+
theme(axis.text.x = element_text(angle=90,hjust=1))+
labs(title="Number of Steps Taken Each Day for Each User",
x="Date",
y="Number of Steps")
Plotting average steps by day of week for each user. Seeing if there are patterns/trends for when individual users are taking more or less steps .
ggplot(data=Avg_Dly_Act_by_ID_DayofWeek)+
geom_point(mapping=aes(x=DayofWeek,y=Avg_Steps))+
facet_wrap("Id")+
theme(axis.text.x = element_text(angle=90,hjust=1))+
geom_hline(yintercept=10000, linetype="dashed", color="blue")+
labs(title="Average Number of Steps Taken by Each User by Day of Week",
x="Date",
y="Number of Steps")
We see some users are not using their Fitbits everyday. Some users are more consistent than others across days of the week. Some users are more active on certain days of the week. We see that the scatter plots showing number of minutes of veryactive activity are very similar to scatterplots of number of steps. People probably get their most steps in when are most active.
Is there a certain time of day people are taking more steps?
HourlySteps2 <- Hourly_Steps %>%
group_by(Hour,Id) %>%
summarise(AVGSteps = mean(StepTotal)) %>%
arrange(Id,AVGSteps)
HourlySteps2 %>%
ggplot(aes(x=Hour,y=AVGSteps))+
geom_point()+
geom_smooth()+
theme_light()+
scale_x_continuous(breaks=seq(0,24,by=1))+
labs(title="Average Number of Steps by Hour",
x="Hour",
y="Average Number of Steps")
11am-5pm people are taking their greatest number of steps on average.
HourlySteps3<-HourlySteps2 %>%
group_by(Id) %>%
summarise(max = max(AVGSteps)) %>%
arrange(desc(max))
MergeHourlySteps<-inner_join(HourlySteps2,HourlySteps3, by=c('Id'='Id','AVGSteps'='max'))
ggplot(data=MergeHourlySteps)+
geom_bar(mapping=aes(x=Hour))+
theme_light()+
scale_x_continuous(breaks=seq(0,24,by=1))+
labs(title="Hour Users are Taking their Max Number of Steps",
x="Hour",
y="Max Number of Steps")
Users’ average steps may be higher in the late morning/afternoon but their max step counts are highest in the morning.
Certain time of day people burning more calories?
Hourly_Calories2 <- Hourly_Calories %>%
group_by(Hour,Id) %>%
summarise(AvgCal = mean(Calories)) %>%
arrange(Id,AvgCal)
Hourly_Calories2 %>%
ggplot(aes(x=Hour,y=AvgCal))+
geom_point()+
geom_smooth()+
theme_light()+
scale_x_continuous(breaks=seq(0,24,by=1))+
labs(title="Average Number of Calories Burned Each Hour",
x="Hour",
y="Average Number of Calories Burned")
Closely resembles the Hourly_Steps graph seen previously.
Certain time of day people most active?
Hourly_Intensities2 <- Hourly_Intensities %>%
group_by(Hour,Id) %>%
summarise(AvgIntensity = mean(TotalIntensity)) %>%
arrange(Id,AvgIntensity)
Hourly_Intensities2 %>%
ggplot(aes(x=Hour,y=AvgIntensity))+
geom_point()+
geom_smooth()+
theme_light()+
scale_x_continuous(breaks=seq(0,24,by=1))+
labs(title="Average Intensity Each Hour",
x="Hour",
y="Average Intensity")
Closely resembles the Hourly_Steps graph seen previously. Not much insight here can just use the hourly steps graph.
Are people using the Fitbit to track their sleep?
n_distinct(Daily_Sleep$Id)
## [1] 24
NumDaysUsedforSleep <- Daily_Sleep %>%
group_by(Id) %>%
summarise(
CountDays = n_distinct(ActivityDate)) %>%
arrange(CountDays)
DaysUsedforSleep<-Days_Used_Chart %>%
subset(select=c(Id, ActivityDate, Hours_Asleep)) %>%
na.omit()
DaysUsedforSleep["Hours_Asleep"][DaysUsedforSleep["Hours_Asleep"] == 0] <- NA
DaysUsedforSleep<-na.omit(DaysUsedforSleep)
ggplot(DaysUsedforSleep, aes(x=ActivityDate, y=fct_infreq(Id)))+
geom_point(shape=22, fill="black", size=4.5)+
theme(legend.position = "none")+
labs(title="Number of Days Sleep Tracked",
x="Date",
y="User ID")
Only 24 of the 33 people used the fitbit to track their sleep. Of those people only 3 of them used it to track their sleep the full 31 days
ggplot(NumDaysUsedforSleep, aes(x=CountDays))+
geom_histogram(binwidth=1, color="blue", fill="white")+
labs(title="Number of Days Users Tracked their Sleep",
x="Number of Days Tracked",
y="Number of Users")
27.27% of people did not use their fitibit to track their sleep. 27.27% of people used it to track sleep for 1-10 day. 9.09% of people used it to track sleep for 11-20 days. 36.36% of people used it to track sleep for 21-31 days. Why aren’t people using it to track their sleep? Is it uncomfortable to wear at night? Does the data make sense/is it valuable to them?
NumGoodSleepDays<-Daily_Sleep %>%
group_by(Id) %>%
summarise(GoodSleepDays = sum(Hours_Asleep>=7))
UseForSleep<-merge(x=NumGoodSleepDays, y=NumDaysUsedforSleep, by=c("Id"))
UseForSleep$BadSleepDays<-UseForSleep$CountDays-UseForSleep$GoodSleepDays
UseForSleep$PercentBadDays<-round(UseForSleep$BadSleepDays/UseForSleep$CountDays,2)
UseForSleep$PercentGoodDays<-round(UseForSleep$GoodSleepDays/UseForSleep$CountDays,2)
Of those that used it to track sleep, 11 of 24 participants had 50% of more bad sleep nights (less than 7 hours of sleep per night).
Creating a visual representation of this:
UseForSleep_Long <- melt(UseForSleep,
id.vars = c("Id"))
UseForSleep_Long<-UseForSleep_Long[!(UseForSleep_Long$variable=="CountDays"| UseForSleep_Long$variable=="PercentGoodDays"|UseForSleep_Long$variable=="PercentBadDays" ),]
UseForSleep_Long$fake<-1
ggplot(data=UseForSleep_Long,aes(fill=variable, y=value, x=fake))+
geom_bar(position="stack", stat="identity") +
facet_wrap("Id") +
scale_fill_manual(values = c("GoodSleepDays"="#90EE90",
"BadSleepDays" = "#F08080"))+
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
labs(title="Good Sleep Days vs. Bad Sleep Days",
subtitle="Good Sleep Day = 7 or more hours of sleep",
x="",
y="Number of Days")
Correlation between sleep and other variables?
Days_Used_v5<-merge(x=Days_Used_v4,y=Daily_Activity,by=c("Id","ActivityDate"), all=TRUE)
Days_Used_v5 %>%
ggplot(aes(x=Hours_Asleep,y=VeryActiveMinutes))+
geom_point()
Days_Used_v5.2<- Days_Used_v5[!(is.na(Days_Used_v5$Hours_Asleep) | is.na(Days_Used_v5$VeryActiveMinutes)), ]
cor(Days_Used_v5.2$Hours_Asleep, Days_Used_v5.2$VeryActiveMinutes)
## [1] 0.06796441
Days_Used_v5.3<- Days_Used_v5[!(is.na(Days_Used_v5$Hours_Asleep) | is.na(Days_Used_v5$FairlyActiveMinutes)), ]
cor(Days_Used_v5.3$Hours_Asleep, Days_Used_v5.3$FairlyActiveMinutes)
## [1] 0.1577412
Days_Used_v5.4<- Days_Used_v5[!(is.na(Days_Used_v5$Hours_Asleep) | is.na(Days_Used_v5$LightlyActiveMinutes)), ]
cor(Days_Used_v5.4$Hours_Asleep, Days_Used_v5.4$LightlyActiveMinutes)
## [1] 0.1761975
Days_Used_v5.5<- Days_Used_v5[!(is.na(Days_Used_v5$Hours_Asleep) | is.na(Days_Used_v5$SedentaryMinutes)), ]
cor(Days_Used_v5.5$Hours_Asleep, Days_Used_v5.5$SedentaryMinutes)
## [1] -0.848661
Days_Used_v5.6<- Days_Used_v5[!(is.na(Days_Used_v5$Hours_Asleep) | is.na(Days_Used_v5$TotalSteps)), ]
cor(Days_Used_v5.6$Hours_Asleep, Days_Used_v5.6$TotalSteps)
## [1] 0.1107692
SedentaryMinutes was the variable with the strongest correlation to Hours_Asleep.
Days_Used_v5 %>%
ggplot(aes(x=Hours_Asleep, y=SedentaryMinutes))+
geom_point()+
geom_smooth()+
theme_minimal()
However, the fitbit incorrectly is recording sleep as sedentaryminutes. Lets get rid of all values equal to or over 960 minutes (16 hours awake x 60 minutes). This assumes people are getting at least 8 hours of sleep/movement per day.
Days_Used_v6<-Days_Used_v5[!(Days_Used_v5$SedentaryMinutes>960),]
Days_Used_v6 %>%
ggplot(aes(x=Hours_Asleep, y=SedentaryMinutes))+
geom_point()+
geom_smooth()+
theme_minimal()
Days_Used_v6.2<- Days_Used_v6[!(is.na(Days_Used_v6$Hours_Asleep) | is.na(Days_Used_v6$SedentaryMinutes)), ]
cor(Days_Used_v6.2$Hours_Asleep, Days_Used_v6.2$SedentaryMinutes)
## [1] -0.302556
The correlation decreased but is still significant. Ultimately, since the data is inacurrate in this respect, we will disregard this for the analysis. The other variables correlation coefficients are not strong enought to suggest a relationship between hours of sleep and the number of active minutes/steps participants are taking in a day.
Visual representation of the UseForSleep table.
Days_Used_v5$SleepQuality<-ifelse(Days_Used_v5$Hours_Asleep>=7,"Good","Bad")
Days_Used_v5_Sleep<-Days_Used_v5
Days_Used_v5_Sleep["Hours_Asleep"][Days_Used_v5_Sleep["Hours_Asleep"] == 0] <- NA
Days_Used_v5_Sleep<-Days_Used_v5_Sleep %>% mutate(SleepQuality =
case_when(Hours_Asleep >= 7 ~ "Good",
Hours_Asleep < 7 ~ "Bad",
Hours_Asleep == NA ~ "NA"))
Days_Used_v5_Sleep %>%
ggplot(aes(x=ActivityDate,y=Hours_Asleep, color=SleepQuality))+
geom_point()+
facet_wrap("Id")+
theme(axis.text.x = element_text(angle=90,hjust=1))+
labs(title="Good Sleep Days vs Bad Sleep Days of Each User",
x="Date",
y="Hours Asleep")
## Warning: Removed 499 rows containing missing values (`geom_point()`).
How many people are using the fitbit to track their weight?
n_distinct(WeightLog$Id)
## [1] 8
Only 8 of the 33 people are using it to track their weight. Is it hard to enter in their weight? Is this not their goal? Is anyone losing weight? Gaining weight?
ggplot(data=WeightLog)+
geom_point(mapping=aes(x=Date,y=WeightPounds))+
facet_wrap("Id")+
theme(axis.text.x = element_text(angle=90,hjust=1))+
labs(title="Weight Tracked by Each User",
x="Date",
y="Weight (Ibs)")
Of the 8 people who logged their weight, only 2 did so consistently. None of the IDs show significant changes in weight.