Step 3 - Process
- In the process phase, we have to process the data via
cleaning. Then, we have to verify that it is pertinent, correct, and
without error.
- We have to check is there are any missing or null values in that
dataset.
- We have to transform the dataset into the format needed for later
analysis.
Installing and Loading Packages
install.packages("readxl", repos = "http://cran.us.r-project.org")
install.packages("dplyr", repos = "http://cran.us.r-project.org")
install.packages("readr", repos = "http://cran.us.r-project.org")
install.packages("janitor", repos = "http://cran.us.r-project.org")
install.packages("lubridate", repos = "http://cran.us.r-project.org")
install.packages("tidyverse", repos = "http://cran.us.r-project.org")
install.packages("leaflet", repos = "http://cran.us.r-project.org")
install.packages("gridExtra", repos = "http://cran.us.r-project.org")
install.packages("ggcorrplot", repos = "http://cran.us.r-project.org")
install.packages("ggrepel", repos = "http://cran.us.r-project.org")
install.packages("remotes", repos = "http://cran.us.r-project.org")
if (!require('devtools')) {install.packages("devtools")}
if (!require('RWordPress')) {devtools::install_github(c("duncantl/XMLRPC", "duncantl/RWordPress"))}
library(readxl)
library(dplyr)
library(readr)
library(janitor)
library(lubridate)
library(tidyverse)
library(leaflet)
library(gridExtra)
library(ggcorrplot)
library(ggrepel)
library(RWordPress)
Loading Datasets
daily_activity <- read_csv("/Users/bethanyleach/Desktop/bellabeats_data_info/dailyActivity_merged.csv")
daily_calories <- read_csv("/Users/bethanyleach/Desktop/bellabeats_data_info/dailyCalories_merged.csv")
daily_intensities <- read_csv("/Users/bethanyleach/Desktop/bellabeats_data_info/dailyIntensities_merged.csv")
hourly_intensities <- read_csv("/Users/bethanyleach/Desktop/bellabeats_data_info/hourlyIntensities_merged.csv")
daily_steps <- read_csv("/Users/bethanyleach/Desktop/bellabeats_data_info/dailySteps_merged.csv")
hourly_steps <- read_csv("/Users/bethanyleach/Desktop/bellabeats_data_info/hourlySteps_merged.csv")
heartrate_seconds <- read_csv("/Users/bethanyleach/Desktop/bellabeats_data_info/heartrate_seconds_merged.csv")
sleep_day <- read_csv("/Users/bethanyleach/Desktop/bellabeats_data_info/sleepDay_merged.csv")
weight_log_info <- read_csv("/Users/bethanyleach/Desktop/bellabeats_data_info/weightLogInfo_merged.csv")
min_intensity <- read_csv("/Users/bethanyleach/Desktop/bellabeats_data_info/minuteIntensitiesNarrow_merged.csv")
n_distinct(daily_activity$Id) #33
## [1] 33
n_distinct(daily_calories$Id) #33
## [1] 33
n_distinct(daily_intensities$Id) #33
## [1] 33
n_distinct(daily_steps$Id) #33
## [1] 33
n_distinct(heartrate_seconds$Id) #14
## [1] 14
n_distinct(sleep_day$Id) #24
## [1] 24
n_distinct(weight_log_info$Id) #8
## [1] 8
n_distinct(hourly_intensities$Id) #33
## [1] 33
sum(is.na(daily_activity))
## [1] 0
sum(is.na(daily_calories))
## [1] 0
sum(is.na(daily_intensities))
## [1] 0
sum(is.na(daily_steps))
## [1] 0
sum(is.na(heartrate_seconds))
## [1] 0
sum(is.na(sleep_day))
## [1] 0
sum(is.na(weight_log_info))
## [1] 65
sum(is.na(hourly_intensities))
## [1] 0
sum(duplicated(daily_activity))
## [1] 0
sum(duplicated(daily_calories))
## [1] 0
sum(duplicated(daily_intensities))
## [1] 0
sum(duplicated(daily_steps))
## [1] 0
sum(duplicated(heartrate_seconds))
## [1] 0
sum(duplicated(sleep_day))
## [1] 3
sum(duplicated(weight_log_info))
## [1] 0
sum(duplicated(hourly_intensities))
## [1] 0
sleep_day <- sleep_day %>%
distinct()
activity_daily <- daily_activity
activity_daily$ActivityDate <- as.POSIXct(activity_daily$ActivityDate, format='%m/%d/%Y', tz=Sys.timezone())
activity_daily$date <- format(activity_daily$ActivityDate, format = "%m/%d/%y")
sleep_day <- sleep_day
sleep_day$SleepDay <- as.POSIXct(sleep_day$SleepDay, format ='%m/%d/%Y', tz=Sys.timezone())
sleep_day$date <- format(sleep_day$SleepDay, format = "%m/%d/%y")
heart_rate <- heartrate_seconds
heart_rate$Time <- as.POSIXct(heart_rate$Time, format='%m/%d/%Y %I:%M:%S %p', tz=Sys.timezone())
heart_rate$hour <- format(heart_rate$Time, format = '%H:%M:%S')
heart_rate$date <- format(heart_rate$Time, format = '%m/%d/%y')
head(activity_daily)
## # A tibble: 6 × 16
## Id ActivityDate TotalSteps TotalDistance TrackerDistance
## <dbl> <dttm> <dbl> <dbl> <dbl>
## 1 1503960366 2016-04-12 00:00:00 13162 8.5 8.5
## 2 1503960366 2016-04-13 00:00:00 10735 6.97 6.97
## 3 1503960366 2016-04-14 00:00:00 10460 6.74 6.74
## 4 1503960366 2016-04-15 00:00:00 9762 6.28 6.28
## 5 1503960366 2016-04-16 00:00:00 12669 8.16 8.16
## 6 1503960366 2016-04-17 00:00:00 9705 6.48 6.48
## # ℹ 11 more variables: LoggedActivitiesDistance <dbl>,
## # VeryActiveDistance <dbl>, ModeratelyActiveDistance <dbl>,
## # LightActiveDistance <dbl>, SedentaryActiveDistance <dbl>,
## # VeryActiveMinutes <dbl>, FairlyActiveMinutes <dbl>,
## # LightlyActiveMinutes <dbl>, SedentaryMinutes <dbl>, Calories <dbl>,
## # date <chr>
head(sleep_day)
## # A tibble: 6 × 6
## Id SleepDay TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
## <dbl> <dttm> <dbl> <dbl> <dbl>
## 1 1.50e9 2016-04-12 00:00:00 1 327 346
## 2 1.50e9 2016-04-13 00:00:00 2 384 407
## 3 1.50e9 2016-04-15 00:00:00 1 412 442
## 4 1.50e9 2016-04-16 00:00:00 2 340 367
## 5 1.50e9 2016-04-17 00:00:00 1 700 712
## 6 1.50e9 2016-04-19 00:00:00 1 304 320
## # ℹ 1 more variable: date <chr>
head(heart_rate)
## # A tibble: 6 × 5
## Id Time Value hour date
## <dbl> <dttm> <dbl> <chr> <chr>
## 1 2022484408 2016-04-12 07:21:00 97 07:21:00 04/12/16
## 2 2022484408 2016-04-12 07:21:05 102 07:21:05 04/12/16
## 3 2022484408 2016-04-12 07:21:10 105 07:21:10 04/12/16
## 4 2022484408 2016-04-12 07:21:20 103 07:21:20 04/12/16
## 5 2022484408 2016-04-12 07:21:25 101 07:21:25 04/12/16
## 6 2022484408 2016-04-12 07:22:05 95 07:22:05 04/12/16
activity_daily <- activity_daily %>%
filter(TotalDistance !=0, TotalSteps !=0)
- Guiding Questions
- What tools are you choosing and why? I
used RStudio for all data cleaning, transformation, analysis, and
visualization. To begin, I installed and then read all of the packages I
deemed necessary for this project. R is a straightforward programming
language and therefore can easily be used to load, organize, modify, and
clean datasets and then to construct meaningful data visualizations in
order to convey the technical information to stakeholders.
- Have you ensured your data’s integrity?
Since the data in this dataset was collected via survey, I am not able
to determine its accuracy or level of integrity.
- What steps have you taken to ensure that your data is
clean? To ensure that my data is clean, I checked for and
then removed both duplicates and NA’s. In addition, I formatted data and
converted columns to different formats.
- How can you verify that your data is clean and ready to
analyze? I can verify that my data is clean and ready to
analyze by using code to check that my dropping NA values and duplicates
was executed successfully.
- Have you documented your cleaning process so you can
review and share those results? I documented my cleaning
process in labeled code chunks since I used RMarkdown to write and
publish this project.
Step 4 - Analyze and Step 5- Share
Now that the data is stored appropriately and has been
prepared for analysis, it’s time to start putting it to work. During
this phase, I will summarize the data in order to point out
comprehensive insight about said data so that I can complete the
business task.
Once I’ve analyzed the data, I will share what I found via
visualizations.
nrow(activity_daily)
## [1] 862
nrow(sleep_day)
## [1] 410
nrow(heart_rate)
## [1] 2483658
nrow(hourly_intensities)
## [1] 22099
summary(activity_daily)
## Id ActivityDate TotalSteps
## Min. :1.504e+09 Min. :2016-04-12 00:00:00.00 Min. : 8
## 1st Qu.:2.320e+09 1st Qu.:2016-04-18 00:00:00.00 1st Qu.: 4927
## Median :4.445e+09 Median :2016-04-26 00:00:00.00 Median : 8054
## Mean :4.861e+09 Mean :2016-04-26 02:15:18.78 Mean : 8329
## 3rd Qu.:6.962e+09 3rd Qu.:2016-05-03 00:00:00.00 3rd Qu.:11096
## Max. :8.878e+09 Max. :2016-05-12 00:00:00.00 Max. :36019
## TotalDistance TrackerDistance LoggedActivitiesDistance VeryActiveDistance
## Min. : 0.010 Min. : 0.010 Min. :0.000 Min. : 0.000
## 1st Qu.: 3.373 1st Qu.: 3.373 1st Qu.:0.000 1st Qu.: 0.000
## Median : 5.590 Median : 5.590 Median :0.000 Median : 0.410
## Mean : 5.986 Mean : 5.971 Mean :0.118 Mean : 1.639
## 3rd Qu.: 7.905 3rd Qu.: 7.880 3rd Qu.:0.000 3rd Qu.: 2.277
## Max. :28.030 Max. :28.030 Max. :4.942 Max. :21.920
## ModeratelyActiveDistance LightActiveDistance SedentaryActiveDistance
## Min. :0.0000 Min. : 0.000 Min. :0.000000
## 1st Qu.:0.0000 1st Qu.: 2.350 1st Qu.:0.000000
## Median :0.3100 Median : 3.580 Median :0.000000
## Mean :0.6189 Mean : 3.643 Mean :0.001752
## 3rd Qu.:0.8675 3rd Qu.: 4.897 3rd Qu.:0.000000
## Max. :6.4800 Max. :10.710 Max. :0.110000
## VeryActiveMinutes FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:147.0 1st Qu.: 721.2
## Median : 7.00 Median : 8.00 Median :208.5 Median :1020.5
## Mean : 23.04 Mean : 14.79 Mean :210.3 Mean : 955.2
## 3rd Qu.: 35.00 3rd Qu.: 21.00 3rd Qu.:272.0 3rd Qu.:1189.0
## Max. :210.00 Max. :143.00 Max. :518.0 Max. :1440.0
## Calories date
## Min. : 52 Length:862
## 1st Qu.:1857 Class :character
## Median :2220 Mode :character
## Mean :2362
## 3rd Qu.:2832
## Max. :4900
summary(sleep_day)
## Id SleepDay TotalSleepRecords
## Min. :1.504e+09 Min. :2016-04-12 00:00:00.00 Min. :1.00
## 1st Qu.:3.977e+09 1st Qu.:2016-04-19 00:00:00.00 1st Qu.:1.00
## Median :4.703e+09 Median :2016-04-27 00:00:00.00 Median :1.00
## Mean :4.995e+09 Mean :2016-04-26 11:38:55.60 Mean :1.12
## 3rd Qu.:6.962e+09 3rd Qu.:2016-05-04 00:00:00.00 3rd Qu.:1.00
## Max. :8.792e+09 Max. :2016-05-12 00:00:00.00 Max. :3.00
## TotalMinutesAsleep TotalTimeInBed date
## Min. : 58.0 Min. : 61.0 Length:410
## 1st Qu.:361.0 1st Qu.:403.8 Class :character
## Median :432.5 Median :463.0 Mode :character
## Mean :419.2 Mean :458.5
## 3rd Qu.:490.0 3rd Qu.:526.0
## Max. :796.0 Max. :961.0
summary(heart_rate)
## Id Time Value
## Min. :2.022e+09 Min. :2016-04-12 00:00:00.00 Min. : 36.00
## 1st Qu.:4.388e+09 1st Qu.:2016-04-19 06:18:10.00 1st Qu.: 63.00
## Median :5.554e+09 Median :2016-04-26 20:28:50.00 Median : 73.00
## Mean :5.514e+09 Mean :2016-04-26 19:43:52.24 Mean : 77.33
## 3rd Qu.:6.962e+09 3rd Qu.:2016-05-04 08:00:20.00 3rd Qu.: 88.00
## Max. :8.878e+09 Max. :2016-05-12 16:20:00.00 Max. :203.00
## hour date
## Length:2483658 Length:2483658
## Class :character Class :character
## Mode :character Mode :character
##
##
##
summary(hourly_intensities)
## Id ActivityHour TotalIntensity AverageIntensity
## Min. :1.504e+09 Length:22099 Min. : 0.00 Min. :0.0000
## 1st Qu.:2.320e+09 Class :character 1st Qu.: 0.00 1st Qu.:0.0000
## Median :4.445e+09 Mode :character Median : 3.00 Median :0.0500
## Mean :4.848e+09 Mean : 12.04 Mean :0.2006
## 3rd Qu.:6.962e+09 3rd Qu.: 16.00 3rd Qu.:0.2667
## Max. :8.878e+09 Max. :180.00 Max. :3.0000
calories_activity_date <- ggplot(activity_daily, aes(x=ActivityDate, y=Calories)) + geom_smooth() + labs(x="Date of Activity", y="Calories burned (kcal)") + ggtitle("Relationship Between Activity Date and Calories Burned") + theme(plot.title = element_text(hjust = 0.5))
calories_activity_date
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

- Explanation
- Based off this graph, a consistently large amount of
calories (~2400) were being burned during the springtime (mid April to
early May), but then there was a huge drop off by mid May.
This implies that either fitbit users met their fitness goals or
they weren’t tracking their movement as frequently. Moving forward,
marketers who are focused on users who regularly track their fitness
data should release their campaigns during Springtime because those
users might be trying to exercise more frequently before
Summertime.
- Activity Type Deep Dive
- In this dataset, there are four different types of activity (Very
Active, Fairly Active, Lightly Active, and Sedentary). In order to learn
more about users’ habits, I will calculate the average minutes tracked
for each designation of activity level.
activity_daily_categs <- activity_daily %>%
select(VeryActiveMinutes, FairlyActiveMinutes, LightlyActiveMinutes, SedentaryMinutes)
activity_daily_categs_avgs <- c(VeryActive = mean(activity_daily_categs$VeryActiveMinutes), FairlyActive = mean(activity_daily_categs$FairlyActiveMinutes), LightlyActive = mean(activity_daily_categs$LightlyActiveMinutes), Sedentary = mean(activity_daily_categs$SedentaryMinutes))
activity_daily_labels <- c("Very", "Fairly", "Lightly", "Sedentary")
activity_daily_avg_dataframe <- data.frame(activity_daily_categs_avgs, activity_daily_labels)
activity_daily_avg_barchart <- ggplot(activity_daily_avg_dataframe)+
geom_col(aes(x=activity_daily_categs_avgs, y=activity_daily_labels), fill="turquoise") + coord_flip() + labs(x = "Average Time Exercising (mins)", y = "Type of Activity") + ggtitle("Type of Activity vs Average Time Exercising") + theme(plot.title = element_text(hjust = 0.5))
activity_daily_avg_barchart

- Explanation
- *This bar plot clearly shows that users are mainly inactive while
tracking and the average for that category (Sedentary) is 4.5 times the
next average (Lightly Active).** It’s possible that these fitbit
users spend a large part of their day stationary (working at a desk/in
an office). Also, these users may be spending lengthy periods of time in
daily commutes. In addition, it’s possible that this bar chart also
encompasses time users were sleeping. Therefore, I’d need a more
detailed dataset in order to determine if the fitbits were tracking
‘active minutes’ while users were sleeping.
in_bed_sleep_graph <- ggplot(sleep_day, aes(x=TotalMinutesAsleep, y=TotalTimeInBed)) + geom_point(color="orange") + geom_smooth(color="blue", method=lm, se=FALSE) + labs(x="Total Time Asleep (min)", y="Total Time in Bed (min)") + ggtitle("Relationship Between Total Time Asleep and Total Time in Bed") + theme(plot.title = element_text(hjust = 0.5))
in_bed_sleep_graph
## `geom_smooth()` using formula = 'y ~ x'

- Explanation
- There is clearly a strong positive correlation between the
users’ total minutes asleep and their total time in bed. However, there
are outliers in the middle and upper portions of the plot. The outliers
here are users who spent more time in bed, but weren’t actually
sleeping.
- This scenario presents an area of improvement for Bellabeat.
Although it’s true that exercise is helpful when trying to stay healthy,
sleep is also equally important. These users are not meeting the
recommendation of 8hrs of sleep per day. Therefore, Bellabeat could
encourage customers to set a daily bed time by adding a “Go to Sleep”
notification to their smart device. Doing so would likely lead to users
improving their health holistically.
activity_daily_sleep_day_merge <- merge(activity_daily, sleep_day, by=c("Id", "date"))
sedentary_time_asleep_graph <- ggplot(activity_daily_sleep_day_merge, aes(x=SedentaryMinutes, y=TotalMinutesAsleep)) + geom_point(color="orange") + geom_smooth(color="blue", method=lm, se=FALSE) + labs(x="Inactive Minutes", y="Total Minutes Asleep") + ggtitle("Inactive Minutes vs Total Minutes Asleep") + theme(plot.title = element_text(hjust = 0.5))
sedentary_time_asleep_graph
## `geom_smooth()` using formula = 'y ~ x'

ml_sed_sleep <- lm(SedentaryMinutes~TotalMinutesAsleep, data = activity_daily_sleep_day_merge)
summary(ml_sed_sleep)$r.squared
## [1] 0.3612889
- Explanation
- By definition, the coefficient of determination (\(r^2\)) is a measurement that is used to
explain how much the variability of one factor is caused due to its
relationship to another factor. This correlation coefficient is
represented as a value between 0 and 1 or 0 to 100 percent.
- This scatter plot is looking at the relationship between a
user’s inactive minutes (SedentaryMinutes) vs the total time they were
asleep (TotalMinutesAsleep). The coefficient of determination is 0.361
and this graph shows that the more minutes a user is inactive, the fewer
amount of minutes they are asleep. Based off the negative slope of this
trend line, there is a negative relationship between the two variables
meaning that inactivity has a detrimental effect on amount of
sleep.
activity_daily_sleep_day_merge$day_of_week <- weekdays(as.Date(activity_daily_sleep_day_merge$ActivityDate, format="%Y-%m-%d"))
activity_daily_sleep_day_merge$sum_minutes <- activity_daily_sleep_day_merge$VeryActiveMinutes+activity_daily_sleep_day_merge$FairlyActiveMinutes+activity_daily_sleep_day_merge$LightlyActiveMinutes+activity_daily_sleep_day_merge$SedentaryMinutes
str(activity_daily_sleep_day_merge)
## 'data.frame': 410 obs. of 22 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ date : chr "04/12/16" "04/13/16" "04/15/16" "04/16/16" ...
## $ ActivityDate : POSIXct, format: "2016-04-12" "2016-04-13" ...
## $ TotalSteps : num 13162 10735 9762 12669 9705 ...
## $ TotalDistance : num 8.5 6.97 6.28 8.16 6.48 ...
## $ TrackerDistance : num 8.5 6.97 6.28 8.16 6.48 ...
## $ LoggedActivitiesDistance: num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveDistance : num 1.88 1.57 2.14 2.71 3.19 ...
## $ ModeratelyActiveDistance: num 0.55 0.69 1.26 0.41 0.78 ...
## $ LightActiveDistance : num 6.06 4.71 2.83 5.04 2.51 ...
## $ SedentaryActiveDistance : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveMinutes : num 25 21 29 36 38 50 28 19 41 39 ...
## $ FairlyActiveMinutes : num 13 19 34 10 20 31 12 8 21 5 ...
## $ LightlyActiveMinutes : num 328 217 209 221 164 264 205 211 262 238 ...
## $ SedentaryMinutes : num 728 776 726 773 539 775 818 838 732 709 ...
## $ Calories : num 1985 1797 1745 1863 1728 ...
## $ SleepDay : POSIXct, format: "2016-04-12" "2016-04-13" ...
## $ TotalSleepRecords : num 1 2 1 2 1 1 1 1 1 1 ...
## $ TotalMinutesAsleep : num 327 384 412 340 700 304 360 325 361 430 ...
## $ TotalTimeInBed : num 346 407 442 367 712 320 377 364 384 449 ...
## $ day_of_week : chr "Tuesday" "Wednesday" "Friday" "Saturday" ...
## $ sum_minutes : num 1094 1033 998 1040 761 ...
activity_daily_sleep_day_merge$day_of_week = factor(activity_daily_sleep_day_merge$day_of_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday","Sunday"))
activity_daily_sleep_day_merge$TotalHoursAsleep <- (activity_daily_sleep_day_merge$TotalMinutesAsleep)/60
hrs_sleep_weekday <- ggplot(activity_daily_sleep_day_merge, aes(x=day_of_week, y=TotalHoursAsleep)) + geom_bar(stat="summary", fill="turquoise", fun="mean") + labs(x = "Day of the Week", y = "Average Hours Slept") + ggtitle("Average Time Slept (Hours) by Day of the Week") + theme(plot.title = element_text(hjust=0.5)) + geom_hline(yintercept=7)
hrs_sleep_weekday

mean(activity_daily_sleep_day_merge$TotalHoursAsleep)
## [1] 6.98622
activity_daily_sleep_day_merge$Id <- as.character(activity_daily_sleep_day_merge$Id)
hrs_sleep_user <- ggplot(activity_daily_sleep_day_merge, aes(x=Id, y=TotalHoursAsleep)) + geom_bar(stat="summary", fill="turquoise", fun="mean") + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) + geom_hline(yintercept=7) + labs(x="User", y="Average Hours Slept") + ggtitle("Average Time Slept (Hours) by User")
hrs_sleep_user

- Explanation
- On average, users get 7 hours of sleep per night. The first bar
plot clearly shows that users get the most sleep on Sunday. This aligns
with the recommendation that adults try to sleep 7 hours per
night.
- The second bar plot shows the average hours asleep by user. This
plot clearly shows a significant amount of variation in the average
hours a user spends sleeping with a handful of users regularly sleeping
less than 7 hours per night and a couple of users sleeping more than 8
hours per night. The horizontal line at 7 hours featured in both plots
is present to show the recommended daily hours of sleep for adults per
The Mayo Clinic. This second bar plot can be used to pinpoint users who
may be helped by sleep-related research. I omitted the user id as to
keep the bar plot clean, but the information is easily accessible in
dataframe I used to create the plot.
user_info_activity_daily <- activity_daily %>%
group_by(Id)
user_info_activity_daily <- user_info_activity_daily %>%
summarize(num_days=n_distinct(date))
user_info_activity_daily <- user_info_activity_daily %>%
mutate(Usage = case_when(num_days >=27 ~ "Daily Usage",
num_days >=20 ~ "Regular Usage",
num_days >=13 ~ "Moderate Usage",
num_days >=6 ~ "Intermittent Usage",
num_days <6 ~ "Seldom Usage"))
user_info_activity_daily_totals <- user_info_activity_daily %>%
count(Usage, name="Total")
user_info_activity_daily_totals <- user_info_activity_daily_totals %>%
mutate(Percentage = (Total/sum(Total)) * 100)
user_info_activity_daily_totals <- user_info_activity_daily_totals %>%
mutate(y_position = cumsum(Percentage) - 0.5*Percentage)
usage_pie_chart <- ggplot(user_info_activity_daily_totals, aes(x="", y=Percentage, fill=Usage)) + coord_polar(theta="y") + geom_col(color="black") +
geom_text(aes(label=Total), color="black", size=7, position=position_stack(vjust=0.5), show.legend=FALSE) + theme_void() + ggtitle("Usage Distribution of Tracking Device Throughout the Day")
usage_pie_chart

Explanation
- Of the 33 distinct users, 21 of them (63.7%) used their devices
on a daily (27 or more days) basis, 5 of them (15.2%) used them
regularly (20 or more days), 6 of them (18.2%) used them moderately (13
or more days), and 1 person (3%) used the device seldomly (fewer than 6
days). I found it interesting that only one person wore their tracker
fewer than 6 days a month and the next lowest user wore their device 17
days a month. This data implies that the tracker is worn by most usrs
and therefore it could be a helpful tool for developing healthy habits.
In addition, this device could be helpful in tracking users’
workouts.
- This pie chart visually shows how often users wear their devices
and therefore can be helpful in explaining to stakeholders (Urska Srsen,
Sando Mur, Bellabeat marketing team) how users interact with their
devices daily. In addition, this pie chart can be used to infer the
population of users that prioritize tracking their activity. Plus, this
pie chart can be used to create motivational information for users who
wear their trackers infrequently.
sedentary_calories_plot <- ggplot(activity_daily, aes(x=SedentaryMinutes, y=Calories)) + geom_point(color="magenta") + geom_smooth(color="navy", method=lm, se=FALSE) + labs(x="Sedentary Minutes", y="Calories Burned (kcal)")
lightly_active_calories_plot <- ggplot(activity_daily, aes(x=LightlyActiveMinutes, y=Calories)) + geom_point(color="magenta") + geom_smooth(color="navy", method=lm, se=FALSE) + labs(x="Lightly Active Minutes", y="Calories Burned (kcal)")
fairly_active_calories_plot <- ggplot(activity_daily, aes(x=FairlyActiveMinutes, y=Calories)) + geom_point(color="magenta") + geom_smooth(color="navy", method=lm, se=FALSE) + labs(x="Fairly Active Minutes", y="Calories Burned (kcal)")
very_active_calories_plot <- ggplot(activity_daily, aes(x=VeryActiveMinutes, y=Calories)) + geom_point(color="magenta") + geom_smooth(color="navy", method=lm, se=FALSE) + labs(x="Very Active Minutes", y="Calories Burned (kcal)")
grid.arrange(sedentary_calories_plot, lightly_active_calories_plot, fairly_active_calories_plot, very_active_calories_plot, nrow=1, top="Relationship Between Type of Activity and Calories Burned")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

#R^2, Coefficient of Determination
ml_1 <- lm(SedentaryMinutes~Calories, data = activity_daily)
summary(ml_1)$r.squared
## [1] 0.0007985348
ml_2 <- lm(LightlyActiveMinutes~Calories, data = activity_daily)
summary(ml_2)$r.squared
## [1] 0.03197434
ml_3 <- lm(FairlyActiveMinutes~Calories, data = activity_daily)
summary(ml_3)$r.squared
## [1] 0.06966728
ml_4 <- lm(VeryActiveMinutes~Calories, data = activity_daily)
summary(ml_4)$r.squared
## [1] 0.3748316
- Explanation
- After studying these four scatter plots, the largest
coefficient of determination is 0.375 (Very Active Minutes vs Calories
Burned). In this graph, it’s clear that a fitbit user who has a higher
number of active minutes usually burns more calories per day. The
inverse is also true, which means that the fewer active minutes a user
has, the fewer calories the user will burn per day.
hourly_steps_adj <- hourly_steps
hourly_steps_adj <- hourly_steps_adj %>%
data.frame()
hourly_steps_adj[c('Date', 'Time')] <- str_split_fixed(hourly_steps_adj$ActivityHour, ' ', 2)
hourly_steps_adj_amt <- as_tibble(hourly_steps_adj)
hourly_steps_adj_amt <- hourly_steps_adj_amt %>%
group_by(Time) %>%
summarize(mean_steps=mean(StepTotal))
hourly_steps_adj_amt$TimeDetails <- factor(hourly_steps_adj_amt$Time, levels = c("12:00:00 AM","1:00:00 AM","2:00:00 AM","3:00:00 AM","4:00:00 AM","5:00:00 AM","6:00:00 AM","7:00:00 AM","8:00:00 AM","9:00:00 AM","10:00:00 AM","11:00:00 AM","12:00:00 PM","1:00:00 PM","2:00:00 PM","3:00:00 PM","4:00:00 PM","5:00:00 PM","6:00:00 PM","7:00:00 PM","8:00:00 PM","9:00:00 PM","10:00:00 PM","11:00:00 PM"))
hourly_steps_adj_plot <- ggplot(hourly_steps_adj_amt, aes(x=TimeDetails, y=mean_steps)) + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=45))+scale_fill_gradient(low = "lavender", high = "purple")+geom_col(aes(fill=mean_steps))+ggtitle("Average Number of Steps by Hour of the Day")+labs(x="Hour", y="Average Number of Steps")+theme(plot.title = element_text(hjust = 0.5))
hourly_steps_adj_plot

- Explanation
- Based off this visualization, it’s clear that the users are the
most active from 7:00AM-7:00PM with the highest average number of steps
taken by users occurring during the 6 o’clock evening hour (599.17).
Users take the fewest number of steps during the late night and early
morning hours, which makes sense. The lowest average number of steps
taken by users occurred during the 3 o’clock morning hour (6.43). The
pattern of user activity seen in this plot makes sense as users are
going to be more active during the day - whether that be exercising or
moving around at work - than they would be at night. Gathering this
information would be beneficial to stakeholders because it provides them
with a clear understanding of users’ habits and then they can create
marketing materials to motivate users to be active during specific times
of the day.
total_steps_vs_total_calories <- ggplot(activity_daily, aes(x=TotalSteps, y=Calories)) + geom_point(color="green") + geom_smooth(color="purple", method=lm, se=FALSE) + labs(x="Total Steps", y="Calories Burned (kcal)") + ggtitle("Total Steps Taken vs Calories Burned") + theme(plot.title = element_text(hjust = 0.5))
total_steps_vs_total_calories
## `geom_smooth()` using formula = 'y ~ x'

ml_step_cal <- lm(TotalSteps~Calories, data = activity_daily)
summary(ml_step_cal)$r.squared
## [1] 0.3144111
cor_matrix_2 <- cor(activity_daily[, c("TotalSteps", "Calories")], use = "complete.obs")
ggcorrplot(cor_matrix_2, hc.order = TRUE, type = "lower",
colors = c("lightblue", "white", "purple"),
lab = TRUE, lab_size = 6,
title = "Correlation plot for Total Steps Taken and Calories Burned")

- Explanation
- The coefficient of determination is 0.314. From this visualization,
it is very clear that the users who took the most steps tended to burn
the most calories. That being said, there is a lot of spread/variance
taking place here (the range of calories burned is very large - roughly
3000 kcal).