Import Nesscesary R Libraries

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate) 
library(dplyr)
library(ggplot2)
library(tidyr)
library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

Import All Required CSV Data Files

We need four csv files:

dailyActivity_merged
dailyCalories_merged
hourlyIntensities_merged
sleepDay_merged
weightLogInfo_merged

activity <- read_csv("/cloud/project/fitbit csv data files/dailyActivity_merged.csv")

## Rows: 940 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): ActivityDate
## dbl (14): Id, TotalSteps, TotalDistance, TrackerDistance, LoggedActivitiesDi...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

calories <- read_csv("/cloud/project/fitbit csv data files/dailyCalories_merged.csv")

## Rows: 940 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityDay
## dbl (2): Id, Calories
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

intensities <- read_csv("/cloud/project/fitbit csv data files/hourlyIntensities_merged.csv")

## Rows: 22099 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityHour
## dbl (3): Id, TotalIntensity, AverageIntensity
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

sleep <- read_csv("/cloud/project/fitbit csv data files/sleepDay_merged.csv")

## Rows: 413 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): SleepDay
## dbl (4): Id, TotalSleepRecords, TotalMinutesAsleep, TotalTimeInBed
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

weight <- read_csv("/cloud/project/fitbit csv data files/weightLogInfo_merged.csv")

## Rows: 67 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Date
## dbl (6): Id, WeightKg, WeightPounds, Fat, BMI, LogId
## lgl (1): IsManualReport
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(activity)

## # A tibble: 6 × 15
##           Id ActivityDate TotalSteps TotalDistance TrackerDistance
##        <dbl> <chr>             <dbl>         <dbl>           <dbl>
## 1 1503960366 4/12/2016         13162          8.5             8.5 
## 2 1503960366 4/13/2016         10735          6.97            6.97
## 3 1503960366 4/14/2016         10460          6.74            6.74
## 4 1503960366 4/15/2016          9762          6.28            6.28
## 5 1503960366 4/16/2016         12669          8.16            8.16
## 6 1503960366 4/17/2016          9705          6.48            6.48
## # ℹ 10 more variables: LoggedActivitiesDistance <dbl>,
## #   VeryActiveDistance <dbl>, ModeratelyActiveDistance <dbl>,
## #   LightActiveDistance <dbl>, SedentaryActiveDistance <dbl>,
## #   VeryActiveMinutes <dbl>, FairlyActiveMinutes <dbl>,
## #   LightlyActiveMinutes <dbl>, SedentaryMinutes <dbl>, Calories <dbl>

head(weight)

## # A tibble: 6 × 8
##           Id Date       WeightKg WeightPounds   Fat   BMI IsManualReport   LogId
##        <dbl> <chr>         <dbl>        <dbl> <dbl> <dbl> <lgl>            <dbl>
## 1 1503960366 5/2/2016 …     52.6         116.    22  22.6 TRUE           1.46e12
## 2 1503960366 5/3/2016 …     52.6         116.    NA  22.6 TRUE           1.46e12
## 3 1927972279 4/13/2016…    134.          294.    NA  47.5 FALSE          1.46e12
## 4 2873212765 4/21/2016…     56.7         125.    NA  21.5 TRUE           1.46e12
## 5 2873212765 5/12/2016…     57.3         126.    NA  21.7 TRUE           1.46e12
## 6 4319703577 4/17/2016…     72.4         160.    25  27.5 TRUE           1.46e12

Clean & Transform Data

There are some problems in the format of the timestamp data and so convert the format to resolve this problem.

Intensities Data Frame

# intensities
intensities$ActivityHour=as.POSIXct(intensities$ActivityHour, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
intensities$time <- format(intensities$ActivityHour, format = "%H:%M:%S")
intensities$date <- format(intensities$ActivityHour, format = "%m/%d/%y")

Activities Data Frame

# activity
activity$ActivityDate=as.POSIXct(activity$ActivityDate, format="%m/%d/%Y", tz=Sys.timezone())
activity$date <- format(activity$ActivityDate, format = "%m/%d/%y")

Sleep Data Frame

# sleep
sleep$SleepDay=as.POSIXct(sleep$SleepDay, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
sleep$date <- format(sleep$SleepDay, format = "%m/%d/%y")

Analyze Data

So first we have to check the sample size of all these data. Becasue there might be case some users not given the some type of data like sleep.

# Finding number of respondents in each data types
n_distinct(activity$Id)

## [1] 33

n_distinct(calories$Id)

## [1] 33

n_distinct(intensities$Id)

## [1] 33

n_distinct(sleep$Id)

## [1] 24

n_distinct(weight$Id)

## [1] 8

Check abnormal changes in respondents weights.

# checking for change in weight
weight%>%
group_by(Id)%>%
summarise(min(WeightKg),max(WeightKg))

## # A tibble: 8 × 3
##           Id `min(WeightKg)` `max(WeightKg)`
##        <dbl>           <dbl>           <dbl>
## 1 1503960366            52.6            52.6
## 2 1927972279           134.            134. 
## 3 2873212765            56.7            57.3
## 4 4319703577            72.3            72.4
## 5 4558609924            69.1            70.3
## 6 5577150313            90.7            90.7
## 7 6962181067            61              62.5
## 8 8877689391            84              85.8

Check the sumarry of all data frames(Min/Max/Mean/Meadian/SD)

Activity Data Frame

activity %>%  
  select(TotalSteps,
         TotalDistance,
         SedentaryMinutes, Calories) %>%
  summary()

##    TotalSteps    TotalDistance    SedentaryMinutes    Calories   
##  Min.   :    0   Min.   : 0.000   Min.   :   0.0   Min.   :   0  
##  1st Qu.: 3790   1st Qu.: 2.620   1st Qu.: 729.8   1st Qu.:1828  
##  Median : 7406   Median : 5.245   Median :1057.5   Median :2134  
##  Mean   : 7638   Mean   : 5.490   Mean   : 991.2   Mean   :2304  
##  3rd Qu.:10727   3rd Qu.: 7.713   3rd Qu.:1229.5   3rd Qu.:2793  
##  Max.   :36019   Max.   :28.030   Max.   :1440.0   Max.   :4900

Calories Data Frame

calories %>%
  select(Calories) %>%
  summary()

##     Calories   
##  Min.   :   0  
##  1st Qu.:1828  
##  Median :2134  
##  Mean   :2304  
##  3rd Qu.:2793  
##  Max.   :4900

Sleep Data Frame

sleep %>%
  select(TotalSleepRecords, TotalMinutesAsleep, TotalTimeInBed) %>%
  summary()

##  TotalSleepRecords TotalMinutesAsleep TotalTimeInBed 
##  Min.   :1.000     Min.   : 58.0      Min.   : 61.0  
##  1st Qu.:1.000     1st Qu.:361.0      1st Qu.:403.0  
##  Median :1.000     Median :433.0      Median :463.0  
##  Mean   :1.119     Mean   :419.5      Mean   :458.6  
##  3rd Qu.:1.000     3rd Qu.:490.0      3rd Qu.:526.0  
##  Max.   :3.000     Max.   :796.0      Max.   :961.0

Active Minuter Per Category

activity %>%
  select(VeryActiveMinutes, FairlyActiveMinutes, LightlyActiveMinutes) %>%
  summary()

##  VeryActiveMinutes FairlyActiveMinutes LightlyActiveMinutes
##  Min.   :  0.00    Min.   :  0.00      Min.   :  0.0       
##  1st Qu.:  0.00    1st Qu.:  0.00      1st Qu.:127.0       
##  Median :  4.00    Median :  6.00      Median :199.0       
##  Mean   : 21.16    Mean   : 13.56      Mean   :192.8       
##  3rd Qu.: 32.00    3rd Qu.: 19.00      3rd Qu.:264.0       
##  Max.   :210.00    Max.   :143.00      Max.   :518.0

Weight Data Frame

weight %>%
  select(WeightKg, BMI) %>%
  summary()

##     WeightKg           BMI       
##  Min.   : 52.60   Min.   :21.45  
##  1st Qu.: 61.40   1st Qu.:23.96  
##  Median : 62.50   Median :24.39  
##  Mean   : 72.04   Mean   :25.19  
##  3rd Qu.: 85.05   3rd Qu.:25.56  
##  Max.   :133.50   Max.   :47.54

Insights about these summaries:

Sedetary Min AVG: 16.5 Hours
Average No. of Steps/Day: 7638. [CDC recommendation 10,000 Steps/Day]
Most of of the participants activity level is light.
Average Participant Burns: 97 Cal/Hour
Average Participants Sleep: 7 Hours/Day

Merge Data

Before visualizing the data, two of the datasets the activity and sleep datasets on columns Id and date will be merged by inner join.

merged_data <- merge(sleep, activity, by = c('Id', 'date'))
head(merged_data)

##           Id     date   SleepDay TotalSleepRecords TotalMinutesAsleep
## 1 1503960366 04/12/16 2016-04-12                 1                327
## 2 1503960366 04/13/16 2016-04-13                 2                384
## 3 1503960366 04/15/16 2016-04-15                 1                412
## 4 1503960366 04/16/16 2016-04-16                 2                340
## 5 1503960366 04/17/16 2016-04-17                 1                700
## 6 1503960366 04/19/16 2016-04-19                 1                304
##   TotalTimeInBed ActivityDate TotalSteps TotalDistance TrackerDistance
## 1            346   2016-04-12      13162          8.50            8.50
## 2            407   2016-04-13      10735          6.97            6.97
## 3            442   2016-04-15       9762          6.28            6.28
## 4            367   2016-04-16      12669          8.16            8.16
## 5            712   2016-04-17       9705          6.48            6.48
## 6            320   2016-04-19      15506          9.88            9.88
##   LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 1                        0               1.88                     0.55
## 2                        0               1.57                     0.69
## 3                        0               2.14                     1.26
## 4                        0               2.71                     0.41
## 5                        0               3.19                     0.78
## 6                        0               3.53                     1.32
##   LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 1                6.06                       0                25
## 2                4.71                       0                21
## 3                2.83                       0                29
## 4                5.04                       0                36
## 5                2.51                       0                38
## 6                5.03                       0                50
##   FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## 1                  13                  328              728     1985
## 2                  19                  217              776     1797
## 3                  34                  209              726     1745
## 4                  10                  221              773     1863
## 5                  20                  164              539     1728
## 6                  31                  264              775     2035

Data Visualization & Share

1. Scatter Polt (Total Steps Vs Calories Burned)

To check if there is a correlation between total number of steps taken and calories burned. The more steps each participant takes, the more calories they burn.

ggplot(data = activity, aes(x = TotalSteps, y = Calories)) + geom_point(colour="purple") + geom_smooth() + labs(title = "Total Steps vs. Calories")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

2. Scatter Polt (Total Time Asleep Vs Total Time In Bed)

We get a positive correlation between total time asleep vs total time in bed. To improve sleep quality for its users, bellabeat should consider having a section where users can customize their sleep schedule to ensure consistency.

ggplot(data = sleep, aes(x = TotalMinutesAsleep, y = TotalTimeInBed)) + geom_point(color="#E5A9A9") + labs(title = "Total time asleep vs Total time in bed")

3. Scatter Polt (Total Time Asleep Vs Total Time In Bed)

ggplot(data = merged_data, mapping = aes(x = SedentaryMinutes, y = TotalMinutesAsleep)) + 
  geom_point(color = "#D66BA0") + labs(title= "Sleep Duration and Sedentary Time")

cor(merged_data$TotalMinutesAsleep,merged_data$SedentaryMinutes)

## [1] -0.599394

Observing the graph, it’s evident that there’s a negative correlation between SedentaryMinutes and TotalMinutesAsleep. This implies that individuals who are less active tend to get less sleep.

Next, let’s explore the influence of the day of the week on our activity levels and sleep patterns.

4. Aggregate Data By Day Of The Week To Summarize Averages

merged_data <- mutate(merged_data, 
                                        day = wday(SleepDay, label = TRUE))
summarized_activity_sleep <- merged_data %>% 
  group_by(day) %>% 
  summarise(AvgDailySteps = mean(TotalSteps),
            AvgAsleepMinutes = mean(TotalMinutesAsleep),
            AvgAwakeTimeInBed = mean(TotalTimeInBed), 
            AvgSedentaryMinutes = mean(SedentaryMinutes),
            AvgLightlyActiveMinutes = mean(LightlyActiveMinutes),
            AvgFairlyActiveMinutes = mean(FairlyActiveMinutes),
            AvgVeryActiveMinutes = mean(VeryActiveMinutes), 
            AvgCalories = mean(Calories))
head(summarized_activity_sleep)

## # A tibble: 6 × 9
##   day   AvgDailySteps AvgAsleepMinutes AvgAwakeTimeInBed AvgSedentaryMinutes
##   <ord>         <dbl>            <dbl>             <dbl>               <dbl>
## 1 Sun           7298.             453.              504.                688.
## 2 Mon           9340.             419.              456.                718.
## 3 Tue           9183.             405.              443.                740.
## 4 Wed           8023.             435.              470.                714.
## 5 Thu           8205.             402.              436.                701.
## 6 Fri           7901.             405.              445.                743.
## # ℹ 4 more variables: AvgLightlyActiveMinutes <dbl>,
## #   AvgFairlyActiveMinutes <dbl>, AvgVeryActiveMinutes <dbl>, AvgCalories <dbl>

5. Bar Graph Chart (AVG Daily StepS/Day)

According to this observation participants are most active on saturdays and least active on sundays.

ggplot(data = summarized_activity_sleep, mapping = aes(x = day, y = AvgDailySteps)) +
geom_col(fill = "purple") + labs(title = "Daily Step Count")

FitBit Data Analysis for Bellabeat Marketing Strategy

Import Nesscesary R Libraries

Import All Required CSV Data Files

Clean & Transform Data

Intensities Data Frame

Activities Data Frame

Sleep Data Frame

Analyze Data

Check abnormal changes in respondents weights.

Check the sumarry of all data frames(Min/Max/Mean/Meadian/SD)

Activity Data Frame

Calories Data Frame

Sleep Data Frame

Active Minuter Per Category

Weight Data Frame

Insights about these summaries:

Merge Data

Data Visualization & Share

1. Scatter Polt (Total Steps Vs Calories Burned)

2. Scatter Polt (Total Time Asleep Vs Total Time In Bed)

3. Scatter Polt (Total Time Asleep Vs Total Time In Bed)

4. Aggregate Data By Day Of The Week To Summarize Averages

5. Bar Graph Chart (AVG Daily StepS/Day)

Share Insights

Data Analysis Recommendations:

Thanks!