“Bellabeat, a high-tech manufacturer of health-focused products for women. Bellabeat wants to expand their business for one of their products so they wants to analyse the usage of one of their products in order to gain insight into how people are already using their smart devices.Then, using this information,she(Urška Sršen, cofounder and Chief Creative Officer of Bellabeat) would like high-level recommendations for how these trends can inform Bellabeat marketing strategy”
Urška Sršen: Bellabeat’s co-founder and Chief Creative Officer Sando Mur: Mathematician and Bellabeat’s co-founder Bellabeat marketing analytics team
This is a capstone project for my Google Data Analytics Capstone. In this case study, I will be analyzing one of Bellabeat’s smart device data to gain insight into how consumers are using their smart devices, Bellabeat is a high-tech manufacturer of health focused products for women. The insights I discover will then help guide to decide or enhance marketing strategy for the company
Data preparation:
Source, Licensing, Privacy
Source: FitBit Fitness Tracker Data - (Dataset made available through Mobius)
License: CC0: Public Domain
Privacy: This Kaggle data set contains personal fitness tracker from 30
Fitbit users who consented to the submission of personal tracker data,including minute-level output for physical activity, heart rate, and sleep monitoring. It includes information about daily activity, steps, and heart rate that can be used to explore users’ habits.
if (!require('tidyverse'))
{
install.packages('tidyverse');
library(tidyverse);
}
## Loading required package: tidyverse
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
if(!require('tidyverse'))
{
install.packages('here');
library(here);
}
if(!require('skimr'))
{
install.packages('skimr');
library(skimr);
}
## Loading required package: skimr
if(!require('janitor'))
{
install.packages('janitor');
library(janitor)
}
## Loading required package: janitor
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
if(!require('lubridate'))
{
install.packages('lubridate');
library(lubridate)
}
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
if(!require('dplyr'))
{
install.packages('dplyr');
library(dplyr);
}
if(!require('ggplot2'))
{
install.packages('ggplot2');
library(ggplot2);
}
if(!require('tidyr'))
{
install.packages('tidyr');
library(tidyr);
}
if(!require('corrplot'))
{
install.packages('corrplot');
library(corrplot);
}
## Loading required package: corrplot
## corrplot 0.90 loaded
if(!require('ggpubr'))
{
install.packages('ggpubr');
library(ggpubr);
}
## Loading required package: ggpubr
if(!require('chron'))
{
install.packages('chron');
library(chron);
}
## Loading required package: chron
##
## Attaching package: 'chron'
## The following objects are masked from 'package:lubridate':
##
## days, hours, minutes, seconds, years
if(!require('hms'))
{
install.packages('hms');
library(hms);
}
## Loading required package: hms
##
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
##
## hms
if(!require('kableExtra'))
{
install.packages(kableExtra);
library(kableExtra);
}
## Loading required package: kableExtra
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
if(!require('magrittr'))
{
install.packages('magrittr');
library(magrittr);
}
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
if(!require('scales'))
{
install.packages('scales');
library(scales);
}
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
if(!require('devtools'))
{
install.packages('devtools');
library(devtools)
}
## Loading required package: devtools
## Loading required package: usethis
if(!require('readxl'))
{
install.packages('readxl');
library(readxl);
}
## Loading required package: readxl
if(!require('tinytex'))
{
install.packages('tinytex');
library(tinytex)
}
## Loading required package: tinytex
As I explore through all the files present I found that much files's columns are already present in the dailyActivity_merged file so we are here taking only those files which we have to analyze so those files are:
dailyActivity_merged
hourlyIntensities_merged
heartrate_seconds_merged
sleepDay_merged
weightLogInfo_merged
###- importing files:
Activity <- read.csv("dailyActivity_merged.csv")
Hourly_Calories <- read.csv("hourlyCalories_merged.csv")
Hourly_Intensities <- read.csv("hourlyIntensities_merged.csv")
Hourly_Steps <- read.csv("hourlySteps_merged.csv")
Heartrate_Seconds <- read.csv("heartrate_seconds_merged.csv")
Sleep_Day <- read.csv("sleepDay_merged.csv")
WeightLogInfo <- read.csv("weightLogInfo_merged.csv")
head(Activity)
head(Hourly_Calories)
head(Hourly_Intensities)
head(Hourly_Steps)
head(Heartrate_Seconds)
head(Sleep_Day)
head(WeightLogInfo)
length(unique(Activity$Id))
## [1] 33
length(unique(Hourly_Intensities$Id))
## [1] 33
length(unique(Heartrate_Seconds$Id))
## [1] 7
length(unique(Sleep_Day$Id))
## [1] 24
length(unique(WeightLogInfo$Id))
## [1] 8
length(unique(Hourly_Calories$Id))
## [1] 33
length(unique(Hourly_Steps$Id))
## [1] 33
any(duplicated(Activity))
## [1] FALSE
any(duplicated(Hourly_Calories))
## [1] FALSE
any(duplicated(Hourly_Intensities))
## [1] FALSE
any(duplicated(Hourly_Steps))
## [1] FALSE
any(duplicated(Heartrate_Seconds))
## [1] TRUE
any(duplicated(Sleep_Day))
## [1] TRUE
any(duplicated(WeightLogInfo))
## [1] FALSE
Sleep_Day <- distinct(Sleep_Day)
any(duplicated(Sleep_Day))
## [1] FALSE
here in most of files we have unnecessory columns are present which we don’t have to use so we are going to delete them first
Activity <- subset(Activity,select=-c(TrackerDistance,LoggedActivitiesDistance,SedentaryActiveDistance))
head(Activity)
WeightLogInfo <- subset(WeightLogInfo,select=-c(WeightPounds,Id))
head(WeightLogInfo)
Activity <- Activity %>%
mutate(ActivityDate = mdy(ActivityDate), Day_of_Week = weekdays(ActivityDate))
head(Activity)
Sleep_Day = Sleep_Day %>%
mutate(Sleep_Amount = case_when(TotalMinutesAsleep/60>=6.0 & TotalMinutesAsleep/60<=9.0 ~ "Good Sleep",
TotalMinutesAsleep/60<6.0 ~ "Under Sleep",
TotalMinutesAsleep/60>9.0 ~ "Over sleep"))
head(Sleep_Day)
Sleep_Day <- subset(Sleep_Day,select=-c(SleepDay,TotalSleepRecords,TotalTimeInBed))
head(Sleep_Day)
Activity <- mutate(Activity, Id=as.character('Id'))
Hourly_Intensities <-mutate(Hourly_Intensities,Id=as.character('Id'))
Hourly_Calories <- mutate(Hourly_Calories,Id=as.character('Id'))
Hourly_Steps <- mutate(Hourly_Steps,Id=as.character('Id'))
Heartrate_Seconds <- mutate(Heartrate_Seconds,Id=as.character('Id'))
Sleep_Day <- mutate(Sleep_Day,Id=as.character('Id'))
WeightLogInfo <-mutate(WeightLogInfo,Id=as.character('Id'))
summary(Activity)
## Id ActivityDate TotalSteps TotalDistance
## Length:940 Min. :2016-04-12 Min. : 0 Min. : 0.000
## Class :character 1st Qu.:2016-04-19 1st Qu.: 3790 1st Qu.: 2.620
## Mode :character Median :2016-04-26 Median : 7406 Median : 5.245
## Mean :2016-04-26 Mean : 7638 Mean : 5.490
## 3rd Qu.:2016-05-04 3rd Qu.:10727 3rd Qu.: 7.713
## Max. :2016-05-12 Max. :36019 Max. :28.030
## VeryActiveDistance ModeratelyActiveDistance LightActiveDistance
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.: 1.945
## Median : 0.210 Median :0.2400 Median : 3.365
## Mean : 1.503 Mean :0.5675 Mean : 3.341
## 3rd Qu.: 2.053 3rd Qu.:0.8000 3rd Qu.: 4.782
## Max. :21.920 Max. :6.4800 Max. :10.710
## VeryActiveMinutes FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:127.0 1st Qu.: 729.8
## Median : 4.00 Median : 6.00 Median :199.0 Median :1057.5
## Mean : 21.16 Mean : 13.56 Mean :192.8 Mean : 991.2
## 3rd Qu.: 32.00 3rd Qu.: 19.00 3rd Qu.:264.0 3rd Qu.:1229.5
## Max. :210.00 Max. :143.00 Max. :518.0 Max. :1440.0
## Calories Day_of_Week
## Min. : 0 Length:940
## 1st Qu.:1828 Class :character
## Median :2134 Mode :character
## Mean :2304
## 3rd Qu.:2793
## Max. :4900
#Hourly_Calories
Hourly_Calories$ActivityHour=as.POSIXct(Hourly_Calories$ActivityHour, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
Hourly_Calories$ActivityHour <- format(Hourly_Calories$ActivityHour, format = "%m/%d/%y")
#Heartrate_Seconds
Heartrate_Seconds$Time=as.POSIXct(Heartrate_Seconds$Time, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
Heartrate_Seconds$Time <- format(Heartrate_Seconds$Time, format = "%m/%d/%y")
# WeightLogInfo
WeightLogInfo$Date=as.POSIXct(WeightLogInfo$Date, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
WeightLogInfo$Date <- format(WeightLogInfo$Date, format = "%m/%d/%y")
#hourly_Intensities
Hourly_Intensities$ActivityHour=as.POSIXct(Hourly_Intensities$ActivityHour, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
Hourly_Intensities$ActivityHour <- format(Hourly_Intensities$ActivityHour, format = "%H:%M:%S")
Hourly_Intensities$ActivityHour <- format(Hourly_Intensities$ActivityHour, format = "%m/%d/%y")
WeightLogInfo$Date <- as.Date(WeightLogInfo$Date)
head(WeightLogInfo)
Heartrate_Seconds$Time <-as.Date(Heartrate_Seconds$Time)
Hourly_Calories$ActivityHour <-as.Date(Hourly_Calories$ActivityHour)
head(Hourly_Calories)
we can say smart devices works very good on total steps taken and calories burned
Activity <- merge(Activity,Sleep_Day,by="Id")
colnames(Activity)
## [1] "Id" "ActivityDate"
## [3] "TotalSteps" "TotalDistance"
## [5] "VeryActiveDistance" "ModeratelyActiveDistance"
## [7] "LightActiveDistance" "VeryActiveMinutes"
## [9] "FairlyActiveMinutes" "LightlyActiveMinutes"
## [11] "SedentaryMinutes" "Calories"
## [13] "Day_of_Week" "TotalMinutesAsleep"
## [15] "Sleep_Amount"
Activity <- Activity %>%
mutate(DistanceCategory= case_when(
Activity$TotalDistance < 4500 ~ "LowActive",
Activity$TotalDistance >= 5000 & .$TotalDistance < 6000 ~ "AverageActive",
Activity$TotalDistance >= 6500 & .$TotalDistance < 7000 ~ "MorethanAverageActive",
Activity$TotalDistance > 7500 ~ "VeryActive"
))
ggplot(data = Activity)+
geom_col(mapping = aes(Sleep_Amount, y= Calories, fill =Sleep_Amount))+
facet_grid(~DistanceCategory)+
labs(title = "How much Calories burns affecting by Sleep Quality By Distance Categories")+
theme(axis.text = element_text(angle = 45))
we can see the smart device works very well on sleep data and calories burned
ggplot(data = Activity)+
geom_col(mapping = aes(Day_of_Week, y=TotalSteps, fill =Day_of_Week))+
facet_grid(~DistanceCategory)+
labs(title = "How much Steps taken on which day of week")+
theme(axis.text = element_text(angle = 45))
Observations_ by data viz we can say on sunday there are less data for totalsteps taken due to holiday on tuesday again back to routine work so good steps are taken and on other days average steps are taken
The app which is made must have to update to track all the data which is needed from the smart devices.
So I would like to suggest the company must have to work on the app to get these information consistently and accurately.
Because the smart devices are rooted to the mobile to track the information from the consumers through the smart devices.
The age of the consumer not tracked so I would like to add the age data to this app to get accurate results.
Activity <- Activity %>%
rowwise() %>%
mutate(TotalActiveMinutes = sum(c(VeryActiveMinutes, FairlyActiveMinutes, LightlyActiveMinutes)))
Sleep_Day = Sleep_Day %>% mutate(Sleep_Amount = case_when(TotalMinutesAsleep/60>=6.0 & TotalMinutesAsleep/60<=9.0 ~ "Good Sleep",
TotalMinutesAsleep/60<6.0 ~ "Under Sleep",
TotalMinutesAsleep/60>9.0 ~ "Over sleep"))
New_Weight <- WeightLogInfo %>%
select(Id,Date,WeightKg,BMI)
Activity <- mutate(Activity, Id=as.character('Id'))
Hourly_Intensities <-mutate(Hourly_Intensities,Id=as.character('Id'))
Hourly_Calories <- mutate(Hourly_Calories,Id=as.character('Id'))
Hourly_Steps <- mutate(Hourly_Steps,Id=as.character('Id'))
Heartrate_Seconds <- mutate(Heartrate_Seconds,Id=as.character('Id'))
Sleep_Day <- mutate(Sleep_Day,Id=as.character('Id'))
WeightLogInfo <-mutate(WeightLogInfo,Id=as.character('Id'))
summary(Activity)
## Id ActivityDate TotalSteps TotalDistance
## Length:385400 Min. :2016-04-12 Min. : 0 Min. : 0.000
## Class :character 1st Qu.:2016-04-19 1st Qu.: 3790 1st Qu.: 2.620
## Mode :character Median :2016-04-26 Median : 7406 Median : 5.245
## Mean :2016-04-26 Mean : 7638 Mean : 5.490
## 3rd Qu.:2016-05-04 3rd Qu.:10727 3rd Qu.: 7.713
## Max. :2016-05-12 Max. :36019 Max. :28.030
## VeryActiveDistance ModeratelyActiveDistance LightActiveDistance
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.: 1.945
## Median : 0.210 Median :0.2400 Median : 3.365
## Mean : 1.503 Mean :0.5675 Mean : 3.341
## 3rd Qu.: 2.053 3rd Qu.:0.8000 3rd Qu.: 4.782
## Max. :21.920 Max. :6.4800 Max. :10.710
## VeryActiveMinutes FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:127.0 1st Qu.: 729.8
## Median : 4.00 Median : 6.00 Median :199.0 Median :1057.5
## Mean : 21.16 Mean : 13.56 Mean :192.8 Mean : 991.2
## 3rd Qu.: 32.00 3rd Qu.: 19.00 3rd Qu.:264.0 3rd Qu.:1229.5
## Max. :210.00 Max. :143.00 Max. :518.0 Max. :1440.0
## Calories Day_of_Week TotalMinutesAsleep Sleep_Amount
## Min. : 0 Length:385400 Min. : 58.0 Length:385400
## 1st Qu.:1828 Class :character 1st Qu.:361.0 Class :character
## Median :2134 Mode :character Median :432.5 Mode :character
## Mean :2304 Mean :419.2
## 3rd Qu.:2793 3rd Qu.:490.0
## Max. :4900 Max. :796.0
## DistanceCategory TotalActiveMinutes
## Length:385400 Min. : 0.0
## Class :character 1st Qu.:146.8
## Mode :character Median :247.0
## Mean :227.5
## 3rd Qu.:317.2
## Max. :552.0
Activity = Activity %>%
mutate(Steps_Amount = case_when(TotalSteps<=4500 ~ "Less Walker",
TotalSteps>4000 & TotalSteps <=9000 ~ "Good Walker", TotalSteps>9000 & TotalSteps<=12000 ~ "Better Walker",
TotalSteps>12000 ~ "Best Walker"))
New_Weight = New_Weight %>%
mutate(Weight_Amount = case_when(BMI <= 18.5 ~ "UnderWeight", BMI >= 18.6 & BMI <= 24.9 ~ "NormalWeight", BMI >= 25 & BMI <= 29.9 ~ "OverWeight", BMI >= 30 ~ "Obesity"))
Heartrate_Seconds = Heartrate_Seconds %>%
mutate(Heartrate_Amount = case_when(Value <= 80 ~ "Normal B.P.",
Value >= 81 ~ "High B.P."))
Activity = Activity %>% mutate(Burned_Calories = case_when(Calories<=1800 ~ "Low", Calories>1800 & Calories<=2200 ~ "Medium",
Calories>2200 & Calories<=2600 ~ "High", Calories>2600 ~ "Very High"))
Activity = Activity %>%
mutate(Sedentary_Time = case_when(SedentaryMinutes > 626 ~ "Good",
SedentaryMinutes < 627 & SedentaryMinutes ~ "Good", SedentaryMinutes/60>10 & SedentaryMinutes/60<=12 ~ "Bad",
SedentaryMinutes/60>12 ~ "Very Bad"))
Hourly_Calories$ActivityHour=as.POSIXct(Hourly_Calories$ActivityHour, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
Hourly_Calories$ActivityHour <- format(Hourly_Calories$ActivityHour, format = "%m/%d/%y")
Heartrate_Seconds$Time=as.POSIXct(Heartrate_Seconds$Time, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
Heartrate_Seconds$Time <- format(Heartrate_Seconds$Time, format = "%m/%d/%y")
WeightLogInfo$Date=as.POSIXct(WeightLogInfo$Date, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
WeightLogInfo$Date <- format(WeightLogInfo$Date, format = "%m/%d/%y")
Hourly_Intensities$ActivityHour=as.POSIXct(Hourly_Intensities$ActivityHour, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
Hourly_Intensities$ActivityHour <- format(Hourly_Intensities$ActivityHour, format = "%H:%M:%S")
Hourly_Intensities$ActivityHour <- format(Hourly_Intensities$ActivityHour, format = "%m/%d/%y")
The app which is made must have to update to track all the data which is needed from the smart devices.
So I would like to suggest the company must have to work on the app to get these information consistently and accurately.
Because the smart devices are rooted to the mobile to track the information from the consumers through the smart devices.
The age of the consumer not tracked so I would like to add the age data to this app to get accurate results.