BELLA BEAT CASE STUDY

CLEANING(PROCESS)

CHECK FOR ANY DUPLICATES AND MISSING VALUES

Notes: Check is done by different simple functions

#unique users check
n_distinct(daily_activity$Id)
## [1] 33
n_distinct(daily_sleep$Id)
## [1] 24
n_distinct(weight_log)
## [1] 67
#check for any na values
sum(is.na(daily_activity))
## [1] 0
sum(is.na(daily_sleep))
## [1] 0
sum(is.na(weight_log))#65 na.for the fat column
## [1] 65
#check for duplicates
sum(duplicated(daily_activity))
## [1] 0
sum(duplicated(daily_sleep))
## [1] 3
sum(duplicated(weight_log))
## [1] 0
#Conclusuion:there are no missing values in the data set but there are duplicated values in the sleep data set

REMOVE DUPLICATES

Notes: We also remove NAs as a precautionary measure On the weight dataset we removed the fat dataset owing to the hge na numbers on this column

daily_sleep_clean <- daily_sleep%>%
  distinct()%>%
  drop_na()
n_distinct(daily_sleep_clean)
## [1] 410
n_distinct(daily_activity)
## [1] 940
#dataset 2 cleaned

weight_log_clean <- weight_log%>%
  distinct()%>%
  select(-Fat)
View(weight_log_clean)
#dataset 3 cleaned

#Cleaning dataset 1
clean_names(daily_activity)
## # A tibble: 940 x 15
##            id activity_date total_steps total_distance tracker_distance
##         <dbl> <chr>               <dbl>          <dbl>            <dbl>
##  1 1503960366 4/12/2016           13162           8.5              8.5 
##  2 1503960366 4/13/2016           10735           6.97             6.97
##  3 1503960366 4/14/2016           10460           6.74             6.74
##  4 1503960366 4/15/2016            9762           6.28             6.28
##  5 1503960366 4/16/2016           12669           8.16             8.16
##  6 1503960366 4/17/2016            9705           6.48             6.48
##  7 1503960366 4/18/2016           13019           8.59             8.59
##  8 1503960366 4/19/2016           15506           9.88             9.88
##  9 1503960366 4/20/2016           10544           6.68             6.68
## 10 1503960366 4/21/2016            9819           6.34             6.34
## # ... with 930 more rows, and 10 more variables:
## #   logged_activities_distance <dbl>, very_active_distance <dbl>,
## #   moderately_active_distance <dbl>, light_active_distance <dbl>,
## #   sedentary_active_distance <dbl>, very_active_minutes <dbl>,
## #   fairly_active_minutes <dbl>, lightly_active_minutes <dbl>,
## #   sedentary_minutes <dbl>, calories <dbl>
clean_names(daily_sleep_clean)
## # A tibble: 410 x 5
##            id sleep_day     total_sleep_reco~ total_minutes_as~ total_time_in_b~
##         <dbl> <chr>                     <dbl>             <dbl>            <dbl>
##  1 1503960366 4/12/2016 12~                 1               327              346
##  2 1503960366 4/13/2016 12~                 2               384              407
##  3 1503960366 4/15/2016 12~                 1               412              442
##  4 1503960366 4/16/2016 12~                 2               340              367
##  5 1503960366 4/17/2016 12~                 1               700              712
##  6 1503960366 4/19/2016 12~                 1               304              320
##  7 1503960366 4/20/2016 12~                 1               360              377
##  8 1503960366 4/21/2016 12~                 1               325              364
##  9 1503960366 4/23/2016 12~                 1               361              384
## 10 1503960366 4/24/2016 12~                 1               430              449
## # ... with 400 more rows

Change the date format

Notes: Change the datatype of the data column, convert format to yyyy-mm-dd and rename it “date”

#dataset 1
daily_activity <- daily_activity %>%
  rename(Date = ActivityDate) %>%
  mutate(Date = as_date(Date, format = "%m/%d/%Y"))

#dataset 2
daily_sleep_clean <- daily_sleep_clean %>%
  rename(Date = SleepDay) %>%
  mutate(Date = as_date(Date,format ="%m/%d/%Y %I:%M:%S %p" , tz=Sys.timezone()))
## Warning: `tz` argument is ignored by `as_date()`
#dataset 3
weight_log_clean <- weight_log_clean %>%
  mutate(Date = as_date(Date,format ="%m/%d/%Y %I:%M:%S %p" , tz=Sys.timezone()))
## Warning: `tz` argument is ignored by `as_date()`
#confirmation
head(daily_activity)
## # A tibble: 6 x 15
##        Id Date       TotalSteps TotalDistance TrackerDistance LoggedActivitiesD~
##     <dbl> <date>          <dbl>         <dbl>           <dbl>              <dbl>
## 1  1.50e9 2016-04-12      13162          8.5             8.5                   0
## 2  1.50e9 2016-04-13      10735          6.97            6.97                  0
## 3  1.50e9 2016-04-14      10460          6.74            6.74                  0
## 4  1.50e9 2016-04-15       9762          6.28            6.28                  0
## 5  1.50e9 2016-04-16      12669          8.16            8.16                  0
## 6  1.50e9 2016-04-17       9705          6.48            6.48                  0
## # ... with 9 more variables: VeryActiveDistance <dbl>,
## #   ModeratelyActiveDistance <dbl>, LightActiveDistance <dbl>,
## #   SedentaryActiveDistance <dbl>, VeryActiveMinutes <dbl>,
## #   FairlyActiveMinutes <dbl>, LightlyActiveMinutes <dbl>,
## #   SedentaryMinutes <dbl>, Calories <dbl>
head(daily_sleep_clean)
## # A tibble: 6 x 5
##           Id Date       TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
##        <dbl> <date>                 <dbl>              <dbl>          <dbl>
## 1 1503960366 2016-04-12                 1                327            346
## 2 1503960366 2016-04-13                 2                384            407
## 3 1503960366 2016-04-15                 1                412            442
## 4 1503960366 2016-04-16                 2                340            367
## 5 1503960366 2016-04-17                 1                700            712
## 6 1503960366 2016-04-19                 1                304            320
head(weight_log_clean)
## # A tibble: 6 x 7
##           Id Date       WeightKg WeightPounds   BMI IsManualReport         LogId
##        <dbl> <date>        <dbl>        <dbl> <dbl> <lgl>                  <dbl>
## 1 1503960366 2016-05-02     52.6         116.  22.6 TRUE           1462233599000
## 2 1503960366 2016-05-03     52.6         116.  22.6 TRUE           1462319999000
## 3 1927972279 2016-04-13    134.          294.  47.5 FALSE          1460509732000
## 4 2873212765 2016-04-21     56.7         125.  21.5 TRUE           1461283199000
## 5 2873212765 2016-05-12     57.3         126.  21.7 TRUE           1463097599000
## 6 4319703577 2016-04-17     72.4         160.  27.5 TRUE           1460937599000

ANALYZING OUR DATA

Daily activity data

Notes:We Get summaries of our data

daily_activity%>%
  select(TotalSteps,TotalDistance,SedentaryMinutes,LightlyActiveMinutes,
         FairlyActiveMinutes,VeryActiveMinutes,Calories)%>%
  summary()
##    TotalSteps    TotalDistance    SedentaryMinutes LightlyActiveMinutes
##  Min.   :    0   Min.   : 0.000   Min.   :   0.0   Min.   :  0.0       
##  1st Qu.: 3790   1st Qu.: 2.620   1st Qu.: 729.8   1st Qu.:127.0       
##  Median : 7406   Median : 5.245   Median :1057.5   Median :199.0       
##  Mean   : 7638   Mean   : 5.490   Mean   : 991.2   Mean   :192.8       
##  3rd Qu.:10727   3rd Qu.: 7.713   3rd Qu.:1229.5   3rd Qu.:264.0       
##  Max.   :36019   Max.   :28.030   Max.   :1440.0   Max.   :518.0       
##  FairlyActiveMinutes VeryActiveMinutes    Calories   
##  Min.   :  0.00      Min.   :  0.00    Min.   :   0  
##  1st Qu.:  0.00      1st Qu.:  0.00    1st Qu.:1828  
##  Median :  6.00      Median :  4.00    Median :2134  
##  Mean   : 13.56      Mean   : 21.16    Mean   :2304  
##  3rd Qu.: 19.00      3rd Qu.: 32.00    3rd Qu.:2793  
##  Max.   :143.00      Max.   :210.00    Max.   :4900
#NOTABLE STATISTICS
#average daily steps-7,638 < recommended 10,000(bad)
#average distance covered daily-5.49 miles
#Sedentary Minutes - 991.2 - can cause health problems(3/4 a day)
#Lightly Active Minutes - 192.8
#Fairly ightly Active Minutes- 13.56
#Very Active Minutes - 21.6 > recommended 9.7 mins(good)
#Calories - 2304-(average person burns 1800 a day...to lose weight we need 3500 a day)
#On tarck to lose weight but generally slower

Daily sleep data

daily_sleep_clean%>%
  select(TotalSleepRecords,TotalMinutesAsleep,TotalTimeInBed)%>%
  summary()
##  TotalSleepRecords TotalMinutesAsleep TotalTimeInBed 
##  Min.   :1.00      Min.   : 58.0      Min.   : 61.0  
##  1st Qu.:1.00      1st Qu.:361.0      1st Qu.:403.8  
##  Median :1.00      Median :432.5      Median :463.0  
##  Mean   :1.12      Mean   :419.2      Mean   :458.5  
##  3rd Qu.:1.00      3rd Qu.:490.0      3rd Qu.:526.0  
##  Max.   :3.00      Max.   :796.0      Max.   :961.0
#NOTABLE STATISTICS
#AVG Total Minutes Asleep - 419.2(6.986667 hours)(good-recommended 6-8 hrs)
#AVG Total Time In Bed - 458.5(7.641667hours)
#According to Health Central, people should not spend more than 1 hour in bed awake.
#This is to prevent a mental link being formed between being awake and being in bed, which can lead to insomnia.

Weight dataset

weight_log_clean%>%
  select(WeightKg,BMI)%>%
  summary()
##     WeightKg           BMI       
##  Min.   : 52.60   Min.   :21.45  
##  1st Qu.: 61.40   1st Qu.:23.96  
##  Median : 62.50   Median :24.39  
##  Mean   : 72.04   Mean   :25.19  
##  3rd Qu.: 85.05   3rd Qu.:25.56  
##  Max.   :133.50   Max.   :47.54
#NOTABLE STATISTICS
#AVG BMI - 25.19(overweight)...does not generally indicate body fatness

PLOTS (Share phase)

DAILY ACTIVITY DATSET

Total steps vs sedentary minutes

Notes: there is a negative linear relation

#Total steps vs sedentary minutes
summary(daily_activity)
##        Id                 Date              TotalSteps    TotalDistance   
##  Min.   :1.504e+09   Min.   :2016-04-12   Min.   :    0   Min.   : 0.000  
##  1st Qu.:2.320e+09   1st Qu.:2016-04-19   1st Qu.: 3790   1st Qu.: 2.620  
##  Median :4.445e+09   Median :2016-04-26   Median : 7406   Median : 5.245  
##  Mean   :4.855e+09   Mean   :2016-04-26   Mean   : 7638   Mean   : 5.490  
##  3rd Qu.:6.962e+09   3rd Qu.:2016-05-04   3rd Qu.:10727   3rd Qu.: 7.713  
##  Max.   :8.878e+09   Max.   :2016-05-12   Max.   :36019   Max.   :28.030  
##  TrackerDistance  LoggedActivitiesDistance VeryActiveDistance
##  Min.   : 0.000   Min.   :0.0000           Min.   : 0.000    
##  1st Qu.: 2.620   1st Qu.:0.0000           1st Qu.: 0.000    
##  Median : 5.245   Median :0.0000           Median : 0.210    
##  Mean   : 5.475   Mean   :0.1082           Mean   : 1.503    
##  3rd Qu.: 7.710   3rd Qu.:0.0000           3rd Qu.: 2.053    
##  Max.   :28.030   Max.   :4.9421           Max.   :21.920    
##  ModeratelyActiveDistance LightActiveDistance SedentaryActiveDistance
##  Min.   :0.0000           Min.   : 0.000      Min.   :0.000000       
##  1st Qu.:0.0000           1st Qu.: 1.945      1st Qu.:0.000000       
##  Median :0.2400           Median : 3.365      Median :0.000000       
##  Mean   :0.5675           Mean   : 3.341      Mean   :0.001606       
##  3rd Qu.:0.8000           3rd Qu.: 4.782      3rd Qu.:0.000000       
##  Max.   :6.4800           Max.   :10.710      Max.   :0.110000       
##  VeryActiveMinutes FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes
##  Min.   :  0.00    Min.   :  0.00      Min.   :  0.0        Min.   :   0.0  
##  1st Qu.:  0.00    1st Qu.:  0.00      1st Qu.:127.0        1st Qu.: 729.8  
##  Median :  4.00    Median :  6.00      Median :199.0        Median :1057.5  
##  Mean   : 21.16    Mean   : 13.56      Mean   :192.8        Mean   : 991.2  
##  3rd Qu.: 32.00    3rd Qu.: 19.00      3rd Qu.:264.0        3rd Qu.:1229.5  
##  Max.   :210.00    Max.   :143.00      Max.   :518.0        Max.   :1440.0  
##     Calories   
##  Min.   :   0  
##  1st Qu.:1828  
##  Median :2134  
##  Mean   :2304  
##  3rd Qu.:2793  
##  Max.   :4900
ggplot(daily_activity, aes(x=TotalSteps, y =SedentaryMinutes,color=Calories))+
  geom_point()+geom_smooth(method = lm)+
  labs(title= "Total steps vs sedentary minutes")+
  annotate("text",x=22000,y=350,label="negative correlation", color="red",fontface="bold")
## `geom_smooth()` using formula 'y ~ x'

Calories vs very active minutes

Notes: positive linear relation

ggplot(daily_activity, aes(x=VeryActiveMinutes, y =Calories,color=Calories))+
  geom_point()+geom_smooth(method = lm)+
  labs(title= "Very active minutes vs Calories")+
  annotate("text",x=120,y=1000,label="positive correlation", color="green",fontface="bold")
## `geom_smooth()` using formula 'y ~ x'

Steps vs Calories

Notes: Positive Linear Relation

#Steps vs Calories
ggplot(daily_activity, aes(x=TotalSteps, y =Calories,color=Calories))+
  geom_point()+geom_smooth(method = lm)+
  labs(title= "Steps vs Calories")+
  annotate("text",x=30000,y=1000,label="positive correlation", color="green",fontface="bold")
## `geom_smooth()` using formula 'y ~ x'

DAILY SLEEP

Total Minutes Asleep vs TotalTime In Bed

Notes: Positive Linear Relation

ggplot(daily_sleep_clean, aes(x =TotalMinutesAsleep, y =TotalTimeInBed,))+
  geom_point()+geom_smooth(method = lm)+
  labs(title= "Total Minutes Asleep vs TotalTime In Bed")+
  annotate("text",x=500,y=230,label="positive correlation", color="green",fontface="bold")
## `geom_smooth()` using formula 'y ~ x'

WEIGHT LOG

Weight vs BMI

Notes: Positive Linear Relation

ggplot(weight_log_clean, aes(x=WeightKg, y =BMI))+
  geom_point()+geom_smooth(method = lm)+
  labs(title = "Weight vs BMI")+
  annotate("text",x=70,y=38,label="positive correlation", color="green",fontface="bold")
## `geom_smooth()` using formula 'y ~ x'

RECOMMENDATIONS FOR APP

  • Encourage Users to try and make 10,000 steps per day so as to improve their health and get interactive with Leaf
  • Since the weight vs bmi graph had little entries,encourage the customers to enter their weight and height so as to make the process seamless
  • Send users alerts and notifications of their remaining time on intense activities so as to stimulate the use of the app while promoting health
  • Send notifications that the users have surpassed one hour on the bed after waking up
  • Since most users are slightly average when burning calories daily, Introduce a calorie tracker that shows calories consumed and used per day while sending notifications
  • Set notifications to users showing them how sedentary they have been and send recommendations for healthier activities
  • Introduction of a friend feature o enable users to track each others data(with consent)

RECOMMENDATIONS FOR MEMBERSHIP

  • Reduced offers and not charging for premium membership for a short period of time to enable users to fully experience BellaBeat
  • Encourage referrals at a reduced price
  • Offer discounts for product with membership

CITATIONS

Gornall, Lucy. “How to Lose Weight: How Many Calories Should i Eat to Lose Weight?” GoodtoKnow, 12 Aug. 2020 https://www.goodto.com/wellbeing/diets-exercise/what-is-calorie-how-many-lose-weigt-425557

Grey, Heather. “Heart Rates Can Vary by 70 Bpm: What That Means for Your Health.” Healthline, Healthline Media, 9 Feb. 2020, https://www.healthline.com/health-news/what-your-heart-rate-says-about-your-health

“CDC — How Much Sleep Do I Need? — Sleep and Sleep Disorders.” Centers for Disease Control and Prevention, Centers for Disease Control and Prevention, 2 Mar. 2017, https://www.cdc.gov/sleep/about_sleep/how_much_sleep.html.

Reed, Martin. “Spend Less Time In Bed If You Want More Sleep.” Healthcentral.com, 7 May 2017, https://www.healthcentral.com/article/spend-less-time-in-bed-if-you-want-more-sleep