Notes:The following packages were used(description shown)
library(tidyverse)#For data cleaning and organizing
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)#For plotting
library(knitr)#For use with rmarkdown oin presentation
library(lubridate)#For sdate use and conversion
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(readr)#For importing data
library(dplyr)#For making manipulations
library(rmarkdown)#For creating a sharable document
library(janitor)#For easing data cleaning
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(skimr)#For easy summary
Notes:Three data sets were imported for this analysis
#used only two data sets that covered 24 hours(initially)
#added weight data set to get context
daily_activity <- read_csv("C:\\Users\\Lusui\\OneDrive - CM Advocates, LLP\\Documents\\R\\Fitabase Data 4.12.16-5.12.16\\dailyActivity_merged.csv")
## Rows: 940 Columns: 15
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): ActivityDate
## dbl (14): Id, TotalSteps, TotalDistance, TrackerDistance, LoggedActivitiesDi...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
daily_sleep <- read_csv("C:\\Users\\Lusui\\OneDrive - CM Advocates, LLP\\Documents\\R\\Fitabase Data 4.12.16-5.12.16\\sleepDay_merged.csv")
## Rows: 413 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): SleepDay
## dbl (4): Id, TotalSleepRecords, TotalMinutesAsleep, TotalTimeInBed
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
weight_log <- read_csv("C:\\Users\\Lusui\\OneDrive - CM Advocates, LLP\\Documents\\R\\Fitabase Data 4.12.16-5.12.16\\weightLogInfo_merged.csv")
## Rows: 67 Columns: 8
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): Date
## dbl (6): Id, WeightKg, WeightPounds, Fat, BMI, LogId
## lgl (1): IsManualReport
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Notes: Just the first 6 rows & then look at the structure
head(daily_activity)
## # A tibble: 6 x 15
## Id ActivityDate TotalSteps TotalDistance TrackerDistance LoggedActivitie~
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1.50e9 4/12/2016 13162 8.5 8.5 0
## 2 1.50e9 4/13/2016 10735 6.97 6.97 0
## 3 1.50e9 4/14/2016 10460 6.74 6.74 0
## 4 1.50e9 4/15/2016 9762 6.28 6.28 0
## 5 1.50e9 4/16/2016 12669 8.16 8.16 0
## 6 1.50e9 4/17/2016 9705 6.48 6.48 0
## # ... with 9 more variables: VeryActiveDistance <dbl>,
## # ModeratelyActiveDistance <dbl>, LightActiveDistance <dbl>,
## # SedentaryActiveDistance <dbl>, VeryActiveMinutes <dbl>,
## # FairlyActiveMinutes <dbl>, LightlyActiveMinutes <dbl>,
## # SedentaryMinutes <dbl>, Calories <dbl>
head(daily_sleep)
## # A tibble: 6 x 5
## Id SleepDay TotalSleepRecor~ TotalMinutesAsle~ TotalTimeInBed
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1503960366 4/12/2016 12:00:~ 1 327 346
## 2 1503960366 4/13/2016 12:00:~ 2 384 407
## 3 1503960366 4/15/2016 12:00:~ 1 412 442
## 4 1503960366 4/16/2016 12:00:~ 2 340 367
## 5 1503960366 4/17/2016 12:00:~ 1 700 712
## 6 1503960366 4/19/2016 12:00:~ 1 304 320
head(weight_log)
## # A tibble: 6 x 8
## Id Date WeightKg WeightPounds Fat BMI IsManualReport LogId
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl>
## 1 1503960366 5/2/2016~ 52.6 116. 22 22.6 TRUE 1.46e12
## 2 1503960366 5/3/2016~ 52.6 116. NA 22.6 TRUE 1.46e12
## 3 1927972279 4/13/201~ 134. 294. NA 47.5 FALSE 1.46e12
## 4 2873212765 4/21/201~ 56.7 125. NA 21.5 TRUE 1.46e12
## 5 2873212765 5/12/201~ 57.3 126. NA 21.7 TRUE 1.46e12
## 6 4319703577 4/17/201~ 72.4 160. 25 27.5 TRUE 1.46e12
#Structure
str(daily_activity)
## spec_tbl_df [940 x 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Id : num [1:940] 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityDate : chr [1:940] "4/12/2016" "4/13/2016" "4/14/2016" "4/15/2016" ...
## $ TotalSteps : num [1:940] 13162 10735 10460 9762 12669 ...
## $ TotalDistance : num [1:940] 8.5 6.97 6.74 6.28 8.16 ...
## $ TrackerDistance : num [1:940] 8.5 6.97 6.74 6.28 8.16 ...
## $ LoggedActivitiesDistance: num [1:940] 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveDistance : num [1:940] 1.88 1.57 2.44 2.14 2.71 ...
## $ ModeratelyActiveDistance: num [1:940] 0.55 0.69 0.4 1.26 0.41 ...
## $ LightActiveDistance : num [1:940] 6.06 4.71 3.91 2.83 5.04 ...
## $ SedentaryActiveDistance : num [1:940] 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveMinutes : num [1:940] 25 21 30 29 36 38 42 50 28 19 ...
## $ FairlyActiveMinutes : num [1:940] 13 19 11 34 10 20 16 31 12 8 ...
## $ LightlyActiveMinutes : num [1:940] 328 217 181 209 221 164 233 264 205 211 ...
## $ SedentaryMinutes : num [1:940] 728 776 1218 726 773 ...
## $ Calories : num [1:940] 1985 1797 1776 1745 1863 ...
## - attr(*, "spec")=
## .. cols(
## .. Id = col_double(),
## .. ActivityDate = col_character(),
## .. TotalSteps = col_double(),
## .. TotalDistance = col_double(),
## .. TrackerDistance = col_double(),
## .. LoggedActivitiesDistance = col_double(),
## .. VeryActiveDistance = col_double(),
## .. ModeratelyActiveDistance = col_double(),
## .. LightActiveDistance = col_double(),
## .. SedentaryActiveDistance = col_double(),
## .. VeryActiveMinutes = col_double(),
## .. FairlyActiveMinutes = col_double(),
## .. LightlyActiveMinutes = col_double(),
## .. SedentaryMinutes = col_double(),
## .. Calories = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
str(daily_sleep)
## spec_tbl_df [413 x 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Id : num [1:413] 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ SleepDay : chr [1:413] "4/12/2016 12:00:00 AM" "4/13/2016 12:00:00 AM" "4/15/2016 12:00:00 AM" "4/16/2016 12:00:00 AM" ...
## $ TotalSleepRecords : num [1:413] 1 2 1 2 1 1 1 1 1 1 ...
## $ TotalMinutesAsleep: num [1:413] 327 384 412 340 700 304 360 325 361 430 ...
## $ TotalTimeInBed : num [1:413] 346 407 442 367 712 320 377 364 384 449 ...
## - attr(*, "spec")=
## .. cols(
## .. Id = col_double(),
## .. SleepDay = col_character(),
## .. TotalSleepRecords = col_double(),
## .. TotalMinutesAsleep = col_double(),
## .. TotalTimeInBed = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
str(weight_log)
## spec_tbl_df [67 x 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Id : num [1:67] 1.50e+09 1.50e+09 1.93e+09 2.87e+09 2.87e+09 ...
## $ Date : chr [1:67] "5/2/2016 11:59:59 PM" "5/3/2016 11:59:59 PM" "4/13/2016 1:08:52 AM" "4/21/2016 11:59:59 PM" ...
## $ WeightKg : num [1:67] 52.6 52.6 133.5 56.7 57.3 ...
## $ WeightPounds : num [1:67] 116 116 294 125 126 ...
## $ Fat : num [1:67] 22 NA NA NA NA 25 NA NA NA NA ...
## $ BMI : num [1:67] 22.6 22.6 47.5 21.5 21.7 ...
## $ IsManualReport: logi [1:67] TRUE TRUE FALSE TRUE TRUE TRUE ...
## $ LogId : num [1:67] 1.46e+12 1.46e+12 1.46e+12 1.46e+12 1.46e+12 ...
## - attr(*, "spec")=
## .. cols(
## .. Id = col_double(),
## .. Date = col_character(),
## .. WeightKg = col_double(),
## .. WeightPounds = col_double(),
## .. Fat = col_double(),
## .. BMI = col_double(),
## .. IsManualReport = col_logical(),
## .. LogId = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Notes: Check is done by different simple functions
#unique users check
n_distinct(daily_activity$Id)
## [1] 33
n_distinct(daily_sleep$Id)
## [1] 24
n_distinct(weight_log)
## [1] 67
#check for any na values
sum(is.na(daily_activity))
## [1] 0
sum(is.na(daily_sleep))
## [1] 0
sum(is.na(weight_log))#65 na.for the fat column
## [1] 65
#check for duplicates
sum(duplicated(daily_activity))
## [1] 0
sum(duplicated(daily_sleep))
## [1] 3
sum(duplicated(weight_log))
## [1] 0
#Conclusuion:there are no missing values in the data set but there are duplicated values in the sleep data set
Notes: We also remove NAs as a precautionary measure On the weight dataset we removed the fat dataset owing to the hge na numbers on this column
daily_sleep_clean <- daily_sleep%>%
distinct()%>%
drop_na()
n_distinct(daily_sleep_clean)
## [1] 410
n_distinct(daily_activity)
## [1] 940
#dataset 2 cleaned
weight_log_clean <- weight_log%>%
distinct()%>%
select(-Fat)
View(weight_log_clean)
#dataset 3 cleaned
#Cleaning dataset 1
clean_names(daily_activity)
## # A tibble: 940 x 15
## id activity_date total_steps total_distance tracker_distance
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1503960366 4/12/2016 13162 8.5 8.5
## 2 1503960366 4/13/2016 10735 6.97 6.97
## 3 1503960366 4/14/2016 10460 6.74 6.74
## 4 1503960366 4/15/2016 9762 6.28 6.28
## 5 1503960366 4/16/2016 12669 8.16 8.16
## 6 1503960366 4/17/2016 9705 6.48 6.48
## 7 1503960366 4/18/2016 13019 8.59 8.59
## 8 1503960366 4/19/2016 15506 9.88 9.88
## 9 1503960366 4/20/2016 10544 6.68 6.68
## 10 1503960366 4/21/2016 9819 6.34 6.34
## # ... with 930 more rows, and 10 more variables:
## # logged_activities_distance <dbl>, very_active_distance <dbl>,
## # moderately_active_distance <dbl>, light_active_distance <dbl>,
## # sedentary_active_distance <dbl>, very_active_minutes <dbl>,
## # fairly_active_minutes <dbl>, lightly_active_minutes <dbl>,
## # sedentary_minutes <dbl>, calories <dbl>
clean_names(daily_sleep_clean)
## # A tibble: 410 x 5
## id sleep_day total_sleep_reco~ total_minutes_as~ total_time_in_b~
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1503960366 4/12/2016 12~ 1 327 346
## 2 1503960366 4/13/2016 12~ 2 384 407
## 3 1503960366 4/15/2016 12~ 1 412 442
## 4 1503960366 4/16/2016 12~ 2 340 367
## 5 1503960366 4/17/2016 12~ 1 700 712
## 6 1503960366 4/19/2016 12~ 1 304 320
## 7 1503960366 4/20/2016 12~ 1 360 377
## 8 1503960366 4/21/2016 12~ 1 325 364
## 9 1503960366 4/23/2016 12~ 1 361 384
## 10 1503960366 4/24/2016 12~ 1 430 449
## # ... with 400 more rows
Notes: Change the datatype of the data column, convert format to yyyy-mm-dd and rename it “date”
#dataset 1
daily_activity <- daily_activity %>%
rename(Date = ActivityDate) %>%
mutate(Date = as_date(Date, format = "%m/%d/%Y"))
#dataset 2
daily_sleep_clean <- daily_sleep_clean %>%
rename(Date = SleepDay) %>%
mutate(Date = as_date(Date,format ="%m/%d/%Y %I:%M:%S %p" , tz=Sys.timezone()))
## Warning: `tz` argument is ignored by `as_date()`
#dataset 3
weight_log_clean <- weight_log_clean %>%
mutate(Date = as_date(Date,format ="%m/%d/%Y %I:%M:%S %p" , tz=Sys.timezone()))
## Warning: `tz` argument is ignored by `as_date()`
#confirmation
head(daily_activity)
## # A tibble: 6 x 15
## Id Date TotalSteps TotalDistance TrackerDistance LoggedActivitiesD~
## <dbl> <date> <dbl> <dbl> <dbl> <dbl>
## 1 1.50e9 2016-04-12 13162 8.5 8.5 0
## 2 1.50e9 2016-04-13 10735 6.97 6.97 0
## 3 1.50e9 2016-04-14 10460 6.74 6.74 0
## 4 1.50e9 2016-04-15 9762 6.28 6.28 0
## 5 1.50e9 2016-04-16 12669 8.16 8.16 0
## 6 1.50e9 2016-04-17 9705 6.48 6.48 0
## # ... with 9 more variables: VeryActiveDistance <dbl>,
## # ModeratelyActiveDistance <dbl>, LightActiveDistance <dbl>,
## # SedentaryActiveDistance <dbl>, VeryActiveMinutes <dbl>,
## # FairlyActiveMinutes <dbl>, LightlyActiveMinutes <dbl>,
## # SedentaryMinutes <dbl>, Calories <dbl>
head(daily_sleep_clean)
## # A tibble: 6 x 5
## Id Date TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
## <dbl> <date> <dbl> <dbl> <dbl>
## 1 1503960366 2016-04-12 1 327 346
## 2 1503960366 2016-04-13 2 384 407
## 3 1503960366 2016-04-15 1 412 442
## 4 1503960366 2016-04-16 2 340 367
## 5 1503960366 2016-04-17 1 700 712
## 6 1503960366 2016-04-19 1 304 320
head(weight_log_clean)
## # A tibble: 6 x 7
## Id Date WeightKg WeightPounds BMI IsManualReport LogId
## <dbl> <date> <dbl> <dbl> <dbl> <lgl> <dbl>
## 1 1503960366 2016-05-02 52.6 116. 22.6 TRUE 1462233599000
## 2 1503960366 2016-05-03 52.6 116. 22.6 TRUE 1462319999000
## 3 1927972279 2016-04-13 134. 294. 47.5 FALSE 1460509732000
## 4 2873212765 2016-04-21 56.7 125. 21.5 TRUE 1461283199000
## 5 2873212765 2016-05-12 57.3 126. 21.7 TRUE 1463097599000
## 6 4319703577 2016-04-17 72.4 160. 27.5 TRUE 1460937599000
Notes:We Get summaries of our data
daily_activity%>%
select(TotalSteps,TotalDistance,SedentaryMinutes,LightlyActiveMinutes,
FairlyActiveMinutes,VeryActiveMinutes,Calories)%>%
summary()
## TotalSteps TotalDistance SedentaryMinutes LightlyActiveMinutes
## Min. : 0 Min. : 0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.: 3790 1st Qu.: 2.620 1st Qu.: 729.8 1st Qu.:127.0
## Median : 7406 Median : 5.245 Median :1057.5 Median :199.0
## Mean : 7638 Mean : 5.490 Mean : 991.2 Mean :192.8
## 3rd Qu.:10727 3rd Qu.: 7.713 3rd Qu.:1229.5 3rd Qu.:264.0
## Max. :36019 Max. :28.030 Max. :1440.0 Max. :518.0
## FairlyActiveMinutes VeryActiveMinutes Calories
## Min. : 0.00 Min. : 0.00 Min. : 0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:1828
## Median : 6.00 Median : 4.00 Median :2134
## Mean : 13.56 Mean : 21.16 Mean :2304
## 3rd Qu.: 19.00 3rd Qu.: 32.00 3rd Qu.:2793
## Max. :143.00 Max. :210.00 Max. :4900
#NOTABLE STATISTICS
#average daily steps-7,638 < recommended 10,000(bad)
#average distance covered daily-5.49 miles
#Sedentary Minutes - 991.2 - can cause health problems(3/4 a day)
#Lightly Active Minutes - 192.8
#Fairly ightly Active Minutes- 13.56
#Very Active Minutes - 21.6 > recommended 9.7 mins(good)
#Calories - 2304-(average person burns 1800 a day...to lose weight we need 3500 a day)
#On tarck to lose weight but generally slower
daily_sleep_clean%>%
select(TotalSleepRecords,TotalMinutesAsleep,TotalTimeInBed)%>%
summary()
## TotalSleepRecords TotalMinutesAsleep TotalTimeInBed
## Min. :1.00 Min. : 58.0 Min. : 61.0
## 1st Qu.:1.00 1st Qu.:361.0 1st Qu.:403.8
## Median :1.00 Median :432.5 Median :463.0
## Mean :1.12 Mean :419.2 Mean :458.5
## 3rd Qu.:1.00 3rd Qu.:490.0 3rd Qu.:526.0
## Max. :3.00 Max. :796.0 Max. :961.0
#NOTABLE STATISTICS
#AVG Total Minutes Asleep - 419.2(6.986667 hours)(good-recommended 6-8 hrs)
#AVG Total Time In Bed - 458.5(7.641667hours)
#According to Health Central, people should not spend more than 1 hour in bed awake.
#This is to prevent a mental link being formed between being awake and being in bed, which can lead to insomnia.
weight_log_clean%>%
select(WeightKg,BMI)%>%
summary()
## WeightKg BMI
## Min. : 52.60 Min. :21.45
## 1st Qu.: 61.40 1st Qu.:23.96
## Median : 62.50 Median :24.39
## Mean : 72.04 Mean :25.19
## 3rd Qu.: 85.05 3rd Qu.:25.56
## Max. :133.50 Max. :47.54
#NOTABLE STATISTICS
#AVG BMI - 25.19(overweight)...does not generally indicate body fatness
Notes: Positive Linear Relation
ggplot(daily_sleep_clean, aes(x =TotalMinutesAsleep, y =TotalTimeInBed,))+
geom_point()+geom_smooth(method = lm)+
labs(title= "Total Minutes Asleep vs TotalTime In Bed")+
annotate("text",x=500,y=230,label="positive correlation", color="green",fontface="bold")
## `geom_smooth()` using formula 'y ~ x'
Notes: Positive Linear Relation
ggplot(weight_log_clean, aes(x=WeightKg, y =BMI))+
geom_point()+geom_smooth(method = lm)+
labs(title = "Weight vs BMI")+
annotate("text",x=70,y=38,label="positive correlation", color="green",fontface="bold")
## `geom_smooth()` using formula 'y ~ x'
Gornall, Lucy. “How to Lose Weight: How Many Calories Should i Eat to Lose Weight?” GoodtoKnow, 12 Aug. 2020 https://www.goodto.com/wellbeing/diets-exercise/what-is-calorie-how-many-lose-weigt-425557
Grey, Heather. “Heart Rates Can Vary by 70 Bpm: What That Means for Your Health.” Healthline, Healthline Media, 9 Feb. 2020, https://www.healthline.com/health-news/what-your-heart-rate-says-about-your-health
“CDC — How Much Sleep Do I Need? — Sleep and Sleep Disorders.” Centers for Disease Control and Prevention, Centers for Disease Control and Prevention, 2 Mar. 2017, https://www.cdc.gov/sleep/about_sleep/how_much_sleep.html.
Reed, Martin. “Spend Less Time In Bed If You Want More Sleep.” Healthcentral.com, 7 May 2017, https://www.healthcentral.com/article/spend-less-time-in-bed-if-you-want-more-sleep