library(readxl)
activity <- read_excel("Fitabase Data 4.12.16-5.12.16/dailyActivity_merged.xlsx") #activity contains daily data on steps, distance, calories, and active minute
library(readr)
heartrate <- read_csv("Fitabase Data 4.12.16-5.12.16/heartrate_seconds_merged.csv")
## Rows: 2483658 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Time
## dbl (2): Id, Value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
weight <- read_csv("Fitabase Data 4.12.16-5.12.16/weightLogInfo_merged.csv") #weight contains data on weight in Kg or Pounds
## Rows: 67 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Date
## dbl (6): Id, WeightKg, WeightPounds, Fat, BMI, LogId
## lgl (1): IsManualReport
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sleep <- read_csv("Fitabase Data 4.12.16-5.12.16/sleepDay_merged.csv") #sleep contains data on the total time spent on bed and the total time asleep each day
## Rows: 413 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): SleepDay
## dbl (4): Id, TotalSleepRecords, TotalMinutesAsleep, TotalTimeInBed
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
METs <- read_csv("Fitabase Data 4.12.16-5.12.16/minuteMETsNarrow_merged.csv") #METs is the metabolic equivalent of task for every minute
## Rows: 1325580 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityMinute
## dbl (2): Id, METs
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(pacman) #package used for managing other packages
library(tidyverse) #package used for data cleaning and manipulation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ purrr     1.0.1
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
glimpse(activity) #To see the entire columns in activity table created
## Rows: 940
## Columns: 15
## $ Id                       <dbl> 1503960366, 1503960366, 1503960366, 150396036…
## $ ActivityDate             <dttm> 2016-04-12, 2016-04-13, 2016-04-14, 2016-04-…
## $ TotalSteps               <dbl> 13162, 10735, 10460, 9762, 12669, 9705, 13019…
## $ TotalDistance            <dbl> 8.50, 6.97, 6.74, 6.28, 8.16, 6.48, 8.59, 9.8…
## $ TrackerDistance          <dbl> 8.50, 6.97, 6.74, 6.28, 8.16, 6.48, 8.59, 9.8…
## $ LoggedActivitiesDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveDistance       <dbl> 1.88, 1.57, 2.44, 2.14, 2.71, 3.19, 3.25, 3.5…
## $ ModeratelyActiveDistance <dbl> 0.55, 0.69, 0.40, 1.26, 0.41, 0.78, 0.64, 1.3…
## $ LightActiveDistance      <dbl> 6.06, 4.71, 3.91, 2.83, 5.04, 2.51, 4.71, 5.0…
## $ SedentaryActiveDistance  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveMinutes        <dbl> 25, 21, 30, 29, 36, 38, 42, 50, 28, 19, 66, 4…
## $ FairlyActiveMinutes      <dbl> 13, 19, 11, 34, 10, 20, 16, 31, 12, 8, 27, 21…
## $ LightlyActiveMinutes     <dbl> 328, 217, 181, 209, 221, 164, 233, 264, 205, …
## $ SedentaryMinutes         <dbl> 728, 776, 1218, 726, 773, 539, 1149, 775, 818…
## $ Calories                 <dbl> 1985, 1797, 1776, 1745, 1863, 1728, 1921, 203…
## Filter and clean the activity data
#selecting only the possible columns the will be used for data visualization 
activity_1 <- activity %>% 
  mutate(date = as_date(ActivityDate, "%Y-%m-%d")) %>% 
  mutate(month_year = format(date, "%b %Y")) %>% 
  select(Id, date, ActivityDate, TotalSteps, TotalDistance, VeryActiveMinutes, FairlyActiveMinutes, LightlyActiveMinutes, SedentaryMinutes, Calories, month_year) %>% 
  select(-ActivityDate) %>% 
  filter(!(Id %in% 4057192912))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `date = as_date(ActivityDate, "%Y-%m-%d")`.
## Caused by warning in `as.POSIXlt.POSIXct()`:
## ! unknown timezone '%Y-%m-%d'
head(activity_1)
## # A tibble: 6 × 10
##           Id date       TotalSteps TotalDistance VeryActiveMinutes
##        <dbl> <date>          <dbl>         <dbl>             <dbl>
## 1 1503960366 2016-04-12      13162          8.5                 25
## 2 1503960366 2016-04-13      10735          6.97                21
## 3 1503960366 2016-04-14      10460          6.74                30
## 4 1503960366 2016-04-15       9762          6.28                29
## 5 1503960366 2016-04-16      12669          8.16                36
## 6 1503960366 2016-04-17       9705          6.48                38
## # ℹ 5 more variables: FairlyActiveMinutes <dbl>, LightlyActiveMinutes <dbl>,
## #   SedentaryMinutes <dbl>, Calories <dbl>, month_year <chr>
glimpse(METs) #To see the entire columns in METs table created
## Rows: 1,325,580
## Columns: 3
## $ Id             <dbl> 1503960366, 1503960366, 1503960366, 1503960366, 1503960…
## $ ActivityMinute <chr> "4/12/2016 12:00:00 AM", "4/12/2016 12:01:00 AM", "4/12…
## $ METs           <dbl> 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, 12, 10, 10,…
METs_1 <- METs %>% 
  mutate(date_time = parse_date_time(ActivityMinute, "%m/%d/%Y %I:%M:%S %p")) %>% 
  select(-ActivityMinute)
head(METs_1)
## # A tibble: 6 × 3
##           Id  METs date_time          
##        <dbl> <dbl> <dttm>             
## 1 1503960366    10 2016-04-12 00:00:00
## 2 1503960366    10 2016-04-12 00:01:00
## 3 1503960366    10 2016-04-12 00:02:00
## 4 1503960366    10 2016-04-12 00:03:00
## 5 1503960366    10 2016-04-12 00:04:00
## 6 1503960366    12 2016-04-12 00:05:00
glimpse(weight) #To see the entire columns in weight table created
## Rows: 67
## Columns: 8
## $ Id             <dbl> 1503960366, 1503960366, 1927972279, 2873212765, 2873212…
## $ Date           <chr> "5/2/2016 23:59", "5/3/2016 23:59", "4/13/2016 1:08", "…
## $ WeightKg       <dbl> 52.6, 52.6, 133.5, 56.7, 57.3, 72.4, 72.3, 69.7, 70.3, …
## $ WeightPounds   <dbl> 115.9631, 115.9631, 294.3171, 125.0021, 126.3249, 159.6…
## $ Fat            <dbl> 22, NA, NA, NA, NA, 25, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ BMI            <dbl> 22.65, 22.65, 47.54, 21.45, 21.69, 27.45, 27.38, 27.25,…
## $ IsManualReport <lgl> TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, …
## $ LogId          <dbl> 1.46223e+12, 1.46232e+12, 1.46051e+12, 1.46128e+12, 1.4…
weight_1 <- weight %>% 
  mutate(ID = as.character(Id)) %>% 
  select(ID, WeightKg) %>% 
  group_by(ID) %>% 
  drop_na() %>% 
  summarise(avg_weight = mean(WeightKg))
head(weight_1)
## # A tibble: 6 × 2
##   ID         avg_weight
##   <chr>           <dbl>
## 1 1503960366       52.6
## 2 1927972279      134. 
## 3 2873212765       57  
## 4 4319703577       72.4
## 5 4558609924       69.6
## 6 5577150313       90.7
glimpse(sleep)
## Rows: 413
## Columns: 5
## $ Id                 <dbl> 1503960366, 1503960366, 1503960366, 1503960366, 150…
## $ SleepDay           <chr> "4/12/2016 12:00:00 AM", "4/13/2016 12:00:00 AM", "…
## $ TotalSleepRecords  <dbl> 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ TotalMinutesAsleep <dbl> 327, 384, 412, 340, 700, 304, 360, 325, 361, 430, 2…
## $ TotalTimeInBed     <dbl> 346, 407, 442, 367, 712, 320, 377, 364, 384, 449, 3…
sleep_1 <- sleep %>% 
  mutate(Date_1 = as.character(SleepDay)) %>% 
  mutate(PST = TotalTimeInBed - TotalMinutesAsleep) %>% 
  separate(Date_1, c("date", "time"), sep = " ") %>% 
  mutate(Date = as.Date(date, "%m/%d/%Y", optional = FALSE)) %>% 
  select(-TotalSleepRecords, -SleepDay, -date, -time) %>% 
  drop_na() %>% 
  filter(!(Id %in% c(1644430081, 1844505072, 1927972279, 4020332650, 2320127002, 4558609924, 6775888955, 7007744171, 8053475328)))
## Warning: Expected 2 pieces. Additional pieces discarded in 413 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
head(sleep_1)
## # A tibble: 6 × 5
##           Id TotalMinutesAsleep TotalTimeInBed   PST Date      
##        <dbl>              <dbl>          <dbl> <dbl> <date>    
## 1 1503960366                327            346    19 2016-04-12
## 2 1503960366                384            407    23 2016-04-13
## 3 1503960366                412            442    30 2016-04-15
## 4 1503960366                340            367    27 2016-04-16
## 5 1503960366                700            712    12 2016-04-17
## 6 1503960366                304            320    16 2016-04-19
glimpse(heartrate)    
## Rows: 2,483,658
## Columns: 3
## $ Id    <dbl> 2022484408, 2022484408, 2022484408, 2022484408, 2022484408, 2022…
## $ Time  <chr> "4/12/2016 7:21:00 AM", "4/12/2016 7:21:05 AM", "4/12/2016 7:21:…
## $ Value <dbl> 97, 102, 105, 103, 101, 95, 91, 93, 94, 93, 92, 89, 83, 61, 60, …
heartrate_1 <- heartrate %>% 
  mutate(date_time = parse_date_time(Time, "%m/%d/%Y %I:%M:%S %p")) %>% 
  select(-Time)
head(heartrate_1)
## # A tibble: 6 × 3
##           Id Value date_time          
##        <dbl> <dbl> <dttm>             
## 1 2022484408    97 2016-04-12 07:21:00
## 2 2022484408   102 2016-04-12 07:21:05
## 3 2022484408   105 2016-04-12 07:21:10
## 4 2022484408   103 2016-04-12 07:21:20
## 5 2022484408   101 2016-04-12 07:21:25
## 6 2022484408    95 2016-04-12 07:22:05
library(ggplot2)

ggplot(sleep_1,
       mapping = aes(
         x = Date,
         y = TotalMinutesAsleep)
       )+
  geom_line()+
  labs(x = "", 
       y = "DailySleep(mins)", 
       title = "Time Asleep per Day", 
       subtitle = "April & May")+
  theme(axis.text.x = element_text(angle=45, 
                                   hjust=0.8, 
                                   size = 8)
        )+
  scale_x_date(date_breaks = "2 week")+
  facet_wrap(~Id)

ggplot(sleep_1,
       mapping = aes(
         x = Date,
         y = PST)
       )+
  geom_line()+
  labs(x = "Date", 
       y = "Possible Stressed Time", 
       title = "Users on Bed without Sleeping", 
       subtitle = "April & May")+
  theme(axis.text.x = element_text(angle=45, 
                                   hjust=0.8, 
                                   size = 8)
        )+
  scale_x_date(date_breaks = "2 week")+
  facet_wrap(~Id)

ggplot(activity_1,
       mapping = aes(
         x = date,
         y = TotalDistance,
         colour = FairlyActiveMinutes)
       ) +
  geom_line() +
  scale_color_gradient(low = "#94E873", high = "#00441B") +
  labs(x = "Date",
       y = "Distances(Km)",
       title = "User's Daily Activity",
       subtitle = "April & May"
       ) +
  scale_x_date(date_minor_breaks = "month")+
  theme(axis.text.x = element_text(angle=45)
  )+
  theme(axis.text.y = element_text(size = 5))+
  theme(axis.text.x = element_text(size = 5))+
  facet_wrap(vars(Id))

ggplot(heartrate_1, 
       mapping = aes(
         x = date_time, 
         y = Value, 
         colour = Id)
       ) +
  geom_step() +
  scale_color_gradient(low = "#132B43", high = "#2A77B2") +
  labs(x = "Date_time",
       y = "Pulse Value per second",
       title = "Time Series of User's Heartbeat",
       subtitle = "April & May"
       ) +
  theme_light() +
  theme(axis.text.y = element_text(size = 5))+
  theme(axis.text.x = element_text(angle=45, 
                                   hjust=0.8,
                                   size = 5)
        )+
  facet_wrap(vars(Id), scales = "free")

ggplot(activity_1, 
       mapping = aes(
         x = date, 
         y = Calories, 
         colour = month_year)
       ) +
  geom_line() +
  scale_color_hue(direction = 1) +
  labs(x = "Date",
       y = "Calories (KJ)",
       title = "Showing Calories Burnt by Users",
       subtitle = "April & May"
       ) +
  theme_bw() +
  theme(axis.text.y = element_text(size = 5))+
  theme(axis.text.x = element_text(angle=45,
                                   size = 5)
        )+
  facet_wrap(vars(Id))

ggplot(weight_1, 
       mapping = aes(
         x = ID, 
         y = avg_weight, 
         fill = avg_weight)
       )+
  geom_col() +
  theme_minimal()+
  labs(x = "ID",
       y = "Average Weight",
       title = "Average Weight of Users"
       ) +
  scale_fill_gradient(low = "#1881EB", high = "#08243A")

print(mean(activity_1$TotalDistance))
## [1] 5.500929
print(sd(activity_1$TotalDistance))
## [1] 3.927566
print(mean(sleep_1$TotalMinutesAsleep)/60)
## [1] 7.143887
print(sd(sleep_1$TotalMinutesAsleep)/60)
## [1] 1.676945
print(mean(sleep_1$PST))
## [1] 38.38522
print(sd(sleep_1$PST))
## [1] 40.85727