Loading the CSV file to create data frames
install.packages("readr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(readr)
dailyActivity_merged <- read_csv("dailyActivity_merged.csv")
## Rows: 940 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityDate
## dbl (14): Id, TotalSteps, TotalDistance, TrackerDistance, LoggedActivitiesDi...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(dailyActivity_merged)
## Rows: 940
## Columns: 15
## $ Id <dbl> 1503960366, 1503960366, 1503960366, 150396036…
## $ ActivityDate <chr> "4/12/2016", "4/13/2016", "4/14/2016", "4/15/…
## $ TotalSteps <dbl> 13162, 10735, 10460, 9762, 12669, 9705, 13019…
## $ TotalDistance <dbl> 8.50, 6.97, 6.74, 6.28, 8.16, 6.48, 8.59, 9.8…
## $ TrackerDistance <dbl> 8.50, 6.97, 6.74, 6.28, 8.16, 6.48, 8.59, 9.8…
## $ LoggedActivitiesDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveDistance <dbl> 1.88, 1.57, 2.44, 2.14, 2.71, 3.19, 3.25, 3.5…
## $ ModeratelyActiveDistance <dbl> 0.55, 0.69, 0.40, 1.26, 0.41, 0.78, 0.64, 1.3…
## $ LightActiveDistance <dbl> 6.06, 4.71, 3.91, 2.83, 5.04, 2.51, 4.71, 5.0…
## $ SedentaryActiveDistance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ VeryActiveMinutes <dbl> 25, 21, 30, 29, 36, 38, 42, 50, 28, 19, 66, 4…
## $ FairlyActiveMinutes <dbl> 13, 19, 11, 34, 10, 20, 16, 31, 12, 8, 27, 21…
## $ LightlyActiveMinutes <dbl> 328, 217, 181, 209, 221, 164, 233, 264, 205, …
## $ SedentaryMinutes <dbl> 728, 776, 1218, 726, 773, 539, 1149, 775, 818…
## $ Calories <dbl> 1985, 1797, 1776, 1745, 1863, 1728, 1921, 203…
Because I want to analyze just the frequency of tracking and
logging, I will first create two data frames–one for each choice and
including only the distances above zero. Here I will change the column
names so that I can merge the two frames back together and have the
logged and tracked distances in the same column. The information on the
method by which the distance was recorded will still be preserved with
each entry by introducing a new column called recording method.
tracking_df <- select(dailyActivity_merged,Id, ActivityDate, TrackerDistance) %>%
rename(Distance = TrackerDistance) %>%
filter(Distance > 0) %>%
mutate(recording_method = "tracked")
glimpse(tracking_df)
## Rows: 862
## Columns: 4
## $ Id <dbl> 1503960366, 1503960366, 1503960366, 1503960366, 15039…
## $ ActivityDate <chr> "4/12/2016", "4/13/2016", "4/14/2016", "4/15/2016", "…
## $ Distance <dbl> 8.50, 6.97, 6.74, 6.28, 8.16, 6.48, 8.59, 9.88, 6.68,…
## $ recording_method <chr> "tracked", "tracked", "tracked", "tracked", "tracked"…
logging_df <- select(dailyActivity_merged, Id, ActivityDate, LoggedActivitiesDistance) %>%
rename(Distance = LoggedActivitiesDistance) %>%
filter(Distance > 0) %>%
mutate(recording_method = "logged")
glimpse(logging_df)
## Rows: 32
## Columns: 4
## $ Id <dbl> 6775888955, 6962181067, 6962181067, 6962181067, 70077…
## $ ActivityDate <chr> "4/26/2016", "4/21/2016", "4/25/2016", "5/9/2016", "4…
## $ Distance <dbl> 1.959596, 4.081692, 2.785175, 3.167822, 4.869783, 4.8…
## $ recording_method <chr> "logged", "logged", "logged", "logged", "logged", "lo…
distance_recording_method_df <- full_join(tracking_df, logging_df)
## Joining, by = c("Id", "ActivityDate", "Distance", "recording_method")
head(distance_recording_method_df)
## # A tibble: 6 × 4
## Id ActivityDate Distance recording_method
## <dbl> <chr> <dbl> <chr>
## 1 1503960366 4/12/2016 8.5 tracked
## 2 1503960366 4/13/2016 6.97 tracked
## 3 1503960366 4/14/2016 6.74 tracked
## 4 1503960366 4/15/2016 6.28 tracked
## 5 1503960366 4/16/2016 8.16 tracked
## 6 1503960366 4/17/2016 6.48 tracked