R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
#reading dailyActivity files for Mar - May
dailyActivity_merged_mar_apr <- read_csv("dailyActivity_merged_mar_apr.csv")
## Rows: 457 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): ActivityDate
## dbl (14): Id, TotalSteps, TotalDistance, TrackerDistance, LoggedActivitiesDi...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dailyActivity_merged_apr_may <- read_csv("dailyActivity_merged_apr_may.csv")
## Rows: 940 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): ActivityDate
## dbl (14): librabId, TotalSteps, TotalDistance, TrackerDistance, LoggedActivi...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#combining dailyActivity info for march, apr, and may

combined_dailyActivity <- bind_rows(dailyActivity_merged_mar_apr, dailyActivity_merged_apr_may)
print(combined_dailyActivity)
## # A tibble: 1,397 × 16
##            Id ActivityDate TotalSteps TotalDistance TrackerDistance
##         <dbl> <chr>             <dbl>         <dbl>           <dbl>
##  1 1503960366 3/25/2016         11004          7.11            7.11
##  2 1503960366 3/26/2016         17609         11.6            11.6 
##  3 1503960366 3/27/2016         12736          8.53            8.53
##  4 1503960366 3/28/2016         13231          8.93            8.93
##  5 1503960366 3/29/2016         12041          7.85            7.85
##  6 1503960366 3/30/2016         10970          7.16            7.16
##  7 1503960366 3/31/2016         12256          7.86            7.86
##  8 1503960366 4/1/2016          12262          7.87            7.87
##  9 1503960366 4/2/2016          11248          7.25            7.25
## 10 1503960366 4/3/2016          10016          6.37            6.37
## # ℹ 1,387 more rows
## # ℹ 11 more variables: LoggedActivitiesDistance <dbl>,
## #   VeryActiveDistance <dbl>, ModeratelyActiveDistance <dbl>,
## #   LightActiveDistance <dbl>, SedentaryActiveDistance <dbl>,
## #   VeryActiveMinutes <dbl>, FairlyActiveMinutes <dbl>,
## #   LightlyActiveMinutes <dbl>, SedentaryMinutes <dbl>, Calories <dbl>,
## #   librabId <dbl>

#Verified the number of unique user IDs in each consolidated table (limit 33 according to Kaggle info)

unique_Ids <- unique(combined_dailyActivity$Id)
print(unique_Ids)
##  [1] 1503960366 1624580081 1644430081 1844505072 1927972279 2022484408
##  [7] 2026352035 2320127002 2347167796 2873212765 2891001357 3372868164
## [13] 3977333714 4020332650 4057192912 4319703577 4388161847 4445114986
## [19] 4558609924 4702921684 5553957443 5577150313 6117666160 6290855005
## [25] 6391747486 6775888955 6962181067 7007744171 7086361926 8053475328
## [31] 8253242879 8378563200 8583815059 8792009665 8877689391         NA

#35 (not 33) unique Ids printed

#checked for duplicate ids

 is_duplicate <- duplicated(combined_dailyActivity)
duplicate_rows <- combined_dailyActivity[is_duplicate, ]
print(duplicate_rows)
## # A tibble: 0 × 16
## # ℹ 16 variables: Id <dbl>, ActivityDate <chr>, TotalSteps <dbl>,
## #   TotalDistance <dbl>, TrackerDistance <dbl>, LoggedActivitiesDistance <dbl>,
## #   VeryActiveDistance <dbl>, ModeratelyActiveDistance <dbl>,
## #   LightActiveDistance <dbl>, SedentaryActiveDistance <dbl>,
## #   VeryActiveMinutes <dbl>, FairlyActiveMinutes <dbl>,
## #   LightlyActiveMinutes <dbl>, SedentaryMinutes <dbl>, Calories <dbl>,
## #   librabId <dbl>

#All data/time columns were converted from string/object types to datetime format.

combined_dailyActivity %>% mutate(ActivityDate = mdy(ActivityDate))
## # A tibble: 1,397 × 16
##            Id ActivityDate TotalSteps TotalDistance TrackerDistance
##         <dbl> <date>            <dbl>         <dbl>           <dbl>
##  1 1503960366 2016-03-25        11004          7.11            7.11
##  2 1503960366 2016-03-26        17609         11.6            11.6 
##  3 1503960366 2016-03-27        12736          8.53            8.53
##  4 1503960366 2016-03-28        13231          8.93            8.93
##  5 1503960366 2016-03-29        12041          7.85            7.85
##  6 1503960366 2016-03-30        10970          7.16            7.16
##  7 1503960366 2016-03-31        12256          7.86            7.86
##  8 1503960366 2016-04-01        12262          7.87            7.87
##  9 1503960366 2016-04-02        11248          7.25            7.25
## 10 1503960366 2016-04-03        10016          6.37            6.37
## # ℹ 1,387 more rows
## # ℹ 11 more variables: LoggedActivitiesDistance <dbl>,
## #   VeryActiveDistance <dbl>, ModeratelyActiveDistance <dbl>,
## #   LightActiveDistance <dbl>, SedentaryActiveDistance <dbl>,
## #   VeryActiveMinutes <dbl>, FairlyActiveMinutes <dbl>,
## #   LightlyActiveMinutes <dbl>, SedentaryMinutes <dbl>, Calories <dbl>,
## #   librabId <dbl>

#Checked for missing values (can drop those rows with missing values or use imputation - filling missing values with mean or median depending upon the variable).

missing_values_per_column <- colSums(is.na(combined_dailyActivity))
print(missing_values_per_column)
##                       Id             ActivityDate               TotalSteps 
##                      940                        0                        0 
##            TotalDistance          TrackerDistance LoggedActivitiesDistance 
##                        0                        0                        0 
##       VeryActiveDistance ModeratelyActiveDistance      LightActiveDistance 
##                        0                        0                        0 
##  SedentaryActiveDistance        VeryActiveMinutes      FairlyActiveMinutes 
##                        0                        0                        0 
##     LightlyActiveMinutes         SedentaryMinutes                 Calories 
##                        0                        0                        0 
##                 librabId 
##                      457

#inspecting the data it looks like some id’s are called librabID instead of just ID. Fix this, rename the incorrectly named column (librabId) to the correct name (Id) and then coalesce (combine) the two ID columns into a single, clean Id column.

combined_dailyActivity_fixed <- combined_dailyActivity %>%
mutate(Id = coalesce(Id, librabId))

#May need to remove the redundant ‘librabId’ column

#Checked for missing values again

missing_values_per_column <- colSums(is.na(combined_dailyActivity_fixed))
print(missing_values_per_column)
##                       Id             ActivityDate               TotalSteps 
##                        0                        0                        0 
##            TotalDistance          TrackerDistance LoggedActivitiesDistance 
##                        0                        0                        0 
##       VeryActiveDistance ModeratelyActiveDistance      LightActiveDistance 
##                        0                        0                        0 
##  SedentaryActiveDistance        VeryActiveMinutes      FairlyActiveMinutes 
##                        0                        0                        0 
##     LightlyActiveMinutes         SedentaryMinutes                 Calories 
##                        0                        0                        0 
##                 librabId 
##                      457

#Looked for outliers or inconsistencies (example: zero steps in a day highly unlikely - flag or remove - may have been days the tracker was not worn)

#Filtered out rows where total steps were zero using the following code:
combined_dailyActivity <- combined_dailyActivity %>%
filter(TotalSteps > 0)

Calculate key descriptive statistics, excluding the ‘Id’ column

summary_stats_long <- combined_dailyActivity %>%
  summarise(
    across(
      .cols = c(where(is.numeric), -starts_with("Id")), 
      .fns = list(
      mean = mean, 
      median = median, 
      max = max, 
      min = min
      ),
    .names = "{.col}_{.fn}",
    na.rm = TRUE 
  )
) %>%
pivot_longer(
  cols = everything(), 
  names_to = "Metric", 
  values_to = "Value" 
)
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(...)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

#Removed the Sedentary Active Distance column (noisy variable) because the values are zero or close to zero and since this distance is recorded without an associated step count that triggers an “active” classification, it usually points to passive forms of movement like driving or riding, arm movement or typing, or gps jitters.Using this code:

combined_dailyActivity <- combined_dailyActivity %>%
select(-SedentaryActiveDistance)
print(combined_dailyActivity_fixed)
## # A tibble: 1,397 × 16
##            Id ActivityDate TotalSteps TotalDistance TrackerDistance
##         <dbl> <chr>             <dbl>         <dbl>           <dbl>
##  1 1503960366 3/25/2016         11004          7.11            7.11
##  2 1503960366 3/26/2016         17609         11.6            11.6 
##  3 1503960366 3/27/2016         12736          8.53            8.53
##  4 1503960366 3/28/2016         13231          8.93            8.93
##  5 1503960366 3/29/2016         12041          7.85            7.85
##  6 1503960366 3/30/2016         10970          7.16            7.16
##  7 1503960366 3/31/2016         12256          7.86            7.86
##  8 1503960366 4/1/2016          12262          7.87            7.87
##  9 1503960366 4/2/2016          11248          7.25            7.25
## 10 1503960366 4/3/2016          10016          6.37            6.37
## # ℹ 1,387 more rows
## # ℹ 11 more variables: LoggedActivitiesDistance <dbl>,
## #   VeryActiveDistance <dbl>, ModeratelyActiveDistance <dbl>,
## #   LightActiveDistance <dbl>, SedentaryActiveDistance <dbl>,
## #   VeryActiveMinutes <dbl>, FairlyActiveMinutes <dbl>,
## #   LightlyActiveMinutes <dbl>, SedentaryMinutes <dbl>, Calories <dbl>,
## #   librabId <dbl>

#User Segmentation (Activity Levels): Categorized users based on their average daily steps to understand the user base. #Inactive: < 5,000 steps/day, Average/Lightly Active: 5,000 to 10,000 steps/day, Active: > 10,000 steps/day # Calculate the average steps for each unique Id

activity_level_count <- combined_dailyActivity_fixed %>%
# Group the data by user ID
group_by(Id) %>%
# Summarize to find the average steps taken per day for each user
summarise(
Average_Steps = mean(TotalSteps, na.rm = TRUE)
) %>%
#Categorize each user's average steps into activity levels
mutate(
Activity_Level = case_when(
Average_Steps < 5000 ~ "Inactive",
Average_Steps >= 5000 & Average_Steps <= 10000 ~ "Lightly Active",
Average_Steps > 10000 ~ "Active"
    )
) %>%
#Count how many users fall into each category
# Ungroup before counting to get the total number of users per category
ungroup() %>%
count(Activity_Level, name = "UserCount")
# Display the resulting count table
print(activity_level_count)
## # A tibble: 3 × 2
##   Activity_Level UserCount
##   <chr>              <int>
## 1 Active                 7
## 2 Inactive              13
## 3 Lightly Active        15

#Calculate the percentage for each category

activity_level_data_for_pie <- activity_level_count %>%
mutate(
# Calculate the percentage of total users
  percentage = (UserCount / sum(UserCount))*100)
print(activity_level_data_for_pie)
## # A tibble: 3 × 3
##   Activity_Level UserCount percentage
##   <chr>              <int>      <dbl>
## 1 Active                 7       20  
## 2 Inactive              13       37.1
## 3 Lightly Active        15       42.9

#Convert the ActivityDate column from character to Date format and save it

combined_dailyActivity_fixed <- combined_dailyActivity_fixed %>%
mutate(ActivityDate = mdy(ActivityDate))
write_csv(combined_dailyActivity_fixed, "combined_dailyActivity_cleaned.csv")

#Extract the Day of the Week (This line will now work)

combined_dailyActivity_fixed <- combined_dailyActivity_fixed %>%
mutate(DayOfWeek = wday(ActivityDate, label = TRUE, abbr = FALSE))

#Calculate the mean steps for each day of the week

DailyStepsSummary <- combined_dailyActivity_fixed %>%
group_by(DayOfWeek) %>%
summarise(
  mean_steps = mean(TotalSteps, na.rm = TRUE),
  .groups = 'drop'
  )
print(combined_dailyActivity_fixed)    
## # A tibble: 1,397 × 17
##            Id ActivityDate TotalSteps TotalDistance TrackerDistance
##         <dbl> <date>            <dbl>         <dbl>           <dbl>
##  1 1503960366 2016-03-25        11004          7.11            7.11
##  2 1503960366 2016-03-26        17609         11.6            11.6 
##  3 1503960366 2016-03-27        12736          8.53            8.53
##  4 1503960366 2016-03-28        13231          8.93            8.93
##  5 1503960366 2016-03-29        12041          7.85            7.85
##  6 1503960366 2016-03-30        10970          7.16            7.16
##  7 1503960366 2016-03-31        12256          7.86            7.86
##  8 1503960366 2016-04-01        12262          7.87            7.87
##  9 1503960366 2016-04-02        11248          7.25            7.25
## 10 1503960366 2016-04-03        10016          6.37            6.37
## # ℹ 1,387 more rows
## # ℹ 12 more variables: LoggedActivitiesDistance <dbl>,
## #   VeryActiveDistance <dbl>, ModeratelyActiveDistance <dbl>,
## #   LightActiveDistance <dbl>, SedentaryActiveDistance <dbl>,
## #   VeryActiveMinutes <dbl>, FairlyActiveMinutes <dbl>,
## #   LightlyActiveMinutes <dbl>, SedentaryMinutes <dbl>, Calories <dbl>,
## #   librabId <dbl>, DayOfWeek <ord>
print(DailyStepsSummary)
## # A tibble: 7 × 2
##   DayOfWeek mean_steps
##   <ord>          <dbl>
## 1 Sunday         6607.
## 2 Monday         7541.
## 3 Tuesday        7084.
## 4 Wednesday      7548.
## 5 Thursday       7268.
## 6 Friday         7188.
## 7 Saturday       7752.

Including Plots

#Created the bar chart for Daily Steps Summary

ggplot(data = DailyStepsSummary, aes(x = DayOfWeek, y = mean_steps)) +
# Use geom_col for a bar chart where the height is determined by 'mean_steps'
geom_col(fill = "tan") +
# Add labels above the bars for precision
geom_text(aes(label = round(mean_steps, 0)), vjust = -0.5, size = 3) +
# Add Labels and Title
labs(
  title = "Average Daily Steps by Day of the Week",
  subtitle = "Analysis of User Activity Patterns",
  x = "Day of the Week",
  y = "Average Total Steps"
) +
# Enhance the theme for presentation
theme_minimal() +
# Adjust y-axis limit to accommodate the labels slightly above the bars
scale_y_continuous(expand = expansion(mult = c(0, 0.1)))

#Create the scatter plot of daily calories burned versus sedentary minutes with a trend line

combined_dailyActivity_fixed %>%
ggplot(aes(x = SedentaryMinutes, y = Calories)) +
# 1. Add points (scatter plot)
geom_point(alpha = 0.5, color = "tan") + # alpha makes overlapping points visible
# 2. Add a trend line (geom_smooth)
# 'method = "lm"' creates a linear regression line.
# 'se = FALSE' removes the shaded confidence interval.
geom_smooth(method = "lm", se = FALSE, color = "darkcyan", linetype = "dashed") +
# 3. Add Labels and Title
labs(
  title = "Daily Calories Burned vs. Sedentary Minutes",
  subtitle = "Analysis shows a slight positive relationship (or lack of correlation)",
  x = "Sedentary Minutes (per Day)",
  y = "Total Calories Burned (per Day)"
) +
# 4. Enhance the theme
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5)
  )
## `geom_smooth()` using formula = 'y ~ x'

#Create the scatter plot of daily calories burned versus total steps with a trend line

calorie_totalSteps_plot <- combined_dailyActivity_fixed %>%
ggplot(aes(x = TotalSteps, y = Calories)) +
# 1. Add points (scatter plot)
geom_point(alpha = 0.5, color = "tan") + # alpha makes overlapping points visible
# 2. Add a trend line (geom_smooth)
# 'method = "lm"' creates a linear regression line.
# 'se = FALSE' removes the shaded confidence interval.
geom_smooth(method = "lm", se = FALSE, color = "darkcyan", linetype = "dashed") +
# 3. Add Labels and Title
  labs(
    title = "Daily Calories Burned vs. Total Steps",
    subtitle = "Analysis shows a positive relationship",
    x = "Total Steps (per Day)",
    y = "Total Calories Burned (per Day)"
    ) +
# 4. Enhance the theme
  theme_minimal() +
    theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5)
    )
print(calorie_totalSteps_plot)
## `geom_smooth()` using formula = 'y ~ x'

write_csv(combined_dailyActivity_fixed, "combined_dailyActivity_cleaned.csv")