This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
#reading dailyActivity files for Mar - May
dailyActivity_merged_mar_apr <- read_csv("dailyActivity_merged_mar_apr.csv")
## Rows: 457 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityDate
## dbl (14): Id, TotalSteps, TotalDistance, TrackerDistance, LoggedActivitiesDi...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dailyActivity_merged_apr_may <- read_csv("dailyActivity_merged_apr_may.csv")
## Rows: 940 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ActivityDate
## dbl (14): librabId, TotalSteps, TotalDistance, TrackerDistance, LoggedActivi...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#combining dailyActivity info for march, apr, and may
combined_dailyActivity <- bind_rows(dailyActivity_merged_mar_apr, dailyActivity_merged_apr_may)
print(combined_dailyActivity)
## # A tibble: 1,397 × 16
## Id ActivityDate TotalSteps TotalDistance TrackerDistance
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1503960366 3/25/2016 11004 7.11 7.11
## 2 1503960366 3/26/2016 17609 11.6 11.6
## 3 1503960366 3/27/2016 12736 8.53 8.53
## 4 1503960366 3/28/2016 13231 8.93 8.93
## 5 1503960366 3/29/2016 12041 7.85 7.85
## 6 1503960366 3/30/2016 10970 7.16 7.16
## 7 1503960366 3/31/2016 12256 7.86 7.86
## 8 1503960366 4/1/2016 12262 7.87 7.87
## 9 1503960366 4/2/2016 11248 7.25 7.25
## 10 1503960366 4/3/2016 10016 6.37 6.37
## # ℹ 1,387 more rows
## # ℹ 11 more variables: LoggedActivitiesDistance <dbl>,
## # VeryActiveDistance <dbl>, ModeratelyActiveDistance <dbl>,
## # LightActiveDistance <dbl>, SedentaryActiveDistance <dbl>,
## # VeryActiveMinutes <dbl>, FairlyActiveMinutes <dbl>,
## # LightlyActiveMinutes <dbl>, SedentaryMinutes <dbl>, Calories <dbl>,
## # librabId <dbl>
#Verified the number of unique user IDs in each consolidated table (limit 33 according to Kaggle info)
unique_Ids <- unique(combined_dailyActivity$Id)
print(unique_Ids)
## [1] 1503960366 1624580081 1644430081 1844505072 1927972279 2022484408
## [7] 2026352035 2320127002 2347167796 2873212765 2891001357 3372868164
## [13] 3977333714 4020332650 4057192912 4319703577 4388161847 4445114986
## [19] 4558609924 4702921684 5553957443 5577150313 6117666160 6290855005
## [25] 6391747486 6775888955 6962181067 7007744171 7086361926 8053475328
## [31] 8253242879 8378563200 8583815059 8792009665 8877689391 NA
#35 (not 33) unique Ids printed
#checked for duplicate ids
is_duplicate <- duplicated(combined_dailyActivity)
duplicate_rows <- combined_dailyActivity[is_duplicate, ]
print(duplicate_rows)
## # A tibble: 0 × 16
## # ℹ 16 variables: Id <dbl>, ActivityDate <chr>, TotalSteps <dbl>,
## # TotalDistance <dbl>, TrackerDistance <dbl>, LoggedActivitiesDistance <dbl>,
## # VeryActiveDistance <dbl>, ModeratelyActiveDistance <dbl>,
## # LightActiveDistance <dbl>, SedentaryActiveDistance <dbl>,
## # VeryActiveMinutes <dbl>, FairlyActiveMinutes <dbl>,
## # LightlyActiveMinutes <dbl>, SedentaryMinutes <dbl>, Calories <dbl>,
## # librabId <dbl>
#All data/time columns were converted from string/object types to datetime format.
combined_dailyActivity %>% mutate(ActivityDate = mdy(ActivityDate))
## # A tibble: 1,397 × 16
## Id ActivityDate TotalSteps TotalDistance TrackerDistance
## <dbl> <date> <dbl> <dbl> <dbl>
## 1 1503960366 2016-03-25 11004 7.11 7.11
## 2 1503960366 2016-03-26 17609 11.6 11.6
## 3 1503960366 2016-03-27 12736 8.53 8.53
## 4 1503960366 2016-03-28 13231 8.93 8.93
## 5 1503960366 2016-03-29 12041 7.85 7.85
## 6 1503960366 2016-03-30 10970 7.16 7.16
## 7 1503960366 2016-03-31 12256 7.86 7.86
## 8 1503960366 2016-04-01 12262 7.87 7.87
## 9 1503960366 2016-04-02 11248 7.25 7.25
## 10 1503960366 2016-04-03 10016 6.37 6.37
## # ℹ 1,387 more rows
## # ℹ 11 more variables: LoggedActivitiesDistance <dbl>,
## # VeryActiveDistance <dbl>, ModeratelyActiveDistance <dbl>,
## # LightActiveDistance <dbl>, SedentaryActiveDistance <dbl>,
## # VeryActiveMinutes <dbl>, FairlyActiveMinutes <dbl>,
## # LightlyActiveMinutes <dbl>, SedentaryMinutes <dbl>, Calories <dbl>,
## # librabId <dbl>
#Checked for missing values (can drop those rows with missing values or use imputation - filling missing values with mean or median depending upon the variable).
missing_values_per_column <- colSums(is.na(combined_dailyActivity))
print(missing_values_per_column)
## Id ActivityDate TotalSteps
## 940 0 0
## TotalDistance TrackerDistance LoggedActivitiesDistance
## 0 0 0
## VeryActiveDistance ModeratelyActiveDistance LightActiveDistance
## 0 0 0
## SedentaryActiveDistance VeryActiveMinutes FairlyActiveMinutes
## 0 0 0
## LightlyActiveMinutes SedentaryMinutes Calories
## 0 0 0
## librabId
## 457
#inspecting the data it looks like some id’s are called librabID instead of just ID. Fix this, rename the incorrectly named column (librabId) to the correct name (Id) and then coalesce (combine) the two ID columns into a single, clean Id column.
combined_dailyActivity_fixed <- combined_dailyActivity %>%
mutate(Id = coalesce(Id, librabId))
#May need to remove the redundant ‘librabId’ column
#Checked for missing values again
missing_values_per_column <- colSums(is.na(combined_dailyActivity_fixed))
print(missing_values_per_column)
## Id ActivityDate TotalSteps
## 0 0 0
## TotalDistance TrackerDistance LoggedActivitiesDistance
## 0 0 0
## VeryActiveDistance ModeratelyActiveDistance LightActiveDistance
## 0 0 0
## SedentaryActiveDistance VeryActiveMinutes FairlyActiveMinutes
## 0 0 0
## LightlyActiveMinutes SedentaryMinutes Calories
## 0 0 0
## librabId
## 457
#Looked for outliers or inconsistencies (example: zero steps in a day highly unlikely - flag or remove - may have been days the tracker was not worn)
#Filtered out rows where total steps were zero using the following code:
combined_dailyActivity <- combined_dailyActivity %>%
filter(TotalSteps > 0)
summary_stats_long <- combined_dailyActivity %>%
summarise(
across(
.cols = c(where(is.numeric), -starts_with("Id")),
.fns = list(
mean = mean,
median = median,
max = max,
min = min
),
.names = "{.col}_{.fn}",
na.rm = TRUE
)
) %>%
pivot_longer(
cols = everything(),
names_to = "Metric",
values_to = "Value"
)
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(...)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
#Removed the Sedentary Active Distance column (noisy variable) because the values are zero or close to zero and since this distance is recorded without an associated step count that triggers an “active” classification, it usually points to passive forms of movement like driving or riding, arm movement or typing, or gps jitters.Using this code:
combined_dailyActivity <- combined_dailyActivity %>%
select(-SedentaryActiveDistance)
print(combined_dailyActivity_fixed)
## # A tibble: 1,397 × 16
## Id ActivityDate TotalSteps TotalDistance TrackerDistance
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1503960366 3/25/2016 11004 7.11 7.11
## 2 1503960366 3/26/2016 17609 11.6 11.6
## 3 1503960366 3/27/2016 12736 8.53 8.53
## 4 1503960366 3/28/2016 13231 8.93 8.93
## 5 1503960366 3/29/2016 12041 7.85 7.85
## 6 1503960366 3/30/2016 10970 7.16 7.16
## 7 1503960366 3/31/2016 12256 7.86 7.86
## 8 1503960366 4/1/2016 12262 7.87 7.87
## 9 1503960366 4/2/2016 11248 7.25 7.25
## 10 1503960366 4/3/2016 10016 6.37 6.37
## # ℹ 1,387 more rows
## # ℹ 11 more variables: LoggedActivitiesDistance <dbl>,
## # VeryActiveDistance <dbl>, ModeratelyActiveDistance <dbl>,
## # LightActiveDistance <dbl>, SedentaryActiveDistance <dbl>,
## # VeryActiveMinutes <dbl>, FairlyActiveMinutes <dbl>,
## # LightlyActiveMinutes <dbl>, SedentaryMinutes <dbl>, Calories <dbl>,
## # librabId <dbl>
#User Segmentation (Activity Levels): Categorized users based on their average daily steps to understand the user base. #Inactive: < 5,000 steps/day, Average/Lightly Active: 5,000 to 10,000 steps/day, Active: > 10,000 steps/day # Calculate the average steps for each unique Id
activity_level_count <- combined_dailyActivity_fixed %>%
# Group the data by user ID
group_by(Id) %>%
# Summarize to find the average steps taken per day for each user
summarise(
Average_Steps = mean(TotalSteps, na.rm = TRUE)
) %>%
#Categorize each user's average steps into activity levels
mutate(
Activity_Level = case_when(
Average_Steps < 5000 ~ "Inactive",
Average_Steps >= 5000 & Average_Steps <= 10000 ~ "Lightly Active",
Average_Steps > 10000 ~ "Active"
)
) %>%
#Count how many users fall into each category
# Ungroup before counting to get the total number of users per category
ungroup() %>%
count(Activity_Level, name = "UserCount")
# Display the resulting count table
print(activity_level_count)
## # A tibble: 3 × 2
## Activity_Level UserCount
## <chr> <int>
## 1 Active 7
## 2 Inactive 13
## 3 Lightly Active 15
#Calculate the percentage for each category
activity_level_data_for_pie <- activity_level_count %>%
mutate(
# Calculate the percentage of total users
percentage = (UserCount / sum(UserCount))*100)
print(activity_level_data_for_pie)
## # A tibble: 3 × 3
## Activity_Level UserCount percentage
## <chr> <int> <dbl>
## 1 Active 7 20
## 2 Inactive 13 37.1
## 3 Lightly Active 15 42.9
#Convert the ActivityDate column from character to Date format and save it
combined_dailyActivity_fixed <- combined_dailyActivity_fixed %>%
mutate(ActivityDate = mdy(ActivityDate))
write_csv(combined_dailyActivity_fixed, "combined_dailyActivity_cleaned.csv")
#Extract the Day of the Week (This line will now work)
combined_dailyActivity_fixed <- combined_dailyActivity_fixed %>%
mutate(DayOfWeek = wday(ActivityDate, label = TRUE, abbr = FALSE))
#Calculate the mean steps for each day of the week
DailyStepsSummary <- combined_dailyActivity_fixed %>%
group_by(DayOfWeek) %>%
summarise(
mean_steps = mean(TotalSteps, na.rm = TRUE),
.groups = 'drop'
)
print(combined_dailyActivity_fixed)
## # A tibble: 1,397 × 17
## Id ActivityDate TotalSteps TotalDistance TrackerDistance
## <dbl> <date> <dbl> <dbl> <dbl>
## 1 1503960366 2016-03-25 11004 7.11 7.11
## 2 1503960366 2016-03-26 17609 11.6 11.6
## 3 1503960366 2016-03-27 12736 8.53 8.53
## 4 1503960366 2016-03-28 13231 8.93 8.93
## 5 1503960366 2016-03-29 12041 7.85 7.85
## 6 1503960366 2016-03-30 10970 7.16 7.16
## 7 1503960366 2016-03-31 12256 7.86 7.86
## 8 1503960366 2016-04-01 12262 7.87 7.87
## 9 1503960366 2016-04-02 11248 7.25 7.25
## 10 1503960366 2016-04-03 10016 6.37 6.37
## # ℹ 1,387 more rows
## # ℹ 12 more variables: LoggedActivitiesDistance <dbl>,
## # VeryActiveDistance <dbl>, ModeratelyActiveDistance <dbl>,
## # LightActiveDistance <dbl>, SedentaryActiveDistance <dbl>,
## # VeryActiveMinutes <dbl>, FairlyActiveMinutes <dbl>,
## # LightlyActiveMinutes <dbl>, SedentaryMinutes <dbl>, Calories <dbl>,
## # librabId <dbl>, DayOfWeek <ord>
print(DailyStepsSummary)
## # A tibble: 7 × 2
## DayOfWeek mean_steps
## <ord> <dbl>
## 1 Sunday 6607.
## 2 Monday 7541.
## 3 Tuesday 7084.
## 4 Wednesday 7548.
## 5 Thursday 7268.
## 6 Friday 7188.
## 7 Saturday 7752.
#Created the bar chart for Daily Steps Summary
ggplot(data = DailyStepsSummary, aes(x = DayOfWeek, y = mean_steps)) +
# Use geom_col for a bar chart where the height is determined by 'mean_steps'
geom_col(fill = "tan") +
# Add labels above the bars for precision
geom_text(aes(label = round(mean_steps, 0)), vjust = -0.5, size = 3) +
# Add Labels and Title
labs(
title = "Average Daily Steps by Day of the Week",
subtitle = "Analysis of User Activity Patterns",
x = "Day of the Week",
y = "Average Total Steps"
) +
# Enhance the theme for presentation
theme_minimal() +
# Adjust y-axis limit to accommodate the labels slightly above the bars
scale_y_continuous(expand = expansion(mult = c(0, 0.1)))
#Create the scatter plot of daily calories burned versus sedentary minutes with a trend line
combined_dailyActivity_fixed %>%
ggplot(aes(x = SedentaryMinutes, y = Calories)) +
# 1. Add points (scatter plot)
geom_point(alpha = 0.5, color = "tan") + # alpha makes overlapping points visible
# 2. Add a trend line (geom_smooth)
# 'method = "lm"' creates a linear regression line.
# 'se = FALSE' removes the shaded confidence interval.
geom_smooth(method = "lm", se = FALSE, color = "darkcyan", linetype = "dashed") +
# 3. Add Labels and Title
labs(
title = "Daily Calories Burned vs. Sedentary Minutes",
subtitle = "Analysis shows a slight positive relationship (or lack of correlation)",
x = "Sedentary Minutes (per Day)",
y = "Total Calories Burned (per Day)"
) +
# 4. Enhance the theme
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5)
)
## `geom_smooth()` using formula = 'y ~ x'
#Create the scatter plot of daily calories burned versus total steps with a trend line
calorie_totalSteps_plot <- combined_dailyActivity_fixed %>%
ggplot(aes(x = TotalSteps, y = Calories)) +
# 1. Add points (scatter plot)
geom_point(alpha = 0.5, color = "tan") + # alpha makes overlapping points visible
# 2. Add a trend line (geom_smooth)
# 'method = "lm"' creates a linear regression line.
# 'se = FALSE' removes the shaded confidence interval.
geom_smooth(method = "lm", se = FALSE, color = "darkcyan", linetype = "dashed") +
# 3. Add Labels and Title
labs(
title = "Daily Calories Burned vs. Total Steps",
subtitle = "Analysis shows a positive relationship",
x = "Total Steps (per Day)",
y = "Total Calories Burned (per Day)"
) +
# 4. Enhance the theme
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5)
)
print(calorie_totalSteps_plot)
## `geom_smooth()` using formula = 'y ~ x'
write_csv(combined_dailyActivity_fixed, "combined_dailyActivity_cleaned.csv")