# Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(skimr)
library(stringr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(here)
## here() starts at D:/Bellabeat-data-to-clean
library(ggplot2)
library(knitr)
library(pander)
We start by loading the required libraries: - dplyr and tidyr for data manipulation - janitor for cleaning column names - skimr for summary statistics - stringr for string manipulation - lubridate for working with dates - ggplot2 for data visualization - knitr and pander for reporting and displaying results
# Set the working directory to the folder containing CSV files
setwd("D:\\Bellabeat-data-to-clean")
# List all CSV files in the directory
file_list <- list.files(pattern = "\\.csv$")
In this section, we set the working directory to where the data files are located and list all CSV files in that directory.
# Create a folder to save cleaned files
cleaned_folder <- "cleaned_files"
if (!dir.exists(cleaned_folder)) dir.create(cleaned_folder)
Here, we create a new directory named cleaned_files to
store the cleaned dataset(s) if it doesn’t already exist.
# Function to clean and analyze the data
clean_data <- function(data, file_name) {
# Remove empty rows
data <- data %>%
filter(complete.cases(.))
head(data)
# Remove duplicate rows
data <- data %>% distinct()
# Trim whitespaces around entries in character cells
data <- data %>%
mutate(across(where(is.character), ~str_trim(.)))
# Format numeric columns to have 1-2 decimal places
data <- data %>%
mutate(across(where(is.numeric), ~round(., 2)))
# Analyze missing data and suggest filling methods
missing_data_summary <- colSums(is.na(data))
if (any(missing_data_summary > 0)) {
print("Columns with missing data:", names(missing_data_summary[missing_data_summary > 0]), "\n")
print("Suggested methods to fill missing data depend on column types, e.g.:\n")
print(" - Numeric columns: Mean/Median/Mode\n")
print(" - Categorical columns: Mode or most frequent value\n")
print(" - Datetime columns: Impute based on trends\n")
}
# Data summary
skim_without_charts(data)
# Clean column names
data <- clean_names(data)
return(data)
}
This function performs several steps to clean the data: 1.
Remove empty rows using complete.cases().
2. Remove duplicate rows with distinct().
3. Trim whitespace in character columns. 4.
Format numeric columns to 1-2 decimal places. 5.
Handle missing data by suggesting methods for
imputation. 6. Generate a summary of the data using
skimr. 7. Clean column names using
clean_names() from the janitor package.
# Loop through each file
for (file in file_list) {
# Read the CSV file
data <- read.csv(file)
# Clean the data
cleaned_data <- clean_data(data, file)
# Save the cleaned data
cleaned_file_path <- file.path(cleaned_folder, paste("cleaned", file))
write.csv(cleaned_data, cleaned_file_path, row.names = FALSE)
# Summary statistics
summary_stats <- cleaned_data %>%
summarize(
avg_steps = mean(total_steps),
avg_calories = mean(calories),
avg_very_active_minutes = mean(very_active_minutes)
)
#pander(summary_stats, caption = "Summary Statistics")
print(summary_stats)
}
## avg_steps avg_calories avg_very_active_minutes
## 1 7637.911 2303.61 21.16489
# Visualizing smoothed daily total steps over time
suppressWarnings({
ggplot(cleaned_data, aes(x = as.Date(activity_date), y = total_steps)) +
geom_point(alpha = 0.5, color = "blue") +
geom_smooth(method = "loess", color = "red", se = FALSE) +
labs(
title = "Smoothed Daily Total Steps Over Time",
x = "Date",
y = "Total Steps"
) +
theme_minimal()
})
## `geom_smooth()` using formula = 'y ~ x'
This loop iterates over each CSV file, reads the data, cleans it
using the clean_data() function, and saves the cleaned data
in a new folder. It also generates summary statistics and
visualizations.
# Calculate correlation between Total Steps and Calories
correlation <- cor(cleaned_data$total_steps, cleaned_data$calories, use = "complete.obs")
# Print correlation
print(paste("Correlation between Total Steps and Calories:", round(correlation, 2)))
## [1] "Correlation between Total Steps and Calories: 0.59"
In this section, we calculate the correlation
between total steps and calories burned using the cor()
function, and display the result.
# Scatter plot with trend line showing the relationship between Total Steps and Calories
ggplot(cleaned_data, aes(x = total_steps, y = calories)) +
geom_point(color = "blue", alpha = 0.5) +
geom_smooth(method = "lm", color = "red", se = TRUE) +
labs(
title = "Relationship between Total Steps and Calories",
x = "Total Steps",
y = "Calories Burned"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
This plot visualizes the relationship between total steps and calories burned with a scatter plot and a linear trend line.
# Create user groups based on activity levels
cleaned_data <- cleaned_data %>%
mutate(activity_level = case_when(
total_steps > 10000 ~ "Highly Active",
total_steps >= 5000 ~ "Moderately Active",
TRUE ~ "Sedentary"
))
# Create a bar chart of activity levels
ggplot(cleaned_data, aes(x = activity_level, fill = activity_level)) +
geom_bar() +
labs(
title = "Distribution of Users by Activity Level",
x = "Activity Level",
y = "Number of Users"
) +
theme_minimal() +
scale_fill_manual(values = c("Highly Active" = "green",
"Moderately Active" = "orange",
"Sedentary" = "red")) +
theme(legend.position = "none")
In this section, we categorize users based on their total steps into three activity levels and visualize the distribution with a bar chart.
# Identify peak activity times and periods
cleaned_data %>%
group_by(activity_date) %>%
summarize(daily_steps = sum(total_steps)) %>%
arrange(desc(daily_steps))
## # A tibble: 31 × 2
## activity_date daily_steps
## <chr> <dbl>
## 1 4/16/2016 277733
## 2 4/12/2016 271816
## 3 4/23/2016 267124
## 4 4/21/2016 263795
## 5 4/20/2016 261215
## 6 4/30/2016 258726
## 7 4/27/2016 258516
## 8 4/19/2016 257557
## 9 4/14/2016 255538
## 10 4/25/2016 253849
## # ℹ 21 more rows
Here, we summarize the total steps for each day and identify the peak activity periods.
# Boxplot for calories burned by activity level
ggplot(cleaned_data, aes(x = activity_level, y = calories)) +
geom_boxplot() +
labs(title = "Calories Burned by Activity Level")
This box plot visualizes how calories burned vary by activity level.
A positive correlation between total steps and calories burned indicates that users who take more steps tend to burn more calories.
Visualizing total steps over time reveals peak activity periods, which can guide Bellabeat in sending targeted notifications and creating promotions.
The analysis suggests that the more active a user is, the more calories they tend to burn.
By analyzing peak activity periods, Bellabeat can identify when users are most active and can tailor engagement efforts accordingly.
The insights derived from the analysis suggest that Bellabeat can enhance its marketing strategy by targeting users based on their activity levels, motivating them with step and calorie-based goals, and engaging them during peak activity times. Additionally, improving the app’s features to provide personalized and actionable insights will encourage users to increase their activity levels and achieve their fitness goals.