1. Instilling packages
packages <- c(“tidyverse”, “here”, “janitor”, “lubridate”, “skimr”, “readr”, “dplyr”) invisible(lapply(packages, function(x) if (!require(x, character.only = TRUE)) install.packages(x)))
library(tidyverse) library(here) library(janitor) library(lubridate) library(skimr) library(readr) library(dplyr)
2. Import csv files
For analysis we use data for April and May 2016 because this period contains more parameters for assessing trends.
Upload zip file DATA with all 18 cvs files.
Сheck files list and save it in txt format
library(here)
## here() starts at /cloud/project
# Get a list of all files in the DATA folder
file_list <- list.files(path = here("DATA"), full.names = TRUE)
file_list <- file_list[file.exists(file_list)]
# Print the list of files on the screen
print(file_list)
## [1] "/cloud/project/DATA/dailyActivity_merged.csv"
## [2] "/cloud/project/DATA/dailyCalories_merged.csv"
## [3] "/cloud/project/DATA/dailyIntensities_merged.csv"
## [4] "/cloud/project/DATA/dailySteps_merged.csv"
## [5] "/cloud/project/DATA/heartrate_seconds_merged.csv"
## [6] "/cloud/project/DATA/hourlyCalories_merged.csv"
## [7] "/cloud/project/DATA/hourlyData_merged.csv"
## [8] "/cloud/project/DATA/hourlyIntensities_merged.csv"
## [9] "/cloud/project/DATA/hourlySteps_merged.csv"
## [10] "/cloud/project/DATA/minuteCaloriesNarrow_merged.csv"
## [11] "/cloud/project/DATA/minuteCaloriesWide_merged.csv"
## [12] "/cloud/project/DATA/minuteIntensitiesNarrow_merged.csv"
## [13] "/cloud/project/DATA/minuteIntensitiesWide_merged.csv"
## [14] "/cloud/project/DATA/minuteMETsNarrow_merged.csv"
## [15] "/cloud/project/DATA/minuteSleep_merged.csv"
## [16] "/cloud/project/DATA/minuteStepsNarrow_merged.csv"
## [17] "/cloud/project/DATA/minuteStepsWide_merged.csv"
## [18] "/cloud/project/DATA/sleepDay_merged.csv"
## [19] "/cloud/project/DATA/weightLogInfo_merged.csv"
# Save the list of files to a text file
writeLines(file_list, "file_list.txt")
# Load all CSV files into a list, skipping non-existent files
data_list <- lapply(file_list[file.exists(file_list)], read.csv)
# Assign names to data.frames based on file names
names(data_list) <- basename(file_list[file.exists(file_list)])
3. Extract and compare column names
# Extract column names for each file
column_names <- lapply(data_list, names)
# Print column names for each file
for (name in names(column_names)) {
cat("Columns in", name, ":\n")
print(column_names[[name]])
cat("\n")
}
## Columns in dailyActivity_merged.csv :
## [1] "Id" "ActivityDate"
## [3] "TotalSteps" "TotalDistance"
## [5] "TrackerDistance" "LoggedActivitiesDistance"
## [7] "VeryActiveDistance" "ModeratelyActiveDistance"
## [9] "LightActiveDistance" "SedentaryActiveDistance"
## [11] "VeryActiveMinutes" "FairlyActiveMinutes"
## [13] "LightlyActiveMinutes" "SedentaryMinutes"
## [15] "Calories"
##
## Columns in dailyCalories_merged.csv :
## [1] "Id" "ActivityDay" "Calories"
##
## Columns in dailyIntensities_merged.csv :
## [1] "Id" "ActivityDay"
## [3] "SedentaryMinutes" "LightlyActiveMinutes"
## [5] "FairlyActiveMinutes" "VeryActiveMinutes"
## [7] "SedentaryActiveDistance" "LightActiveDistance"
## [9] "ModeratelyActiveDistance" "VeryActiveDistance"
##
## Columns in dailySteps_merged.csv :
## [1] "Id" "ActivityDay" "StepTotal"
##
## Columns in heartrate_seconds_merged.csv :
## [1] "Id" "Time" "Value"
##
## Columns in hourlyCalories_merged.csv :
## [1] "Id" "ActivityHour" "Calories"
##
## Columns in hourlyData_merged.csv :
## [1] "Id" "ActivityHour" "Calories" "TotalIntensity"
## [5] "AverageIntensity" "StepTotal"
##
## Columns in hourlyIntensities_merged.csv :
## [1] "Id" "ActivityHour" "TotalIntensity" "AverageIntensity"
##
## Columns in hourlySteps_merged.csv :
## [1] "Id" "ActivityHour" "StepTotal"
##
## Columns in minuteCaloriesNarrow_merged.csv :
## [1] "Id" "ActivityMinute" "Calories"
##
## Columns in minuteCaloriesWide_merged.csv :
## [1] "Id" "ActivityHour" "Calories00" "Calories01" "Calories02"
## [6] "Calories03" "Calories04" "Calories05" "Calories06" "Calories07"
## [11] "Calories08" "Calories09" "Calories10" "Calories11" "Calories12"
## [16] "Calories13" "Calories14" "Calories15" "Calories16" "Calories17"
## [21] "Calories18" "Calories19" "Calories20" "Calories21" "Calories22"
## [26] "Calories23" "Calories24" "Calories25" "Calories26" "Calories27"
## [31] "Calories28" "Calories29" "Calories30" "Calories31" "Calories32"
## [36] "Calories33" "Calories34" "Calories35" "Calories36" "Calories37"
## [41] "Calories38" "Calories39" "Calories40" "Calories41" "Calories42"
## [46] "Calories43" "Calories44" "Calories45" "Calories46" "Calories47"
## [51] "Calories48" "Calories49" "Calories50" "Calories51" "Calories52"
## [56] "Calories53" "Calories54" "Calories55" "Calories56" "Calories57"
## [61] "Calories58" "Calories59"
##
## Columns in minuteIntensitiesNarrow_merged.csv :
## [1] "Id" "ActivityMinute" "Intensity"
##
## Columns in minuteIntensitiesWide_merged.csv :
## [1] "Id" "ActivityHour" "Intensity00" "Intensity01" "Intensity02"
## [6] "Intensity03" "Intensity04" "Intensity05" "Intensity06" "Intensity07"
## [11] "Intensity08" "Intensity09" "Intensity10" "Intensity11" "Intensity12"
## [16] "Intensity13" "Intensity14" "Intensity15" "Intensity16" "Intensity17"
## [21] "Intensity18" "Intensity19" "Intensity20" "Intensity21" "Intensity22"
## [26] "Intensity23" "Intensity24" "Intensity25" "Intensity26" "Intensity27"
## [31] "Intensity28" "Intensity29" "Intensity30" "Intensity31" "Intensity32"
## [36] "Intensity33" "Intensity34" "Intensity35" "Intensity36" "Intensity37"
## [41] "Intensity38" "Intensity39" "Intensity40" "Intensity41" "Intensity42"
## [46] "Intensity43" "Intensity44" "Intensity45" "Intensity46" "Intensity47"
## [51] "Intensity48" "Intensity49" "Intensity50" "Intensity51" "Intensity52"
## [56] "Intensity53" "Intensity54" "Intensity55" "Intensity56" "Intensity57"
## [61] "Intensity58" "Intensity59"
##
## Columns in minuteMETsNarrow_merged.csv :
## [1] "Id" "ActivityMinute" "METs"
##
## Columns in minuteSleep_merged.csv :
## [1] "Id" "date" "value" "logId"
##
## Columns in minuteStepsNarrow_merged.csv :
## [1] "Id" "ActivityMinute" "Steps"
##
## Columns in minuteStepsWide_merged.csv :
## [1] "Id" "ActivityHour" "Steps00" "Steps01" "Steps02"
## [6] "Steps03" "Steps04" "Steps05" "Steps06" "Steps07"
## [11] "Steps08" "Steps09" "Steps10" "Steps11" "Steps12"
## [16] "Steps13" "Steps14" "Steps15" "Steps16" "Steps17"
## [21] "Steps18" "Steps19" "Steps20" "Steps21" "Steps22"
## [26] "Steps23" "Steps24" "Steps25" "Steps26" "Steps27"
## [31] "Steps28" "Steps29" "Steps30" "Steps31" "Steps32"
## [36] "Steps33" "Steps34" "Steps35" "Steps36" "Steps37"
## [41] "Steps38" "Steps39" "Steps40" "Steps41" "Steps42"
## [46] "Steps43" "Steps44" "Steps45" "Steps46" "Steps47"
## [51] "Steps48" "Steps49" "Steps50" "Steps51" "Steps52"
## [56] "Steps53" "Steps54" "Steps55" "Steps56" "Steps57"
## [61] "Steps58" "Steps59"
##
## Columns in sleepDay_merged.csv :
## [1] "Id" "SleepDay" "TotalSleepRecords"
## [4] "TotalMinutesAsleep" "TotalTimeInBed"
##
## Columns in weightLogInfo_merged.csv :
## [1] "Id" "Date" "WeightKg" "WeightPounds"
## [5] "Fat" "BMI" "IsManualReport" "LogId"
4. Check and select files based on the number of unique IDs
# Compare column names across all files
common_columns <- Reduce(intersect, column_names)
cat("Common columns across all files:\n")
## Common columns across all files:
print(common_columns)
## [1] "Id"
For all files common column is Id what is unique and represent every
user.
The size of the sample could potentially introduce a sampling bias.
While a sample size of 33 remains valid, a larger sample size would
provide a better reflection of the overall population and consequently
enhance the confidence interval. To avoid bias, we check all files for
the number of Id and select only those that contain information about
more than 33 users.
# Function to count the number of unique IDs in a file
count_unique_ids <- function(file_path) {
if (file.exists(file_path)) {
print(paste("Processing:", file_path)) # Debug: Check the file being processed
data <- tryCatch(read_csv(file_path, col_types = cols_only(Id = col_character())),
error = function(e) {
print(paste("Error reading file:", file_path)) # Debug: Log errors
NULL
})
if (!is.null(data)) {
print(paste("Unique IDs in", file_path, ":", length(unique(data$Id)))) # Debug: Show count
return(length(unique(data$Id)))
}
}
return(0)
}
# Filter files that have more than 33 unique IDs
files_with_enough_ids <- file_list[sapply(file_list, count_unique_ids) >= 33]
## [1] "Processing: /cloud/project/DATA/dailyActivity_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/dailyActivity_merged.csv"
## [1] "Processing: /cloud/project/DATA/dailyCalories_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/dailyCalories_merged.csv"
## [1] "Processing: /cloud/project/DATA/dailyIntensities_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/dailyIntensities_merged.csv"
## [1] "Processing: /cloud/project/DATA/dailySteps_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/dailySteps_merged.csv"
## [1] "Processing: /cloud/project/DATA/heartrate_seconds_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/heartrate_seconds_merged.csv"
## [1] "Processing: /cloud/project/DATA/hourlyCalories_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/hourlyCalories_merged.csv"
## [1] "Processing: /cloud/project/DATA/hourlyData_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/hourlyData_merged.csv"
## [1] "Processing: /cloud/project/DATA/hourlyIntensities_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/hourlyIntensities_merged.csv"
## [1] "Processing: /cloud/project/DATA/hourlySteps_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/hourlySteps_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteCaloriesNarrow_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteCaloriesNarrow_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteCaloriesWide_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteCaloriesWide_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteIntensitiesNarrow_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteIntensitiesNarrow_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteIntensitiesWide_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteIntensitiesWide_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteMETsNarrow_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteMETsNarrow_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteSleep_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteSleep_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteStepsNarrow_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteStepsNarrow_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteStepsWide_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteStepsWide_merged.csv"
## [1] "Processing: /cloud/project/DATA/sleepDay_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/sleepDay_merged.csv"
## [1] "Processing: /cloud/project/DATA/weightLogInfo_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/weightLogInfo_merged.csv"
# Print the names of files that meet the condition
print(files_with_enough_ids)
## character(0)
5. Merge the datasets containing time-related information
After examining the column names, merging the three datasets containing time-related information into a single dataset using the “Id” and “ActivityHour” columns.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load hourly data from three CSV files
hourly_calories <- read.csv(here("DATA", "hourlyCalories_merged.csv")) # Data on hourly calorie burn
hourly_intensities <- read.csv(here("DATA", "hourlyIntensities_merged.csv")) # Data on hourly activity intensities
hourly_steps <- read.csv(here("DATA", "hourlySteps_merged.csv")) # Data on hourly step counts
# Merge the datasets by 'Id' and 'ActivityHour' columns
# The result will include only rows with matching 'Id' and 'ActivityHour' across all three datasets
# Merge with hourly intensities
merged_hourly_data <- hourly_calories |>
inner_join(hourly_intensities, by = c("Id", "ActivityHour")) |>
# Merge with hourly steps
inner_join(hourly_steps, by = c("Id", "ActivityHour"))
# Save the merged dataset as a new CSV file
write.csv(merged_hourly_data, here("DATA", "hourlyData_merged.csv"), row.names = FALSE) # Save without row numbers
# Print a message indicating successful merging
cat("Data merged successfully into 'hourlyData_merged.csv'")
## Data merged successfully into 'hourlyData_merged.csv'
7. Convert the various dates and times in ‘hourlyData_merged’ to the correct format and add DayOfWeek column
# Load necessary libraries
library(dplyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(readr)
library(here)
# Load the raw merged hourly data from the CSV file
hourly_data_raw <- read.csv(here("DATA", "hourlyData_merged.csv"))
# Convert the 'ActivityHour' column from a character string to a datetime object
# Using the 'mdy_hms' function from the 'lubridate' package to parse dates in "Month-Day-Year Hour:Minute:Second" format
hourly_data_raw$ActivityHour <- mdy_hms(hourly_data_raw$ActivityHour)
# Extract the day of the week from the 'ActivityHour' datetime column
# 'label = TRUE' ensures the result is a factor with day names, and 'abbr = FALSE' ensures full names are used
hourly_data_raw$DayOfWeek <- wday(hourly_data_raw$ActivityHour, label = TRUE, abbr = FALSE, week_start = 1)
# Display the first few rows of the processed data to verify the transformations
head(hourly_data_raw)
## Id ActivityHour Calories TotalIntensity AverageIntensity
## 1 1503960366 2016-04-12 00:00:00 81 20 0.333333
## 2 1503960366 2016-04-12 01:00:00 61 8 0.133333
## 3 1503960366 2016-04-12 02:00:00 59 7 0.116667
## 4 1503960366 2016-04-12 03:00:00 47 0 0.000000
## 5 1503960366 2016-04-12 04:00:00 48 0 0.000000
## 6 1503960366 2016-04-12 05:00:00 48 0 0.000000
## StepTotal DayOfWeek
## 1 373 Tuesday
## 2 160 Tuesday
## 3 151 Tuesday
## 4 0 Tuesday
## 5 0 Tuesday
## 6 0 Tuesday
8. Read and сheck files with daily data
# Reading the files
daily_activity <- read.csv("DATA/dailyActivity_merged.csv")
daily_calories <- read.csv("DATA/dailyCalories_merged.csv")
daily_intensities <- read.csv("DATA/dailyIntensities_merged.csv")
daily_steps <- read.csv("DATA/dailySteps_merged.csv")
# Display the first 5 rows of each file
head(daily_activity, 5)
## Id ActivityDate TotalSteps TotalDistance TrackerDistance
## 1 1503960366 4/12/2016 13162 8.50 8.50
## 2 1503960366 4/13/2016 10735 6.97 6.97
## 3 1503960366 4/14/2016 10460 6.74 6.74
## 4 1503960366 4/15/2016 9762 6.28 6.28
## 5 1503960366 4/16/2016 12669 8.16 8.16
## LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 1 0 1.88 0.55
## 2 0 1.57 0.69
## 3 0 2.44 0.40
## 4 0 2.14 1.26
## 5 0 2.71 0.41
## LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 1 6.06 0 25
## 2 4.71 0 21
## 3 3.91 0 30
## 4 2.83 0 29
## 5 5.04 0 36
## FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## 1 13 328 728 1985
## 2 19 217 776 1797
## 3 11 181 1218 1776
## 4 34 209 726 1745
## 5 10 221 773 1863
head(daily_calories, 5)
## Id ActivityDay Calories
## 1 1503960366 4/12/2016 1985
## 2 1503960366 4/13/2016 1797
## 3 1503960366 4/14/2016 1776
## 4 1503960366 4/15/2016 1745
## 5 1503960366 4/16/2016 1863
head(daily_intensities, 5)
## Id ActivityDay SedentaryMinutes LightlyActiveMinutes
## 1 1503960366 4/12/2016 728 328
## 2 1503960366 4/13/2016 776 217
## 3 1503960366 4/14/2016 1218 181
## 4 1503960366 4/15/2016 726 209
## 5 1503960366 4/16/2016 773 221
## FairlyActiveMinutes VeryActiveMinutes SedentaryActiveDistance
## 1 13 25 0
## 2 19 21 0
## 3 11 30 0
## 4 34 29 0
## 5 10 36 0
## LightActiveDistance ModeratelyActiveDistance VeryActiveDistance
## 1 6.06 0.55 1.88
## 2 4.71 0.69 1.57
## 3 3.91 0.40 2.44
## 4 2.83 1.26 2.14
## 5 5.04 0.41 2.71
head(daily_steps, 5)
## Id ActivityDay StepTotal
## 1 1503960366 4/12/2016 13162
## 2 1503960366 4/13/2016 10735
## 3 1503960366 4/14/2016 10460
## 4 1503960366 4/15/2016 9762
## 5 1503960366 4/16/2016 12669
# View the column names for each file
colnames(daily_activity)
## [1] "Id" "ActivityDate"
## [3] "TotalSteps" "TotalDistance"
## [5] "TrackerDistance" "LoggedActivitiesDistance"
## [7] "VeryActiveDistance" "ModeratelyActiveDistance"
## [9] "LightActiveDistance" "SedentaryActiveDistance"
## [11] "VeryActiveMinutes" "FairlyActiveMinutes"
## [13] "LightlyActiveMinutes" "SedentaryMinutes"
## [15] "Calories"
colnames(daily_calories)
## [1] "Id" "ActivityDay" "Calories"
colnames(daily_intensities)
## [1] "Id" "ActivityDay"
## [3] "SedentaryMinutes" "LightlyActiveMinutes"
## [5] "FairlyActiveMinutes" "VeryActiveMinutes"
## [7] "SedentaryActiveDistance" "LightActiveDistance"
## [9] "ModeratelyActiveDistance" "VeryActiveDistance"
colnames(daily_steps)
## [1] "Id" "ActivityDay" "StepTotal"
‘daily_activity’ has all columns from all file data, so chose it for father analyse.
9. Convert ActivityDate to Date format Add the DayOfWeek column
library(lubridate)
# Convert ActivityDate to Date format if necessary
daily_activity$ActivityDate <- as.Date(daily_activity$ActivityDate, format = "%m/%d/%Y")
# Add the DayOfWeek column as numeric (1=Sunday, 2=Monday, ..., 7=Saturday)
daily_activity$DayOfWeek <- wday(daily_activity$ActivityDate, label = FALSE, week_start = 1)
# Convert DayOfWeek to text (1=Sunday, ..., 7=Saturday)
days <- c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")
daily_activity$DayOfWeekText <- days[daily_activity$DayOfWeek]
# View the first 5 rows to confirm the change
head(daily_activity, 5)
## Id ActivityDate TotalSteps TotalDistance TrackerDistance
## 1 1503960366 2016-04-12 13162 8.50 8.50
## 2 1503960366 2016-04-13 10735 6.97 6.97
## 3 1503960366 2016-04-14 10460 6.74 6.74
## 4 1503960366 2016-04-15 9762 6.28 6.28
## 5 1503960366 2016-04-16 12669 8.16 8.16
## LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 1 0 1.88 0.55
## 2 0 1.57 0.69
## 3 0 2.44 0.40
## 4 0 2.14 1.26
## 5 0 2.71 0.41
## LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 1 6.06 0 25
## 2 4.71 0 21
## 3 3.91 0 30
## 4 2.83 0 29
## 5 5.04 0 36
## FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories DayOfWeek
## 1 13 328 728 1985 2
## 2 19 217 776 1797 3
## 3 11 181 1218 1776 4
## 4 34 209 726 1745 5
## 5 10 221 773 1863 6
## DayOfWeekText
## 1 Monday
## 2 Tuesday
## 3 Wednesday
## 4 Thursday
## 5 Friday
10. Clean data ‘hourly_data_row’ and ‘daily_activity’
# Check for missing values in the entire dataset
sum(is.na(daily_activity)) # For daily_activity
## [1] 0
sum(is.na(hourly_data_raw)) # For hourly_data_raw
## [1] 0
# Check for missing values per column
colSums(is.na(daily_activity))
## Id ActivityDate TotalSteps
## 0 0 0
## TotalDistance TrackerDistance LoggedActivitiesDistance
## 0 0 0
## VeryActiveDistance ModeratelyActiveDistance LightActiveDistance
## 0 0 0
## SedentaryActiveDistance VeryActiveMinutes FairlyActiveMinutes
## 0 0 0
## LightlyActiveMinutes SedentaryMinutes Calories
## 0 0 0
## DayOfWeek DayOfWeekText
## 0 0
colSums(is.na(hourly_data_raw))
## Id ActivityHour Calories TotalIntensity
## 0 0 0 0
## AverageIntensity StepTotal DayOfWeek
## 0 0 0
# Show rows with missing values
daily_activity[!complete.cases(daily_activity), ]
## [1] Id ActivityDate TotalSteps
## [4] TotalDistance TrackerDistance LoggedActivitiesDistance
## [7] VeryActiveDistance ModeratelyActiveDistance LightActiveDistance
## [10] SedentaryActiveDistance VeryActiveMinutes FairlyActiveMinutes
## [13] LightlyActiveMinutes SedentaryMinutes Calories
## [16] DayOfWeek DayOfWeekText
## <0 rows> (or 0-length row.names)
hourly_data_raw[!complete.cases(hourly_data_raw), ]
## [1] Id ActivityHour Calories TotalIntensity
## [5] AverageIntensity StepTotal DayOfWeek
## <0 rows> (or 0-length row.names)
# Remove rows with missing values
daily_activity_clean <- na.omit(daily_activity)
hourly_data_clean <- na.omit(hourly_data_raw)
# Replace missing values with 0 (or another value)
daily_activity[is.na(daily_activity)] <- 0
hourly_data_raw[is.na(hourly_data_raw)] <- 0
# Find duplicates
sum(duplicated(daily_activity)) # For daily_activity
## [1] 0
sum(duplicated(hourly_data_raw)) # For hourly_data_raw
## [1] 0
# View duplicate rows
daily_activity[duplicated(daily_activity), ]
## [1] Id ActivityDate TotalSteps
## [4] TotalDistance TrackerDistance LoggedActivitiesDistance
## [7] VeryActiveDistance ModeratelyActiveDistance LightActiveDistance
## [10] SedentaryActiveDistance VeryActiveMinutes FairlyActiveMinutes
## [13] LightlyActiveMinutes SedentaryMinutes Calories
## [16] DayOfWeek DayOfWeekText
## <0 rows> (or 0-length row.names)
hourly_data_raw[duplicated(hourly_data_raw), ]
## [1] Id ActivityHour Calories TotalIntensity
## [5] AverageIntensity StepTotal DayOfWeek
## <0 rows> (or 0-length row.names)
# Remove duplicate rows
daily_activity <- daily_activity[!duplicated(daily_activity), ]
hourly_data_raw <- hourly_data_raw[!duplicated(hourly_data_raw), ]
# Check the structure of the dataset
str(daily_activity)
## 'data.frame': 940 obs. of 17 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityDate : Date, format: "2016-04-12" "2016-04-13" ...
## $ TotalSteps : int 13162 10735 10460 9762 12669 9705 13019 15506 10544 9819 ...
## $ TotalDistance : num 8.5 6.97 6.74 6.28 8.16 ...
## $ TrackerDistance : num 8.5 6.97 6.74 6.28 8.16 ...
## $ LoggedActivitiesDistance: num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveDistance : num 1.88 1.57 2.44 2.14 2.71 ...
## $ ModeratelyActiveDistance: num 0.55 0.69 0.4 1.26 0.41 ...
## $ LightActiveDistance : num 6.06 4.71 3.91 2.83 5.04 ...
## $ SedentaryActiveDistance : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveMinutes : int 25 21 30 29 36 38 42 50 28 19 ...
## $ FairlyActiveMinutes : int 13 19 11 34 10 20 16 31 12 8 ...
## $ LightlyActiveMinutes : int 328 217 181 209 221 164 233 264 205 211 ...
## $ SedentaryMinutes : int 728 776 1218 726 773 539 1149 775 818 838 ...
## $ Calories : int 1985 1797 1776 1745 1863 1728 1921 2035 1786 1775 ...
## $ DayOfWeek : num 2 3 4 5 6 7 1 2 3 4 ...
## $ DayOfWeekText : chr "Monday" "Tuesday" "Wednesday" "Thursday" ...
str(hourly_data_raw)
## 'data.frame': 22099 obs. of 7 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityHour : POSIXct, format: "2016-04-12 00:00:00" "2016-04-12 01:00:00" ...
## $ Calories : int 81 61 59 47 48 48 48 47 68 141 ...
## $ TotalIntensity : int 20 8 7 0 0 0 0 0 13 30 ...
## $ AverageIntensity: num 0.333 0.133 0.117 0 0 ...
## $ StepTotal : int 373 160 151 0 0 0 0 0 250 1864 ...
## $ DayOfWeek : Ord.factor w/ 7 levels "Monday"<"Tuesday"<..: 2 2 2 2 2 2 2 2 2 2 ...
# Convert columns to correct data types (example for date format)
daily_activity$ActivityDate <- as.Date(daily_activity$ActivityDate, format = "%m/%d/%Y")
hourly_data_raw$ActivityHour <- as.POSIXct(hourly_data_raw$ActivityHour, format = "%m/%d/%Y %I:%M:%S %p", tz = "UTC")
# Summary statistics to detect outliers
summary(daily_activity)
## Id ActivityDate TotalSteps TotalDistance
## Min. :1.504e+09 Min. :2016-04-12 Min. : 0 Min. : 0.000
## 1st Qu.:2.320e+09 1st Qu.:2016-04-19 1st Qu.: 3790 1st Qu.: 2.620
## Median :4.445e+09 Median :2016-04-26 Median : 7406 Median : 5.245
## Mean :4.855e+09 Mean :2016-04-26 Mean : 7638 Mean : 5.490
## 3rd Qu.:6.962e+09 3rd Qu.:2016-05-04 3rd Qu.:10727 3rd Qu.: 7.713
## Max. :8.878e+09 Max. :2016-05-12 Max. :36019 Max. :28.030
## TrackerDistance LoggedActivitiesDistance VeryActiveDistance
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 2.620 1st Qu.:0.0000 1st Qu.: 0.000
## Median : 5.245 Median :0.0000 Median : 0.210
## Mean : 5.475 Mean :0.1082 Mean : 1.503
## 3rd Qu.: 7.710 3rd Qu.:0.0000 3rd Qu.: 2.053
## Max. :28.030 Max. :4.9421 Max. :21.920
## ModeratelyActiveDistance LightActiveDistance SedentaryActiveDistance
## Min. :0.0000 Min. : 0.000 Min. :0.000000
## 1st Qu.:0.0000 1st Qu.: 1.945 1st Qu.:0.000000
## Median :0.2400 Median : 3.365 Median :0.000000
## Mean :0.5675 Mean : 3.341 Mean :0.001606
## 3rd Qu.:0.8000 3rd Qu.: 4.782 3rd Qu.:0.000000
## Max. :6.4800 Max. :10.710 Max. :0.110000
## VeryActiveMinutes FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:127.0 1st Qu.: 729.8
## Median : 4.00 Median : 6.00 Median :199.0 Median :1057.5
## Mean : 21.16 Mean : 13.56 Mean :192.8 Mean : 991.2
## 3rd Qu.: 32.00 3rd Qu.: 19.00 3rd Qu.:264.0 3rd Qu.:1229.5
## Max. :210.00 Max. :143.00 Max. :518.0 Max. :1440.0
## Calories DayOfWeek DayOfWeekText
## Min. : 0 Min. :1.000 Length:940
## 1st Qu.:1828 1st Qu.:2.000 Class :character
## Median :2134 Median :4.000 Mode :character
## Mean :2304 Mean :3.918
## 3rd Qu.:2793 3rd Qu.:6.000
## Max. :4900 Max. :7.000
summary(hourly_data_raw)
## Id ActivityHour Calories
## Min. :1.504e+09 Min. :2016-04-12 00:00:00.00 Min. : 42.00
## 1st Qu.:2.320e+09 1st Qu.:2016-04-19 01:00:00.00 1st Qu.: 63.00
## Median :4.445e+09 Median :2016-04-26 06:00:00.00 Median : 83.00
## Mean :4.848e+09 Mean :2016-04-26 11:46:42.58 Mean : 97.39
## 3rd Qu.:6.962e+09 3rd Qu.:2016-05-03 19:00:00.00 3rd Qu.:108.00
## Max. :8.878e+09 Max. :2016-05-12 15:00:00.00 Max. :948.00
##
## TotalIntensity AverageIntensity StepTotal DayOfWeek
## Min. : 0.00 Min. :0.0000 Min. : 0.0 Monday :2861
## 1st Qu.: 0.00 1st Qu.:0.0000 1st Qu.: 0.0 Tuesday :3600
## Median : 3.00 Median :0.0500 Median : 40.0 Wednesday:3547
## Mean : 12.04 Mean :0.2006 Mean : 320.2 Thursday :3283
## 3rd Qu.: 16.00 3rd Qu.:0.2667 3rd Qu.: 357.0 Friday :2997
## Max. :180.00 Max. :3.0000 Max. :10554.0 Saturday :2915
## Sunday :2896
# Boxplot to visualize outliers
boxplot(daily_activity$Calories, main = "Calories in daily_activity")
boxplot(hourly_data_raw$Calories, main = "Calories in hourly_data_raw")
# Convert columns to numeric if necessary
daily_activity$Calories <- as.numeric(daily_activity$Calories)
hourly_data_raw$Calories <- as.numeric(hourly_data_raw$Calories)
# Handle non-numeric values (e.g., replace non-numeric values with NA)
daily_activity$Calories[is.na(daily_activity$Calories)] <- 0
hourly_data_raw$Calories[is.na(hourly_data_raw$Calories)] <- 0
# Convert to Date format
daily_activity$ActivityDate <- as.Date(daily_activity$ActivityDate, format = "%m/%d/%Y")
hourly_data_raw$ActivityHour <- as.POSIXct(hourly_data_raw$ActivityHour, format = "%m/%d/%Y %I:%M:%S %p")
# Check for zero or negative values in specific columns
sum(daily_activity$Calories <= 0)
## [1] 4
sum(hourly_data_raw$Calories <= 0)
## [1] 0
check 4 rows where colories <= 0
# Filter rows where Calories <= 0
invalid_calories <- daily_activity[daily_activity$Calories <= 0, ]
# View the filtered rows
print(invalid_calories)
## Id ActivityDate TotalSteps TotalDistance TrackerDistance
## 31 1503960366 2016-05-12 0 0 0
## 654 6290855005 2016-05-10 0 0 0
## 818 8253242879 2016-04-30 0 0 0
## 880 8583815059 2016-05-12 0 0 0
## LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 31 0 0 0
## 654 0 0 0
## 818 0 0 0
## 880 0 0 0
## LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 31 0 0 0
## 654 0 0 0
## 818 0 0 0
## 880 0 0 0
## FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## 31 0 0 1440 0
## 654 0 0 1440 0
## 818 0 0 1440 0
## 880 0 0 1440 0
## DayOfWeek DayOfWeekText
## 31 4 Wednesday
## 654 2 Monday
## 818 6 Friday
## 880 4 Wednesday
we see any activities in other columns, so it should be removed.
daily_activity <- daily_activity[daily_activity$Calories > 0, ]
11. Rename the dataframe from hourly_data_raw to hourly_activity
# Rename the dataframe from hourly_data_raw to hourly_activity
hourly_activity <- hourly_data_raw
12. Export the data to CSV format for use in Tableau
write.csv(daily_activity, "daily_activity_cleaned.csv", row.names = FALSE)
write.csv(hourly_activity, "hourly_activity_cleaned.csv", row.names = FALSE)