Procesing

1. Instilling packages

packages <- c(“tidyverse”, “here”, “janitor”, “lubridate”, “skimr”, “readr”, “dplyr”) invisible(lapply(packages, function(x) if (!require(x, character.only = TRUE)) install.packages(x)))

library(tidyverse) library(here) library(janitor) library(lubridate) library(skimr) library(readr) library(dplyr)

2. Import csv files

For analysis we use data for April and May 2016 because this period contains more parameters for assessing trends.

Upload zip file DATA with all 18 cvs files.

Сheck files list and save it in txt format

library(here)  
## here() starts at /cloud/project
# Get a list of all files in the DATA folder
file_list <- list.files(path = here("DATA"), full.names = TRUE)
file_list <- file_list[file.exists(file_list)]

# Print the list of files on the screen
print(file_list)
##  [1] "/cloud/project/DATA/dailyActivity_merged.csv"          
##  [2] "/cloud/project/DATA/dailyCalories_merged.csv"          
##  [3] "/cloud/project/DATA/dailyIntensities_merged.csv"       
##  [4] "/cloud/project/DATA/dailySteps_merged.csv"             
##  [5] "/cloud/project/DATA/heartrate_seconds_merged.csv"      
##  [6] "/cloud/project/DATA/hourlyCalories_merged.csv"         
##  [7] "/cloud/project/DATA/hourlyData_merged.csv"             
##  [8] "/cloud/project/DATA/hourlyIntensities_merged.csv"      
##  [9] "/cloud/project/DATA/hourlySteps_merged.csv"            
## [10] "/cloud/project/DATA/minuteCaloriesNarrow_merged.csv"   
## [11] "/cloud/project/DATA/minuteCaloriesWide_merged.csv"     
## [12] "/cloud/project/DATA/minuteIntensitiesNarrow_merged.csv"
## [13] "/cloud/project/DATA/minuteIntensitiesWide_merged.csv"  
## [14] "/cloud/project/DATA/minuteMETsNarrow_merged.csv"       
## [15] "/cloud/project/DATA/minuteSleep_merged.csv"            
## [16] "/cloud/project/DATA/minuteStepsNarrow_merged.csv"      
## [17] "/cloud/project/DATA/minuteStepsWide_merged.csv"        
## [18] "/cloud/project/DATA/sleepDay_merged.csv"               
## [19] "/cloud/project/DATA/weightLogInfo_merged.csv"
# Save the list of files to a text file
writeLines(file_list, "file_list.txt")

# Load all CSV files into a list, skipping non-existent files
data_list <- lapply(file_list[file.exists(file_list)], read.csv)

# Assign names to data.frames based on file names
names(data_list) <- basename(file_list[file.exists(file_list)])

3. Extract and compare column names

# Extract column names for each file
column_names <- lapply(data_list, names)

# Print column names for each file
for (name in names(column_names)) {
  cat("Columns in", name, ":\n")
  print(column_names[[name]])
  cat("\n")
}
## Columns in dailyActivity_merged.csv :
##  [1] "Id"                       "ActivityDate"            
##  [3] "TotalSteps"               "TotalDistance"           
##  [5] "TrackerDistance"          "LoggedActivitiesDistance"
##  [7] "VeryActiveDistance"       "ModeratelyActiveDistance"
##  [9] "LightActiveDistance"      "SedentaryActiveDistance" 
## [11] "VeryActiveMinutes"        "FairlyActiveMinutes"     
## [13] "LightlyActiveMinutes"     "SedentaryMinutes"        
## [15] "Calories"                
## 
## Columns in dailyCalories_merged.csv :
## [1] "Id"          "ActivityDay" "Calories"   
## 
## Columns in dailyIntensities_merged.csv :
##  [1] "Id"                       "ActivityDay"             
##  [3] "SedentaryMinutes"         "LightlyActiveMinutes"    
##  [5] "FairlyActiveMinutes"      "VeryActiveMinutes"       
##  [7] "SedentaryActiveDistance"  "LightActiveDistance"     
##  [9] "ModeratelyActiveDistance" "VeryActiveDistance"      
## 
## Columns in dailySteps_merged.csv :
## [1] "Id"          "ActivityDay" "StepTotal"  
## 
## Columns in heartrate_seconds_merged.csv :
## [1] "Id"    "Time"  "Value"
## 
## Columns in hourlyCalories_merged.csv :
## [1] "Id"           "ActivityHour" "Calories"    
## 
## Columns in hourlyData_merged.csv :
## [1] "Id"               "ActivityHour"     "Calories"         "TotalIntensity"  
## [5] "AverageIntensity" "StepTotal"       
## 
## Columns in hourlyIntensities_merged.csv :
## [1] "Id"               "ActivityHour"     "TotalIntensity"   "AverageIntensity"
## 
## Columns in hourlySteps_merged.csv :
## [1] "Id"           "ActivityHour" "StepTotal"   
## 
## Columns in minuteCaloriesNarrow_merged.csv :
## [1] "Id"             "ActivityMinute" "Calories"      
## 
## Columns in minuteCaloriesWide_merged.csv :
##  [1] "Id"           "ActivityHour" "Calories00"   "Calories01"   "Calories02"  
##  [6] "Calories03"   "Calories04"   "Calories05"   "Calories06"   "Calories07"  
## [11] "Calories08"   "Calories09"   "Calories10"   "Calories11"   "Calories12"  
## [16] "Calories13"   "Calories14"   "Calories15"   "Calories16"   "Calories17"  
## [21] "Calories18"   "Calories19"   "Calories20"   "Calories21"   "Calories22"  
## [26] "Calories23"   "Calories24"   "Calories25"   "Calories26"   "Calories27"  
## [31] "Calories28"   "Calories29"   "Calories30"   "Calories31"   "Calories32"  
## [36] "Calories33"   "Calories34"   "Calories35"   "Calories36"   "Calories37"  
## [41] "Calories38"   "Calories39"   "Calories40"   "Calories41"   "Calories42"  
## [46] "Calories43"   "Calories44"   "Calories45"   "Calories46"   "Calories47"  
## [51] "Calories48"   "Calories49"   "Calories50"   "Calories51"   "Calories52"  
## [56] "Calories53"   "Calories54"   "Calories55"   "Calories56"   "Calories57"  
## [61] "Calories58"   "Calories59"  
## 
## Columns in minuteIntensitiesNarrow_merged.csv :
## [1] "Id"             "ActivityMinute" "Intensity"     
## 
## Columns in minuteIntensitiesWide_merged.csv :
##  [1] "Id"           "ActivityHour" "Intensity00"  "Intensity01"  "Intensity02" 
##  [6] "Intensity03"  "Intensity04"  "Intensity05"  "Intensity06"  "Intensity07" 
## [11] "Intensity08"  "Intensity09"  "Intensity10"  "Intensity11"  "Intensity12" 
## [16] "Intensity13"  "Intensity14"  "Intensity15"  "Intensity16"  "Intensity17" 
## [21] "Intensity18"  "Intensity19"  "Intensity20"  "Intensity21"  "Intensity22" 
## [26] "Intensity23"  "Intensity24"  "Intensity25"  "Intensity26"  "Intensity27" 
## [31] "Intensity28"  "Intensity29"  "Intensity30"  "Intensity31"  "Intensity32" 
## [36] "Intensity33"  "Intensity34"  "Intensity35"  "Intensity36"  "Intensity37" 
## [41] "Intensity38"  "Intensity39"  "Intensity40"  "Intensity41"  "Intensity42" 
## [46] "Intensity43"  "Intensity44"  "Intensity45"  "Intensity46"  "Intensity47" 
## [51] "Intensity48"  "Intensity49"  "Intensity50"  "Intensity51"  "Intensity52" 
## [56] "Intensity53"  "Intensity54"  "Intensity55"  "Intensity56"  "Intensity57" 
## [61] "Intensity58"  "Intensity59" 
## 
## Columns in minuteMETsNarrow_merged.csv :
## [1] "Id"             "ActivityMinute" "METs"          
## 
## Columns in minuteSleep_merged.csv :
## [1] "Id"    "date"  "value" "logId"
## 
## Columns in minuteStepsNarrow_merged.csv :
## [1] "Id"             "ActivityMinute" "Steps"         
## 
## Columns in minuteStepsWide_merged.csv :
##  [1] "Id"           "ActivityHour" "Steps00"      "Steps01"      "Steps02"     
##  [6] "Steps03"      "Steps04"      "Steps05"      "Steps06"      "Steps07"     
## [11] "Steps08"      "Steps09"      "Steps10"      "Steps11"      "Steps12"     
## [16] "Steps13"      "Steps14"      "Steps15"      "Steps16"      "Steps17"     
## [21] "Steps18"      "Steps19"      "Steps20"      "Steps21"      "Steps22"     
## [26] "Steps23"      "Steps24"      "Steps25"      "Steps26"      "Steps27"     
## [31] "Steps28"      "Steps29"      "Steps30"      "Steps31"      "Steps32"     
## [36] "Steps33"      "Steps34"      "Steps35"      "Steps36"      "Steps37"     
## [41] "Steps38"      "Steps39"      "Steps40"      "Steps41"      "Steps42"     
## [46] "Steps43"      "Steps44"      "Steps45"      "Steps46"      "Steps47"     
## [51] "Steps48"      "Steps49"      "Steps50"      "Steps51"      "Steps52"     
## [56] "Steps53"      "Steps54"      "Steps55"      "Steps56"      "Steps57"     
## [61] "Steps58"      "Steps59"     
## 
## Columns in sleepDay_merged.csv :
## [1] "Id"                 "SleepDay"           "TotalSleepRecords" 
## [4] "TotalMinutesAsleep" "TotalTimeInBed"    
## 
## Columns in weightLogInfo_merged.csv :
## [1] "Id"             "Date"           "WeightKg"       "WeightPounds"  
## [5] "Fat"            "BMI"            "IsManualReport" "LogId"

4. Check and select files based on the number of unique IDs

# Compare column names across all files
common_columns <- Reduce(intersect, column_names)
cat("Common columns across all files:\n")
## Common columns across all files:
print(common_columns)
## [1] "Id"

For all files common column is Id what is unique and represent every user.
The size of the sample could potentially introduce a sampling bias. While a sample size of 33 remains valid, a larger sample size would provide a better reflection of the overall population and consequently enhance the confidence interval. To avoid bias, we check all files for the number of Id and select only those that contain information about more than 33 users.

# Function to count the number of unique IDs in a file
count_unique_ids <- function(file_path) {
  if (file.exists(file_path)) {
    print(paste("Processing:", file_path))                                       # Debug: Check the file being processed
    data <- tryCatch(read_csv(file_path, col_types = cols_only(Id = col_character())),
                     error = function(e) {
                       print(paste("Error reading file:", file_path))            # Debug: Log errors
                       NULL
                     })
    if (!is.null(data)) {
      print(paste("Unique IDs in", file_path, ":", length(unique(data$Id))))     # Debug: Show count
      return(length(unique(data$Id)))
    }
  }
  return(0)
}

# Filter files that have more than 33 unique IDs
files_with_enough_ids <- file_list[sapply(file_list, count_unique_ids) >= 33]
## [1] "Processing: /cloud/project/DATA/dailyActivity_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/dailyActivity_merged.csv"
## [1] "Processing: /cloud/project/DATA/dailyCalories_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/dailyCalories_merged.csv"
## [1] "Processing: /cloud/project/DATA/dailyIntensities_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/dailyIntensities_merged.csv"
## [1] "Processing: /cloud/project/DATA/dailySteps_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/dailySteps_merged.csv"
## [1] "Processing: /cloud/project/DATA/heartrate_seconds_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/heartrate_seconds_merged.csv"
## [1] "Processing: /cloud/project/DATA/hourlyCalories_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/hourlyCalories_merged.csv"
## [1] "Processing: /cloud/project/DATA/hourlyData_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/hourlyData_merged.csv"
## [1] "Processing: /cloud/project/DATA/hourlyIntensities_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/hourlyIntensities_merged.csv"
## [1] "Processing: /cloud/project/DATA/hourlySteps_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/hourlySteps_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteCaloriesNarrow_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteCaloriesNarrow_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteCaloriesWide_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteCaloriesWide_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteIntensitiesNarrow_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteIntensitiesNarrow_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteIntensitiesWide_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteIntensitiesWide_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteMETsNarrow_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteMETsNarrow_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteSleep_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteSleep_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteStepsNarrow_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteStepsNarrow_merged.csv"
## [1] "Processing: /cloud/project/DATA/minuteStepsWide_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/minuteStepsWide_merged.csv"
## [1] "Processing: /cloud/project/DATA/sleepDay_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/sleepDay_merged.csv"
## [1] "Processing: /cloud/project/DATA/weightLogInfo_merged.csv"
## [1] "Error reading file: /cloud/project/DATA/weightLogInfo_merged.csv"
# Print the names of files that meet the condition
print(files_with_enough_ids)
## character(0)

5. Merge the datasets containing time-related information

After examining the column names, merging the three datasets containing time-related information into a single dataset using the “Id” and “ActivityHour” columns.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Load hourly data from three CSV files
hourly_calories <- read.csv(here("DATA", "hourlyCalories_merged.csv"))           # Data on hourly calorie burn
hourly_intensities <- read.csv(here("DATA", "hourlyIntensities_merged.csv"))     # Data on hourly activity intensities
hourly_steps <- read.csv(here("DATA", "hourlySteps_merged.csv"))                 # Data on hourly step counts

# Merge the datasets by 'Id' and 'ActivityHour' columns
# The result will include only rows with matching 'Id' and 'ActivityHour' across all three datasets
# Merge with hourly intensities
merged_hourly_data <- hourly_calories |>
  inner_join(hourly_intensities, by = c("Id", "ActivityHour")) |>
  # Merge with hourly steps
  inner_join(hourly_steps, by = c("Id", "ActivityHour"))                         

# Save the merged dataset as a new CSV file
write.csv(merged_hourly_data, here("DATA", "hourlyData_merged.csv"), row.names = FALSE)  # Save without row numbers

# Print a message indicating successful merging
cat("Data merged successfully into 'hourlyData_merged.csv'")
## Data merged successfully into 'hourlyData_merged.csv'

7. Convert the various dates and times in ‘hourlyData_merged’ to the correct format and add DayOfWeek column

# Load necessary libraries
library(dplyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(readr)
library(here)

# Load the raw merged hourly data from the CSV file
hourly_data_raw <- read.csv(here("DATA", "hourlyData_merged.csv"))

# Convert the 'ActivityHour' column from a character string to a datetime object
# Using the 'mdy_hms' function from the 'lubridate' package to parse dates in "Month-Day-Year Hour:Minute:Second" format
hourly_data_raw$ActivityHour <- mdy_hms(hourly_data_raw$ActivityHour)

# Extract the day of the week from the 'ActivityHour' datetime column
# 'label = TRUE' ensures the result is a factor with day names, and 'abbr = FALSE' ensures full names are used
hourly_data_raw$DayOfWeek <- wday(hourly_data_raw$ActivityHour, label = TRUE, abbr = FALSE, week_start = 1)

# Display the first few rows of the processed data to verify the transformations
head(hourly_data_raw)
##           Id        ActivityHour Calories TotalIntensity AverageIntensity
## 1 1503960366 2016-04-12 00:00:00       81             20         0.333333
## 2 1503960366 2016-04-12 01:00:00       61              8         0.133333
## 3 1503960366 2016-04-12 02:00:00       59              7         0.116667
## 4 1503960366 2016-04-12 03:00:00       47              0         0.000000
## 5 1503960366 2016-04-12 04:00:00       48              0         0.000000
## 6 1503960366 2016-04-12 05:00:00       48              0         0.000000
##   StepTotal DayOfWeek
## 1       373   Tuesday
## 2       160   Tuesday
## 3       151   Tuesday
## 4         0   Tuesday
## 5         0   Tuesday
## 6         0   Tuesday

8. Read and сheck files with daily data

# Reading the files
daily_activity <- read.csv("DATA/dailyActivity_merged.csv")
daily_calories <- read.csv("DATA/dailyCalories_merged.csv")
daily_intensities <- read.csv("DATA/dailyIntensities_merged.csv")
daily_steps <- read.csv("DATA/dailySteps_merged.csv")

# Display the first 5 rows of each file
head(daily_activity, 5)
##           Id ActivityDate TotalSteps TotalDistance TrackerDistance
## 1 1503960366    4/12/2016      13162          8.50            8.50
## 2 1503960366    4/13/2016      10735          6.97            6.97
## 3 1503960366    4/14/2016      10460          6.74            6.74
## 4 1503960366    4/15/2016       9762          6.28            6.28
## 5 1503960366    4/16/2016      12669          8.16            8.16
##   LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 1                        0               1.88                     0.55
## 2                        0               1.57                     0.69
## 3                        0               2.44                     0.40
## 4                        0               2.14                     1.26
## 5                        0               2.71                     0.41
##   LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 1                6.06                       0                25
## 2                4.71                       0                21
## 3                3.91                       0                30
## 4                2.83                       0                29
## 5                5.04                       0                36
##   FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## 1                  13                  328              728     1985
## 2                  19                  217              776     1797
## 3                  11                  181             1218     1776
## 4                  34                  209              726     1745
## 5                  10                  221              773     1863
head(daily_calories, 5)
##           Id ActivityDay Calories
## 1 1503960366   4/12/2016     1985
## 2 1503960366   4/13/2016     1797
## 3 1503960366   4/14/2016     1776
## 4 1503960366   4/15/2016     1745
## 5 1503960366   4/16/2016     1863
head(daily_intensities, 5)
##           Id ActivityDay SedentaryMinutes LightlyActiveMinutes
## 1 1503960366   4/12/2016              728                  328
## 2 1503960366   4/13/2016              776                  217
## 3 1503960366   4/14/2016             1218                  181
## 4 1503960366   4/15/2016              726                  209
## 5 1503960366   4/16/2016              773                  221
##   FairlyActiveMinutes VeryActiveMinutes SedentaryActiveDistance
## 1                  13                25                       0
## 2                  19                21                       0
## 3                  11                30                       0
## 4                  34                29                       0
## 5                  10                36                       0
##   LightActiveDistance ModeratelyActiveDistance VeryActiveDistance
## 1                6.06                     0.55               1.88
## 2                4.71                     0.69               1.57
## 3                3.91                     0.40               2.44
## 4                2.83                     1.26               2.14
## 5                5.04                     0.41               2.71
head(daily_steps, 5)
##           Id ActivityDay StepTotal
## 1 1503960366   4/12/2016     13162
## 2 1503960366   4/13/2016     10735
## 3 1503960366   4/14/2016     10460
## 4 1503960366   4/15/2016      9762
## 5 1503960366   4/16/2016     12669
# View the column names for each file
colnames(daily_activity)
##  [1] "Id"                       "ActivityDate"            
##  [3] "TotalSteps"               "TotalDistance"           
##  [5] "TrackerDistance"          "LoggedActivitiesDistance"
##  [7] "VeryActiveDistance"       "ModeratelyActiveDistance"
##  [9] "LightActiveDistance"      "SedentaryActiveDistance" 
## [11] "VeryActiveMinutes"        "FairlyActiveMinutes"     
## [13] "LightlyActiveMinutes"     "SedentaryMinutes"        
## [15] "Calories"
colnames(daily_calories)
## [1] "Id"          "ActivityDay" "Calories"
colnames(daily_intensities)
##  [1] "Id"                       "ActivityDay"             
##  [3] "SedentaryMinutes"         "LightlyActiveMinutes"    
##  [5] "FairlyActiveMinutes"      "VeryActiveMinutes"       
##  [7] "SedentaryActiveDistance"  "LightActiveDistance"     
##  [9] "ModeratelyActiveDistance" "VeryActiveDistance"
colnames(daily_steps)
## [1] "Id"          "ActivityDay" "StepTotal"

‘daily_activity’ has all columns from all file data, so chose it for father analyse.

9. Convert ActivityDate to Date format Add the DayOfWeek column

library(lubridate)

# Convert ActivityDate to Date format if necessary
daily_activity$ActivityDate <- as.Date(daily_activity$ActivityDate, format = "%m/%d/%Y")

# Add the DayOfWeek column as numeric (1=Sunday, 2=Monday, ..., 7=Saturday)
daily_activity$DayOfWeek <- wday(daily_activity$ActivityDate, label = FALSE, week_start = 1)

# Convert DayOfWeek to text (1=Sunday, ..., 7=Saturday)
days <- c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")
daily_activity$DayOfWeekText <- days[daily_activity$DayOfWeek]

# View the first 5 rows to confirm the change
head(daily_activity, 5)
##           Id ActivityDate TotalSteps TotalDistance TrackerDistance
## 1 1503960366   2016-04-12      13162          8.50            8.50
## 2 1503960366   2016-04-13      10735          6.97            6.97
## 3 1503960366   2016-04-14      10460          6.74            6.74
## 4 1503960366   2016-04-15       9762          6.28            6.28
## 5 1503960366   2016-04-16      12669          8.16            8.16
##   LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 1                        0               1.88                     0.55
## 2                        0               1.57                     0.69
## 3                        0               2.44                     0.40
## 4                        0               2.14                     1.26
## 5                        0               2.71                     0.41
##   LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 1                6.06                       0                25
## 2                4.71                       0                21
## 3                3.91                       0                30
## 4                2.83                       0                29
## 5                5.04                       0                36
##   FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories DayOfWeek
## 1                  13                  328              728     1985         2
## 2                  19                  217              776     1797         3
## 3                  11                  181             1218     1776         4
## 4                  34                  209              726     1745         5
## 5                  10                  221              773     1863         6
##   DayOfWeekText
## 1        Monday
## 2       Tuesday
## 3     Wednesday
## 4      Thursday
## 5        Friday

10. Clean data ‘hourly_data_row’ and ‘daily_activity’

  1. Check for Missing Values (NA)
# Check for missing values in the entire dataset
sum(is.na(daily_activity))   # For daily_activity
## [1] 0
sum(is.na(hourly_data_raw))  # For hourly_data_raw
## [1] 0
# Check for missing values per column
colSums(is.na(daily_activity))
##                       Id             ActivityDate               TotalSteps 
##                        0                        0                        0 
##            TotalDistance          TrackerDistance LoggedActivitiesDistance 
##                        0                        0                        0 
##       VeryActiveDistance ModeratelyActiveDistance      LightActiveDistance 
##                        0                        0                        0 
##  SedentaryActiveDistance        VeryActiveMinutes      FairlyActiveMinutes 
##                        0                        0                        0 
##     LightlyActiveMinutes         SedentaryMinutes                 Calories 
##                        0                        0                        0 
##                DayOfWeek            DayOfWeekText 
##                        0                        0
colSums(is.na(hourly_data_raw))
##               Id     ActivityHour         Calories   TotalIntensity 
##                0                0                0                0 
## AverageIntensity        StepTotal        DayOfWeek 
##                0                0                0
# Show rows with missing values
daily_activity[!complete.cases(daily_activity), ]
##  [1] Id                       ActivityDate             TotalSteps              
##  [4] TotalDistance            TrackerDistance          LoggedActivitiesDistance
##  [7] VeryActiveDistance       ModeratelyActiveDistance LightActiveDistance     
## [10] SedentaryActiveDistance  VeryActiveMinutes        FairlyActiveMinutes     
## [13] LightlyActiveMinutes     SedentaryMinutes         Calories                
## [16] DayOfWeek                DayOfWeekText           
## <0 rows> (or 0-length row.names)
hourly_data_raw[!complete.cases(hourly_data_raw), ]
## [1] Id               ActivityHour     Calories         TotalIntensity  
## [5] AverageIntensity StepTotal        DayOfWeek       
## <0 rows> (or 0-length row.names)
  1. Remove or Replace Missing Values
# Remove rows with missing values
daily_activity_clean <- na.omit(daily_activity)
hourly_data_clean <- na.omit(hourly_data_raw)

# Replace missing values with 0 (or another value)
daily_activity[is.na(daily_activity)] <- 0
hourly_data_raw[is.na(hourly_data_raw)] <- 0
  1. Check for Duplicates
# Find duplicates
sum(duplicated(daily_activity))   # For daily_activity
## [1] 0
sum(duplicated(hourly_data_raw))  # For hourly_data_raw
## [1] 0
# View duplicate rows
daily_activity[duplicated(daily_activity), ]
##  [1] Id                       ActivityDate             TotalSteps              
##  [4] TotalDistance            TrackerDistance          LoggedActivitiesDistance
##  [7] VeryActiveDistance       ModeratelyActiveDistance LightActiveDistance     
## [10] SedentaryActiveDistance  VeryActiveMinutes        FairlyActiveMinutes     
## [13] LightlyActiveMinutes     SedentaryMinutes         Calories                
## [16] DayOfWeek                DayOfWeekText           
## <0 rows> (or 0-length row.names)
hourly_data_raw[duplicated(hourly_data_raw), ]
## [1] Id               ActivityHour     Calories         TotalIntensity  
## [5] AverageIntensity StepTotal        DayOfWeek       
## <0 rows> (or 0-length row.names)
# Remove duplicate rows
daily_activity <- daily_activity[!duplicated(daily_activity), ]
hourly_data_raw <- hourly_data_raw[!duplicated(hourly_data_raw), ]
  1. Check Data Types and Correct Formats
# Check the structure of the dataset
str(daily_activity)
## 'data.frame':    940 obs. of  17 variables:
##  $ Id                      : num  1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
##  $ ActivityDate            : Date, format: "2016-04-12" "2016-04-13" ...
##  $ TotalSteps              : int  13162 10735 10460 9762 12669 9705 13019 15506 10544 9819 ...
##  $ TotalDistance           : num  8.5 6.97 6.74 6.28 8.16 ...
##  $ TrackerDistance         : num  8.5 6.97 6.74 6.28 8.16 ...
##  $ LoggedActivitiesDistance: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VeryActiveDistance      : num  1.88 1.57 2.44 2.14 2.71 ...
##  $ ModeratelyActiveDistance: num  0.55 0.69 0.4 1.26 0.41 ...
##  $ LightActiveDistance     : num  6.06 4.71 3.91 2.83 5.04 ...
##  $ SedentaryActiveDistance : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VeryActiveMinutes       : int  25 21 30 29 36 38 42 50 28 19 ...
##  $ FairlyActiveMinutes     : int  13 19 11 34 10 20 16 31 12 8 ...
##  $ LightlyActiveMinutes    : int  328 217 181 209 221 164 233 264 205 211 ...
##  $ SedentaryMinutes        : int  728 776 1218 726 773 539 1149 775 818 838 ...
##  $ Calories                : int  1985 1797 1776 1745 1863 1728 1921 2035 1786 1775 ...
##  $ DayOfWeek               : num  2 3 4 5 6 7 1 2 3 4 ...
##  $ DayOfWeekText           : chr  "Monday" "Tuesday" "Wednesday" "Thursday" ...
str(hourly_data_raw)
## 'data.frame':    22099 obs. of  7 variables:
##  $ Id              : num  1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
##  $ ActivityHour    : POSIXct, format: "2016-04-12 00:00:00" "2016-04-12 01:00:00" ...
##  $ Calories        : int  81 61 59 47 48 48 48 47 68 141 ...
##  $ TotalIntensity  : int  20 8 7 0 0 0 0 0 13 30 ...
##  $ AverageIntensity: num  0.333 0.133 0.117 0 0 ...
##  $ StepTotal       : int  373 160 151 0 0 0 0 0 250 1864 ...
##  $ DayOfWeek       : Ord.factor w/ 7 levels "Monday"<"Tuesday"<..: 2 2 2 2 2 2 2 2 2 2 ...
# Convert columns to correct data types (example for date format)
daily_activity$ActivityDate <- as.Date(daily_activity$ActivityDate, format = "%m/%d/%Y")
hourly_data_raw$ActivityHour <- as.POSIXct(hourly_data_raw$ActivityHour, format = "%m/%d/%Y %I:%M:%S %p", tz = "UTC")
  1. Check for Outliers or Unusual Values
# Summary statistics to detect outliers
summary(daily_activity)
##        Id             ActivityDate          TotalSteps    TotalDistance   
##  Min.   :1.504e+09   Min.   :2016-04-12   Min.   :    0   Min.   : 0.000  
##  1st Qu.:2.320e+09   1st Qu.:2016-04-19   1st Qu.: 3790   1st Qu.: 2.620  
##  Median :4.445e+09   Median :2016-04-26   Median : 7406   Median : 5.245  
##  Mean   :4.855e+09   Mean   :2016-04-26   Mean   : 7638   Mean   : 5.490  
##  3rd Qu.:6.962e+09   3rd Qu.:2016-05-04   3rd Qu.:10727   3rd Qu.: 7.713  
##  Max.   :8.878e+09   Max.   :2016-05-12   Max.   :36019   Max.   :28.030  
##  TrackerDistance  LoggedActivitiesDistance VeryActiveDistance
##  Min.   : 0.000   Min.   :0.0000           Min.   : 0.000    
##  1st Qu.: 2.620   1st Qu.:0.0000           1st Qu.: 0.000    
##  Median : 5.245   Median :0.0000           Median : 0.210    
##  Mean   : 5.475   Mean   :0.1082           Mean   : 1.503    
##  3rd Qu.: 7.710   3rd Qu.:0.0000           3rd Qu.: 2.053    
##  Max.   :28.030   Max.   :4.9421           Max.   :21.920    
##  ModeratelyActiveDistance LightActiveDistance SedentaryActiveDistance
##  Min.   :0.0000           Min.   : 0.000      Min.   :0.000000       
##  1st Qu.:0.0000           1st Qu.: 1.945      1st Qu.:0.000000       
##  Median :0.2400           Median : 3.365      Median :0.000000       
##  Mean   :0.5675           Mean   : 3.341      Mean   :0.001606       
##  3rd Qu.:0.8000           3rd Qu.: 4.782      3rd Qu.:0.000000       
##  Max.   :6.4800           Max.   :10.710      Max.   :0.110000       
##  VeryActiveMinutes FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes
##  Min.   :  0.00    Min.   :  0.00      Min.   :  0.0        Min.   :   0.0  
##  1st Qu.:  0.00    1st Qu.:  0.00      1st Qu.:127.0        1st Qu.: 729.8  
##  Median :  4.00    Median :  6.00      Median :199.0        Median :1057.5  
##  Mean   : 21.16    Mean   : 13.56      Mean   :192.8        Mean   : 991.2  
##  3rd Qu.: 32.00    3rd Qu.: 19.00      3rd Qu.:264.0        3rd Qu.:1229.5  
##  Max.   :210.00    Max.   :143.00      Max.   :518.0        Max.   :1440.0  
##     Calories      DayOfWeek     DayOfWeekText     
##  Min.   :   0   Min.   :1.000   Length:940        
##  1st Qu.:1828   1st Qu.:2.000   Class :character  
##  Median :2134   Median :4.000   Mode  :character  
##  Mean   :2304   Mean   :3.918                     
##  3rd Qu.:2793   3rd Qu.:6.000                     
##  Max.   :4900   Max.   :7.000
summary(hourly_data_raw)
##        Id             ActivityHour                       Calories     
##  Min.   :1.504e+09   Min.   :2016-04-12 00:00:00.00   Min.   : 42.00  
##  1st Qu.:2.320e+09   1st Qu.:2016-04-19 01:00:00.00   1st Qu.: 63.00  
##  Median :4.445e+09   Median :2016-04-26 06:00:00.00   Median : 83.00  
##  Mean   :4.848e+09   Mean   :2016-04-26 11:46:42.58   Mean   : 97.39  
##  3rd Qu.:6.962e+09   3rd Qu.:2016-05-03 19:00:00.00   3rd Qu.:108.00  
##  Max.   :8.878e+09   Max.   :2016-05-12 15:00:00.00   Max.   :948.00  
##                                                                       
##  TotalIntensity   AverageIntensity   StepTotal           DayOfWeek   
##  Min.   :  0.00   Min.   :0.0000   Min.   :    0.0   Monday   :2861  
##  1st Qu.:  0.00   1st Qu.:0.0000   1st Qu.:    0.0   Tuesday  :3600  
##  Median :  3.00   Median :0.0500   Median :   40.0   Wednesday:3547  
##  Mean   : 12.04   Mean   :0.2006   Mean   :  320.2   Thursday :3283  
##  3rd Qu.: 16.00   3rd Qu.:0.2667   3rd Qu.:  357.0   Friday   :2997  
##  Max.   :180.00   Max.   :3.0000   Max.   :10554.0   Saturday :2915  
##                                                      Sunday   :2896
# Boxplot to visualize outliers
boxplot(daily_activity$Calories, main = "Calories in daily_activity")

boxplot(hourly_data_raw$Calories, main = "Calories in hourly_data_raw")

  1. Handle Invalid Formats
# Convert columns to numeric if necessary
daily_activity$Calories <- as.numeric(daily_activity$Calories)
hourly_data_raw$Calories <- as.numeric(hourly_data_raw$Calories)

# Handle non-numeric values (e.g., replace non-numeric values with NA)
daily_activity$Calories[is.na(daily_activity$Calories)] <- 0
hourly_data_raw$Calories[is.na(hourly_data_raw$Calories)] <- 0
  1. Check for Invalid Date Formats
# Convert to Date format
daily_activity$ActivityDate <- as.Date(daily_activity$ActivityDate, format = "%m/%d/%Y")
hourly_data_raw$ActivityHour <- as.POSIXct(hourly_data_raw$ActivityHour, format = "%m/%d/%Y %I:%M:%S %p")
  1. Check for Zero or Negative Values in Numeric Columns
# Check for zero or negative values in specific columns
sum(daily_activity$Calories <= 0)
## [1] 4
sum(hourly_data_raw$Calories <= 0)
## [1] 0

check 4 rows where colories <= 0

# Filter rows where Calories <= 0
invalid_calories <- daily_activity[daily_activity$Calories <= 0, ]

# View the filtered rows
print(invalid_calories)
##             Id ActivityDate TotalSteps TotalDistance TrackerDistance
## 31  1503960366   2016-05-12          0             0               0
## 654 6290855005   2016-05-10          0             0               0
## 818 8253242879   2016-04-30          0             0               0
## 880 8583815059   2016-05-12          0             0               0
##     LoggedActivitiesDistance VeryActiveDistance ModeratelyActiveDistance
## 31                         0                  0                        0
## 654                        0                  0                        0
## 818                        0                  0                        0
## 880                        0                  0                        0
##     LightActiveDistance SedentaryActiveDistance VeryActiveMinutes
## 31                    0                       0                 0
## 654                   0                       0                 0
## 818                   0                       0                 0
## 880                   0                       0                 0
##     FairlyActiveMinutes LightlyActiveMinutes SedentaryMinutes Calories
## 31                    0                    0             1440        0
## 654                   0                    0             1440        0
## 818                   0                    0             1440        0
## 880                   0                    0             1440        0
##     DayOfWeek DayOfWeekText
## 31          4     Wednesday
## 654         2        Monday
## 818         6        Friday
## 880         4     Wednesday

we see any activities in other columns, so it should be removed.

daily_activity <- daily_activity[daily_activity$Calories > 0, ]

11. Rename the dataframe from hourly_data_raw to hourly_activity

# Rename the dataframe from hourly_data_raw to hourly_activity
hourly_activity <- hourly_data_raw

12. Export the data to CSV format for use in Tableau

write.csv(daily_activity, "daily_activity_cleaned.csv", row.names = FALSE)
write.csv(hourly_activity, "hourly_activity_cleaned.csv", row.names = FALSE)