# Rename columns to ensure consistency and match 'daily_activity' dataset
colnames(daily_activity)[colnames(daily_activity) == "TotalSteps"] <- "StepTotal"
colnames(daily_calories)[colnames(daily_calories) == "ActivityDay"] <- "ActivityDate"
colnames(daily_intensities)[colnames(daily_intensities) == "ActivityDay"] <- "ActivityDate"
colnames(daily_steps)[colnames(daily_steps) == "ActivityDay"] <- "ActivityDate"
colnames(heartrate_seconds)[colnames(heartrate_seconds) == "Time"] <- "ActivitySeconds"
colnames(heartrate_seconds)[colnames(heartrate_seconds) == "Value"] <- "Bpm"
colnames(sleep_day)[colnames(sleep_day) == "SleepDay"] <- "ActivityDate"
colnames(minute_intensities)[colnames(minute_intensities) == "ActivityMinute"] <- "ActivityMinutes"
colnames(minute_sleep)[colnames(minute_sleep) == "date"] <- "ActivityMinutes"
colnames(weight_log)[colnames(weight_log) == "Date"] <- "ActivityDate"
# Ensure date columns are in the same format
daily_activity$ActivityDate <- as.Date(daily_activity$ActivityDate, format="%m/%d/%Y")
daily_calories$ActivityDate <- as.Date(daily_calories$ActivityDate, format="%m/%d/%Y")
daily_intensities$ActivityDate <- as.Date(daily_intensities$ActivityDate, format="%m/%d/%Y")
daily_steps$ActivityDate <- as.Date(daily_steps$ActivityDate, format="%m/%d/%Y")
heartrate_seconds$ActivitySeconds <- as.POSIXct(heartrate_seconds$ActivitySeconds, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
hourly_calories$ActivityHour <- as.POSIXct(hourly_calories$ActivityHour, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
hourly_intensities$ActivityHour <- as.POSIXct(hourly_intensities$ActivityHour, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
hourly_steps$ActivityHour <- as.POSIXct(hourly_steps$ActivityHour, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
sleep_day$ActivityDate <- as.Date(sleep_day$ActivityDate, format="%m/%d/%Y")
minute_intensities$ActivityMinutes <- as.POSIXct(minute_intensities$ActivityMinutes, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
minute_sleep$ActivityMinutes <- as.POSIXct(minute_sleep$ActivityMinutes, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
weight_log$ActivityDate <- as.POSIXct(weight_log$ActivityDate, format="%m/%d/%Y %I:%M:%S %p", tz=Sys.timezone())
# Confirm changes in final inspection
str(daily_activity)
## 'data.frame': 940 obs. of 15 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityDate : Date, format: "2016-04-12" "2016-04-13" ...
## $ StepTotal : int 13162 10735 10460 9762 12669 9705 13019 15506 10544 9819 ...
## $ TotalDistance : num 8.5 6.97 6.74 6.28 8.16 ...
## $ TrackerDistance : num 8.5 6.97 6.74 6.28 8.16 ...
## $ LoggedActivitiesDistance: num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveDistance : num 1.88 1.57 2.44 2.14 2.71 ...
## $ ModeratelyActiveDistance: num 0.55 0.69 0.4 1.26 0.41 ...
## $ LightActiveDistance : num 6.06 4.71 3.91 2.83 5.04 ...
## $ SedentaryActiveDistance : num 0 0 0 0 0 0 0 0 0 0 ...
## $ VeryActiveMinutes : int 25 21 30 29 36 38 42 50 28 19 ...
## $ FairlyActiveMinutes : int 13 19 11 34 10 20 16 31 12 8 ...
## $ LightlyActiveMinutes : int 328 217 181 209 221 164 233 264 205 211 ...
## $ SedentaryMinutes : int 728 776 1218 726 773 539 1149 775 818 838 ...
## $ Calories : int 1985 1797 1776 1745 1863 1728 1921 2035 1786 1775 ...
str(daily_calories)
## 'data.frame': 940 obs. of 3 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityDate: Date, format: "2016-04-12" "2016-04-13" ...
## $ Calories : int 1985 1797 1776 1745 1863 1728 1921 2035 1786 1775 ...
str(daily_intensities)
## 'data.frame': 940 obs. of 10 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityDate : Date, format: "2016-04-12" "2016-04-13" ...
## $ SedentaryMinutes : int 728 776 1218 726 773 539 1149 775 818 838 ...
## $ LightlyActiveMinutes : int 328 217 181 209 221 164 233 264 205 211 ...
## $ FairlyActiveMinutes : int 13 19 11 34 10 20 16 31 12 8 ...
## $ VeryActiveMinutes : int 25 21 30 29 36 38 42 50 28 19 ...
## $ SedentaryActiveDistance : num 0 0 0 0 0 0 0 0 0 0 ...
## $ LightActiveDistance : num 6.06 4.71 3.91 2.83 5.04 ...
## $ ModeratelyActiveDistance: num 0.55 0.69 0.4 1.26 0.41 ...
## $ VeryActiveDistance : num 1.88 1.57 2.44 2.14 2.71 ...
str(daily_steps)
## 'data.frame': 940 obs. of 3 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityDate: Date, format: "2016-04-12" "2016-04-13" ...
## $ StepTotal : int 13162 10735 10460 9762 12669 9705 13019 15506 10544 9819 ...
str(heartrate_seconds)
## 'data.frame': 2483658 obs. of 3 variables:
## $ Id : num 2.02e+09 2.02e+09 2.02e+09 2.02e+09 2.02e+09 ...
## $ ActivitySeconds: POSIXct, format: "2016-04-12 07:21:00" "2016-04-12 07:21:05" ...
## $ Bpm : int 97 102 105 103 101 95 91 93 94 93 ...
str(hourly_calories)
## 'data.frame': 22099 obs. of 3 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityHour: POSIXct, format: "2016-04-12 00:00:00" "2016-04-12 01:00:00" ...
## $ Calories : int 81 61 59 47 48 48 48 47 68 141 ...
str(hourly_intensities)
## 'data.frame': 22099 obs. of 4 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityHour : POSIXct, format: "2016-04-12 00:00:00" "2016-04-12 01:00:00" ...
## $ TotalIntensity : int 20 8 7 0 0 0 0 0 13 30 ...
## $ AverageIntensity: num 0.333 0.133 0.117 0 0 ...
str(hourly_steps)
## 'data.frame': 22099 obs. of 3 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityHour: POSIXct, format: "2016-04-12 00:00:00" "2016-04-12 01:00:00" ...
## $ StepTotal : int 373 160 151 0 0 0 0 0 250 1864 ...
str(minute_intensities)
## spc_tbl_ [1,325,580 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Id : num [1:1325580] 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityMinutes: POSIXct[1:1325580], format: "2016-04-12 00:00:00" "2016-04-12 00:01:00" ...
## $ Intensity : num [1:1325580] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. Id = col_double(),
## .. ActivityMinute = col_character(),
## .. Intensity = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
str(minute_sleep)
## 'data.frame': 188521 obs. of 4 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityMinutes: POSIXct, format: "2016-04-12 02:47:30" "2016-04-12 02:48:30" ...
## $ value : int 3 2 1 1 1 1 1 2 2 2 ...
## $ logId : num 1.14e+10 1.14e+10 1.14e+10 1.14e+10 1.14e+10 ...
str(sleep_day)
## 'data.frame': 413 obs. of 5 variables:
## $ Id : num 1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
## $ ActivityDate : Date, format: "2016-04-12" "2016-04-13" ...
## $ TotalSleepRecords : int 1 2 1 2 1 1 1 1 1 1 ...
## $ TotalMinutesAsleep: int 327 384 412 340 700 304 360 325 361 430 ...
## $ TotalTimeInBed : int 346 407 442 367 712 320 377 364 384 449 ...
str(weight_log)
## 'data.frame': 67 obs. of 8 variables:
## $ Id : num 1.50e+09 1.50e+09 1.93e+09 2.87e+09 2.87e+09 ...
## $ ActivityDate : POSIXct, format: "2016-05-02 23:59:59" "2016-05-03 23:59:59" ...
## $ WeightKg : num 52.6 52.6 133.5 56.7 57.3 ...
## $ WeightPounds : num 116 116 294 125 126 ...
## $ Fat : int 22 NA NA NA NA 25 NA NA NA NA ...
## $ BMI : num 22.6 22.6 47.5 21.5 21.7 ...
## $ IsManualReport: chr "True" "True" "False" "True" ...
## $ LogId : num 1.46e+12 1.46e+12 1.46e+12 1.46e+12 1.46e+12 ...
# Identify missing values
# List of datasets
datasets <- list(
daily_activity = daily_activity,
daily_calories = daily_calories,
daily_intensities = daily_intensities,
daily_steps = daily_steps,
heartrate_seconds = heartrate_seconds,
hourly_calories = hourly_calories,
hourly_intensities = hourly_intensities,
hourly_steps = hourly_steps,
minute_intensities = minute_intensities,
minute_sleep = minute_sleep,
sleep_day = sleep_day,
weight_log = weight_log
)
# Check for missing values in each dataset
for (name in names(datasets)) {
cat("\nChecking for missing values in:", name, "\n")
print(sapply(datasets[[name]], function(x) sum(is.na(x))))
}
##
## Checking for missing values in: daily_activity
## Id ActivityDate StepTotal
## 0 0 0
## TotalDistance TrackerDistance LoggedActivitiesDistance
## 0 0 0
## VeryActiveDistance ModeratelyActiveDistance LightActiveDistance
## 0 0 0
## SedentaryActiveDistance VeryActiveMinutes FairlyActiveMinutes
## 0 0 0
## LightlyActiveMinutes SedentaryMinutes Calories
## 0 0 0
##
## Checking for missing values in: daily_calories
## Id ActivityDate Calories
## 0 0 0
##
## Checking for missing values in: daily_intensities
## Id ActivityDate SedentaryMinutes
## 0 0 0
## LightlyActiveMinutes FairlyActiveMinutes VeryActiveMinutes
## 0 0 0
## SedentaryActiveDistance LightActiveDistance ModeratelyActiveDistance
## 0 0 0
## VeryActiveDistance
## 0
##
## Checking for missing values in: daily_steps
## Id ActivityDate StepTotal
## 0 0 0
##
## Checking for missing values in: heartrate_seconds
## Id ActivitySeconds Bpm
## 0 0 0
##
## Checking for missing values in: hourly_calories
## Id ActivityHour Calories
## 0 0 0
##
## Checking for missing values in: hourly_intensities
## Id ActivityHour TotalIntensity AverageIntensity
## 0 0 0 0
##
## Checking for missing values in: hourly_steps
## Id ActivityHour StepTotal
## 0 0 0
##
## Checking for missing values in: minute_intensities
## Id ActivityMinutes Intensity
## 0 0 0
##
## Checking for missing values in: minute_sleep
## Id ActivityMinutes value logId
## 0 0 0 0
##
## Checking for missing values in: sleep_day
## Id ActivityDate TotalSleepRecords TotalMinutesAsleep
## 0 0 0 0
## TotalTimeInBed
## 0
##
## Checking for missing values in: weight_log
## Id ActivityDate WeightKg WeightPounds Fat
## 0 0 0 0 65
## BMI IsManualReport LogId
## 0 0 0
# Looking for duplicate rows
# List of datasets
datasets <- list(
daily_activity = daily_activity,
daily_calories = daily_calories,
daily_intensities = daily_intensities,
daily_steps = daily_steps,
hourly_calories = hourly_calories,
hourly_intensities = hourly_intensities,
hourly_steps = hourly_steps,
minute_intensities = minute_intensities,
minute_sleep = minute_sleep,
sleep_day = sleep_day,
weight_log = weight_log
)
# Check for duplicate rows in each dataset
for (name in names(datasets)) {
cat("\nChecking for duplicate rows in:", name, "\n")
duplicates <- sum(duplicated(datasets[[name]]))
cat("Number of duplicate rows:", duplicates, "\n")
}
##
## Checking for duplicate rows in: daily_activity
## Number of duplicate rows: 0
##
## Checking for duplicate rows in: daily_calories
## Number of duplicate rows: 0
##
## Checking for duplicate rows in: daily_intensities
## Number of duplicate rows: 0
##
## Checking for duplicate rows in: daily_steps
## Number of duplicate rows: 0
##
## Checking for duplicate rows in: hourly_calories
## Number of duplicate rows: 0
##
## Checking for duplicate rows in: hourly_intensities
## Number of duplicate rows: 0
##
## Checking for duplicate rows in: hourly_steps
## Number of duplicate rows: 0
##
## Checking for duplicate rows in: minute_intensities
## Number of duplicate rows: 0
##
## Checking for duplicate rows in: minute_sleep
## Number of duplicate rows: 543
##
## Checking for duplicate rows in: sleep_day
## Number of duplicate rows: 3
##
## Checking for duplicate rows in: weight_log
## Number of duplicate rows: 0
# Found 3 duplicates in 'sleep_day'
# Identify duplicates
duplicates <- sleep_day[duplicated(sleep_day) | duplicated(sleep_day, fromLast = TRUE), ]
# Print the duplicate rows to confirm removal
print(duplicates)
# Remove duplicate rows, keeping only the first occurrence
sleep_day <- sleep_day[!duplicated(sleep_day), ]
# Found 543 duplicates in 'minute_sleep'
# Identify duplicates
duplicates <- minute_sleep[duplicated(minute_sleep) | duplicated(minute_sleep, fromLast = TRUE), ]
# Print the duplicate rows to confirm removal
print(duplicates)
# Remove duplicate rows, keeping only the first occurrence
minute_sleep <- minute_sleep[!duplicated(minute_sleep), ]
# Running heart rate_seconds investigation separately to reduce likely hood of crash
# Created a function to process and remove duplicates in chunks
count_duplicates_in_chunks <- function(data, chunk_size = 100000) {
num_chunks <- ceiling(nrow(data) / chunk_size)
total_duplicates <- 0
for (i in 1:num_chunks) {
chunk <- data[((i - 1) * chunk_size + 1):min(i * chunk_size, nrow(data)), ]
total_duplicates <- total_duplicates + sum(duplicated(chunk))
}
return(total_duplicates)
}
# Convert heart rate_seconds to data.table
heartrate_seconds_dt <- as.data.table(heartrate_seconds)
# Count duplicates using chunk processing
num_duplicates <- count_duplicates_in_chunks(heartrate_seconds_dt)
# Print the number of duplicates
cat("Number of duplicate rows in heartrate_seconds:", num_duplicates, "\n")
## Number of duplicate rows in heartrate_seconds: 0
# Merge multiple datasets (daily_activity, daily_intensities, daily_calories, daily_steps, sleep_day) into single dataset named 'daily_combined' using common columns "Id" and "ActivityDate"
# Removes any duplicate columns that have a ".y" suffix, ensuring only the original columns are retained in the daily_combined dataset.
daily_combined <- daily_activity %>%
left_join(daily_intensities, by = c("Id", "ActivityDate")) %>%
left_join(daily_calories, by = c("Id", "ActivityDate")) %>%
left_join(daily_steps, by = c("Id", "ActivityDate")) %>%
left_join(sleep_day, by = c("Id", "ActivityDate")) %>%
select(-contains(".y"))
# Remove ".x" suffix from column names
colnames(daily_combined) <- gsub("\\.x$", "", colnames(daily_combined))
# Merge hourly datasets using common columns "Id" and "ActivityHour" and named 'hourly_combined'
# Remove any duplicate columns that have a ".y" suffix, ensuring only the original columns are retained in the hourly_combined dataset.
hourly_combined <- hourly_calories %>%
left_join(hourly_intensities, by = c("Id", "ActivityHour")) %>%
left_join(hourly_steps, by = c("Id", "ActivityHour")) %>%
select(-contains(".y"))