This is to be used after you are familiar with processing files individually and can interpret the outputs.
The code will process any .csv in the directory, so all files should be trimmed to 24 hr periods only (starting from 12:00:00 AM on Day 2) as per the usual process with actigraphy imputation, and be labelled by their ID code and saved as .csv’s.
A file with ~5% missing data imputed with the following (optimal) MICE settings: m = 10, maxit = 5, and random forest selected, takes 5.41 minutes to complete (based on 100 iterations).
This is using the following hardware in Ocobter 2017:
The function does not alter the input documents’ formatting or data. Rather, 2 columns are added at the end of the dataset, labelled data.Activity and data.White.Light. These are the new imputed data that will be used in subsequent analyses.
There are some mechanisms built in to the .pdf report naming which will indicate at a glance if you need to: 1. Review the file to check for possible outliers (defined as activity count values exceeding >1800) 2. Exclude the case/discard the file due to less than 7 days of available coverage.
All reports generated should be reviewed to assure the imputed values are appropriate.
The filenames of the reports will display:
“Unlikely_Outliers” if the activity counts do not exceed 1800, or “POSSIBLE_OUTLIERS” if activity values exceed 1800.
“lowmiss” if there is less than 10% of values missing, or "_HIGH_PERCENTAGE_MISSING" if there is more than 10% of values missing.
No message if the sampling period includes all 7 days of the week (i.e. weekdays and at least one weekend), or "_NOT_ENOUGH_DAYS" if not all weekdays are covered by the sampling period.
The percentage of missing data, rounded to 2 decimal places at the end of the filename.
If any of these warning messages are displayed in the filename, examine these reports first and re-run after removing outliers etc. Then move on to examining the remainder of the files.
Set your working directory to the location of the files you want to process. Then run this code to pass the names of the files you want to analyse to R.
The outputs will be saved back into your working directory.
# Assign all spreadsheets in wd()
files <- Sys.glob("*.csv")
# Returns the files in the directory
files
# Run the loop
actig_batch_process <- lapply(files, function(f) {
# Locate and load the file
data <- read.csv(f, header = T, sep = ",")
# Examine the dimensions of the data to make sure it contains
# all the relevant statistics of your current file.
str(data)
# Load the required packagewd
library(VIM)
a <- aggr(data$Activity)
a_plot <- aggr(data, col = c("navyblue", "red"), numbers = TRUE, sortVars = TRUE,
prop = FALSE, labels = names(data$G), cex.axis = 0.7, gap = 1, ylab = c("Number of missings",
"Combinations"))
missings <- as.list(summary(a_plot))
library(lubridate)
library(dplyr)
library(ggplot2)
# Wrangle the dates and times
Actig_Time <- parse_date_time(data$Time, "%I:%M:%S %p")
Actig_Date <- dmy(data$Date)
# Plot split by Date - will create a plot for each individual date
no_dates <- ggplot(data = data, aes(x = Actig_Time, y = data$Off.Wrist.Status)) + geom_point() +
scale_x_datetime(labels = function(x) format(x, format = "%H:%M")) +
facet_wrap(~Actig_Date)
# Plot for each day of the week over the sampling period - combines
# duplicated days
Weekdays <- weekdays(as.Date(Actig_Date))
seven_days <- ggplot(data = data, aes(x = Actig_Time, y = data$Off.Wrist.Status)) + geom_point() +
scale_x_datetime(labels = function(x) format(x, format = "%H:%M")) +
facet_wrap(~Weekdays)
# Test for outlying activity values
na_rm <- na.omit(data$Activity)
outlier_test <- ifelse(max(na_rm > 1800), print("_POSSIBLE_OUTLIERS"), print("_Unlikely_Outliers"))
# Test for too many NA's
NROW(data$Activity)
percentage_na <- (sum(is.na(data$Activity))/(NROW(data$Activity))*100)
percentage_na_test <- ifelse(percentage_na > 10, print("_HIGH_PERCENTAGE_MISSING_"), print("_lowmiss_"))
# Test for enough days
no_weekdays_df <- data.frame(unique(Weekdays))
no_weekdays <- as.numeric(NROW(no_weekdays_df))
weekdays_test <- ifelse(no_weekdays < 7, print("_NOT_ENOUGH_DAYS"), print(""))
# Load the required packages
library(mice)
library(lattice)
# Create a vector of variables to be imputed, with a predictor variable
# (white light)
missing_data <- data.frame(data$Activity, data$White.Light)
# Create a new dataset containing imputed values
# tempdata <- mice(missing_data, m = 10, maxit = 5, method = "pmm")
# --BELOW METHOD IS BETTER--
# OR:
tempdata2 <- mice(missing_data, m = 10, maxit = 5, method = "rf")
# --REMEMBER TO CHANGE THE FOLLOWING CODE TO 'tempdata' OR 'tempdata2'--
# Visualise
strip_stats <- summary(tempdata2)
completedata <- mice::complete(tempdata2, 1)
completedata <- as.data.frame(completedata)
completedata2 <- data.frame(data, completedata)
write.csv(completedata2, file = (paste0(f)))
# Save lists to pass in naming conventions
low_percentage <- round(a$percent[a$percent < 40], digits = 2)
naming <- paste0(f, "_Imputation_Report", weekdays_test, outlier_test, percentage_na_test, low_percentage, ".pdf")
# Create data frame with variables for plots
plot_data <- data.frame(data$Off.Wrist.Status, Actig_Time, Actig_Date, Weekdays)
# Save plots as a report
pdf(file = paste0(naming))
aggr(data, col = c("navyblue", "red"), numbers = TRUE, sortVars = TRUE,
prop = FALSE, labels = names(data$G), cex.axis = 0.7, gap = 1, ylab = c("Number of missings",
"Combinations"))
print(ggplot(data = plot_data, aes(x = Actig_Time, y = data$Off.Wrist.Status)) + geom_point() +
scale_x_datetime(labels = function(x) format(x, format = "%H:%M")) +
facet_wrap(~Actig_Date))
print(ggplot(data = plot_data, aes(x = Actig_Time, y = data$Off.Wrist.Status)) + geom_point() +
scale_x_datetime(labels = function(x) format(x, format = "%H:%M")) +
facet_wrap(~Weekdays))
print(stripplot(tempdata2, pch = 20, cex = 1.2, main = "Observed versus Imputed with m = 10"))
plot(data$Activity,
col = "darkred",
cex = 0.70,
main = "Raw dataset",
xlab = "Cumulative frequency of 30-second epochs",
ylab = "Activity counts")
abline(h = 1800, lty = "twodash", lwd = 2)
plot(completedata$data.Activity,
col = "darkblue",
cex = 0.70,
main = "Imputed dataset",
xlab = "Cumulative frequency of 30-second epochs",
ylab = "Activity counts")
abline(h=1800, lty = "twodash", lwd = 2)
dev.off()
print(paste0(f, " has been processed"))
})
The following will also extract the number of unique dates in each of the files, and their percentage missing of activity data:
unique_dates_and_missingness <- function(files) {
data <- read.csv(files)
dates <- data$Date
unique_days <- length(unique(data$Date))
percentage_na <- (sum(is.na(data$Activity))/(NROW(data$Activity))*100)
percentage_na <- round(percentage_na, digits = 2)
missing_level <- ifelse(percentage_na > 10,
print("File contains over 10% missing activity data"), print("OK"))
combined <- cbind.data.frame(files, unique_days, percentage_na, missing_level)
return(combined)
}
unique_dates_and_missingness <- do.call(rbind, lapply(files, unique_dates_and_missingness))
Use the following modified script in the above case. It is essentially just a stripped-down version of the original script.
# Assign all spreadsheets in wd()
files <- Sys.glob("*.csv")
# Returns the files in the directory
files
# Run the loop
actig_batch_process <- lapply(files, function(f) {
# Locate and load the file
data <- read.csv(f, header = T, sep = ",")
# Examine the dimensions of the data to make sure it contains
# all the relevant statistics of your current file.
str(data)
# Load the required packagewd
library(VIM)
a <- aggr(data$Activity)
a_plot <- aggr(data, col = c("navyblue", "red"), numbers = TRUE, sortVars = TRUE,
prop = FALSE, labels = names(data$G), cex.axis = 0.7, gap = 1, ylab = c("Number of missings",
"Combinations"))
missings <- as.list(summary(a_plot))
library(lubridate)
library(dplyr)
library(ggplot2)
# Identify how many days included
days_included <- length(unique(data$Date))
days_included <<- c(f, days_included)
# Wrangle the dates and times
Actig_Time <- parse_date_time(data$Time, "%I:%M:%S %p")
Actig_Date <- dmy(data$Date)
# Test for outlying activity values
na_rm <- na.omit(data$Activity)
outlier_test <- ifelse(max(na_rm > 1800), print("_POSSIBLE_OUTLIERS"), print("_Unlikely_Outliers"))
# Test for too many NA's
NROW(data$Activity)
percentage_na <- (sum(is.na(data$Activity))/(NROW(data$Activity))*100)
percentage_na_test <- ifelse(percentage_na > 10, print("_HIGH_PERCENTAGE_MISSING_"), print("_lowmiss_"))
# Load the required packages
library(mice)
library(lattice)
# Create a vector of variables to be imputed, with a predictor variable
# (white light)
missing_data <- data.frame(data$Activity, data$White.Light)
# Create a new dataset containing imputed values
# tempdata <- mice(missing_data, m = 10, maxit = 5, method = "pmm")
# --BELOW METHOD IS BETTER--
# OR:
tempdata2 <- mice(missing_data, m = 10, maxit = 5, method = "rf")
# --REMEMBER TO CHANGE THE FOLLOWING CODE TO 'tempdata' OR 'tempdata2'--
# Visualise
strip_stats <- summary(tempdata2)
completedata <- mice::complete(tempdata2, 1)
completedata <- as.data.frame(completedata)
completedata2 <- data.frame(data, completedata)
write.csv(completedata2, file = (paste0(f)))
# Save lists to pass in naming conventions
low_percentage <- round(a$percent[a$percent < 40], digits = 2)
naming <- paste0(f, "_Imputation_Report", outlier_test, percentage_na_test, low_percentage, ".pdf")
# Save plots as a report
pdf(file = paste0(naming))
aggr(data, col = c("navyblue", "red"), numbers = TRUE, sortVars = TRUE,
prop = FALSE, labels = names(data$G), cex.axis = 0.7, gap = 1, ylab = c("Number of missings",
"Combinations"))
print(stripplot(tempdata2, pch = 20, cex = 1.2, main = "Observed versus Imputed with m = 10"))
plot(data$Activity,
col = "darkred",
cex = 0.70,
main = "Raw dataset",
xlab = "Cumulative frequency of 30-second epochs",
ylab = "Activity counts")
abline(h = 1800, lty = "twodash", lwd = 2)
plot(completedata$data.Activity,
col = "darkblue",
cex = 0.70,
main = "Imputed dataset",
xlab = "Cumulative frequency of 30-second epochs",
ylab = "Activity counts")
abline(h=1800, lty = "twodash", lwd = 2)
dev.off()
print(paste0(f, " has been processed"))
})