Notes

This is to be used after you are familiar with processing files individually and can interpret the outputs.

The code will process any .csv in the directory, so all files should be trimmed to 24 hr periods only (starting from 12:00:00 AM on Day 2) as per the usual process with actigraphy imputation, and be labelled by their ID code and saved as .csv’s.

Calculating time needed to process your files

A file with ~5% missing data imputed with the following (optimal) MICE settings: m = 10, maxit = 5, and random forest selected, takes 5.41 minutes to complete (based on 100 iterations).

This is using the following hardware in Ocobter 2017:

Core i7-7700K CPU @ 4.20GHz
16GB RAM
Solid State Drive

Output

The function does not alter the input documents’ formatting or data. Rather, 2 columns are added at the end of the dataset, labelled data.Activity and data.White.Light. These are the new imputed data that will be used in subsequent analyses.

There are some mechanisms built in to the .pdf report naming which will indicate at a glance if you need to: 1. Review the file to check for possible outliers (defined as activity count values exceeding >1800) 2. Exclude the case/discard the file due to less than 7 days of available coverage.

All reports generated should be reviewed to assure the imputed values are appropriate.

The filenames of the reports will display:

“Unlikely_Outliers” if the activity counts do not exceed 1800, or “POSSIBLE_OUTLIERS” if activity values exceed 1800.
“lowmiss” if there is less than 10% of values missing, or "_HIGH_PERCENTAGE_MISSING" if there is more than 10% of values missing.
No message if the sampling period includes all 7 days of the week (i.e. weekdays and at least one weekend), or "_NOT_ENOUGH_DAYS" if not all weekdays are covered by the sampling period.
The percentage of missing data, rounded to 2 decimal places at the end of the filename.

If any of these warning messages are displayed in the filename, examine these reports first and re-run after removing outliers etc. Then move on to examining the remainder of the files.

Set your working directory…

Set your working directory to the location of the files you want to process. Then run this code to pass the names of the files you want to analyse to R.

The outputs will be saved back into your working directory.

# Assign all spreadsheets in wd() 
files <- Sys.glob("*.csv")
# Returns the files in the directory  
files

Run the loop

# Run the loop
actig_batch_process <- lapply(files, function(f) {
  
  # Locate and load the file
  data <- read.csv(f, header = T, sep = ",")
  
  # Examine the dimensions of the data to make sure it contains 
  # all the relevant statistics of your current file.
  str(data)
  
  # Load the required packagewd
  library(VIM)
  a <- aggr(data$Activity)
  a_plot <- aggr(data, col = c("navyblue", "red"), numbers = TRUE, sortVars = TRUE, 
                 prop = FALSE, labels = names(data$G), cex.axis = 0.7, gap = 1, ylab = c("Number of missings", 
                                                                                         "Combinations"))
  missings <- as.list(summary(a_plot))
  
  
  library(lubridate)
  library(dplyr)
  library(ggplot2)
  
  # Wrangle the dates and times
  Actig_Time <- parse_date_time(data$Time, "%I:%M:%S %p")
  Actig_Date <- dmy(data$Date)
  
  
  # Plot split by Date - will create a plot for each individual date
  no_dates <- ggplot(data = data, aes(x = Actig_Time, y = data$Off.Wrist.Status)) + geom_point() + 
    scale_x_datetime(labels = function(x) format(x, format = "%H:%M")) + 
    facet_wrap(~Actig_Date)
  
  # Plot for each day of the week over the sampling period - combines
  # duplicated days
  Weekdays <- weekdays(as.Date(Actig_Date))
  
  seven_days <- ggplot(data = data, aes(x = Actig_Time, y = data$Off.Wrist.Status)) + geom_point() + 
    scale_x_datetime(labels = function(x) format(x, format = "%H:%M")) + 
    facet_wrap(~Weekdays)
  
  # Test for outlying activity values
  
  na_rm <- na.omit(data$Activity)
  outlier_test <- ifelse(max(na_rm > 1800), print("_POSSIBLE_OUTLIERS"), print("_Unlikely_Outliers"))
  
  # Test for too many NA's
  
  NROW(data$Activity)
  percentage_na <- (sum(is.na(data$Activity))/(NROW(data$Activity))*100)
  percentage_na_test <- ifelse(percentage_na > 10, print("_HIGH_PERCENTAGE_MISSING_"), print("_lowmiss_"))
  
  # Test for enough days
  
  no_weekdays_df <- data.frame(unique(Weekdays))
  no_weekdays <- as.numeric(NROW(no_weekdays_df))
  weekdays_test <- ifelse(no_weekdays < 7, print("_NOT_ENOUGH_DAYS"), print("")) 
  
  # Load the required packages
  library(mice)
  library(lattice)
  # Create a vector of variables to be imputed, with a predictor variable
  # (white light)
  missing_data <- data.frame(data$Activity, data$White.Light)
  # Create a new dataset containing imputed values
  
  # tempdata <- mice(missing_data, m = 10, maxit = 5, method = "pmm") 
  #                 --BELOW METHOD IS BETTER--
  
  # OR:
  
  tempdata2 <- mice(missing_data, m = 10, maxit = 5, method = "rf")
  
  #               --REMEMBER TO CHANGE THE FOLLOWING CODE TO 'tempdata' OR 'tempdata2'--
  
  # Visualise
  strip_stats <- summary(tempdata2)
  
  completedata <- mice::complete(tempdata2, 1)
  completedata <- as.data.frame(completedata)
  completedata2 <- data.frame(data, completedata)
  write.csv(completedata2, file = (paste0(f)))
  
  # Save lists to pass in naming conventions
  
  low_percentage <- round(a$percent[a$percent < 40], digits = 2)
  naming <- paste0(f, "_Imputation_Report", weekdays_test, outlier_test, percentage_na_test, low_percentage, ".pdf")
 
  # Create data frame with variables for plots
  plot_data <- data.frame(data$Off.Wrist.Status, Actig_Time, Actig_Date, Weekdays)
   
  # Save plots as a report
  
  pdf(file = paste0(naming))
  
  aggr(data, col = c("navyblue", "red"), numbers = TRUE, sortVars = TRUE, 
       prop = FALSE, labels = names(data$G), cex.axis = 0.7, gap = 1, ylab = c("Number of missings", 
                                                                               "Combinations"))
  
  print(ggplot(data = plot_data, aes(x = Actig_Time, y = data$Off.Wrist.Status)) + geom_point() + 
          scale_x_datetime(labels = function(x) format(x, format = "%H:%M")) + 
          facet_wrap(~Actig_Date))
  
  print(ggplot(data = plot_data, aes(x = Actig_Time, y = data$Off.Wrist.Status)) + geom_point() + 
          scale_x_datetime(labels = function(x) format(x, format = "%H:%M")) + 
          facet_wrap(~Weekdays))
  
  print(stripplot(tempdata2, pch = 20, cex = 1.2, main = "Observed versus Imputed with m = 10"))
  
  plot(data$Activity, 
       col = "darkred", 
       cex = 0.70, 
       main = "Raw dataset", 
       xlab = "Cumulative frequency of 30-second epochs", 
       ylab = "Activity counts") 
  abline(h = 1800, lty = "twodash", lwd = 2)
  
  
  plot(completedata$data.Activity, 
       col = "darkblue", 
       cex = 0.70, 
       main = "Imputed dataset", 
       xlab = "Cumulative frequency of 30-second epochs", 
       ylab = "Activity counts")
  abline(h=1800, lty = "twodash", lwd = 2)
  
  dev.off()
  
  print(paste0(f, "        has been processed"))
  
})

To extract the number of recorded days in each file and percent missing

The following will also extract the number of unique dates in each of the files, and their percentage missing of activity data:

unique_dates_and_missingness <- function(files) {
        data <- read.csv(files)
        dates <- data$Date
        unique_days <- length(unique(data$Date))
        percentage_na <- (sum(is.na(data$Activity))/(NROW(data$Activity))*100)
        percentage_na <- round(percentage_na, digits = 2)
        missing_level <- ifelse(percentage_na > 10, 
                                print("File contains over 10% missing activity data"), print("OK"))
        combined <- cbind.data.frame(files, unique_days, percentage_na, missing_level)
        return(combined)
}
unique_dates_and_missingness <- do.call(rbind, lapply(files, unique_dates_and_missingness))

When the “Off.Wrist.Status” column is missing in the export

Use the following modified script in the above case. It is essentially just a stripped-down version of the original script.

# Assign all spreadsheets in wd() 
files <- Sys.glob("*.csv")
# Returns the files in the directory  
files
# Run the loop
actig_batch_process <- lapply(files, function(f) {
        
        # Locate and load the file
        data <- read.csv(f, header = T, sep = ",")
        
        # Examine the dimensions of the data to make sure it contains 
        # all the relevant statistics of your current file.
        str(data)
        
        # Load the required packagewd
        library(VIM)
        a <- aggr(data$Activity)
        a_plot <- aggr(data, col = c("navyblue", "red"), numbers = TRUE, sortVars = TRUE, 
                       prop = FALSE, labels = names(data$G), cex.axis = 0.7, gap = 1, ylab = c("Number of missings", 
                                                                                               "Combinations"))
        missings <- as.list(summary(a_plot))
        
        
        library(lubridate)
        library(dplyr)
        library(ggplot2)
        
         # Identify how many days included
        days_included <- length(unique(data$Date))
        days_included <<- c(f, days_included)
       
         # Wrangle the dates and times
        Actig_Time <- parse_date_time(data$Time, "%I:%M:%S %p")
        Actig_Date <- dmy(data$Date)

        
        # Test for outlying activity values
        
        na_rm <- na.omit(data$Activity)
        outlier_test <- ifelse(max(na_rm > 1800), print("_POSSIBLE_OUTLIERS"), print("_Unlikely_Outliers"))
        
        # Test for too many NA's
        
        NROW(data$Activity)
        percentage_na <- (sum(is.na(data$Activity))/(NROW(data$Activity))*100)
        percentage_na_test <- ifelse(percentage_na > 10, print("_HIGH_PERCENTAGE_MISSING_"), print("_lowmiss_"))
        
        # Load the required packages
        library(mice)
        library(lattice)
        # Create a vector of variables to be imputed, with a predictor variable
        # (white light)
        missing_data <- data.frame(data$Activity, data$White.Light)
        # Create a new dataset containing imputed values
        
        # tempdata <- mice(missing_data, m = 10, maxit = 5, method = "pmm") 
        #                 --BELOW METHOD IS BETTER--
        
        # OR:
        
        tempdata2 <- mice(missing_data, m = 10, maxit = 5, method = "rf")
        
        #               --REMEMBER TO CHANGE THE FOLLOWING CODE TO 'tempdata' OR 'tempdata2'--
        
        # Visualise
        strip_stats <- summary(tempdata2)
        
        completedata <- mice::complete(tempdata2, 1)
        completedata <- as.data.frame(completedata)
        completedata2 <- data.frame(data, completedata)
        write.csv(completedata2, file = (paste0(f)))
        
        # Save lists to pass in naming conventions
        
        low_percentage <- round(a$percent[a$percent < 40], digits = 2)
        naming <- paste0(f, "_Imputation_Report", outlier_test, percentage_na_test, low_percentage, ".pdf")
        
        
        # Save plots as a report
        
        pdf(file = paste0(naming))
        
        aggr(data, col = c("navyblue", "red"), numbers = TRUE, sortVars = TRUE, 
             prop = FALSE, labels = names(data$G), cex.axis = 0.7, gap = 1, ylab = c("Number of missings", 
                                                                                     "Combinations"))
        
        print(stripplot(tempdata2, pch = 20, cex = 1.2, main = "Observed versus Imputed with m = 10"))
        
        plot(data$Activity, 
             col = "darkred", 
             cex = 0.70, 
             main = "Raw dataset", 
             xlab = "Cumulative frequency of 30-second epochs", 
             ylab = "Activity counts") 
        abline(h = 1800, lty = "twodash", lwd = 2)
        
        
        plot(completedata$data.Activity, 
             col = "darkblue", 
             cex = 0.70, 
             main = "Imputed dataset", 
             xlab = "Cumulative frequency of 30-second epochs", 
             ylab = "Activity counts")
        abline(h=1800, lty = "twodash", lwd = 2)
        
        dev.off()
        
        print(paste0(f, "        has been processed"))
        
})

Batch_processing_Actigraphy_Imputation

Jonathon Pye