1 Data Preprocessing

1.1 Load all valid GPX datasets and apply supervised GPX track classes

source('../gpx_validation.R') # GPX dataset processing / shared OpenCPU functions

temp <- dat <- list()
data_dir <- "C:/Users/rober/Desktop/Repositories/gpx_processing/data_backup/"
valid_datasets <- c("14-05.gpx","2016-06-18 10.14.52 Stopwatch.gpx","Current.gpx",
                    "Meg.gpx","Scout.gpx","TeamEchoCurrent.gpx","Track_ 018-02-24 145845.gpx",
                    "Track_14-08 1.gpx","Track_2015-07-05 215817.gpx","Track_2015-07-09 151556.gpx","Track_2015-07-09 183806.gpx","Track_2015-07-10 132200.gpx","Track_2015-07-15 204324.gpx","Track_2015-07-24 164157.gpx","Track_2015-07-29 130706.gpx","Track_2016-04-30 115630.gpx","Track_2016-06-18 131319.gpx","Track_2018-02-10 104731 Task 1.gpx","Track_2018-02-10 110234 Task 2.gpx","Track_2018-02-10 113753 Task 14B.gpx","Track_2018-02-10 150828 Task 29B.gpx","Track_ALPHA05-14 125702.gpx","Track_K9MEG06-18 124555.gpx","Track_QUEBEC5-14 154550.gpx","Track_T18 800 001 TASK2.gpx","Track_T18 800 001.gpx")

hardcode_response <- c("Trail","Sweep","Trail","Canine","Canine","Sweep","Sweep","Sweep","Trail","Trail","Offtrail","Offtrail","Trail","Offtrail","None","Trail","Offtrail","Trail","Trail","Sweep","Trail","Offtrail","Canine","Offtrail","Trail","Sweep")

1.2 Run data frame generator and apply 10 minute block aggregate IDs

# build datasets
# meg.gpx returning warnings
for (i in 1:length(valid_datasets)){
  name <- valid_datasets[i]
  cat(name)
  temp[[name]] <- invisible(gpx_validation(paste0(data_dir,valid_datasets[i])))
  temp[[name]]$dt_ts <- strptime(temp[[name]]$DateTime, "%Y-%m-%dT%H:%M:%SZ")
  temp[[name]]$name <- name
  # Add 10 minute identifiers
  temp_2 <- split(temp[[name]], cut(strptime(temp[[name]]$dt_ts,format="%F %R"),"10 mins"))
  temp_3 <- do.call("rbind", temp_2)
  temp_3$time_bucket <- gsub("\\..*","",rownames(temp_3))
  dat[[name]] <- transform(temp_3,time_bucket_index=as.numeric(factor(time_bucket)))
}

rm(temp);rm(temp_2); rm(temp_3)

1.3 Sorting to standard 10 minute blocks/units and creating associated predictors

`%+=%` = function(e1,e2) eval.parent(substitute(e1 <- e1 + e2))

ascent <- function(arr){
  
  lastHeight <- arr[1]
  total_ascent <- 0
  total_descent <- 0
  
  for (i in seq(1:length(arr))){
    if (arr[i] > lastHeight){
      total_ascent %+=% (-1 * (lastHeight - arr[i]))
    }else{
      total_descent %+=% (lastHeight - arr[i])
    }
    lastHeight <- arr[i]
  }
  return (c(total_ascent,total_descent))
}

suppressWarnings(require(anytime))
## Loading required package: anytime
n <- length(dat)
stats <- data.frame("name"=character(n),
                    "min_time"=anytime(n),
                    "max_time"=anytime(n),
                    "response"=character(n),
                    "num_logs"=integer(n),
                    "distance"=double(n),
                    "avg_speed"=double(n),
                    "avg_gradient"=double(n),
                    "avg_pace"=double(n),
                    "total_ascent"=double(n),
                    "total_descent"=double(n)
                    )
# Gather statistics
j <- k <- j <- 1

for(i in dat){
  split_dat <- split(i,i$time_bucket_index)
  for (f in split_dat){
    #browser()
    stats[k,]$name <- i$name[j]
    stats[k,]$response <- hardcode_response[j]
    stats[k,]$min_time <- min(f$dt_ts)
    stats[k,]$max_time <- max(f$dt_ts)
    stats[k,]$num_logs <- nrow(f)
    # New regression stats
    stats[k,]$distance <- max(f$Dist) - min(f$Dist)
    stats[k,]$avg_speed <- mean(f$Speed)
    stats[k,]$avg_gradient <- mean(f$Gradient)
    stats[k,]$avg_pace <- mean(f$Pace)
    stats[k,]$total_ascent <- ascent(f$Elevation)[1]
    stats[k,]$total_descent <- ascent(f$Elevation)[2]
    k<-k+1
  }
  j<-j+1
}

stats$minute_duration <- (stats$max_time-stats$min_time)/60
stats$logs_per_minute <- stats$num_logs/as.numeric(stats$minute_duration)

1.4 Remove outliers and missing data

counts <- table(stats$name,stats$response)
barplot(counts, main="Class Count by GPX", 
    xlab="Number of classes", col=palette(rainbow(n)))

stats_outlier_removed <- stats
stats_outlier_removed<- stats_outlier_removed[is.finite(stats_outlier_removed$logs_per_minute), ]
stats_outlier_removed<-stats_outlier_removed[stats_outlier_removed$minute_duration > 9, ]
stats_outlier_removed<-stats_outlier_removed[stats_outlier_removed$response != "None", ]
counts <- table(stats_outlier_removed$name,stats_outlier_removed$response)
barplot(counts, main="Class Count by GPX", 
    xlab="Number of classes", col=palette(rainbow(n)))

2 Model Testing / Exploration

trainingdata <- stats_outlier_removed

2.1 Predictor visualization and analysis

2.2 Apply transformations

data_format <- function(df){
  df$log_distance <- log(df$distance)
  df$log_avg_speed <- log(df$avg_speed)
  df$log_total_ascent <- log(df$total_ascent)
  df$log_total_descent <- log(df$total_descent)
  df$log_avg_pace <- log(df$avg_pace)
  df$min_time <- NULL
  df$max_time <- NULL
  df$response_alt <- as.integer(as.factor(df$response))
  df$response_f <- as.factor(df$response)
  df <- do.call(data.frame, lapply(df, function(x) {replace(x, is.infinite(x) | is.na(x), 0)}))
  return (df)
}

trainingdata <- data_format(trainingdata)
### Add night/day?  

2.3 Correlation analysis

suppressWarnings(require(dplyr))
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
suppressWarnings(library(reshape2))#melt function

cor_mat <- as.matrix(cor(subset(trainingdata, select = -c(name,minute_duration,response,response_alt,response_f))))
cor_mat_melt <- arrange(melt(cor_mat), -abs(value))
cor_mat_melt <- cor_mat_melt %>% filter(value > .25, value != 1)
toDelete <- seq(1, nrow(cor_mat_melt), 2)
cor_mat_melt[toDelete, ]
##                 Var1             Var2     value
## 1    logs_per_minute         num_logs 0.9993351
## 3          avg_speed         distance 0.9894442
## 5      log_avg_speed     log_distance 0.9783880
## 7       log_avg_pace         avg_pace 0.7943735
## 9      total_descent     total_ascent 0.7672732
## 11 log_total_descent    total_descent 0.7302528
## 13  log_total_ascent     total_ascent 0.6911984
## 15      log_distance        avg_speed 0.6641661
## 17      log_distance  logs_per_minute 0.6551007
## 19      log_distance         num_logs 0.6520552
## 21     log_avg_speed        avg_speed 0.6445795
## 23      log_distance         distance 0.6389638
## 25     log_avg_speed  logs_per_minute 0.6369149
## 27     log_avg_speed         num_logs 0.6324812
## 29   logs_per_minute        avg_speed 0.6152543
## 31         avg_speed         num_logs 0.6084249
## 33     log_avg_speed         distance 0.6017079
## 35   logs_per_minute         distance 0.5894126
## 37          distance         num_logs 0.5839398
## 39 log_total_descent    log_avg_speed 0.5151395
## 41 log_total_descent     total_ascent 0.4921165
## 43 log_total_descent log_total_ascent 0.4791655
## 45   logs_per_minute    total_descent 0.4706471
## 47 log_total_descent     log_distance 0.4695573
## 49     total_descent         num_logs 0.4679433
## 51     log_avg_speed    total_descent 0.4624027
## 53  log_total_ascent    total_descent 0.4526575
## 55   logs_per_minute     total_ascent 0.4283963
## 57      total_ascent         num_logs 0.4215973
## 59     log_avg_speed     total_ascent 0.4134211
## 61      log_distance    total_descent 0.4026903
## 63     total_descent        avg_speed 0.3903178
## 65  log_total_ascent    log_avg_speed 0.3844168
## 67     total_descent         distance 0.3510473
## 69      log_distance     total_ascent 0.3416614
## 71      total_ascent        avg_speed 0.3366814
## 73 log_total_descent        avg_speed 0.3340285
## 75  log_total_ascent     log_distance 0.3269416
## 77 log_total_descent  logs_per_minute 0.3044082
## 79 log_total_descent         distance 0.3003804
## 81 log_total_descent         num_logs 0.2995813
## 83  log_total_ascent        avg_speed 0.2910805
## 85      total_ascent         distance 0.2816207
## 87  log_total_ascent         distance 0.2527659

2.4 Boxplot visualization analysis

par(mfrow=c(1,2), oma = c(1,1,0,0) + 0.1,  mar = c(2,2,1,1) + 0.1)

boxplot(trainingdata$distance ~ trainingdata$response, col="lightgreen", pch=19)
mtext("Distance", cex=0.8, side=1, line=2)
boxplot(trainingdata$log_distance ~ trainingdata$response, col="lightgreen", pch=19)
mtext("Distance (log)", cex=0.8, side=1, line=2)

boxplot(trainingdata$avg_speed ~ trainingdata$response, col="lightgreen", pch=19)
mtext("Avg Speed", cex=0.8, side=1, line=2)
boxplot(trainingdata$log_avg_speed ~ trainingdata$response, col="lightgreen", pch=19)
mtext("Avg Speed (log)", cex=0.8, side=1, line=2)

boxplot(trainingdata$avg_pace ~ trainingdata$response, col="lightgreen", pch=19)
mtext("Avg Pace", cex=0.8, side=1, line=2)
boxplot(trainingdata$log_avg_pace ~ trainingdata$response, col="lightgreen", pch=19)
mtext("Avg Pace (log)", cex=0.8, side=1, line=2)

boxplot(trainingdata$total_ascent ~ trainingdata$response, col="lightgreen", pch=19)
mtext("Total Ascent", cex=0.8, side=1, line=2)
boxplot(trainingdata$log_total_ascent ~ trainingdata$response, col="lightgreen", pch=19)
mtext("Total Ascent (log)", cex=0.8, side=1, line=2)

boxplot(trainingdata$total_descent ~ trainingdata$response, col="lightgreen", pch=19)
mtext("Total Descent", cex=0.8, side=1, line=2)
boxplot(trainingdata$log_total_descent ~ trainingdata$response, col="lightgreen", pch=19)
mtext("Total Descent (log)", cex=0.8, side=1, line=2)

boxplot(trainingdata$avg_gradient ~ trainingdata$response, col="lightgreen", pch=19)
mtext("Avg Gradient", cex=0.8, side=1, line=2)

2.5 Random Forest Model Application

suppressWarnings(library(randomForest))
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
training_subset <- trainingdata[ , -which(names(trainingdata) %in% c("name","response","num_logs","minute_duration","logs_per_minute"))]

set.seed(100)
sample_train <- sample(2, nrow(training_subset), replace = TRUE, prob=c(0.8, 0.2))

random_forest_model <- randomForest(response_f ~ distance + avg_gradient + total_ascent + log_distance + 
    log_avg_speed + log_total_ascent + log_total_descent + log_avg_pace, training_subset[sample_train==1,], ntree=500, importance=TRUE, proximity=TRUE)

plot(random_forest_model)

2.6 Testing model

random_forest_predict <- predict(random_forest_model, training_subset[sample_train==2,])
table(observed = training_subset[sample_train ==2, "response_f"], predicted = random_forest_predict)
##           predicted
## observed   Canine Offtrail Sweep Trail
##   Canine        9        0     0     1
##   Offtrail      4       12     2     5
##   Sweep         0        1     5     4
##   Trail         5        5     8    25

2.7 Save model

save(random_forest_predict, file = "rf_class.rda")