Duplicated results and missing time intervals in “Minutes” column cause a spread of times. We must eliminate duplicate observations in order to discretely categorize readings by hour

require(zoo)
library(dplyr)

setwd("C:/Users/Aidan/Downloads")
trafficData158324 <- read.csv("trafficData158324.csv", header = TRUE, sep = ",")

Determining if there are any duplicate or missing values for TIMESTAMP by calculating if the number of unique values of TIMESTAMP is equal to the total number of rows in the dataset. If all TIMESTAMP values are unique and there are no gaps in the data, calculation should return TRUE:

length(unique(trafficData158324$TIMESTAMP)) == nrow(trafficData158324)
## [1] FALSE

Returns “FALSE” indicating that there are either duplicate values or missing time intervals.

Determining if there are any time gaps in the TIMESTAMP variable. We set ‘n_occur’ to a table of the TIMESTAMP column of the dataset. We then take a substring of the final two characters (the minute values) and set equal to ‘time_gap’.

n_occur <- data.frame(table(trafficData158324$TIMESTAMP))
time_gap <- substr(n_occur$Var1,15,16)
n_occur <- data.frame(table(time_gap))
n_occur
##    time_gap Freq
## 1        00 2673
## 2        05 2649
## 3        10 2656
## 4        15 2670
## 5        20 2678
## 6        25 2680
## 7        30 2680
## 8        35 2674
## 9        40 2669
## 10       45 2673
## 11       50 2677
## 12       55 2680

n_occur then becomes a table with two columns: time_gap and the frequency of each value.

The table returns varying lengths for each 5 minute interval, indicating time gaps in the data.

#Separating the TIMESTAMP variable into Date and Time variables
date_time_combined <- data.frame(x=trafficData158324$TIMESTAMP)
date_time_sep <- data.frame(do.call('rbind', strsplit(as.character(date_time_combined$x),'T',fixed=TRUE)))

colnames(date_time_sep) <- c("Date","Time")
rm(date_time_combined)

#Needs avg_tvt from "3d_scatterplot.R"
avg_tvt <- select(trafficData158324, TIMESTAMP, vehicleCount, avgMeasuredTime) 

new_data_frame <- bind_cols(date_time_sep, avg_tvt)
new_data_frame <- select(new_data_frame, Date, Time, avgMeasuredTime, vehicleCount)
###

#Separating the Date variable into Year, Month and Day variables
date_combined <- data.frame(x=new_data_frame$Date)
date_separated <- data.frame(do.call('rbind', strsplit(as.character(date_combined$x),'-',fixed=TRUE)))

colnames(date_separated) <- c("Year","Month","Day")
rm(date_combined)
###

#Combining separated 'Date', 'Time' with avgMeasuredTime and vehicleCount - SEPARATING COMPLETE
clean_traffic_data <- bind_cols(date_separated, date_time_sep, avg_tvt)
clean_traffic_data <- select(clean_traffic_data, Year, Month, Day, Time, avgMeasuredTime, vehicleCount)
rm(date_separated)
###

#Separating the Time variable into Hour, Minute and Second (always :00) variables
time_combined <- data.frame(x=clean_traffic_data$Time)
time_separated <- data.frame(do.call('rbind', strsplit(as.character(time_combined$x),':',fixed=TRUE)))

colnames(time_separated) <- c("Hour","Minute","Second")
rm(time_combined)
###

#Combining separated 'Hour' and 'Minutes' with avgMeasuredTime and vehicleCount - SEPARATING COMPLETE
clean_traffic_data <- bind_cols(time_separated, clean_traffic_data)
clean_traffic_data <- select(clean_traffic_data, Year, Month, Day, Hour, Minute, avgMeasuredTime, vehicleCount)
rm(time_separated)
###

#Slicing incomplete hours from the start and finish of the dataset
clean_traffic_data <- slice(clean_traffic_data, 7:32068)
###

Applying a rolling mean function across the length of dataset to get a mean value for every hour. ‘Width’ of 12 values, function = mean calculation, ‘by’ calculate mean every 12 values.

tester <- data.matrix(clean_traffic_data)
output <- rollapply(tester,12,(mean),by=12,column=TRUE,align='right')

Creating a spinning 3d Scatterplot

install.packages("rgl", repos="https://cran.r-project.org/web/packages/rgl/index.html")
library(rgl)
output_1 <- select(as.data.frame(output),Hour, avgMeasuredTime, vehicleCount)
output_k <- kmeans(output_1, 5)
plot3d(output_1, col=output_k$cluster, size=2, xlab = "Hour", 
       ylab="Average Time", zlab="Vehicle Count")

Here we can see that the ‘hour’ axis does not contain discrete values, but a spread suggesting there are still gaps in the data. How can we introduce NAs where a time gap exists?