Reading Data

First we read in the original .csv file trafficData158324

setwd("C:/Users/Aidan/Downloads")
trafficData158324 <- read.csv("trafficData158324.csv", header = TRUE, sep = ",")

#install.packages("scatterplot3d", repos="https://cran.rstudio.com")
#install.packages("Rcpp", dependencies=TRUE, repos="http://cran.rstudio.com/")
#install.packages("dplyr", dependencies=TRUE, repos='http://cran.rstudio.com/')
#install.packages("NbClust", repos="https://cran.rstudio.com/")

library(scatterplot3d)
library(Rcpp)
library(dplyr)
library(NbClust)

Selecting and Clustering Data

Creating ‘avg_st’, a data frame of average time and vehicle count

avg_st <- select(trafficData158324, avgMeasuredTime, vehicleCount)

Conducting k-means clustering on ‘avg_st’ and plotting with coloured clusters

avg_st.kmeans <- kmeans(avg_st,5)

plot(avg_st, col=avg_st.kmeans$cluster)

Cluster Analysis

Calculating the optimum number of clusters (Understand NbClust library and what this function does)

wssplot <- function(avg_st, nc=15, seed=1234){ #Call this function with wssplot(avg_st)
  wss <- (nrow(avg_st)-1)*sum(apply(avg_st,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(avg_st, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}

Within Groups Sum of Squares to Determine Best Clustering Scheme

wssplot(avg_st)

Calculating the degree of agreement between cluster solution and measured data

ct.cl <- table(avg_st$avgMeasuredTime, avg_st.kmeans$cluster)
summary(ct.cl)
## Number of cases in table: 32075 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 119528, df = 832, p-value = 0
##  Chi-squared approximation may be incorrect
install.packages("flexclust", repos="https://cran.rstudio.com/")
## package 'flexclust' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Aidan\AppData\Local\Temp\Rtmp2VSENY\downloaded_packages
library(flexclust)
randIndex(ct.cl) 
##        ARI 
## 0.09483044

Dealing with the TIMESTAMP Column

Separating the TIMESTAMP variable into Date and Time variables

date_time_combined <- data.frame(x=trafficData158324$TIMESTAMP)
date_time_sep <- data.frame(do.call('rbind', strsplit(as.character(date_time_combined$x),'T',fixed=TRUE)))

colnames(date_time_sep) <- c("Date","Time")
rm(date_time_combined)

#Creating a new data frame, 'avg_tvt' containing TIMESTAMP, Vehicle Count and Average Time
avg_tvt <- select(trafficData158324, TIMESTAMP, vehicleCount, avgMeasuredTime) 

new_data_frame <- bind_cols(date_time_sep, avg_tvt)
new_data_frame <- select(new_data_frame, Date, Time, avgMeasuredTime, vehicleCount)

#Separating the Date variable from yyyy/mm/dd into Year, Month and Day columns
date_combined <- data.frame(x=new_data_frame$Date)
date_separated <- data.frame(do.call('rbind', strsplit(as.character(date_combined$x),'-',fixed=TRUE)))

colnames(date_separated) <- c("Year","Month","Day")
rm(date_combined)
#Combining the separated 'Date' and 'Time' with avgMeasuredTime and vehicleCount
clean_traffic_data <- bind_cols(date_separated, date_time_sep, avg_tvt)
clean_traffic_data <- select(clean_traffic_data, Year, Month, Day, Time, avgMeasuredTime, vehicleCount)
rm(date_separated)

#Separating the Time variable from hh:mm:ss into Hour, Minute and Second columns
time_combined <- data.frame(x=clean_traffic_data$Time)
time_separated <- data.frame(do.call('rbind', strsplit(as.character(time_combined$x),':',fixed=TRUE)))

colnames(time_separated) <- c("Hour","Minute","Second")
rm(time_combined)

#Combining separated 'Hour' and 'Minutes' columns with avgMeasuredTime and vehicleCount
clean_traffic_data <- bind_cols(time_separated, clean_traffic_data)
clean_traffic_data <- select(clean_traffic_data, Year, Month, Day, Hour, Minute, avgMeasuredTime, vehicleCount)
rm(time_separated)

#Slicing incomplete hours from the start and finish of the dataset
clean_traffic_data <- slice(clean_traffic_data, 7:32068)