First we read in the original .csv file trafficData158324
setwd("C:/Users/Aidan/Downloads")
trafficData158324 <- read.csv("trafficData158324.csv", header = TRUE, sep = ",")
#install.packages("scatterplot3d", repos="https://cran.rstudio.com")
#install.packages("Rcpp", dependencies=TRUE, repos="http://cran.rstudio.com/")
#install.packages("dplyr", dependencies=TRUE, repos='http://cran.rstudio.com/')
#install.packages("NbClust", repos="https://cran.rstudio.com/")
library(scatterplot3d)
library(Rcpp)
library(dplyr)
library(NbClust)
Creating ‘avg_st’, a data frame of average time and vehicle count
avg_st <- select(trafficData158324, avgMeasuredTime, vehicleCount)
Conducting k-means clustering on ‘avg_st’ and plotting with coloured clusters
avg_st.kmeans <- kmeans(avg_st,5)
plot(avg_st, col=avg_st.kmeans$cluster)
Calculating the optimum number of clusters (Understand NbClust library and what this function does)
wssplot <- function(avg_st, nc=15, seed=1234){ #Call this function with wssplot(avg_st)
wss <- (nrow(avg_st)-1)*sum(apply(avg_st,2,var))
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(avg_st, centers=i)$withinss)}
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")}
wssplot(avg_st)
Calculating the degree of agreement between cluster solution and measured data
ct.cl <- table(avg_st$avgMeasuredTime, avg_st.kmeans$cluster)
summary(ct.cl)
## Number of cases in table: 32075
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 119528, df = 832, p-value = 0
## Chi-squared approximation may be incorrect
install.packages("flexclust", repos="https://cran.rstudio.com/")
## package 'flexclust' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Aidan\AppData\Local\Temp\Rtmp2VSENY\downloaded_packages
library(flexclust)
randIndex(ct.cl)
## ARI
## 0.09483044
Separating the TIMESTAMP variable into Date and Time variables
date_time_combined <- data.frame(x=trafficData158324$TIMESTAMP)
date_time_sep <- data.frame(do.call('rbind', strsplit(as.character(date_time_combined$x),'T',fixed=TRUE)))
colnames(date_time_sep) <- c("Date","Time")
rm(date_time_combined)
#Creating a new data frame, 'avg_tvt' containing TIMESTAMP, Vehicle Count and Average Time
avg_tvt <- select(trafficData158324, TIMESTAMP, vehicleCount, avgMeasuredTime)
new_data_frame <- bind_cols(date_time_sep, avg_tvt)
new_data_frame <- select(new_data_frame, Date, Time, avgMeasuredTime, vehicleCount)
#Separating the Date variable from yyyy/mm/dd into Year, Month and Day columns
date_combined <- data.frame(x=new_data_frame$Date)
date_separated <- data.frame(do.call('rbind', strsplit(as.character(date_combined$x),'-',fixed=TRUE)))
colnames(date_separated) <- c("Year","Month","Day")
rm(date_combined)
#Combining the separated 'Date' and 'Time' with avgMeasuredTime and vehicleCount
clean_traffic_data <- bind_cols(date_separated, date_time_sep, avg_tvt)
clean_traffic_data <- select(clean_traffic_data, Year, Month, Day, Time, avgMeasuredTime, vehicleCount)
rm(date_separated)
#Separating the Time variable from hh:mm:ss into Hour, Minute and Second columns
time_combined <- data.frame(x=clean_traffic_data$Time)
time_separated <- data.frame(do.call('rbind', strsplit(as.character(time_combined$x),':',fixed=TRUE)))
colnames(time_separated) <- c("Hour","Minute","Second")
rm(time_combined)
#Combining separated 'Hour' and 'Minutes' columns with avgMeasuredTime and vehicleCount
clean_traffic_data <- bind_cols(time_separated, clean_traffic_data)
clean_traffic_data <- select(clean_traffic_data, Year, Month, Day, Hour, Minute, avgMeasuredTime, vehicleCount)
rm(time_separated)
#Slicing incomplete hours from the start and finish of the dataset
clean_traffic_data <- slice(clean_traffic_data, 7:32068)