Synopsis

This report aims to describe the dataset, it’s quality, and possible improvements for data.

Near UCU campus on Kozelnytska str there are three bus stops. This dataset is a crowled GPS data for buses, which stops there.

Preparations

library(dplyr)
library(stringr)
library(lubridate)
library(ggplot2)
library(ggmap)
library(geosphere)
library(pander)
# Backup system locale settings
l <- Sys.getlocale()

# Fix for Ukrainian characters
Sys.setlocale('LC_ALL', 'Ukrainian')

Loading data

You can download latest datasets from this repository.

df_data <- read.csv("data-tidy/tidy_busdata_230717_1400.csv", encoding="UTF-8", stringsAsFactors = F)
df_data$datetime <- as.POSIXct(strptime(df_data$datetime, "%Y-%m-%d %H:%M:%OS", tz="Europe/Kiev"))

df_routes <- read.csv("data-tidy/routes.csv", encoding="UTF-8")
df_stops <- read.csv("data-tidy/bus_stops.csv", encoding="UTF-8")
df_route_stops <- read.csv("data-tidy/df_route_stops.csv", encoding="UTF-8")
df_route_path <- read.csv("data-tidy/df_route_path.csv", encoding="UTF-8")
df_vehicles <- read.csv("data-tidy/vehicles.csv", encoding="UTF-8")

Exploration of Routes

Route stops count

df_route_stops %>%
  group_by(routeid) %>%
  summarise (n = n()) %>% 
  arrange(desc(n)) -> df_route_stops_stats
df_route_stops_stats <- data.frame(df_route_stops_stats)

df_route_stops_stats$routeid <- factor(as.character(df_route_stops_stats$routeid), levels = df_route_stops_stats$routeid)
pander(df_route_stops_stats)
routeid n
713002 78
1527114 73
713010 69
712988 65
949921 59
712991 57
1723724 57
1054553 43
ggplot(data=df_route_stops_stats, aes(x=routeid, y=n, fill=routeid)) +
  geom_bar(stat="identity") + 
  scale_fill_brewer(palette="Set1") +
  coord_flip()

Route path lenght

sq_map <- get_map(location = c(24.02324, 49.83826), source = "google", zoom = 12, maptype = 'hybrid', api_key = google_api_key)
ggmap(sq_map) + 
  geom_path(data = df_route_path, mapping = aes(x = lon, y = lat), color = "red") +
  facet_wrap("routeid", nrow = 2, ncol = 4)

total_distance <- function(x) {
  x_m <- cbind(x[1:nrow(x)-1,c('lon','lat')], x[2:nrow(x),c('lon','lat')])
  
  pointsGeoDistance <- function(x) {
    distm(c(x[1], x[2]), c(x[3], x[4]), fun=distHaversine)
  }
  
  x_m$dist<-apply(x_m, 1, pointsGeoDistance)
  
  sum(x_m$dist)
}

df_route_path %>% 
  group_by(routeid) %>%
  do(data.frame(total_distance=total_distance(.))) %>% 
  arrange(desc(total_distance)) -> df_route_length
df_route_length <- data.frame(df_route_length)
df_route_length$routeid <- factor(as.character(df_route_length$routeid), levels = df_route_length$routeid)
pander(df_route_length)
routeid total_distance
1527114 47902
713010 38283
713002 37783
712988 35217
949921 31652
712991 29541
1723724 29187
1054553 23801
ggplot(data=df_route_length, aes(x=routeid, y=total_distance, fill=routeid)) +
  geom_bar(stat="identity") + 
  scale_fill_brewer(palette="Set1") +
  coord_flip()

Dayly Vehicle count by route

df_data$minints<-cut(df_data$datetime, breaks="1 day")

df_data %>%
  group_by(routeid, minints) %>%
  summarise (veh_n = length(unique(vehicleid))) %>% 
  group_by(routeid) %>% 
  summarise(min = min(veh_n), mean = mean(veh_n), 
            median = median(veh_n), max = max(veh_n)) %>% 
  arrange(desc(median)) -> df_vehicles_by_route

df_vehicles_by_route <- data.frame(df_vehicles_by_route)
df_vehicles_by_route$routeid <- factor(as.character(df_vehicles_by_route$routeid), levels = df_vehicles_by_route$routeid)
pander(df_vehicles_by_route)
routeid min mean median max
1723724 9 15.5 17 17
712988 11 11 11 11
712991 8 10.5 10.5 14
713002 3 6.5 7 9
1527114 4 5.5 6 6
949921 2 3.667 3.5 5
713010 2 2.167 2 3
1054553 1 1.667 2 2
ggplot(data=df_vehicles_by_route, aes(x=routeid, y=median, fill=routeid)) +
  geom_bar(stat="identity") + 
  scale_fill_brewer(palette="Set1") +
  coord_flip()

GPS records data distribution by routes

df_data %>%
  group_by(routeid, vehicleid) %>%
  summarise(count = n())  %>% 
  group_by(routeid) %>% 
  summarise(mean_count = mean(count))  %>% 
  arrange(desc(mean_count)) -> data_route_distr

data_route_distr <- data.frame(data_route_distr)
data_route_distr$routeid <- factor(as.character(data_route_distr$routeid), levels = data_route_distr$routeid)
pander(data_route_distr)
routeid mean_count
712988 4213
1527114 3754
949921 3093
713002 2779
712991 2614
1723724 2582
713010 2179
1054553 268.2
ggplot(data=data_route_distr, aes(x=routeid, y=mean_count, fill=routeid)) +
  geom_bar(stat="identity") + 
  scale_fill_brewer(palette="Set1") +
  coord_flip()