Synopsis
This report aims to describe the dataset, it’s quality, and possible improvements for data.
Near UCU campus on Kozelnytska str there are three bus stops. This dataset is a crowled GPS data for buses, which stops there.
Preparations
library(dplyr)
library(stringr)
library(lubridate)
library(ggplot2)
library(ggmap)
library(geosphere)
library(pander)
# Backup system locale settings
l <- Sys.getlocale()
# Fix for Ukrainian characters
Sys.setlocale('LC_ALL', 'Ukrainian')
Loading data
You can download latest datasets from this repository.
df_data <- read.csv("data-tidy/tidy_busdata_230717_1400.csv", encoding="UTF-8", stringsAsFactors = F)
df_data$datetime <- as.POSIXct(strptime(df_data$datetime, "%Y-%m-%d %H:%M:%OS", tz="Europe/Kiev"))
df_routes <- read.csv("data-tidy/routes.csv", encoding="UTF-8")
df_stops <- read.csv("data-tidy/bus_stops.csv", encoding="UTF-8")
df_route_stops <- read.csv("data-tidy/df_route_stops.csv", encoding="UTF-8")
df_route_path <- read.csv("data-tidy/df_route_path.csv", encoding="UTF-8")
df_vehicles <- read.csv("data-tidy/vehicles.csv", encoding="UTF-8")
Exploration of Routes
Route stops count
df_route_stops %>%
group_by(routeid) %>%
summarise (n = n()) %>%
arrange(desc(n)) -> df_route_stops_stats
df_route_stops_stats <- data.frame(df_route_stops_stats)
df_route_stops_stats$routeid <- factor(as.character(df_route_stops_stats$routeid), levels = df_route_stops_stats$routeid)
pander(df_route_stops_stats)
| 713002 |
78 |
| 1527114 |
73 |
| 713010 |
69 |
| 712988 |
65 |
| 949921 |
59 |
| 712991 |
57 |
| 1723724 |
57 |
| 1054553 |
43 |
ggplot(data=df_route_stops_stats, aes(x=routeid, y=n, fill=routeid)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette="Set1") +
coord_flip()

Route path lenght
sq_map <- get_map(location = c(24.02324, 49.83826), source = "google", zoom = 12, maptype = 'hybrid', api_key = google_api_key)
ggmap(sq_map) +
geom_path(data = df_route_path, mapping = aes(x = lon, y = lat), color = "red") +
facet_wrap("routeid", nrow = 2, ncol = 4)

total_distance <- function(x) {
x_m <- cbind(x[1:nrow(x)-1,c('lon','lat')], x[2:nrow(x),c('lon','lat')])
pointsGeoDistance <- function(x) {
distm(c(x[1], x[2]), c(x[3], x[4]), fun=distHaversine)
}
x_m$dist<-apply(x_m, 1, pointsGeoDistance)
sum(x_m$dist)
}
df_route_path %>%
group_by(routeid) %>%
do(data.frame(total_distance=total_distance(.))) %>%
arrange(desc(total_distance)) -> df_route_length
df_route_length <- data.frame(df_route_length)
df_route_length$routeid <- factor(as.character(df_route_length$routeid), levels = df_route_length$routeid)
pander(df_route_length)
| 1527114 |
47902 |
| 713010 |
38283 |
| 713002 |
37783 |
| 712988 |
35217 |
| 949921 |
31652 |
| 712991 |
29541 |
| 1723724 |
29187 |
| 1054553 |
23801 |
ggplot(data=df_route_length, aes(x=routeid, y=total_distance, fill=routeid)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette="Set1") +
coord_flip()

Dayly Vehicle count by route
df_data$minints<-cut(df_data$datetime, breaks="1 day")
df_data %>%
group_by(routeid, minints) %>%
summarise (veh_n = length(unique(vehicleid))) %>%
group_by(routeid) %>%
summarise(min = min(veh_n), mean = mean(veh_n),
median = median(veh_n), max = max(veh_n)) %>%
arrange(desc(median)) -> df_vehicles_by_route
df_vehicles_by_route <- data.frame(df_vehicles_by_route)
df_vehicles_by_route$routeid <- factor(as.character(df_vehicles_by_route$routeid), levels = df_vehicles_by_route$routeid)
pander(df_vehicles_by_route)
| 1723724 |
9 |
15.5 |
17 |
17 |
| 712988 |
11 |
11 |
11 |
11 |
| 712991 |
8 |
10.5 |
10.5 |
14 |
| 713002 |
3 |
6.5 |
7 |
9 |
| 1527114 |
4 |
5.5 |
6 |
6 |
| 949921 |
2 |
3.667 |
3.5 |
5 |
| 713010 |
2 |
2.167 |
2 |
3 |
| 1054553 |
1 |
1.667 |
2 |
2 |
ggplot(data=df_vehicles_by_route, aes(x=routeid, y=median, fill=routeid)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette="Set1") +
coord_flip()

GPS records data distribution by routes
df_data %>%
group_by(routeid, vehicleid) %>%
summarise(count = n()) %>%
group_by(routeid) %>%
summarise(mean_count = mean(count)) %>%
arrange(desc(mean_count)) -> data_route_distr
data_route_distr <- data.frame(data_route_distr)
data_route_distr$routeid <- factor(as.character(data_route_distr$routeid), levels = data_route_distr$routeid)
pander(data_route_distr)
| 712988 |
4213 |
| 1527114 |
3754 |
| 949921 |
3093 |
| 713002 |
2779 |
| 712991 |
2614 |
| 1723724 |
2582 |
| 713010 |
2179 |
| 1054553 |
268.2 |
ggplot(data=data_route_distr, aes(x=routeid, y=mean_count, fill=routeid)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette="Set1") +
coord_flip()
