library(tidyverse)
library(lubridate)
library(gridExtra)
library(scales)
library(zoo)
library(forecast)
dfraw <- read_csv(file="dfosf001.csv",
locale = locale(tz ="Australia/Sydney"))
dfraw %>% head()
dfraw %>% glimpse()
dfraw$OFFENCE_MONTH <- dfraw$OFFENCE_MONTH %>% dmy()
df <- dfraw %>%
select(OFFENCE_FINYEAR, OFFENCE_MONTH, OFFENCE_CODE, OFFENCE_DESC, FACE_VALUE, TOTAL_NUMBER, TOTAL_VALUE)
# raw data set specs
dfraw %>% spec()
cols(
OFFENCE_FINYEAR = col_character(),
OFFENCE_MONTH = col_character(),
OFFENCE_CODE = col_integer(),
OFFENCE_DESC = col_character(),
LEGISLATION = col_character(),
SECTION_CLAUSE = col_character(),
FACE_VALUE = col_integer(),
CAMERA_IND = col_character(),
CAMERA_TYPE = col_character(),
LOCATION_CODE = col_integer(),
LOCATION_DETAILS = col_character(),
SCHOOL_ZONE_IND = col_character(),
SPEED_BAND = col_character(),
SPEED_IND = col_character(),
POINT_TO_POINT_IND = col_character(),
RED_LIGHT_CAMERA_IND = col_character(),
SPEED_CAMERA_IND = col_character(),
SEATBELT_IND = col_character(),
MOBILE_PHONE_IND = col_character(),
PARKING_IND = col_character(),
CINS_IND = col_character(),
FOOD_IND = col_character(),
BICYCLE_TOY_ETC_IND = col_character(),
TOTAL_NUMBER = col_integer(),
TOTAL_VALUE = col_integer()
)
# sample cleansed data
df %>% head()
df %>% summarise( "From Date" = min(OFFENCE_MONTH),
"To Date" = max(OFFENCE_MONTH),
"Total Revenue" = sum(TOTAL_VALUE,na.rm = T),
"Number of Offences" = sum(TOTAL_NUMBER),
"Min Offence Value" = min(FACE_VALUE),
"Max Offence Value" = max(FACE_VALUE),
"Number of Records" = n()) -> dfSummary
knitr::kable(dfSummary)
| From Date | To Date | Total Revenue | Number of Offences | Min Offence Value | Max Offence Value | Number of Records |
|---|---|---|---|---|---|---|
| 2013-01-01 | 2019-02-01 | 4379485352 | 17946602 | 20 | 18455 | 289284 |
df %>% group_by(OFFENCE_DESC) %>%
summarise(Value = first(FACE_VALUE)) %>%
top_n(5, Value ) %>%
ggplot(aes(x = reorder(OFFENCE_DESC,Value), y = Value)) +
geom_bar(stat ="identity") + #geom_label(aes(label=Value)) +
theme_light() +
labs (x = "Offence Description") +
coord_flip()
df %>% group_by(OFFENCE_DESC) %>%
summarise(Value = first(FACE_VALUE)) %>%
top_n(-5, Value ) %>%
ggplot(aes(x = reorder(OFFENCE_DESC,Value), y = Value)) +
geom_bar(stat ="identity") + #geom_label(aes(label=Value)) +
theme_light() +
labs (x = "Offence Description") +
coord_flip()
# numbers metric formatter
format_si <- function(...) {
# Format a vector of numeric values according
# to the International System of Units.
# http://en.wikipedia.org/wiki/SI_prefix
#
# Based on code by Ben Tupper
# https://stat.ethz.ch/pipermail/r-help/2012-January/299804.html
# Args:
# ...: Args passed to format()
#
# Returns:
# A function to format a vector of strings using
# SI prefix notation
#
function(x) {
limits <- c(1e-24, 1e-21, 1e-18, 1e-15, 1e-12,
1e-9, 1e-6, 1e-3, 1e0, 1e3,
1e6, 1e9, 1e12, 1e15, 1e18,
1e21, 1e24)
prefix <- c("y", "z", "a", "f", "p",
"n", "µ", "m", " ", "k",
"M", "G", "T", "P", "E",
"Z", "Y")
# Vector with array indices according to position in intervals
i <- findInterval(abs(x), limits)
# Set prefix to " " for very small values < 1e-24
i <- ifelse(i==0, which(limits == 1e0), i)
paste(format(round(x/limits[i], 1),
trim=TRUE, scientific=FALSE, ...),
prefix[i])
}
}
# decluttering theme
theme_dvn <- function() {
retTheme = theme_minimal() +
theme(panel.grid = element_blank())
return(retTheme)
}
# plotting function
topPlot <- function(df, v){
df %>% group_by(OFFENCE_FINYEAR) %>%
summarise(Value = sum(TOTAL_VALUE), Number = sum(TOTAL_NUMBER)) %>%
ggplot(aes(x=OFFENCE_FINYEAR, y=!!as.name(v))) +
geom_bar(stat = "identity") +
labs( y = v,
x = "Financial Year",
title = paste0(v, " of offences by Financial Year")) +
scale_y_continuous(label=format_si()) +
theme_dvn()-> returnPlot
return(returnPlot)
}
df %>% topPlot("Value")
df %>% topPlot("Number")
dfTop <- df %>% group_by(OFFENCE_FINYEAR) %>%
summarise(Value = sum(TOTAL_VALUE), Number = sum(TOTAL_NUMBER))
knitr::kable(dfTop, caption = "Top Offences by Year")
| OFFENCE_FINYEAR | Value | Number |
|---|---|---|
| 2012-2013 | 287861625 | 1304922 |
| 2013-2014 | 650827664 | 2842871 |
| 2014-2015 | 671614637 | 2840545 |
| 2015-2016 | 715113972 | 2937351 |
| 2016-2017 | 766545273 | 3011925 |
| 2017-2018 | 788953652 | 3101149 |
| 2018-2019 | 498568529 | 1907839 |
First must prepare the plots to aggregate by year and generate plot of top
# aggregates data set
df %>% mutate(Year = year(OFFENCE_MONTH)) %>%
group_by(Year, OFFENCE_DESC) %>%
summarise(Value = sum(TOTAL_VALUE)) %>%
#tally (TOTAL_VALUE) %>%
top_n(10, Value) %>%
ungroup() -> df2
# list of plots by year
pl <- c()
for (y in 2013:2019){
df2 %>%
filter(Year == y) %>%
ggplot(aes(x = reorder(OFFENCE_DESC,Value), y = Value)) +
facet_grid(cols = vars( Year)) +
geom_bar(stat ="identity") + #geom_label(aes(label=Value)) +
theme_dvn() +
labs( x = "Offence Type")+
theme(plot.title = element_text(hjust = -4))+
scale_y_continuous(label=format_si())+
coord_flip() -> p
# print( p)
pl[[toString(y)]] = p
}
Display the top offences by year
invisible(print(pl))
$`2013`
$`2014`
$`2015`
$`2016`
$`2017`
$`2018`
$`2019`
### plot a line
df %>% filter(grepl("parking", df$OFFENCE_DESC)) %>%
group_by(OFFENCE_MONTH) %>%
summarise(Value = sum(TOTAL_VALUE)) %>%
ungroup() %>%
mutate(Year = year(OFFENCE_MONTH),
Month=month(OFFENCE_MONTH)) %>%
select(Year, Month, Value) %>%
ggplot(aes(Month, Value, group=Year)) +
geom_line(aes(color = as.factor(Year))) + # facet_grid(rows =vars( Year))
scale_x_continuous(breaks = 1:12) +
scale_y_continuous(label = format_si()) +
theme_minimal() +
labs (
color = "Year",
title = "Revenue of Parking Offences by Year ",
caption = "Significant Increase of Parking Offences Revenue after June 2018 ")
df %>% group_by(OFFENCE_MONTH) %>%
summarise(Value = sum(TOTAL_VALUE),
Number = sum(TOTAL_NUMBER)) %>%
ungroup() %>%
select(Value, Number) -> tsdf
ts1 <- ts(tsdf,
start = c(year(dfSummary$`From Date`), month(dfSummary$`From Date`)),
frequency = 12) %>% head(-1)
plot_ts <- function (obj, endyear = 2019){
return(
autoplot(obj) + theme_dvn() + scale_y_continuous(label = format_si()) + scale_x_continuous(breaks = 2013:endyear)
)
}
plot_ts(ts1[,1]) + labs(y="Value [AUD]")
plot_ts(ts1[,2]) + labs(y="Number of Offences")
revenueFit <- auto.arima(ts1[,1])
numberFit <- auto.arima(ts1[,2])
revenueForecast <- forecast(revenueFit, h =2*12)
plot_ts(revenueForecast, endyear = 2021) + labs(y="Value [AUD]", title="Revenue Forecast")
numberForecast <- forecast(numberFit,h=2*12)
plot_ts(numberForecast, endyear = 2021) + labs(y="Number of Offences", title="Total Offences Forecast")
The forecasting performed in this example is for explaratory purposes only, and is only meant to demostrate the ability to forecast with this data set.
Producing a meaningful and forecast requires deeper analysis of the timeseries decomposition and proper validation the forecast errors and accuracy on a test data set before accepting any of the forecasting results.
R markdown combined with plotting libraries like ggplot and other analysis library, like the time series function, can be used to explore data and communicate findings and insights. The output of the R markdown is not necessiarly interactive, but it can be used to publish the findings into the web as an HTML page, or to share as a Word or PowerPoint documents with stakeholders.
Applying these tools requires proper and careful preperation of the data, and a good handle on plotting library in order to produce correctly formatted visualisations.