The following visualizations are from the Baltimore City Traffic Citations.The data set is downloadable through this link: https://www.dropbox.com/s/a7oanceqcbd1he5/Baltimore_Traffic_Citations.csv?raw=1
Pressing the ‘Code’ button shows / hides the code used in this document.
#Necessary Libraries
library(data.table)
library(dplyr)
library(DescTools)
library(lubridate)
library(scales)
library(ggplot2)
library(ggthemes)
library(ggrepel)
library(plotly)
library(stringr)
library(tidyr)
library(leaflet)
#Read in Data
balt <- fread("Baltimore_Citations.csv",na.strings = c(NA,""))
head(balt)
## Citation Tag ExpMM ExpYY State Make Address
## 1: 88206835 5CB9762 03 20 MD TOYT 1300 BLK EAST NORTHERN PKWY WB
## 2: 88221248 4DC2945 03 20 MD VOLK 1300 BLK EAST NORTHERN PKWY WB
## 3: 88221388 9CL8104 09 16 MD HYUN 2800 BLK REISTERSTOWN RD WB
## 4: 88236758 S14LDS 04 20 NJ NIS 1500 BLK E COLD SPRING LN EB
## 5: 88247052 9BN6360 09 20 MD DODG 1600 BLK E BELVEDERE AVE EB
## 6: 88250593 9BF9185 11 20 MD HOND 1300 BLK WOODBOURNE AVE EB
## ViolCode Description ViolFine ViolDate Balance
## 1: 32 Fixed Speed Camera 40 06/17/2019 06:14:00 PM 0
## 2: 32 Fixed Speed Camera 40 06/18/2019 12:12:00 PM 0
## 3: 32 Fixed Speed Camera 40 06/18/2019 12:17:00 PM 40
## 4: 32 Fixed Speed Camera 40 06/19/2019 07:08:00 AM 40
## 5: 32 Fixed Speed Camera 40 06/19/2019 01:12:00 PM 0
## 6: 32 Fixed Speed Camera 40 06/19/2019 03:16:00 PM 0
## PenaltyDate OpenFine OpenPenalty ImportDate
## 1: NA 0 0 07/02/2019 04:03:00 AM
## 2: NA 0 0 07/02/2019 04:03:00 AM
## 3: NA 40 0 07/02/2019 04:03:00 AM
## 4: NA 40 0 07/02/2019 04:03:00 AM
## 5: NA 0 0 07/03/2019 04:04:00 AM
## 6: NA 0 0 07/03/2019 04:04:00 AM
#Count the number of unique vehicles
carcount <- data.frame(count(balt, Tag))
#Set order to decreasing
carcount <- carcount[order(carcount$n,decreasing = TRUE),]
#Identify No Tag rows (Citations without a corresponding license plate)
top3 <- balt[balt$Tag %in% c(NA, "NO TAGST", "NOTAGS T", "Description")]
df_top3 <- count(top3, Description)
#Select rows of which include the string 'TAG'
NoTagRows <- which(carcount$Tag %like any% c("%TAG%"))
#Identify rows with NA values
NARows <- which(is.na(carcount$Tag))
#dataframe for no tags and NAs
NA_and_NoTagRows <- c(NoTagRows, NARows)
#list exceptions to keep
keep <- c("DATAGUY","SCRTAGT","TAG DAT","TAGDAT", "LITAG8R","TAG ART","TAG944","V1NTAGE")
#select the rows to keep
keeprows <- which(carcount$Tag %in% keep)
#remove NAs and No Tag rows
rows_to_drop <- setdiff(NA_and_NoTagRows, keeprows)
#sum of incomplete rows
BadTotal <- sum(carcount[rows_to_drop, "n"])
#removal of rows
carcount<- carcount[-rows_to_drop,]
#Row bind a "No Tags" col
carcount <- rbind(c("No Tags", BadTotal),carcount)
rownames(carcount) <- c(1:nrow(carcount))
#Create citation year col
balt$year <- year(mdy_hms(balt$ViolDate))
#Plot
ggplot(balt, aes(x=year)) +
#histogram
geom_histogram(bins = 8, color="darkblue", fill="lightblue")+
#labels
labs(title = "Histogram of Citations by Year",x = "Year",y = "Citation Count")+
#Y axis labels
scale_y_continuous(labels = comma)+
#Bin size
stat_bin(binwidth = 1,geom='text',color='black', aes(label=..count..),vjust=-.04)+
#Center title text
theme(plot.title = element_text(hjust = 0.5))
Simple histogram of the number citations by year. 2018 is the peak year for citations. A noticeable decline in 2020 due to the pandemic.
#Cast to numeric
carcount$n <- as.numeric(carcount$n)
#plot
ggplot(carcount[2:11,],aes(x = reorder(Tag,n), y=n))+
#bar chart
geom_bar(colour="black", fill="gray76", stat="identity") +
#labels
labs(title = "Number of Citations by Tag (Top 10)", x = "Car License Plate",y = "Citation Count")+
#Center title text
theme(plot.title = element_text(hjust = 0.5))
This bar graph shows the top ten offenders in the data set by license plate.
#new data frame counting the number of each type of offense
df_reasons <- count(balt,Description)
#set order to decreasing
df_reasons <- df_reasons[order(df_reasons$n, decreasing = TRUE),]
#top 10 reasons
top_reasons <- df_reasons$Description[1:10]
#Data frame for violations and violation date
new_df <- balt %>%
#select the row if it is one of the top reasons
filter(Description %in% top_reasons)%>%
select(ViolDate,Description)%>%
#format violation date
mutate(year = year(mdy_hms(ViolDate)))%>%
#group by description and year
group_by(Description,year)%>%
summarise(n = length(Description), .groups = 'keep')%>%
data.frame()
#Data frame for all other offenses
other_df <- balt %>%
#select the row if it is not one of the top reasons
filter(!Description %in% top_reasons)%>%
select(ViolDate)%>%
#format violation date
mutate(year = year(mdy_hms(ViolDate)), Description = "Other")%>%
#group by description and year
group_by(Description, year)%>%
summarise(n = length(Description), .groups = 'keep')%>%
data.frame()
#row bind the two data frames
new_df <- rbind(new_df,other_df)
#Calculate Aggregate total
agg_tot <- new_df%>%
select(Description,n)%>%
group_by(Description)%>%
summarise(Total = sum(n), .groups = 'keep')%>%
data.frame()
#Calculate the sum of the fines
fines_df <- balt %>%
filter(Description %in% top_reasons)%>%
select(Description,ViolFine)%>%
group_by(Description)%>%
summarise(totalFines= sum(ViolFine))%>%
data.frame()
#year as factor
new_df$year <- as.factor(new_df$year)
#adjust Y axis
max_y <- plyr::round_any(max(agg_tot$Total),250000,ceiling)
#plor
ggplot(new_df,aes( x = reorder(Description,n,sum), y = n, fill = year))+
#bar chart
geom_bar(stat="identity", position = position_stack(reverse = TRUE)) +
#flip coordinates + labels
coord_flip()+ labs(title = "Citation Count by Citation Type", x="",y="Citation Count",fill="Year")+
#Center title text
theme_light()+
theme(plot.title = element_text(hjust = .05))+
#Legend
scale_fill_brewer(palette = "Paired",guide = guide_legend(reverse = TRUE))+
#axis labels and breaks
geom_text(data = agg_tot,aes(x=Description,y=Total,label=scales::comma(Total),fill=NULL),hjust=-0.1,size=4)+
scale_y_continuous(labels = comma, breaks = seq(0,max_y,250000), limits = c(0,max_y))
Citation count by citation type by year. The fixed speed camera is credited with over 1.5 million citations. Red light violations were the second highest individual category. Since 2017, the Maryland Department of Transportation has almost doubled the number of citations delivered annualy.
#Data frame for days of the week
days_df <- balt %>%
#select violation date
select(ViolDate)%>%
#format year and weekdays
dplyr::mutate(year(mdy_hms(ViolDate)),
dayoftheweek = weekdays(mdy_hms(ViolDate), abbreviate = TRUE))%>%
#group by year and day of the week
dplyr::group_by(year = balt$year, dayoftheweek)%>%
#count the number of citations
dplyr::summarise(n = length(ViolDate), .groups = 'keep')%>%
data.frame()
#year as factor
days_df$year <- as.factor(days_df$year)
#Set day order to be the order of the days of the week
day_order <- factor(days_df$dayoftheweek, level = c('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))
#put the day order in the data frame
days_df$dayoftheweek<- factor(days_df$dayoftheweek, level = c('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))
#line plot
ggplot(days_df, aes(x=day_order, y = n, group = year))+
#lines
geom_line(aes(color=year), size = 3)+
#labels
labs(title = "Citations by Day and by Year", x = "Days of the Week", y = "Citation Count")+
theme_light()+
#center title text
theme(plot.title = element_text(hjust = .5))+
#create points for each day of the week on the line
geom_point(shape = 21, size = 3, color = "black", fill = "white")+
#y axis labels
scale_y_continuous(labels = comma)+
#legend
scale_color_brewer(palette = "Paired", name = "Year",guide = guide_legend(reverse = TRUE))
2018 was the highest year for citations given. There is a slight increase in citations on Fridays while the weekends have relatively low citation counts
#sum of other violations
other_sum <- sum(balt[!balt$Description %in% top_reasons, "ViolFine"])
#row bind fines df with other to include the 'other' form of violation
fines_df <- rbind(fines_df,c("Other",other_sum))
#total fines as numeric
fines_df$totalFines <- as.numeric(fines_df$totalFines)
#y labels
ylab <- seq(0, max(fines_df$totalFines)/1e6,10)
#Format the label for dollars in terms of millions
my_labs <- paste0("$",ylab, "M")
#x lables
xlab <- seq(0,23,1)
#plot
ggplot(new_df,aes(x = reorder(Description,n,sum), y = n, fill = year))+
#bar chart + coordinate flip
geom_bar(stat = "identity",position = position_stack(reverse = TRUE))+ coord_flip()+theme_light()+
#labels
labs(title = "Citation Count and Total Fines",x="",y="Citation Count", fill = "Year")+
#center title text
theme(plot.title = element_text(hjust = .05))+
#legend
scale_fill_brewer(palette = "Spectral",guide = guide_legend(reverse = TRUE))+
#line graph
geom_line(inherit.aes = FALSE,data = fines_df,aes(x=Description, y = totalFines/20, colour = "Total Fines",group=1),size=1)+
#line color
scale_color_manual(NULL, values = "black") +
#y axis labels
scale_y_continuous(labels = comma,
sec.axis = sec_axis(~.*20,name = "Total Fines",labels = my_labs, breaks =ylab*1e6 ))+
#place a point on the line for the sum of citations for each cartegory
geom_point(inherit.aes = FALSE, data = fines_df,
#axis data
aes(x = Description, y = totalFines/20, group = 1), size = 3, shape = 21, fill = "white",color = "black")+
#legend clarity
theme(legend.background = element_rect(fill="transparent"),
legend.box.background = element_rect(fill= "transparent", colour = NA),
legend.spacing = unit(-1,"lines"))
This chart demonstrates the citation count by type and by year while displaying the number of revenue from the fines on the upper x axis. The points on the line represent the total fines value for each type of citation. The fixed speed camera totals to just over $60 million.
#create a data frame for the states (Maryland, Virginia, or Other)
state_df <- balt%>%
#select the state and the date
select(State, ViolDate)%>%
#format year
mutate(year = year(mdy_hms(ViolDate)),
#Create three categories: MD,VA,and Other
myState = ifelse(State=="MD", "MD", ifelse(State=="VA","VA","Other")))%>%
#group by year and state
group_by(year, myState)%>%
#count number of instances for each state
summarise(n=length(myState), .groups = 'keep')%>%
#group by year
group_by(year)%>%
#calculate the percentage of the total for each observation
mutate(percent_of_total = round(100*n/sum(n),1))%>%
#ungroup so plot can be made
ungroup()%>%
data.frame()
#plotly pie chart
plot_ly(hole=.7)%>%
#title
layout(title= "Citations by State of Vehicle Registration (2013-2015)")%>%
#2015 data
add_trace(data = state_df[state_df$year==2015,],
labels = ~myState,
values = state_df[state_df$year==2015,"n"],
type = "pie",
textposition = "inside",
hovertemplate = "Year: 2015<br>State:%{label}<br>Percent:%{percent}<br>Citation Count:%{value}<extra></extra>")%>%
#2014 data
add_trace(data = state_df[state_df$year==2014,],
labels = ~myState,
values = state_df[state_df$year==2014,"n"],
type = "pie",
textposition = "inside",
hovertemplate = "Year: 2014<br>State:%{label}<br>Percent:%{percent}<br>Citation Count:%{value}<extra></extra>",
domain = list(
x = c(0.16,0.84),
y = c(0.16,0.84)))%>%
#2013 data
add_trace(data = state_df[state_df$year==2013,],
labels = ~myState,
values = state_df[state_df$year==2013,"n"],
type = "pie",
textposition = "inside",
hovertemplate = "Year: 2013<br>State:%{label}<br>Percent:%{percent}<br>Citation Count:%{value}<extra></extra>",
domain = list(
x = c(0.27,0.73),
y = c(0.27,0.73)))
Maryland plates are the biggest offenders in Maryland… Surprise! More states could be added for further detail, this is just a simple example :) Hover over the sectors to see detail
#set breaks for the map
breaks <- c(seq(0,max(days_df$n),by=25000))
#plot
ggplot(days_df,aes(x=year,y=dayoftheweek, fill=n))+
#heatmap
geom_tile(color = "black")+
#text lables: comma
geom_text(aes(label=comma(n)))+
coord_equal(ratio=1)+
#labels
labs(title = "Heatmap: Citations by Day of the Week",
x = "Year",
y = "Days of the Week",
fill = "Citation Count")+
theme_minimal()+
#center title text
theme(plot.title = element_text(hjust = .5))+
#set y axis as days of the week
scale_y_discrete(limits = rev(levels(days_df$dayoftheweek)))+
#set colors and breaks
scale_fill_continuous(low="white",high="red", breaks = breaks)+
#legend
guides(fill=guide_legend(reverse = TRUE, override.aes = list(colour="black")))
Similar to the line graph this heat map shows which days of the week produced the most citations and how the days’ citation counts has changed over the recent years. Starting in 2016 Wednesday became the highest ticketed day for all following years. Midweek rush?
#Three locations in Baltimore and what they are
Loyola <- c(39.3463882,-76.6210078,"College",4000)
Hopkins <- c(39.3297084,-76.6219169,"College",18000)
Alonsos <- c(39.3445305,-76.6308260,"Bar",1000)
#data frame for the locations
gps_df <- data.frame(rbind(Loyola,Hopkins,Alonsos))
#set col names
colnames(gps_df)<- c("Lat","Lon","Type","n")
#set Lat, Long, and n as numeric
gps_df$Lat <- as.numeric(gps_df$Lat)
gps_df$Lon<- as.numeric(gps_df$Lon)
gps_df$n <- as.numeric(gps_df$n)
#create an icon to mark the locations
icon.glyphicon <- makeAwesomeIcon(icon = 'flag', markerColor = 'red', iconColor = 'black')
#create map
leaflet()%>%
addTiles()%>%
#add labels
addAwesomeMarkers(lng = gps_df$Lon,lat = gps_df$Lat,
icon = icon.glyphicon,
popup = paste(row.names(gps_df),gps_df$n),
label = row.names(gps_df))
This is a simple map plot. Could be useful for plotting regional customer locations and / or stores.
Thank you for your attention and listening to my presentation