Data Visualizations from IS460W01

The following visualizations are from the Baltimore City Traffic Citations.The data set is downloadable through this link: https://www.dropbox.com/s/a7oanceqcbd1he5/Baltimore_Traffic_Citations.csv?raw=1

Libraries and Data Source

Pressing the ‘Code’ button shows / hides the code used in this document.

#Necessary Libraries
library(data.table)
library(dplyr)
library(DescTools)
library(lubridate)
library(scales)
library(ggplot2)
library(ggthemes)
library(ggrepel)
library(plotly)
library(stringr)
library(tidyr)
library(leaflet)

#Read in Data
balt <- fread("Baltimore_Citations.csv",na.strings = c(NA,""))

Data Preview

head(balt)
##    Citation     Tag ExpMM ExpYY State Make                        Address
## 1: 88206835 5CB9762    03    20    MD TOYT 1300 BLK EAST NORTHERN PKWY WB
## 2: 88221248 4DC2945    03    20    MD VOLK 1300 BLK EAST NORTHERN PKWY WB
## 3: 88221388 9CL8104    09    16    MD HYUN    2800 BLK REISTERSTOWN RD WB
## 4: 88236758  S14LDS    04    20    NJ  NIS   1500 BLK E COLD SPRING LN EB
## 5: 88247052 9BN6360    09    20    MD DODG    1600 BLK E BELVEDERE AVE EB
## 6: 88250593 9BF9185    11    20    MD HOND     1300 BLK WOODBOURNE AVE EB
##    ViolCode        Description ViolFine               ViolDate Balance
## 1:       32 Fixed Speed Camera       40 06/17/2019 06:14:00 PM       0
## 2:       32 Fixed Speed Camera       40 06/18/2019 12:12:00 PM       0
## 3:       32 Fixed Speed Camera       40 06/18/2019 12:17:00 PM      40
## 4:       32 Fixed Speed Camera       40 06/19/2019 07:08:00 AM      40
## 5:       32 Fixed Speed Camera       40 06/19/2019 01:12:00 PM       0
## 6:       32 Fixed Speed Camera       40 06/19/2019 03:16:00 PM       0
##    PenaltyDate OpenFine OpenPenalty             ImportDate
## 1:          NA        0           0 07/02/2019 04:03:00 AM
## 2:          NA        0           0 07/02/2019 04:03:00 AM
## 3:          NA       40           0 07/02/2019 04:03:00 AM
## 4:          NA       40           0 07/02/2019 04:03:00 AM
## 5:          NA        0           0 07/03/2019 04:04:00 AM
## 6:          NA        0           0 07/03/2019 04:04:00 AM

Histogram

#Count the number of unique vehicles
carcount <- data.frame(count(balt, Tag))
#Set order to decreasing
carcount <- carcount[order(carcount$n,decreasing = TRUE),]
#Identify No Tag rows (Citations without a corresponding license plate)
top3 <- balt[balt$Tag %in% c(NA, "NO TAGST", "NOTAGS T", "Description")]
df_top3 <- count(top3, Description)
#Select rows of which include the string 'TAG'
NoTagRows <- which(carcount$Tag %like any% c("%TAG%"))
#Identify rows with NA values
NARows <- which(is.na(carcount$Tag))
#dataframe for no tags and NAs
NA_and_NoTagRows <- c(NoTagRows, NARows)
#list exceptions to keep
keep <- c("DATAGUY","SCRTAGT","TAG DAT","TAGDAT", "LITAG8R","TAG ART","TAG944","V1NTAGE")
#select the rows to keep
keeprows <- which(carcount$Tag %in% keep)
#remove NAs and No Tag rows
rows_to_drop <- setdiff(NA_and_NoTagRows, keeprows)
#sum of incomplete rows
BadTotal <- sum(carcount[rows_to_drop, "n"])
#removal of rows
carcount<- carcount[-rows_to_drop,]
#Row bind a "No Tags" col
carcount <- rbind(c("No Tags", BadTotal),carcount)
rownames(carcount) <- c(1:nrow(carcount))
#Create citation year col
balt$year <- year(mdy_hms(balt$ViolDate))
#Plot
ggplot(balt, aes(x=year)) +
  #histogram
  geom_histogram(bins = 8, color="darkblue", fill="lightblue")+
  #labels
  labs(title = "Histogram of Citations by Year",x = "Year",y = "Citation Count")+
  #Y axis labels 
  scale_y_continuous(labels = comma)+
  #Bin size
  stat_bin(binwidth = 1,geom='text',color='black', aes(label=..count..),vjust=-.04)+
  #Center title text
  theme(plot.title = element_text(hjust = 0.5))

Simple histogram of the number citations by year. 2018 is the peak year for citations. A noticeable decline in 2020 due to the pandemic.

Bar graph 1

#Cast to numeric
carcount$n <- as.numeric(carcount$n)
#plot
ggplot(carcount[2:11,],aes(x = reorder(Tag,n), y=n))+
  #bar chart
  geom_bar(colour="black", fill="gray76", stat="identity") +
  #labels
  labs(title = "Number of Citations by Tag (Top 10)", x = "Car License Plate",y = "Citation Count")+
  #Center title text
  theme(plot.title = element_text(hjust = 0.5))

This bar graph shows the top ten offenders in the data set by license plate.

Bar graph 2

#new data frame counting the number of each type of offense
df_reasons <- count(balt,Description)
#set order to decreasing
df_reasons <- df_reasons[order(df_reasons$n, decreasing = TRUE),] 
#top 10 reasons
top_reasons <- df_reasons$Description[1:10]

#Data frame for violations and violation date
new_df <- balt %>%
  #select the row if it is one of the top reasons
  filter(Description %in% top_reasons)%>%
  select(ViolDate,Description)%>%
  #format violation date 
  mutate(year = year(mdy_hms(ViolDate)))%>%
  #group by description and year
  group_by(Description,year)%>%
  summarise(n = length(Description), .groups = 'keep')%>%
  data.frame()

#Data frame for all other offenses
other_df <- balt %>%
  #select the row if it is not one of the top reasons
  filter(!Description %in% top_reasons)%>%
  select(ViolDate)%>%
  #format violation date 
  mutate(year = year(mdy_hms(ViolDate)), Description = "Other")%>%
  #group by description and year
  group_by(Description, year)%>%
  summarise(n = length(Description), .groups = 'keep')%>%
  data.frame()

#row bind the two data frames
new_df <- rbind(new_df,other_df)

#Calculate Aggregate total
agg_tot <- new_df%>%
  select(Description,n)%>%
  group_by(Description)%>%
  summarise(Total = sum(n), .groups = 'keep')%>%
  data.frame()
#Calculate the sum of the fines
fines_df <- balt %>%
  filter(Description %in% top_reasons)%>%
  select(Description,ViolFine)%>%
  group_by(Description)%>%
  summarise(totalFines= sum(ViolFine))%>%
  data.frame()
#year as factor
new_df$year <- as.factor(new_df$year)
#adjust Y axis 
max_y <- plyr::round_any(max(agg_tot$Total),250000,ceiling)
#plor
ggplot(new_df,aes( x = reorder(Description,n,sum), y = n, fill = year))+
  #bar chart
  geom_bar(stat="identity", position = position_stack(reverse = TRUE)) +
  #flip coordinates + labels
  coord_flip()+ labs(title = "Citation Count by Citation Type", x="",y="Citation Count",fill="Year")+
  #Center title text
  theme_light()+
  theme(plot.title = element_text(hjust = .05))+
  #Legend
  scale_fill_brewer(palette = "Paired",guide = guide_legend(reverse = TRUE))+
  #axis labels and breaks
  geom_text(data = agg_tot,aes(x=Description,y=Total,label=scales::comma(Total),fill=NULL),hjust=-0.1,size=4)+
  scale_y_continuous(labels = comma, breaks = seq(0,max_y,250000), limits = c(0,max_y))

Citation count by citation type by year. The fixed speed camera is credited with over 1.5 million citations. Red light violations were the second highest individual category. Since 2017, the Maryland Department of Transportation has almost doubled the number of citations delivered annualy.

Line graph

#Data frame for days of the week
days_df <- balt %>%
  #select violation date
  select(ViolDate)%>%
  #format year and weekdays
  dplyr::mutate(year(mdy_hms(ViolDate)),
         dayoftheweek = weekdays(mdy_hms(ViolDate), abbreviate = TRUE))%>%
  #group by year and day of the week
  dplyr::group_by(year = balt$year, dayoftheweek)%>%
  #count the number of citations
  dplyr::summarise(n = length(ViolDate), .groups = 'keep')%>%
  data.frame()
#year as factor
days_df$year <- as.factor(days_df$year)  
#Set day order to be the order of the days of the week
day_order <- factor(days_df$dayoftheweek, level = c('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))
#put the day order in the data frame
days_df$dayoftheweek<- factor(days_df$dayoftheweek, level = c('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))

#line plot
ggplot(days_df, aes(x=day_order, y = n, group = year))+
  #lines
  geom_line(aes(color=year), size = 3)+
  #labels
  labs(title = "Citations by Day and by Year", x = "Days of the Week", y = "Citation Count")+
  theme_light()+
  #center title text
  theme(plot.title = element_text(hjust = .5))+
  #create points for each day of the week on the line
  geom_point(shape = 21, size = 3, color = "black", fill = "white")+
  #y axis labels
  scale_y_continuous(labels = comma)+
  #legend
  scale_color_brewer(palette = "Paired", name = "Year",guide = guide_legend(reverse = TRUE))

2018 was the highest year for citations given. There is a slight increase in citations on Fridays while the weekends have relatively low citation counts

Bar + line graph

#sum of other violations
other_sum <- sum(balt[!balt$Description %in% top_reasons, "ViolFine"])
#row bind fines df with other to include the 'other' form of violation
fines_df <- rbind(fines_df,c("Other",other_sum))
#total fines as numeric
fines_df$totalFines <- as.numeric(fines_df$totalFines)
#y labels
ylab <- seq(0, max(fines_df$totalFines)/1e6,10)
#Format the label for dollars in terms of millions
my_labs <- paste0("$",ylab, "M")
#x lables
xlab <- seq(0,23,1)
#plot
ggplot(new_df,aes(x = reorder(Description,n,sum), y = n, fill = year))+
  #bar chart + coordinate flip
  geom_bar(stat = "identity",position = position_stack(reverse = TRUE))+ coord_flip()+theme_light()+
  #labels
  labs(title = "Citation Count and Total Fines",x="",y="Citation Count", fill = "Year")+
  #center title text
  theme(plot.title = element_text(hjust = .05))+
  #legend
  scale_fill_brewer(palette = "Spectral",guide = guide_legend(reverse = TRUE))+
  #line graph
  geom_line(inherit.aes = FALSE,data = fines_df,aes(x=Description, y = totalFines/20, colour = "Total Fines",group=1),size=1)+
  #line color
  scale_color_manual(NULL, values = "black") +
  #y axis labels
  scale_y_continuous(labels = comma, 
                     sec.axis = sec_axis(~.*20,name = "Total Fines",labels = my_labs, breaks =ylab*1e6 ))+
  #place a point on the line for the sum of citations for each cartegory
  geom_point(inherit.aes = FALSE, data = fines_df,
             #axis data 
             aes(x = Description, y = totalFines/20, group = 1), size = 3, shape = 21, fill = "white",color = "black")+
  #legend clarity
  theme(legend.background = element_rect(fill="transparent"),
        legend.box.background = element_rect(fill= "transparent", colour = NA),
        legend.spacing = unit(-1,"lines"))

This chart demonstrates the citation count by type and by year while displaying the number of revenue from the fines on the upper x axis. The points on the line represent the total fines value for each type of citation. The fixed speed camera totals to just over $60 million.

Nested Pie Chart

#create a data frame for the states (Maryland, Virginia, or Other)
state_df <- balt%>%
  #select the state and the date
  select(State, ViolDate)%>%
  #format year 
  mutate(year = year(mdy_hms(ViolDate)),
         #Create three categories: MD,VA,and Other
         myState = ifelse(State=="MD", "MD", ifelse(State=="VA","VA","Other")))%>%
  #group by year and state
  group_by(year, myState)%>%
  #count number of instances for each state
  summarise(n=length(myState), .groups = 'keep')%>%
  #group by year
  group_by(year)%>%
  #calculate the percentage of the total for each observation
  mutate(percent_of_total = round(100*n/sum(n),1))%>%
  #ungroup so plot can be made
  ungroup()%>%
  data.frame()

#plotly pie chart
plot_ly(hole=.7)%>%
  #title
  layout(title= "Citations by State of Vehicle Registration (2013-2015)")%>%
  #2015 data
  add_trace(data = state_df[state_df$year==2015,],
            labels = ~myState,
            values = state_df[state_df$year==2015,"n"],
            type = "pie",
            textposition = "inside",
            hovertemplate = "Year: 2015<br>State:%{label}<br>Percent:%{percent}<br>Citation Count:%{value}<extra></extra>")%>%
  #2014 data
  add_trace(data = state_df[state_df$year==2014,],
            labels = ~myState,
            values = state_df[state_df$year==2014,"n"],
            type = "pie",
            textposition = "inside",
            hovertemplate = "Year: 2014<br>State:%{label}<br>Percent:%{percent}<br>Citation Count:%{value}<extra></extra>",
            domain = list(
              x = c(0.16,0.84),
              y = c(0.16,0.84)))%>%
  #2013 data
  add_trace(data = state_df[state_df$year==2013,],
            labels = ~myState,
            values = state_df[state_df$year==2013,"n"],
            type = "pie",
            textposition = "inside",
            hovertemplate = "Year: 2013<br>State:%{label}<br>Percent:%{percent}<br>Citation Count:%{value}<extra></extra>",
            domain = list(
              x = c(0.27,0.73),
              y = c(0.27,0.73)))

Maryland plates are the biggest offenders in Maryland… Surprise! More states could be added for further detail, this is just a simple example :) Hover over the sectors to see detail

Heatmap

#set breaks for the map
breaks <- c(seq(0,max(days_df$n),by=25000))
#plot
ggplot(days_df,aes(x=year,y=dayoftheweek, fill=n))+
  #heatmap
  geom_tile(color = "black")+
  #text lables: comma
  geom_text(aes(label=comma(n)))+
  coord_equal(ratio=1)+
  #labels
  labs(title = "Heatmap: Citations by Day of the Week",
       x = "Year",
       y = "Days of the Week",
       fill = "Citation Count")+
  theme_minimal()+
  #center title text
  theme(plot.title = element_text(hjust = .5))+
  #set y axis as days of the week
  scale_y_discrete(limits = rev(levels(days_df$dayoftheweek)))+
  #set colors and breaks
  scale_fill_continuous(low="white",high="red", breaks = breaks)+
  #legend
  guides(fill=guide_legend(reverse = TRUE, override.aes = list(colour="black")))

Similar to the line graph this heat map shows which days of the week produced the most citations and how the days’ citation counts has changed over the recent years. Starting in 2016 Wednesday became the highest ticketed day for all following years. Midweek rush?

Bonus: Lat Long Plot

#Three locations in Baltimore and what they are
Loyola <- c(39.3463882,-76.6210078,"College",4000)
Hopkins <- c(39.3297084,-76.6219169,"College",18000)
Alonsos <- c(39.3445305,-76.6308260,"Bar",1000)
#data frame for the locations
gps_df <- data.frame(rbind(Loyola,Hopkins,Alonsos))
#set col names
colnames(gps_df)<- c("Lat","Lon","Type","n")
#set Lat, Long, and n as numeric
gps_df$Lat <- as.numeric(gps_df$Lat)
gps_df$Lon<- as.numeric(gps_df$Lon)
gps_df$n <- as.numeric(gps_df$n)
#create an icon to mark the locations
icon.glyphicon <- makeAwesomeIcon(icon = 'flag', markerColor = 'red', iconColor = 'black')

#create map
leaflet()%>%
  addTiles()%>%
  #add labels
  addAwesomeMarkers(lng = gps_df$Lon,lat = gps_df$Lat,
                    icon = icon.glyphicon,
             popup = paste(row.names(gps_df),gps_df$n),
             label = row.names(gps_df))

This is a simple map plot. Could be useful for plotting regional customer locations and / or stores.

Closing

Thank you for your attention and listening to my presentation