library(ggplot2)
library(DescTools)
library(dplyr)
library(scales)
library(lubridate)
library(RColorBrewer)
library(ggthemes)
library(plyr)
library(plotly)
library(ggrepel)
library(tidyverse)
df_reasons = data.frame(dplyr::count(df,`Violation Description`))
df_reasons = df_reasons[order(df_reasons$n, decreasing = TRUE),]
top_reasons = df_reasons$Violation.Description
top_reasons = top_reasons[1:10]
#df$`Violation Description` %in% top_reasons
#table(df$`Violation Description` %in% top_reasons)
df$ViolationDescription <- df$`Violation Description`
df$FineAmount <- df$`Fine amount`
fines_df = df %>%
filter(ViolationDescription %in% top_reasons) %>%
dplyr::select(ViolationDescription, FineAmount) %>%
group_by(ViolationDescription) %>%
dplyr::summarise(totfines = sum(FineAmount)) %>%
data.frame
#new_df$YEAR = as.character(new_df$YEAR)
ylab = seq(0, max(fines_df$totfines)/1e6, 1)
my_labels = paste0("$", ylab, "M")
#length(unique(df$`Agency`))
top3_agency = dplyr::count(df, `Agency`)
top3_agency = top3_agency[order(-n),]
#top3_agency[top3_agency$Agency %in% c("54", "51", "56", "53", "55"), "n"] / sum(top3_agency$n)
agency3_df = df %>%
dplyr::select(`Agency`, `Issue Date`) %>%
dplyr::mutate(year = year(ymd(`Issue Date`)),
myagency = ifelse(`Agency` == "54", "54",
ifelse(`Agency` == "51", "51",
ifelse(`Agency` == "56", "56",
ifelse(`Agency` == "53", "53",
ifelse(`Agency` == "55", "55", "Other")))))) %>%
filter(!is.na(year)) %>%
group_by(year, myagency) %>%
dplyr::summarise(n=length(myagency), .groups='keep') %>%
group_by(year) %>%
dplyr::mutate(percent_of_total = round(100*n/sum(n),1)) %>%
ungroup() %>%
data.frame()
#length(unique(df$`Agency`))
top_agency = dplyr::count(df, `Agency`)
top_agency = top_agency[order(-n),]
#top_agency[top_agency$Agency %in% c("54", "51", "56", "53", "55"), "n"] / sum(top_agency$n)
agency_df = df %>%
dplyr::select(`Agency`, `Issue Date`) %>%
dplyr::mutate(year = year(ymd(`Issue Date`)),
myagency = ifelse(`Agency` == "54", "54",
ifelse(`Agency` == "51", "51",
ifelse(`Agency` == "56", "56",
ifelse(`Agency` == "53", "53",
ifelse(`Agency` == "55", "55", "Other")))))) %>%
filter(!is.na(year)) %>%
group_by(year, myagency) %>%
dplyr::summarise(n=length(myagency), .groups='keep') %>%
group_by(year) %>%
dplyr::mutate(percent_of_total = round(100*n/sum(n),1)) %>%
ungroup() %>%
data.frame()
agency_df$myagency = factor(agency_df$myagency, levels=c("54", "51", "56", "53", "55", "Other"))
issue_time_df = data.frame(df$`Issue time`)
df$minute <- df$`Issue time` %% 100
df$hour <- (df$`Issue time` - df$minute)/100
hours_df = df %>%
dplyr::select(hour) %>%
group_by(hour) %>%
dplyr::summarise(n = length(hour), .groups = 'keep') %>%
data.frame()
hours_df = na.omit(hours_df)
hours_df$hour = as.numeric(hours_df$hour)
hi_lo = hours_df %>%
filter(n ==min(n) | n==max(n)) %>%
data.frame
x_axis_labels = min(hours_df$hour):max(hours_df$hour)
ViolCode = df$`Violation code`
violDescip = df$`Violation Description`
violcount_by_descrip = data.frame(dplyr::count(df, `Violation Description`))
violcount_by_descrip = violcount_by_descrip[order(violcount_by_descrip$n, decreasing = TRUE),]
fineAmount = df$`Fine amount`
fineRows = fineAmount
fineRows = which(!is.na(fineAmount))
finesamount = data.frame(dplyr::count(df, `Fine amount`))
finesamount = finesamount[order(finesamount$n, decreasing = TRUE),]
new_df = df %>%
filter(`Violation Description` %in% top_reasons) %>%
dplyr::select(`Issue Date`, `Violation Description`) %>%
dplyr::mutate(year = year(ymd(`Issue Date`))) %>%
group_by(`Violation Description`, year) %>%
dplyr::summarise(n = length(`Violation Description`), .groups='keep') %>%
data.frame()
agg_tot = new_df %>%
dplyr::select(`Violation.Description`, n) %>%
group_by(`Violation.Description`) %>%
dplyr::summarise(tot = sum(n), .groups = 'keep') %>%
data.frame()
LA is one of the largest cities in the US with busy streets filled with an array of businesses and populous neighborhoods.With lots of people in a city comes lots of vehicles and drivers in the streets. Law enforcement mandate rules and laws to ensure a smooth running city which works optimally when citizens follow the rules and laws.
This report tells the story of which top parking citations in Los Angeles (LA), California are the most common, how much money they bring in, and which parking agency brings in the most citations by count and fine amount. Additionally, this data visualization report shows at which hour historically clocks in the most number of citations.
The data comes from the City of Los Angeles. Unless otherwise specified, the data includes the years 2010-2018. However, none of the plots include data from 2017 as data from 2017 was omitted from the initial data set.
Many citations are written up in LA during the year over the years as it is a highly populated city with many tourists. Parking citations is one way the city brings in money, although in an ideal world having no citations written up will be the best. This plot shows which fine amount ticketed has the highest count. It is important to note that of the top 10 fine amounts 8 of them are under 100 dollars which could be an indicator that when people park illegally and they know it, they are willing to lose under $100 or less to a parking citation.
max1.1_y = round_any(max(agg_tot$tot), 0.05, ceiling)
ggplot(finesamount[1:10,], aes(x = reorder(Fine.amount, -n), y = n)) +
geom_bar(colour="darkblue", fill="red", stat="identity") +
labs(title = "Number of Violations per Fine Amount (Top 10)", x = "Fine Amount", y = "Count of Number of Fines") +
theme(plot.title = element_text(hjust = 0.5)) +
theme_light() +
scale_y_continuous(labels = comma, limits=c(0,max1.1_y)) +
geom_text(data = finesamount[1:10,], aes(label = scales::comma(n), fill =NULL), hjust = -0.1, size=3) +
coord_flip() +
scale_fill_brewer(palette = "Spectral", guide = guide_legend(reverse = TRUE))
Another way to look at the top 10 parking citations is by which description is written up the most. The number 1 violation of No Park/Street Clean and Meter Exp. dominates the top parking citations where both are at least double the count of the remaining top 10 citations by description. Having Meter Exp. as a top 2 parking violations could be an indicator that the drivers are negligent and yet content with knowing they have parked longer than they are permitted and are more willing to pay the fine. While with the high amount of No Park/Street Clean violations, that could be an indicator that the city needs to do a better job at informing people parking on certain streets at certain times that they cannot park on those streets while the streets are being cleaned. Maybe the city needs to increase signage, change where they put them, and the styling of the signage.
max1.2_y = round_any(max(agg_tot$tot), 200000, ceiling)
ggplot(violcount_by_descrip[1:10,], aes(x = reorder(Violation.Description, -n), y = n)) +
geom_bar(colour="black", fill="yellow", stat="identity") +
labs(title = "Number of Violations by Description (Top 10)", x = "Violation Description", y = "Violation Count") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(labels = comma, limits=c(0,max1.2_y)) +
geom_text(data = violcount_by_descrip[1:10,], aes(label = scales::comma(n), fill =NULL), hjust = -0.1, size=4) +
coord_flip() +
scale_fill_brewer(palette = "Spectral", guide = guide_legend(reverse = TRUE))
Knowing that No Park/Street clean brings in the most parking citations it would make sense that it brings in the most fines. The same goes with Meter Exp. but for being the second most parking citation. What is interesting about this display of top citations is that the count of citations is not fully proportionate with the total fines for each description. No Park/Street Clean has a significantly higher (a little less than double) total fines amount than Meter Exp. even though the counts of each are more relatively closer in count. This is an indicator that the fine amount for Meter Exp. is less than the No Park/Street Clean. What else is interesting about this plotting is that Display of tabs has the fifth highest count but the total of fines it brings it is closer to the tail end of the top 10. Similarly, while No Stop/Standing is number 10 with the count of citations its total fines is more closer to being the 7th highest.
Note: Once again please remember that not all of the years have been fully reported in the data set and as technology improved the number of citations ticketed increased which would explain the disproportion of the years.
ggplot(new_df, aes(x=reorder(Violation.Description, n, sum), y=n, fill=factor(year))) +
geom_bar(stat = "identity", position = position_stack(reverse = TRUE)) +
coord_flip() +
theme_light() +
labs(title="Citation Count and Total Fines by Citation Description", x="", y="Citation Count", fill="Year") +
theme(plot.title=element_text(hjust=0.5)) +
scale_fill_brewer(palette = "Spectral", guide = guide_legend(reverse = TRUE)) +
geom_line(inherit.aes = FALSE, data=fines_df,
aes(x=ViolationDescription, y=totfines/20, colour="Total Fines", group=1)) +
scale_color_manual(NULL, values="black") +
scale_y_continuous(labels = comma,
sec.axis=sec_axis(~. *20, name="Total Fines", labels=my_labels,
breaks = ylab*10^6)) +
geom_point(inherit.aes = FALSE, data=fines_df,
aes(x=ViolationDescription, y =totfines/20, group=1),
size=3, shape=21, fill="white", color="black") +
theme(legend.background = element_rect(fill="transparent", colour=NA),
legend.spacing = unit(-1, "lines"))
From knowing which parking descriptions brings in the higher total fines it is also helpful to know which agencies which distribute the violations brings in the higher total of fines over the years.
Note: Once again please remember that not all of the years have been fully reported in the data set and as technology improved the number of citations ticketed increased which would explain why 2017 has been omitted from the data.
plot_ly(hole=0.7) %>%
layout(title="Total Fines by Agency (2015, 2016, 2018)") %>%
add_trace(data = agency3_df[agency3_df$year == 2015,],
labels = ~myagency,
values = ~agency3_df[agency3_df$year == 2015, "n"],
type = "pie",
textposition = "inside",
hovertemplate = "Year: 2015<br>Agency:%{label}<br>Percent:%{percent}<br>Total Fines: %{value}<extra></extra>") %>%
add_trace(data = agency3_df[agency3_df$year == 2016,],
labels = ~myagency,
values = ~agency3_df[agency3_df$year == 2016, "n"],
type = "pie",
textposition = "inside",
hovertemplate = "Year: 2016<br>Agency:%{label}<br>Percent:%{percent}<br>Total Fines: %{value}<extra></extra>",
domain = list (
x = c(0.16,0.84),
y = c(0.16,0.84))) %>%
add_trace(data = agency3_df[agency3_df$year == 2018,],
labels = ~myagency,
values = ~agency3_df[agency3_df$year == 2018, "n"],
type = "pie",
textposition = "inside",
hovertemplate = "Year: 2018<br>Agency:%{label}<br>Percent:%{percent}<br>Total Fines: %{value}<extra></extra>",
domain = list(
x = c(0.27,0.73),
y = c(0.27,0.73)))
Knowing which parking agency brings in the highest amount of parking citation fines can be either a good or bad indicator. It can be a good indicator that an agency is doing a good job of enforcing the laws while people break them meaning they do a good job at writing up violations. Or it can be an indicator that the area which an agency has jurisdiction over is a more populated area so more sources need to be focused in that jurisdiction to reflect the higher population. Or it can be a bad indicator that signage in the high count jurisdictions are not effective.
Note: Once again please remember that not all of the years have been fully reported in the data set and as technology improved the number of citations ticketed increased which would explain why 2017 has been omitted from the data.
plot_ly(textposition="inside", labels = ~myagency, values = ~n) %>%
add_pie(data=agency3_df[agency_df$year == 2015,],
name="2015", title="2015", domain=list(row=0, column=0),
hovertemplate = "Agency:%{label}<br>Citation Count: %{value}<br>Percent:%{percent}<extra></extra>") %>%
add_pie(data=agency3_df[agency_df$year == 2016,],
name="2016", title="2016", domain=list(row=0, column=1),
hovertemplate = "Agency:%{label}<br>Citation Count: %{value}<br>Percent:%{percent}<extra></extra>")%>%
add_pie(data=agency3_df[agency_df$year == 2018,],
name="2018", title="2018", domain=list(row=0, column=2),
hovertemplate = "Agency:%{label}<br>Citation Count: %{value}<br>Percent:%{percent}<extra></extra>") %>%
layout(title="Citation Count by Year and by Agency", showlegend = TRUE,
grid=list(rows=1, columns=3))
Knowing at which hours has been recorded as hours that have issued the most parking violations is important for law enforcement. At peak hours law enforcement should dispatch more officers since at those hours more people have been recorded as breaking parking laws while at hours with lower rates not as many officers are needed as when peak hours occur. During morning commuting hours through early afternoon more parking enforcement hours should be dispatched while significantly less are needed after evening commute and through the night.
ggplot(hours_df, aes(x=hour, y=n)) +
geom_line(color='black', size=1) +
geom_point(shape=21, size=4, color='red', fill='white') +
labs(x="Hour", y="Citation Count", title="Citation Count by Hour") +
scale_y_continuous(labels = comma) +
theme_light() +
theme(plot.title = element_text(hjust=0.5)) +
scale_x_continuous(labels=x_axis_labels, breaks = x_axis_labels, minor_breaks = NULL) +
geom_point(data = hi_lo, aes(x=hour, y=n), shape=21, size=4, fill='red', color='red') +
geom_label_repel(aes(label=ifelse(n==max(n) | n==min(n), scales::comma(n), "")),
box.padding=1, point.padding=1, size=4, color='Grey50', segment.color='darkblue')
What can be concluded from this LA parking citation data set is that the top parking citations issued are worth 100 dollars or less, they top 2 citations by description are Meter Exp. and No Park/Street Clean, the parking agencies which bring in the most parking citations by count and dollar value are the agencies between 51-56, and the peake hours for issuing parking citations are between 8am and 9am.