Loading libraries:
suppressMessages(library(data.table)) #faster way to read large data sets
suppressMessages(library(tidyverse)) #load dplyr, tidyr and ggplot
suppressMessages(library(bit64))
suppressMessages(library(lubridate))
suppressMessages(library(ggrepel))
getwd()
## [1] "/Users/codethedral/Google Drive/ChiHackNights/City_of_Chicago_Parking_Ticket_Data"
Clearing environment
ls()
## character(0)
rm(list = ls())
This dataset provides details on all parking and vehicle compliance tickets issued in Chicago from Jan. 1, 2007 to May 14, 2018. Glimpse at data:
chi.tickets<-as.data.frame(fread("parking_tickets.csv", na.strings = c("NA")))
glimpse(chi.tickets)
## Observations: 28,272,580
## Variables: 23
## $ ticket_number <S3: integer64> 51551278, 51491256, 50433524, ...
## $ issue_date <chr> "2007-01-01 00:00:00", "2007-01-01 00:00...
## $ violation_location <chr> "6014 W 64TH ST", "530 N MICHIGAN", "400...
## $ license_plate_number <chr> "90ad622c3274c9bdc9d8c812b79a01d0aaf7479...
## $ license_plate_state <chr> "IL", "IL", "IL", "IL", "IL", "IL", "IL"...
## $ license_plate_type <chr> "PAS", "PAS", "PAS", "PAS", "PAS", "TMP"...
## $ zipcode <chr> "60638", "606343801", "60148", "60601", ...
## $ violation_code <chr> "0976160F", "0964150B", "0976160F", "096...
## $ violation_description <chr> "EXPIRED PLATES OR TEMPORARY REGISTRATIO...
## $ unit <chr> "8", "18", "16", "152", "2", "23", "10",...
## $ unit_description <chr> "CPD", "CPD", "CPD", "CPD", "CPD", "CPD"...
## $ vehicle_make <chr> "CHEV", "CHRY", "BUIC", "NISS", "INFI", ...
## $ fine_level1_amount <int> 50, 50, 50, 100, 25, 50, 100, 50, 120, 5...
## $ fine_level2_amount <int> 100, 100, 100, 200, 50, 100, 200, 100, 2...
## $ current_amount_due <dbl> 0, 50, 0, 0, 0, 50, 244, 0, 0, 0, 0, 0, ...
## $ total_payments <dbl> 100, 0, 50, 100, 50, 0, 0, 50, 240, 100,...
## $ ticket_queue <chr> "Paid", "Define", "Paid", "Paid", "Paid"...
## $ ticket_queue_date <chr> "2007-05-21 00:00:00", "2007-01-22 00:00...
## $ notice_level <chr> "SEIZ", "", "VIOL", "DETR", "SEIZ", "", ...
## $ hearing_disposition <chr> "", "", "", "Liable", "", "", "", "Liabl...
## $ notice_number <S3: integer64> 5048648030, 0, 5079875240, 502...
## $ officer <chr> "15227", "18320", "3207", "19410", "6639...
## $ address <chr> "6000 w 64th st, chicago, il", "500 n mi...
print(object.size(chi.tickets),units = "Gb")
## 6.5 Gb
Convert issue_date to date and time (“2015-05-07 13:52:00”)
chi.tickets$Date <- sapply(strsplit(as.character(chi.tickets$issue_date), " "), "[", 1)
chi.tickets$Time <- sapply(strsplit(as.character(chi.tickets$issue_date), " "), "[", 2)
Convert Date from
chi.tickets$Date <- as.Date(chi.tickets$Date)
For this study, we’ll choose only data for 2017.
chi.tickets.2017<-filter(chi.tickets, Date >= as_date("2017-01-01"), Date <= as_date("2017-12-31"))
print(object.size(chi.tickets.2017),units = "Gb")
## 0.7 Gb
Saving data set as an RDS file:
# Saving object to a file
getwd()
## [1] "/Users/codethedral/Google Drive/ChiHackNights/City_of_Chicago_Parking_Ticket_Data"
saveRDS(chi.tickets.2017, file = "chi.tickets.2017.rds")
Clear environment:
ls()
## [1] "chi.tickets" "chi.tickets.2017"
rm(list = ls())
Load RDS file to continue working on analysis:
# Restore the object
chi.tickets.2017<-readRDS(file = "chi.tickets.2017.rds")
print(object.size(chi.tickets.2017),units = "Gb")
## 0.7 Gb
Separate data into Year, Month and Day:
chi.tickets.2017$Year<-year(as.POSIXlt(chi.tickets.2017$Date, format="%d/%m/%Y"))
chi.tickets.2017$Month<-month(as.POSIXlt(chi.tickets.2017$Date, format="%d/%m/%Y"))
chi.tickets.2017$Day<-day(as.POSIXlt(chi.tickets.2017$Date, format="%d/%m/%Y"))
glimpse(chi.tickets.2017)
## Observations: 2,190,763
## Variables: 28
## $ ticket_number <S3: integer64> 69141201, 67486883, 67498563, ...
## $ issue_date <chr> "2017-01-01 00:00:00", "2017-01-01 00:00...
## $ violation_location <chr> "3959 W SHAKESPEARE", "3464 N CLARK", "4...
## $ license_plate_number <chr> "07bef58da5eabab43fb335ae30ef7fb6ee02230...
## $ license_plate_state <chr> "IL", "IL", "IL", "IL", "IL", "IL", "IL"...
## $ license_plate_type <chr> "PAS", "PAS", "PAS", "PAS", "PAS", "PAS"...
## $ zipcode <chr> "", "60473", "60625", "", "606121727", "...
## $ violation_code <chr> "0964110D", "0964080B", "0964110A", "096...
## $ violation_description <chr> "PARK OR STAND ON SIDEWALK", "NO STANDIN...
## $ unit <chr> "25", "19", "145", "3", "6", "25", "145"...
## $ unit_description <chr> "CPD", "CPD", "CPD-Other", "CPD", "CPD",...
## $ vehicle_make <chr> "PONT", "LEXU", "TOYT", "HOND", "NISS", ...
## $ fine_level1_amount <int> 60, 100, 100, 100, 60, 100, 100, 75, 60,...
## $ fine_level2_amount <int> 120, 200, 200, 200, 120, 200, 200, 150, ...
## $ current_amount_due <dbl> 60.00, 0.00, 0.00, 0.00, 0.00, 100.00, 0...
## $ total_payments <dbl> 0.00, 100.00, 100.00, 100.00, 60.00, 0.0...
## $ ticket_queue <chr> "Define", "Paid", "Paid", "Paid", "Paid"...
## $ ticket_queue_date <chr> "2018-01-09 00:00:00", "2017-02-09 00:00...
## $ notice_level <chr> "", "VIOL", "VIOL", "", "VIOL", "", "SEI...
## $ hearing_disposition <chr> "", "", "", "", "", "", "", "", "", "", ...
## $ notice_number <S3: integer64> 0, 5203356010, 5203615680, 0, ...
## $ officer <chr> "16305", "14163", "04262", "12033", "192...
## $ address <chr> "3900 w shakespeare, chicago, il", "3400...
## $ Date <date> 2017-01-01, 2017-01-01, 2017-01-01, 201...
## $ Time <chr> "00:00:00", "00:00:00", "00:03:00", "00:...
## $ Year <dbl> 2017, 2017, 2017, 2017, 2017, 2017, 2017...
## $ Month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ Day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
Converting vehicle_make to factor:
chi.tickets.2017$vehicle_make<-as.factor(chi.tickets.2017$vehicle_make)
Which vehicle_make gets the more tickets:
group <-chi.tickets.2017 %>%
group_by(vehicle_make) %>%
summarise(total = n()) %>%
distinct()%>%
top_n(20)
## Selecting by total
group %>%
ggplot(aes(reorder(vehicle_make, total), y = total)) +
geom_col(fill = "sky blue") +
geom_label_repel(aes(label = total), size = 3) +
coord_flip() +
labs(title = 'Chicago Top 20 Cars getting a ticket in 2017',
x = "Vehicle Description",
y = "Total")