For this assignment, we are going to investigate data on parking violations in NYC.
NYC Open Data has data on all parking violations issued in NYC since 2014. The updated dataset provided for 2021 currently includes about 10 million observations. To make the assignment manageable, I have reduced it to a subset of tickets issued in from Jan 2020 to Jan 2021 and by Manhattan precincts only, yielding about 2.2M tickets.
Two support files are also included in the parking
sub folder:
A second data source is the shape files of police precincts in NYC.
Before focusing on the spatial part of the data, let’s explore the basic patterns in the data.
df <- read.csv('./data/parking/parkingNYC_Jan2020-Jan2021.csv')
library(tibble)
## Warning: package 'tibble' was built under R version 4.0.2
glimpse(df)
## Rows: 2,244,505
## Columns: 47
## $ Summons.Number <dbl> 1474094223, 1474094600, 1474097807,…
## $ Plate.ID <chr> "KDT3875", "GTW5034", "JEF4856", "H…
## $ Registration.State <chr> "NY", "NY", "NY", "NY", "PA", "PA",…
## $ Plate.Type <chr> "PAS", "PAS", "PAS", "PAS", "PAS", …
## $ Issue.Date <chr> "06/24/2020", "06/26/2020", "06/26/…
## $ Violation.Code <int> 98, 20, 20, 14, 20, 40, 46, 40, 46,…
## $ Vehicle.Body.Type <chr> "SDN", "SDN", "SDN", "SDN", "SUBN",…
## $ Vehicle.Make <chr> "KIA", "VOLKS", "ME/BE", "TOYOT", "…
## $ Issuing.Agency <chr> "P", "P", "P", "P", "P", "P", "P", …
## $ Street.Code1 <int> 0, 18390, 35710, 15710, 35930, 3593…
## $ Street.Code2 <int> 40404, 10110, 15710, 35710, 11710, …
## $ Street.Code3 <int> 40404, 10010, 11710, 35770, 13610, …
## $ Vehicle.Expiration.Date <int> 20200805, 20201006, 20201219, 0, 0,…
## $ Violation.Location <int> 19, 19, 24, 24, 24, 24, 25, 25, 25,…
## $ Violation.Precinct <int> 19, 19, 24, 24, 24, 24, 25, 25, 25,…
## $ Issuer.Precinct <int> 19, 19, 24, 24, 24, 24, 25, 25, 25,…
## $ Issuer.Code <int> 966865, 939882, 963854, 963854, 968…
## $ Issuer.Command <chr> "0019", "0019", "0024", "0024", "00…
## $ Issuer.Squad <chr> "0000", "0000", "0000", "0000", "00…
## $ Violation.Time <chr> "0805A", "0321P", "0636A", "0628A",…
## $ Time.First.Observed <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Violation.County <chr> "NY", "NY", "NY", "NY", NA, NA, "NY…
## $ Violation.In.Front.Of.Or.Opposite <chr> "O", "F", "O", "F", "F", "F", "F", …
## $ House.Number <chr> "240", "343", "127", "784", "215-7"…
## $ Street.Name <chr> "E 75", "EAST 70 STREET", "WEST 97T…
## $ Intersecting.Street <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Date.First.Observed <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Law.Section <int> 408, 408, 408, 408, 408, 408, 408, …
## $ Sub.Division <chr> "C4", "J6", "D", "C", "C", "D", "D"…
## $ Violation.Legal.Code <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Days.Parking.In.Effect <chr> "BBBBBBB", "BBBBBBB", "BBBBYBB", "B…
## $ From.Hours.In.Effect <chr> "ALL", "ALL", "0600A", "ALL", "ALL"…
## $ To.Hours.In.Effect <chr> "ALL", "ALL", "1500P", "ALL", "ALL"…
## $ Vehicle.Color <chr> "BLACK", "BK", "WH", "RD", "WH", "W…
## $ Unregistered.Vehicle. <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Vehicle.Year <int> 2019, 2014, 2018, 0, 0, 0, 2014, 20…
## $ Meter.Number <chr> "-", "-", "-", "-", "-", "-", "-", …
## $ Feet.From.Curb <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,…
## $ Violation.Post.Code <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Violation.Description <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ No.Standing.or.Stopping.Violation <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Hydrant.Violation <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Double.Parking.Violation <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ issue_date <chr> "2020-06-24", "2020-06-26", "2020-0…
## $ year <int> 2020, 2020, 2020, 2020, 2020, 2020,…
## $ month <int> 6, 6, 6, 6, 6, 6, 7, 6, 6, 5, 5, 5,…
## $ day <int> 24, 26, 26, 19, 22, 22, 1, 27, 27, …
length(unique(df$Violation.Code))
## [1] 87
Add the violation code descriptions and fine amounts to the data file. Provide a visual overview of the top 10 most common types of violations (feel free to group them into categories if reasonable). Compare how this ranking differs if we focus on the total amount of revenue generated.
library(readxl)
## Warning: package 'readxl' was built under R version 4.0.2
violation_labels <- read_excel('./data/parking/ParkingViolationCodes_January2020.xlsx')
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.2
## Warning: replacing previous import 'vctrs::data_frame' by 'tibble::data_frame'
## when loading 'dplyr'
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ✓ purrr 0.3.4
## Warning: package 'ggplot2' was built under R version 4.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
violation_labels <- violation_labels %>%
rename(
'Violation.Code' = 'VIOLATION CODE',
'Violation.Description' = 'VIOLATION DESCRIPTION',
'Manhattan.Below' = 'Manhattan 96th St. & below\r\n(Fine Amount $)',
'All.Other.Areas' = 'All Other Areas\r\n(Fine Amount $)'
)
glimpse(violation_labels)
## Rows: 97
## Columns: 4
## $ Violation.Code <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, …
## $ Violation.Description <chr> "FAILURE TO DISPLAY BUS PERMIT", "NO OPERATOR N…
## $ Manhattan.Below <dbl> 515, 515, 515, 115, 50, 265, 50, 115, 115, 115,…
## $ All.Other.Areas <dbl> 515, 515, 515, 115, 50, 265, 50, 115, 115, 115,…
df_violations = merge(x = df, y = violation_labels, by = 'Violation.Code')
glimpse(df_violations)
## Rows: 2,244,045
## Columns: 50
## $ Violation.Code <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Summons.Number <dbl> 8773725183, 8696246354, 8773720630,…
## $ Plate.ID <chr> "OYB5181", "OYB1400", "OYB5237", "O…
## $ Registration.State <chr> "NJ", "NJ", "NJ", "NJ", "NJ", "NJ",…
## $ Plate.Type <chr> "PAS", "PAS", "PAS", "PAS", "PAS", …
## $ Issue.Date <chr> "11/24/2020", "09/25/2020", "10/21/…
## $ Vehicle.Body.Type <chr> "BUS", "BUS", "BUS", "BUS", "BUS", …
## $ Vehicle.Make <chr> "FORD", "FORD", "FORD", "FORD", "FO…
## $ Issuing.Agency <chr> "T", "T", "T", "T", "T", "T", "T", …
## $ Street.Code1 <int> 10810, 34610, 10810, 10810, 10810, …
## $ Street.Code2 <int> 0, 10810, 0, 0, 0, 10810, 0, 0, 345…
## $ Street.Code3 <int> 0, 10910, 0, 0, 0, 10910, 0, 0, 345…
## $ Vehicle.Expiration.Date <int> 88880088, 88880088, 88880088, 88880…
## $ Violation.Location <int> 14, 10, 14, 14, 14, 14, 14, 14, 14,…
## $ Violation.Precinct <int> 14, 10, 14, 14, 14, 14, 14, 14, 14,…
## $ Issuer.Precinct <int> 14, 10, 14, 14, 14, 14, 14, 14, 14,…
## $ Issuer.Code <int> 365793, 351969, 365793, 365793, 361…
## $ Issuer.Command <chr> "T106", "MTTF", "T106", "T106", "T1…
## $ Issuer.Squad <chr> "E", "A", "E", "E", "E", "E", "E", …
## $ Violation.Time <chr> "0245P", "0937A", "0844A", "0314P",…
## $ Time.First.Observed <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Violation.County <chr> "NY", "NY", "NY", "NY", "NY", "NY",…
## $ Violation.In.Front.Of.Or.Opposite <chr> "I", "F", "I", "I", "I", "F", "I", …
## $ House.Number <chr> "E", "323", "E", "E", "E", "319", "…
## $ Street.Name <chr> "8th Ave", "W 42nd St", "8th Ave", …
## $ Intersecting.Street <chr> "15ft N/of W 41st St", NA, "30ft S/…
## $ Date.First.Observed <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Law.Section <int> 408, 408, 408, 408, 408, 408, 408, …
## $ Sub.Division <chr> "C1", "K7", "D7", "D7", "C1", "M5",…
## $ Violation.Legal.Code <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Days.Parking.In.Effect <chr> "YYYYYYY", "YYYYYYY", "YYYYYYY", "Y…
## $ From.Hours.In.Effect <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ To.Hours.In.Effect <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Vehicle.Color <chr> "WHITE", "WHITE", "WHITE", "WHITE",…
## $ Unregistered.Vehicle. <int> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Vehicle.Year <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Meter.Number <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Feet.From.Curb <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Violation.Post.Code <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Violation.Description.x <chr> "01-No Intercity Pmt Displ", NA, NA…
## $ No.Standing.or.Stopping.Violation <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Hydrant.Violation <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Double.Parking.Violation <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ issue_date <chr> "2020-11-24", "2020-09-25", "2020-1…
## $ year <int> 2020, 2020, 2020, 2020, 2020, 2020,…
## $ month <int> 11, 9, 10, 11, 8, 11, 10, 10, 9, 10…
## $ day <int> 24, 25, 21, 24, 25, 8, 26, 21, 3, 3…
## $ Violation.Description.y <chr> "FAILURE TO DISPLAY BUS PERMIT", "F…
## $ Manhattan.Below <dbl> 515, 515, 515, 515, 515, 515, 515, …
## $ All.Other.Areas <dbl> 515, 515, 515, 515, 515, 515, 515, …
length(unique(df_violations$Violation.Description.y))
## [1] 85
length(unique(df_violations$Violation.Code))
## [1] 85
#Both are 85. We are good to go.
colnames(df_violations)
## [1] "Violation.Code" "Summons.Number"
## [3] "Plate.ID" "Registration.State"
## [5] "Plate.Type" "Issue.Date"
## [7] "Vehicle.Body.Type" "Vehicle.Make"
## [9] "Issuing.Agency" "Street.Code1"
## [11] "Street.Code2" "Street.Code3"
## [13] "Vehicle.Expiration.Date" "Violation.Location"
## [15] "Violation.Precinct" "Issuer.Precinct"
## [17] "Issuer.Code" "Issuer.Command"
## [19] "Issuer.Squad" "Violation.Time"
## [21] "Time.First.Observed" "Violation.County"
## [23] "Violation.In.Front.Of.Or.Opposite" "House.Number"
## [25] "Street.Name" "Intersecting.Street"
## [27] "Date.First.Observed" "Law.Section"
## [29] "Sub.Division" "Violation.Legal.Code"
## [31] "Days.Parking.In.Effect" "From.Hours.In.Effect"
## [33] "To.Hours.In.Effect" "Vehicle.Color"
## [35] "Unregistered.Vehicle." "Vehicle.Year"
## [37] "Meter.Number" "Feet.From.Curb"
## [39] "Violation.Post.Code" "Violation.Description.x"
## [41] "No.Standing.or.Stopping.Violation" "Hydrant.Violation"
## [43] "Double.Parking.Violation" "issue_date"
## [45] "year" "month"
## [47] "day" "Violation.Description.y"
## [49] "Manhattan.Below" "All.Other.Areas"
violations_count <- as.data.frame( table(df_violations$Violation.Description.y) )
violations_top10 <- violations_count %>%
arrange (desc(Freq)) %>%
head(11)
violations_top10
## Var1 Freq
## 1 NO STANDING-DAY/TIME LIMITS 296962
## 2 NO PARKING-DAY/TIME LIMITS 271285
## 3 FAIL TO DSPLY MUNI METER RECPT 201108
## 4 NO STANDING-COMM METER ZONE 180219
## 5 NO PARKING-STREET CLEANING 161910
## 6 DOUBLE PARKING 139444
## 7 FAIL TO DISP. MUNI METER RECPT 131976
## 8 FIRE HYDRANT 122162
## 9 NO STANDING-EXC. TRUCK LOADING 100093
## 10 EXPIRED MUNI METER 70944
## 11 DOUBLE PARKING-MIDTOWN COMML 66101
violations_top10[violations_top10$Var1 == "FAIL TO DSPLY MUNI METER RECPT", ] <- violations_top10[violations_top10$Var1 == "FAIL TO DSPLY MUNI METER RECPT", ] + violations_top10[violations_top10$Var1 == "FAIL TO DISP. MUNI METER RECPT", ]
## Warning in Ops.factor(left, right): '+' not meaningful for factors
violations_top10 <- violations_top10[rownames(violations_top10) != 7, ]
violations_top10[3, 1] = "FAIL TO DSPLY MUNI METER RECPT"
violations_top10$Var1 <- factor(violations_top10$Var1, levels = violations_top10$Var1[order(violations_top10$Freq, decreasing = F)])
levels(violations_top10$Var1)
## [1] "DOUBLE PARKING-MIDTOWN COMML" "EXPIRED MUNI METER"
## [3] "NO STANDING-EXC. TRUCK LOADING" "FIRE HYDRANT"
## [5] "DOUBLE PARKING" "NO PARKING-STREET CLEANING"
## [7] "NO STANDING-COMM METER ZONE" "NO PARKING-DAY/TIME LIMITS"
## [9] "NO STANDING-DAY/TIME LIMITS" "FAIL TO DSPLY MUNI METER RECPT"
library(ggplot2)
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.2
count_compare <- ggplot(violations_top10, aes(x= Var1, y = Freq)) +
geom_bar(stat = 'identity') +
coord_flip() +
scale_y_continuous(name="Frequency", labels = scales::comma) +
scale_x_discrete(name = "Type of Traffic Violation") +
labs(title = "Top 10 Traffic Violations (Frequency)") +
theme_tufte()
count_compare
violations_fine_top10 <- df_violations %>%
group_by(Violation.Description.y) %>%
summarise(total_fine = sum(Manhattan.Below)) %>%
arrange (desc(total_fine)) %>%
head(11)
## `summarise()` ungrouping output (override with `.groups` argument)
violations_fine_top10[violations_fine_top10$Violation.Description.y == "FAIL TO DSPLY MUNI METER RECPT", "total_fine"] <- violations_fine_top10[violations_fine_top10$Violation.Description.y == "FAIL TO DSPLY MUNI METER RECPT", "total_fine"] + violations_fine_top10[violations_fine_top10$Violation.Description.y == "FAIL TO DISP. MUNI METER RECPT", "total_fine"]
violations_fine_top10 <- violations_fine_top10[rownames(violations_fine_top10) != 9, ]
violations_fine_top10
## # A tibble: 10 x 2
## Violation.Description.y total_fine
## <chr> <dbl>
## 1 NO STANDING-DAY/TIME LIMITS 34150630
## 2 NO STANDING-COMM METER ZONE 20725185
## 3 NO PARKING-DAY/TIME LIMITS 17633525
## 4 DOUBLE PARKING 16036060
## 5 FIRE HYDRANT 14048630
## 6 FAIL TO DSPLY MUNI METER RECPT 21650460
## 7 NO PARKING-STREET CLEANING 10524150
## 8 NO STANDING-EXC. TRUCK LOADING 9508835
## 9 DOUBLE PARKING-MIDTOWN COMML 7601615
## 10 NO STANDING-BUS STOP 7463270
violations_fine_top10$Violation.Description.y <- factor(violations_fine_top10$Violation.Description.y, levels = violations_fine_top10$Violation.Description.y[order(violations_fine_top10$total_fine, decreasing = F)])
levels(violations_fine_top10$Violation.Description.y)
## [1] "NO STANDING-BUS STOP" "DOUBLE PARKING-MIDTOWN COMML"
## [3] "NO STANDING-EXC. TRUCK LOADING" "NO PARKING-STREET CLEANING"
## [5] "FIRE HYDRANT" "DOUBLE PARKING"
## [7] "NO PARKING-DAY/TIME LIMITS" "NO STANDING-COMM METER ZONE"
## [9] "FAIL TO DSPLY MUNI METER RECPT" "NO STANDING-DAY/TIME LIMITS"
library(ggplot2)
fine_compare <- ggplot(violations_fine_top10, aes(x= Violation.Description.y, y = total_fine)) +
geom_col() +
coord_flip() +
scale_y_continuous(name="Total Fine Amount", labels = scales::comma) +
scale_x_discrete(name = "Type of Traffic Violation") +
labs(title = "Top 10 Traffic Violations (Total Fine Amount)") +
theme_tufte()
fine_compare
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.0.2
ggarrange(count_compare, fine_compare, nrow = 2)
There are some overlaps between the frequency of top traffic violations, and their associated fine amounts.
Compare the average amount of fine by vehicle color, vehicle year, and vehicle plate type [Hint: it is sufficient to restrict your attention to commercial (COM
) and passenger (PAS
) vehicles]? Briefly describe your findings.
violations_fine_color <- df_violations %>%
group_by(Vehicle.Color) %>%
summarise(mean_fine = mean(Manhattan.Below)) %>%
arrange (desc(mean_fine))
## `summarise()` ungrouping output (override with `.groups` argument)
max(violations_fine_color$mean_fine)
## [1] 180
min(violations_fine_color$mean_fine)
## [1] 32.5
unique(violations_fine_color$mean_fine)
## [1] 180.00000 165.00000 130.00000 115.00000 114.09091 113.66667 112.16450
## [8] 112.08333 111.02273 110.22876 109.16667 108.33333 108.28571 107.71505
## [15] 106.66667 106.61290 105.90909 105.71429 105.21739 105.00000 104.78261
## [22] 104.76190 104.00000 103.50000 103.14713 102.89704 102.53846 102.50000
## [29] 102.38785 101.78212 101.66948 101.53846 100.99068 100.40052 100.31250
## [36] 100.00000 98.68421 98.33333 97.90155 97.85714 97.83333 97.55222
## [43] 97.48092 97.10145 96.12299 96.08974 95.53571 95.00000 94.92554
## [50] 94.44999 94.11501 94.11040 94.00000 93.52941 93.28571 92.97209
## [57] 92.96512 92.70717 92.50000 92.05682 91.66667 91.60920 91.25000
## [64] 91.14480 90.97594 90.96491 90.68260 90.48970 90.45764 90.28698
## [71] 90.00000 89.39566 88.92308 88.83606 88.47694 88.18328 87.75272
## [78] 87.50000 87.43902 86.66667 86.59620 86.58625 86.04244 85.98214
## [85] 85.89286 85.78890 85.42443 85.33422 85.00000 84.75856 84.52457
## [92] 84.36508 84.23469 84.08111 84.04412 83.89937 83.88541 83.33333
## [99] 81.93182 81.73469 81.66667 80.66667 80.00000 77.50000 77.17391
## [106] 69.48980 65.00000 32.50000
#rough segmentation - 32.5 to 90, 90 to 100, 100 to 180#
violations_fine_color$groups = ifelse(violations_fine_color$mean_fine <=90, 'Low Fine Amt', 0)
violations_fine_color$groups = ifelse(violations_fine_color$mean_fine >90 & violations_fine_color$mean_fine<=100, 'Medium Fine Amt', violations_fine_color$groups)
violations_fine_color$groups = ifelse(violations_fine_color$mean_fine >100, 'High Fine Amt', violations_fine_color$groups)
violations_fine_color$groups <- as.factor(violations_fine_color$groups)
violations_fine_color %>% #simple counts of each category
group_by(groups) %>%
count()
## # A tibble: 3 x 2
## # Groups: groups [3]
## groups n
## <fct> <int>
## 1 High Fine Amt 106
## 2 Low Fine Amt 85
## 3 Medium Fine Amt 57
violations_fine_color_high <- violations_fine_color %>%
filter(groups == "High Fine Amt")
as.data.frame( table(unique(violations_fine_color_high$Vehicle.Color)) )
## Var1 Freq
## 1 B L 1
## 2 BAIGE 1
## 3 BALCK 1
## 4 BBRW 1
## 5 BERG 1
## 6 BG 1
## 7 BGE 1
## 8 BIERG 1
## 9 BKJ 1
## 10 BLAK 1
## 11 BLE 1
## 12 BLK 1
## 13 BLU 1
## 14 BLY 1
## 15 BN 1
## 16 BRG 1
## 17 BRK 1
## 18 BRN 1
## 19 BRNW 1
## 20 BRO 1
## 21 BRON 1
## 22 BRONZ 1
## 23 BROW 1
## 24 BROWN 1
## 25 BRW 1
## 26 BRWN 1
## 27 BUG 1
## 28 BUL 1
## 29 BURG 1
## 30 BWN 1
## 31 BY 1
## 32 CREAM 1
## 33 DARKR 1
## 34 DKGRA 1
## 35 DKGRY 1
## 36 DKGY 1
## 37 DRKGR 1
## 38 G 1
## 39 GAY 1
## 40 GEY 1
## 41 GLD 1
## 42 GLO 1
## 43 GR/ 1
## 44 GRAAY 1
## 45 GRANG 1
## 46 GRAY 1
## 47 GREE 1
## 48 GREG 1
## 49 GREN 1
## 50 GREU 1
## 51 GRN 1
## 52 GRRY 1
## 53 GRT 1
## 54 GRU 1
## 55 GRY 1
## 56 GV 1
## 57 GW 1
## 58 HRY 1
## 59 INFIN 1
## 60 MAR 1
## 61 MARO 1
## 62 MC 1
## 63 METAL 1
## 64 MUNI 1
## 65 NYPD 1
## 66 OPTER 1
## 67 ORA 1
## 68 ORN 1
## 69 PKGRY 1
## 70 PRPL 1
## 71 PURP 1
## 72 QBLK 1
## 73 R 1
## 74 RDW 1
## 75 SILER 1
## 76 SL 1
## 77 SLIVE 1
## 78 SLV 1
## 79 SLVR 1
## 80 SR 1
## 81 SUBUR 1
## 82 SV 1
## 83 SY 1
## 84 TEAL 1
## 85 TNG 1
## 86 TX 1
## 87 W 1
## 88 WH/BL 1
## 89 WHE 1
## 90 WHI 1
## 91 WHIE 1
## 92 WHIT 1
## 93 WHITW 1
## 94 WHT 1
## 95 WHTE 1
## 96 WHTIE 1
## 97 WN 1
## 98 WT 1
## 99 WTH 1
## 100 WWT 1
## 101 YE 1
## 102 YEL 1
## 103 YLL 1
## 104 YLLW 1
## 105 YN 1
violations_fine_color_medium <- violations_fine_color %>%
filter(groups == "Medium Fine Amt")
as.data.frame( table(unique(violations_fine_color_medium$Vehicle.Color)) )
## Var1 Freq
## 1 . Y 1
## 2 B 1
## 3 BEING 1
## 4 BERGU 1
## 5 BKG 1
## 6 BLAC 1
## 7 BLACK 1
## 8 BLCK 1
## 9 BLUE 1
## 10 BRCH 1
## 11 BUR 1
## 12 BURGE 1
## 13 BURGU 1
## 14 DKGN 1
## 15 GD 1
## 16 GN 1
## 17 GOLD 1
## 18 GRA 1
## 19 GRE 1
## 20 GRE Y 1
## 21 GREEN 1
## 22 GREY 1
## 23 GRFAY 1
## 24 GY/ 1
## 25 LTB 1
## 26 LTBLU 1
## 27 LTT 1
## 28 MAROO 1
## 29 MN 1
## 30 MR 1
## 31 NVYBL 1
## 32 ORANG 1
## 33 OTHER 1
## 34 PEARL 1
## 35 PK 1
## 36 PURPL 1
## 37 RD/ 1
## 38 RDG 1
## 39 RE 1
## 40 RED 1
## 41 S 1
## 42 SIL 1
## 43 SILV 1
## 44 SILVE 1
## 45 SILVR 1
## 46 SIV 1
## 47 SIVR 1
## 48 TAN 1
## 49 WH/ 1
## 50 WHB 1
## 51 WHG 1
## 52 WHITE 1
## 53 WI 1
## 54 YELL 1
## 55 YELLO 1
## 56 YLW 1
## 57 YW 1
violations_fine_color_low <- violations_fine_color %>%
filter(groups == "Low Fine Amt")
as.data.frame( table(unique(violations_fine_color_low$Vehicle.Color)) )
## Var1 Freq
## 1 .- 1
## 2 B ACK 1
## 3 B LAC 1
## 4 BCK 1
## 5 BCL 1
## 6 BEGE 1
## 7 BEIG 1
## 8 BEIGE 1
## 9 BI 1
## 10 BIEGE 1
## 11 BK 1
## 12 BK/ 1
## 13 BKT 1
## 14 BKW 1
## 15 BL 1
## 16 BL/ 1
## 17 BLA 1
## 18 BLB 1
## 19 BLC 1
## 20 BLG 1
## 21 BLO 1
## 22 BLW 1
## 23 BR 1
## 24 BRIGE 1
## 25 BRWON 1
## 26 BU 1
## 27 BURUN 1
## 28 BW 1
## 29 DARKG 1
## 30 DEEP 1
## 31 DK/ 1
## 32 DKB 1
## 33 DKG 1
## 34 DKM 1
## 35 DKP 1
## 36 DKR 1
## 37 FAN 1
## 38 FR 1
## 39 GARY 1
## 40 GERY 1
## 41 GL 1
## 42 GR 1
## 43 GRG 1
## 44 GRW 1
## 45 GY 1
## 46 GYB 1
## 47 GYG 1
## 48 GYT 1
## 49 HITE 1
## 50 LAVEN 1
## 51 LIGHT 1
## 52 LT/ 1
## 53 LTG 1
## 54 LTP 1
## 55 M 1
## 56 MRG 1
## 57 MULTI 1
## 58 NAVY 1
## 59 NO 1
## 60 NOC 1
## 61 NVBLU 1
## 62 OLIVE 1
## 63 OR 1
## 64 ORG 1
## 65 ORGE 1
## 66 PINK 1
## 67 PR 1
## 68 RD 1
## 69 RDT 1
## 70 REG 1
## 71 RUST 1
## 72 SIVER 1
## 73 SLATE 1
## 74 SLR 1
## 75 SLVER 1
## 76 TN 1
## 77 TN/ 1
## 78 U 1
## 79 WH 1
## 80 WHO 1
## 81 WHTG 1
## 82 WTE 1
## 83 Y 1
## 84 YL 1
## 85 YU 1
No firm conclusions can be drawn about variation in average fine amounts, according to Vehicle.Color
, because the data is very unclean. However, it is noteworthy that for High Fine Amt
, there are many more color types. A preliminary hypothesis that cars with novel colors might be more likely to have committed violations with higher fine amounts. A visual inspection of the tables also show the prevalence of more colors that might not be represented for vehicles that have caused Medium Fine Amt
and Low Fine Amt
violations.
violations_fine_year <- df_violations %>%
group_by(Vehicle.Year) %>%
summarise(mean_fine = mean(Manhattan.Below)) %>%
arrange (desc(mean_fine))
## `summarise()` ungrouping output (override with `.groups` argument)
min(violations_fine_year$Vehicle.Year) #nonsense
## [1] 0
max(violations_fine_year$Vehicle.Year) #nonsense
## [1] 2069
violations_fine_year <- df_violations %>%
filter(Vehicle.Year != 0 & Vehicle.Year <=2021) %>%
group_by(Vehicle.Year) %>%
summarise(mean_fine = mean(Manhattan.Below)) %>%
arrange (desc(mean_fine))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(violations_fine_year, aes(x=Vehicle.Year, y=mean_fine)) +
geom_point() +
stat_smooth(method= "lm", color = "black", se=F, size=0.3) +
labs(
x = 'Year Vehicle was Registered',
y = 'Average Fine Amount',
title="Average Fine by Vehicle's Registration Year") +
theme_tufte()
## `geom_smooth()` using formula 'y ~ x'
The earlier a car is registered, the more likely that the mean fine amount incurred is a smaller value. The later a car is registered, the more likely that the mean fine amount incurred is a higher value.
violations_fine_plate <- df_violations %>%
group_by(Plate.Type) %>%
filter (Plate.Type == 'COM' | Plate.Type == 'PAS') %>%
summarise(mean_fine = mean(Manhattan.Below)) %>%
arrange (desc(mean_fine))
## `summarise()` ungrouping output (override with `.groups` argument)
violations_fine_plate
## # A tibble: 2 x 2
## Plate.Type mean_fine
## <chr> <dbl>
## 1 COM 91.6
## 2 PAS 89.7
violations_fine_plate_foranova <- df_violations %>%
group_by(Plate.Type) %>%
filter (Plate.Type == 'COM' | Plate.Type == 'PAS')
violations_fine_plate_foranova$Plate.Type <- as.factor(violations_fine_plate_foranova$Plate.Type)
summary(aov(Manhattan.Below ~ Plate.Type, data = violations_fine_plate_foranova)) #statistically significant difference in fine amounts between Plate.Type#
## Df Sum Sq Mean Sq F value Pr(>F)
## Plate.Type 1 1.707e+06 1706749 2795 <2e-16 ***
## Residuals 2129696 1.300e+09 611
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Commercial vehicles (M=91.6) seem to incur, on average, higher fines than passenger vehicles (M=89.7). An exploratory one-way Analysis of Variance (ANOVA) was run to compare the mean fine values of these two groups, and indeed there was a significantly significant difference in fine values (p<.01) between the two groups.
Let’s see if we can observe the effect of COVID restrictions on parking violations. Present a visualization that shows how parking violations changed after the New York statewide stay-at-home order on March 14, 2020. Make sure the visualization clearly highlights the main pattern (the COVID effect).
library(anytime)
## Warning: package 'anytime' was built under R version 4.0.2
df_violations$issue_date <- anytime::anydate(df_violations$issue_date)
glimpse(df_violations$issue_date)
## Date[1:2244045], format: "2020-11-24" "2020-09-25" "2020-10-21" "2020-11-24" "2020-08-25" ...
df_violations$Month_Yr <- format(as.Date(df_violations$issue_date), "%Y-%m")
glimpse(df_violations$Month_Yr)
## chr [1:2244045] "2020-11" "2020-09" "2020-10" "2020-11" "2020-08" ...
df_covid <- df_violations %>%
filter (issue_date >= '2020-01-01') %>%
count(Month_Yr) %>%
filter(Month_Yr <= '2021-01')
df_covid
## Month_Yr n
## 1 2020-01 56
## 2 2020-02 19
## 3 2020-03 177
## 4 2020-04 79
## 5 2020-05 143
## 6 2020-06 15053
## 7 2020-07 178563
## 8 2020-08 352658
## 9 2020-09 397106
## 10 2020-10 395206
## 11 2020-11 339235
## 12 2020-12 285292
## 13 2021-01 280408
ggplot(df_covid, aes(x = Month_Yr, y = n)) +
geom_col() +
geom_text(
aes(label = n, y = n),
position = position_dodge(0.2),
vjust = -0.5,
size=3
) +
scale_x_discrete(name="Year-Month") +
scale_y_continuous(name="Number of Traffic Violations", labels = scales::comma) +
labs(title="New York Traffic Violations in 2020", caption = "Note on COVID effects: Within the United States and New York, COVID-19 became a serious issue from 2020-03 onwards.") +
theme_tufte()
#https://www.nytimes.com/2021/01/05/nyregion/nyc-residential-parking.html #Increase in Car Ownership, post-COVID, as a potential reason for these effects#
Read in the shape files for the police precincts and remove all precincts outside of Manhattan.
library(rgdal)
## Warning: package 'rgdal' was built under R version 4.0.2
## Loading required package: sp
## Warning: package 'sp' was built under R version 4.0.2
## rgdal: version: 1.5-23, (SVN revision 1121)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.1.4, released 2020/10/20
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/4.0/Resources/library/rgdal/gdal
## GDAL binary built with GEOS: TRUE
## Loaded PROJ runtime: Rel. 6.3.1, February 10th, 2020, [PJ_VERSION: 631]
## Path to PROJ shared files: /Library/Frameworks/R.framework/Versions/4.0/Resources/library/rgdal/proj
## Linking to sp version:1.4-5
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
police_prec <- readOGR('/Users/kagenlim/Documents/Data Viz/07_parking-graded/data/police_precincts/nypp.shp')
## OGR data source with driver: ESRI Shapefile
## Source: "/Users/kagenlim/Documents/Data Viz/07_parking-graded/data/police_precincts/nypp.shp", layer: "nypp"
## with 77 features
## It has 3 fields
manhattan_before <- subset(police_prec, Precinct<=34) #only Manhattan
#https://www1.nyc.gov/site/nypd/bureaus/patrol/precincts-landing.page
manhattan_before <- spTransform(manhattan_before, "+init=ESRI:102718")
manhattan <- fortify(manhattan_before)
## Regions defined for each Polygons
from = unique(manhattan$id)
to = unique(manhattan_before$Precinct)
map = setNames(to, from)
manhattan$id = map[manhattan$id]
unique(manhattan$id) #all good#
## [1] 1 5 6 7 9 10 13 14 17 18 19 20 22 23 24 25 26 28 30 32 33 34
manhattan <- manhattan %>%
rename(
Violation.Precinct = id,
)
manhattan$Violation.Precinct <- as.factor(manhattan$Violation.Precinct)
library(ggplot2)
library(ggthemes)
manhattan_poly <- ggplot() +
geom_polygon(aes(y=lat, x=long, group=group),
color="dark red", data=manhattan, alpha=.2) +
theme_map()
manhattan_poly
Provide three maps that show choropleth maps of:
violations_prec <- as.data.frame( table(df_violations$Violation.Precinct) )
violations_prec$Var1 <- as.integer(violations_prec$Var1)
manhattan_violations_prec <- subset(violations_prec,as.integer(Var1) <= 34)
manhattan_violations_prec <- manhattan_violations_prec %>%
rename(
Violation.Precinct = Var1,
)
manhattan_violations_prec$Violation.Precinct <- as.factor(manhattan_violations_prec$Violation.Precinct)
manhattan_violations_prec
## Violation.Precinct Freq
## 1 1 233
## 2 2 163469
## 3 3 7
## 4 4 4
## 5 5 8
## 6 6 92813
## 7 7 125203
## 8 8 61854
## 9 9 2
## 10 10 96271
## 11 11 124158
## 12 12 2
## 13 13 188542
## 14 14 197335
## 15 15 1
## 16 16 127133
## 17 17 207388
## 18 18 271678
## 19 19 101313
## 20 20 2
## 21 21 157
## 22 22 61823
## 23 23 92080
## 24 24 43054
## 25 25 28600
## 26 26 3
## 27 27 68685
## 28 28 24971
## 29 29 45624
## 30 30 51263
## 31 31 61945
## 32 32 3
## 33 33 2
## 34 34 2
df_violations_location = left_join(x = manhattan, y = manhattan_violations_prec, by = 'Violation.Precinct')
unique(df_violations_location$Freq)
## [1] 233 8 92813 125203 2 96271 188542 197335 207388 271678
## [11] 101313 61823 92080 43054 28600 3 24971 51263
manhattan_poly_n <- ggplot() +
geom_polygon(aes(fill = Freq, y=lat, x=long, group=group),
color = 'white', data=df_violations_location) +
scale_fill_gradient(name="Frequency of \nTraffic Violations", labels = scales::comma, low = "purple2", high = "blue") +
theme_map() +
theme(legend.position = 'right')
manhattan_poly_n
df_violations_group_fine <- df_violations %>%
group_by(Violation.Precinct) %>%
summarize(total_fines_byprec = sum(Manhattan.Below))
## `summarise()` ungrouping output (override with `.groups` argument)
manhattan_violations_group_fine <- subset(df_violations_group_fine,as.integer(Violation.Precinct) <= 34)
manhattan_violations_group_fine$Violation.Precinct <- as.factor(manhattan_violations_group_fine$Violation.Precinct)
manhattan_violations_group_fine
## # A tibble: 31 x 2
## Violation.Precinct total_fines_byprec
## <fct> <dbl>
## 1 0 24770
## 2 1 14698510
## 3 2 785
## 4 3 410
## 5 4 770
## 6 5 7870865
## 7 6 10193765
## 8 7 5123620
## 9 8 190
## 10 9 8284975
## # … with 21 more rows
#same as above
manhattan$Violation.Precinct <- as.factor(manhattan$Violation.Precinct)
df_violations_fine_location = left_join(x = manhattan, y = manhattan_violations_group_fine, by = 'Violation.Precinct')
unique(df_violations_fine_location$total_fines_byprec)
## [1] 14698510 7870865 10193765 5123620 8284975 11215995 16983205 19071685
## [9] 12553685 20118160 24444130 8764365 15235 5683850 8079715 3754230
## [17] 2494015 6042175 2194665 4202170 4583540 5737010
manhattan_poly_fine <- ggplot() +
geom_polygon(aes(fill = total_fines_byprec, y=lat, x=long, group=group),
color = 'white', data=df_violations_fine_location) +
scale_fill_gradient(name="Total Fine Amounts", labels = scales::comma, low = "purple2", high = "blue") +
theme_map() +
theme(legend.position = 'right')
manhattan_poly_fine
df_violations_group_fine_mean <- df_violations %>%
group_by(Violation.Precinct) %>%
summarize(mean_fines_byprec = mean(Manhattan.Below))
## `summarise()` ungrouping output (override with `.groups` argument)
manhattan_violations_group_fine_mean <- subset(df_violations_group_fine_mean,as.integer(Violation.Precinct) <= 34)
manhattan_violations_group_fine_mean$Violation.Precinct <- as.factor(manhattan_violations_group_fine_mean$Violation.Precinct)
manhattan_violations_group_fine_mean
## # A tibble: 31 x 2
## Violation.Precinct mean_fines_byprec
## <fct> <dbl>
## 1 0 106.
## 2 1 89.9
## 3 2 112.
## 4 3 102.
## 5 4 96.2
## 6 5 84.8
## 7 6 81.4
## 8 7 82.8
## 9 8 95
## 10 9 86.1
## # … with 21 more rows
#same as above
manhattan$Violation.Precinct <- as.factor(manhattan$Violation.Precinct)
df_violations_fine_location_mean = left_join(x = manhattan, y = manhattan_violations_group_fine_mean, by = 'Violation.Precinct')
unique(df_violations_fine_location_mean$mean_fines_byprec)
## [1] 89.91619 84.80348 81.41790 82.83409 86.05889 90.33647 90.07651 96.64624
## [9] 98.74450 97.00735 89.97464 86.50780 97.03822 91.93747 87.74669 87.19817
## [17] 87.20332 87.96935 87.88855 92.10437 89.41225 92.61458
manhattan_poly_avg_fine <- ggplot() +
geom_polygon(aes(fill = mean_fines_byprec, y=lat, x=long, group=group),
color = 'white', data=df_violations_fine_location_mean) +
scale_fill_gradient(name="Average Fine Amounts", labels = scales::comma, low = "purple2", high = "blue") +
theme_map() +
theme(legend.position = 'right')
manhattan_poly_avg_fine
Briefly describe what you learn from these maps in comparison.
ggarrange(manhattan_poly_n,
manhattan_poly_fine,
manhattan_poly_avg_fine, ncol=2,nrow=2)
The three maps show that the bulk of traffic violations do happen in the midtown or downtown areas. However, the areas with the largest fines, associated with traffic violations, are not necessarily downtown. The total fine amounts map shows that the Upper East Side area has very high total fine amounts. The average fine maps shows that the region slightly below the Supper East Side, Midtown East, has high average fine amounts. The total and average fines map also show that the more residential areas (i.e., away from downtown, and more toward the upper end of the island) have higher fine amounts that the midtown or downtown areas.
Group the almost 100 types of ticket violations into a smaller set of 4-6 subgroups (where other
should be the remainder of violations not included in other groups you defined). [Hint: No need to spend more than 5 minutes thinking about what the right grouping is.]. Provide choropleth maps for each of these subgroups to show where different types of violations are more or less common.
df_violations_manhattan <- subset(df_violations,as.integer(Violation.Precinct) <= 34)
df_violations_parking <- subset(df_violations_manhattan, subset = Violation.Code %in% c(4, 6, 20, 21, 23, 24, 27, 32, 33, 34, 35, 37, 38, 39, 40, 42, 43, 44, 46, 47, 59, 60, 77, 78, 86, 87))
df_violations_standing <- subset(df_violations_manhattan, subset = Violation.Code %in% c(8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 22, 25, 26, 28, 30, 31, 63, 64, 79, 81, 89))
df_violations_inmotion <- subset(df_violations_manhattan, subset = Violation.Code %in% c(5, 7, 9, 12, 36, 45, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 61, 96, 98))
df_violations_permits_signs_reg <- subset(df_violations_manhattan, subset = Violation.Code %in% c(1, 2, 29, 65, 70, 71, 72, 73, 74, 75, 76, 83))
df_violations_other <- subset(df_violations_manhattan, subset = Violation.Code %in% c(3, 41, 54, 62, 66, 67, 68, 69, 80, 82, 84, 85, 88, 90, 91, 92, 93, 97, 99))
check_rows_manhattan <- nrow(df_violations_manhattan)
check_rows_manhattan
## [1] 2235621
check_rows_grouped <- nrow(df_violations_parking) + nrow(df_violations_standing) + nrow(df_violations_inmotion) + nrow(df_violations_permits_signs_reg) +
nrow(df_violations_other)
check_rows_grouped
## [1] 2235621
check_rows_manhattan == check_rows_grouped
## [1] TRUE
#successfully grouped all observations#
parking_related <- as.data.frame( table(df_violations_parking$Violation.Precinct) )
parking_related_prec <- parking_related %>%
rename(
Violation.Precinct = Var1
)
manhattan_parking = left_join(x = manhattan, y = parking_related_prec, by = 'Violation.Precinct')
manhattan_parking_map <- ggplot() +
geom_polygon(aes(fill = Freq, y=lat, x=long, group=group),
color = 'white', data=manhattan_parking) +
scale_fill_gradient(name="Number of \nParking-Related \nTraffic Offences", labels = scales::comma, low = "purple2", high = "blue") +
theme_map() +
theme(legend.position = 'right')
manhattan_parking_map
standing_related <- as.data.frame( table(df_violations_standing$Violation.Precinct) )
standing_related_prec <- standing_related %>%
rename(
Violation.Precinct = Var1
)
manhattan_standing = left_join(x = manhattan, y = standing_related_prec, by = 'Violation.Precinct')
manhattan_standing_map <- ggplot() +
geom_polygon(aes(fill = Freq, y=lat, x=long, group=group),
color = 'white', data=manhattan_standing) +
scale_fill_gradient(name="Number of \nStanding-Related \nTraffic Offences", labels = scales::comma, low = "purple2", high = "blue") +
theme_map() +
theme(legend.position = 'right')
manhattan_standing_map
inmotion_related <- as.data.frame( table(df_violations_inmotion$Violation.Precinct) )
inmotion_related_prec <- inmotion_related %>%
rename(
Violation.Precinct = Var1
)
manhattan_inmotion= left_join(x = manhattan, y = inmotion_related_prec, by = 'Violation.Precinct')
manhattan_inmotion_map <- ggplot() +
geom_polygon(aes(fill = Freq, y=lat, x=long, group=group),
color = 'white', data=manhattan_inmotion) +
scale_fill_gradient(name="Number of \nIn-Motion-Related \nTraffic Offences", labels = scales::comma, low = "purple2", high = "blue") +
theme_map() +
theme(legend.position = 'right')
manhattan_inmotion_map
other_related <- as.data.frame( table(df_violations_other$Violation.Precinct) )
other_prec <- other_related %>%
rename(
Violation.Precinct = Var1
)
manhattan_other= left_join(x = manhattan, y = other_prec, by = 'Violation.Precinct')
manhattan_other_map <- ggplot() +
geom_polygon(aes(fill = Freq, y=lat, x=long, group=group),
color = 'white', data=manhattan_other) +
scale_fill_gradient(name="Number of \nOther/Misc \nTraffic Offences", labels = scales::comma, low = "purple2", high = "blue") +
theme_map() +
theme(legend.position = 'right')
manhattan_other_map
ggarrange(manhattan_parking_map,
manhattan_standing_map,
manhattan_inmotion_map,
manhattan_other_map, ncol=2,nrow=2)
Somewhat unsurprisingly, the number of in-motion related offences are highest in the areas at the tips of Mantattan, Washington Heights and the financial district, where one might expect the highest amount of traffic to and fro the island. The number of parking-related offences seems concentrated in the Upper East side. Standing-Related Offences and miscellaneous seem concentrated around the midtown area.
Precinct 19 identifies the Upper East Side. The data currently does not provide latitude and longitude of the violation locations (and I am not sure what these street_code
variables are for).
Restrict your data to parking violations related to fire hydrants (Violation Code = 40
). Using the variables Street Name
and House Number
as well as the knowledge that these addresses are in the Upper East Side of Manhattan, geocode at least 500 addresses. Include a data table of these addresses and the latitude and longitude of these addresses in the output.
df_fire_hydrants = subset(df_violations, Violation.Code==40)
df_fire_hydrants_uppereast = subset(df_fire_hydrants, Violation.Precinct==19)
df_fire_hydrants_uppereast$address = paste(df_fire_hydrants_uppereast$House.Number, df_fire_hydrants_uppereast$Street.Name, "New York City Manhattan Upper East Side", sep = " ")
df_fire_hydrants_uppereast$address[1:5]
## [1] "N E 88th St New York City Manhattan Upper East Side"
## [2] "137 E 76th St New York City Manhattan Upper East Side"
## [3] "888 Lexington Ave New York City Manhattan Upper East Side"
## [4] "1648 2nd Ave New York City Manhattan Upper East Side"
## [5] "155 E 77th St New York City Manhattan Upper East Side"
#looks good
nrow(df_fire_hydrants_uppereast)
## [1] 17999
#that's a lot, we will just look at the first 600#
library(ggmap)
## Warning: package 'ggmap' was built under R version 4.0.2
for (val in 1:600){
result = geocode(df_fire_hydrants_uppereast$address[val], output = "latlon" , source = "google")
df_fire_hydrants_uppereast$lon[val] <- as.numeric(result[1])
df_fire_hydrants_uppereast$lat[val] <- as.numeric(result[2])
}
df_fire_hydrants_uppereast_geocoded = head(df_fire_hydrants_uppereast, 600)
glimpse(df_fire_hydrants_uppereast_geocoded)
## Rows: 600
## Columns: 54
## $ Violation.Code <int> 40, 40, 40, 40, 40, 40, 40, 40, 40,…
## $ Summons.Number <dbl> 8838014991, 8853963256, 8738049417,…
## $ Plate.ID <chr> "33554MN", "DME7952", "E16LGH", "P8…
## $ Registration.State <chr> "NY", "NY", "NJ", "NJ", "FL", "NJ",…
## $ Plate.Type <chr> "COM", "PAS", "PAS", "PAS", "PAS", …
## $ Issue.Date <chr> "10/07/2020", "01/19/2021", "08/03/…
## $ Vehicle.Body.Type <chr> "2DSD", "SUBN", "4DSD", "PICK", "SU…
## $ Vehicle.Make <chr> "SMART", "BMW", "HYUND", "DODGE", "…
## $ Issuing.Agency <chr> "T", "T", "T", "T", "T", "T", "T", …
## $ Street.Code1 <int> 18750, 18510, 24890, 10110, 18530, …
## $ Street.Code2 <int> 0, 27790, 18290, 18690, 24890, 1011…
## $ Street.Code3 <int> 0, 24890, 18310, 18710, 10210, 1001…
## $ Vehicle.Expiration.Date <int> 20211031, 20220213, 88880088, 88880…
## $ Violation.Location <int> 19, 19, 19, 19, 19, 19, 19, 19, 19,…
## $ Violation.Precinct <int> 19, 19, 19, 19, 19, 19, 19, 19, 19,…
## $ Issuer.Precinct <int> 19, 19, 19, 19, 19, 19, 19, 19, 19,…
## $ Issuer.Code <int> 354164, 371204, 353431, 355113, 361…
## $ Issuer.Command <chr> "T103", "T103", "T103", "T103", "T1…
## $ Issuer.Squad <chr> "H", "N", "CC", "AA", "O", "BB", "P…
## $ Violation.Time <chr> "1011A", "0304P", "0825A", "0830A",…
## $ Time.First.Observed <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Violation.County <chr> "NY", "NY", "NY", "NY", "NY", "NY",…
## $ Violation.In.Front.Of.Or.Opposite <chr> "I", "F", "F", "F", "O", "F", "I", …
## $ House.Number <chr> "N", "137", "888", "1648", "155", "…
## $ Street.Name <chr> "E 88th St", "E 76th St", "Lexingto…
## $ Intersecting.Street <chr> "15ft W/of York Ave", NA, NA, NA, N…
## $ Date.First.Observed <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Law.Section <int> 408, 408, 408, 408, 408, 408, 408, …
## $ Sub.Division <chr> "E2", "E2", "D", "C", "C", "E2", "e…
## $ Violation.Legal.Code <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Days.Parking.In.Effect <chr> "YYYYYYY", "YYYYYYY", "YYYYYYY", "Y…
## $ From.Hours.In.Effect <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ To.Hours.In.Effect <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Vehicle.Color <chr> "BK", "BL", "BLACK", "OTHER", "WHIT…
## $ Unregistered.Vehicle. <int> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Vehicle.Year <int> 2016, 2021, 0, 0, 0, 0, 2018, 0, 20…
## $ Meter.Number <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Feet.From.Curb <int> 4, 0, 0, 0, 0, 6, 7, 2, 1, 0, 0, 0,…
## $ Violation.Post.Code <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Violation.Description.x <chr> NA, "40-Fire Hydrant", NA, NA, NA, …
## $ No.Standing.or.Stopping.Violation <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Hydrant.Violation <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Double.Parking.Violation <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ issue_date <date> 2020-10-07, 2021-01-19, 2020-08-03…
## $ year <int> 2020, 2021, 2020, 2020, 2020, 2020,…
## $ month <int> 10, 1, 8, 11, 9, 8, 11, 10, 10, 8, …
## $ day <int> 7, 19, 3, 2, 4, 25, 6, 22, 20, 19, …
## $ Violation.Description.y <chr> "FIRE HYDRANT", "FIRE HYDRANT", "FI…
## $ Manhattan.Below <dbl> 115, 115, 115, 115, 115, 115, 115, …
## $ All.Other.Areas <dbl> 115, 115, 115, 115, 115, 115, 115, …
## $ Month_Yr <chr> "2020-10", "2021-01", "2020-08", "2…
## $ address <chr> "N E 88th St New York City Manhatta…
## $ lon <dbl> -73.95141, -73.96051, -73.96531, -7…
## $ lat <dbl> 40.77945, 40.77315, 40.76658, 40.77…
library(DT)
## Warning: package 'DT' was built under R version 4.0.2
df_fire_hydrants_uppereast_address <- df_fire_hydrants_uppereast_geocoded %>%
select(address, lon, lat)
pretty_headers <- colnames(df_fire_hydrants_uppereast_address) %>%
str_to_title()
interactive_dt <- df_fire_hydrants_uppereast_address %>%
datatable(
rownames = FALSE,
colnames = pretty_headers,
filter = list(position = "top"),
options = list(language = list(sSearch = "Filter:"))
)
interactive_dt
Provide an interactive map of the violations you geocoded using leaflet
. Provide at least three pieces of information on the parking ticket in a popup.
library(leaflet)
m <- leaflet(df_fire_hydrants_uppereast_geocoded) %>%
addTiles() %>% # Add OpenStreetMap map tiles
addCircles(lng = ~lon, lat = ~lat)
m
Note: The data here seems a bit unclean, with some observations categorized as falling within the Upper East Side, when they do not.
table(df_fire_hydrants_uppereast_geocoded$Plate.Type)
##
## 999 AGC APP CMB COM ITP MED MOT OMS OMT PAS RGL SRF
## 4 1 1 5 115 1 2 3 5 3 452 1 7
#there are many different kinds of vehicles, with small numbers, I will try to summarize them#
df_fire_hydrants_uppereast_geocoded$Plate.Type <- ifelse(df_fire_hydrants_uppereast_geocoded$Plate.Type == 'COM', 'Commerical Vehicle', df_fire_hydrants_uppereast_geocoded$Plate.Type)
df_fire_hydrants_uppereast_geocoded$Plate.Type <- ifelse(df_fire_hydrants_uppereast_geocoded$Plate.Type== 'PAS', 'Passenger Vehicle', df_fire_hydrants_uppereast_geocoded$Plate.Type)
for (val in c('999', 'AGC', 'APP', 'CMB', 'ITP', 'MED', 'MOT', 'OMS', 'OMT', 'RGL', 'SRF')){
df_fire_hydrants_uppereast_geocoded$Plate.Type <- ifelse(df_fire_hydrants_uppereast_geocoded$Plate.Type == val, 'Non Passenger or Commerical Vehicle', df_fire_hydrants_uppereast_geocoded$Plate.Type)
}
table(df_fire_hydrants_uppereast_geocoded$Plate.Type)
##
## Commerical Vehicle Non Passenger or Commerical Vehicle
## 115 33
## Passenger Vehicle
## 452
content <- paste("Vehicle Type:",df_fire_hydrants_uppereast_geocoded$Plate.Type,"<br/>",
"When:",df_fire_hydrants_uppereast_geocoded$issue_date,"<br/>",
"Where:",df_fire_hydrants_uppereast_geocoded$Street.Name,"<br/>")
m %>%
addCircles(popup = content)
## Assuming "lon" and "lat" are longitude and latitude, respectively
Using the vehicle Plate ID
, identify repeat offenders (in the full dataset). Create another variable called luxury_car
in which you identify luxury car brands using the Vehicle Make
variable.
Start with the previous map. Distinguish the points by whether the car is a repeat offender and/or luxury car. Add a legend informing the user about the color scheme. Also make sure that the added information about the car type and repeat offender status is now contained in the popup information. Show this map.
glimpse(df_fire_hydrants_uppereast)
## Rows: 17,999
## Columns: 54
## $ Violation.Code <int> 40, 40, 40, 40, 40, 40, 40, 40, 40,…
## $ Summons.Number <dbl> 8838014991, 8853963256, 8738049417,…
## $ Plate.ID <chr> "33554MN", "DME7952", "E16LGH", "P8…
## $ Registration.State <chr> "NY", "NY", "NJ", "NJ", "FL", "NJ",…
## $ Plate.Type <chr> "COM", "PAS", "PAS", "PAS", "PAS", …
## $ Issue.Date <chr> "10/07/2020", "01/19/2021", "08/03/…
## $ Vehicle.Body.Type <chr> "2DSD", "SUBN", "4DSD", "PICK", "SU…
## $ Vehicle.Make <chr> "SMART", "BMW", "HYUND", "DODGE", "…
## $ Issuing.Agency <chr> "T", "T", "T", "T", "T", "T", "T", …
## $ Street.Code1 <int> 18750, 18510, 24890, 10110, 18530, …
## $ Street.Code2 <int> 0, 27790, 18290, 18690, 24890, 1011…
## $ Street.Code3 <int> 0, 24890, 18310, 18710, 10210, 1001…
## $ Vehicle.Expiration.Date <int> 20211031, 20220213, 88880088, 88880…
## $ Violation.Location <int> 19, 19, 19, 19, 19, 19, 19, 19, 19,…
## $ Violation.Precinct <int> 19, 19, 19, 19, 19, 19, 19, 19, 19,…
## $ Issuer.Precinct <int> 19, 19, 19, 19, 19, 19, 19, 19, 19,…
## $ Issuer.Code <int> 354164, 371204, 353431, 355113, 361…
## $ Issuer.Command <chr> "T103", "T103", "T103", "T103", "T1…
## $ Issuer.Squad <chr> "H", "N", "CC", "AA", "O", "BB", "P…
## $ Violation.Time <chr> "1011A", "0304P", "0825A", "0830A",…
## $ Time.First.Observed <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Violation.County <chr> "NY", "NY", "NY", "NY", "NY", "NY",…
## $ Violation.In.Front.Of.Or.Opposite <chr> "I", "F", "F", "F", "O", "F", "I", …
## $ House.Number <chr> "N", "137", "888", "1648", "155", "…
## $ Street.Name <chr> "E 88th St", "E 76th St", "Lexingto…
## $ Intersecting.Street <chr> "15ft W/of York Ave", NA, NA, NA, N…
## $ Date.First.Observed <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Law.Section <int> 408, 408, 408, 408, 408, 408, 408, …
## $ Sub.Division <chr> "E2", "E2", "D", "C", "C", "E2", "e…
## $ Violation.Legal.Code <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Days.Parking.In.Effect <chr> "YYYYYYY", "YYYYYYY", "YYYYYYY", "Y…
## $ From.Hours.In.Effect <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ To.Hours.In.Effect <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Vehicle.Color <chr> "BK", "BL", "BLACK", "OTHER", "WHIT…
## $ Unregistered.Vehicle. <int> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Vehicle.Year <int> 2016, 2021, 0, 0, 0, 0, 2018, 0, 20…
## $ Meter.Number <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Feet.From.Curb <int> 4, 0, 0, 0, 0, 6, 7, 2, 1, 0, 0, 0,…
## $ Violation.Post.Code <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Violation.Description.x <chr> NA, "40-Fire Hydrant", NA, NA, NA, …
## $ No.Standing.or.Stopping.Violation <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Hydrant.Violation <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Double.Parking.Violation <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ issue_date <date> 2020-10-07, 2021-01-19, 2020-08-03…
## $ year <int> 2020, 2021, 2020, 2020, 2020, 2020,…
## $ month <int> 10, 1, 8, 11, 9, 8, 11, 10, 10, 8, …
## $ day <int> 7, 19, 3, 2, 4, 25, 6, 22, 20, 19, …
## $ Violation.Description.y <chr> "FIRE HYDRANT", "FIRE HYDRANT", "FI…
## $ Manhattan.Below <dbl> 115, 115, 115, 115, 115, 115, 115, …
## $ All.Other.Areas <dbl> 115, 115, 115, 115, 115, 115, 115, …
## $ Month_Yr <chr> "2020-10", "2021-01", "2020-08", "2…
## $ address <chr> "N E 88th St New York City Manhatta…
## $ lon <dbl> -73.95141, -73.96051, -73.96531, -7…
## $ lat <dbl> 40.77945, 40.77315, 40.76658, 40.77…
duplicated_offences_uppereast <- df_fire_hydrants_uppereast %>%
group_by(Plate.ID) %>%
filter(n()>1)
duplicated_offences_uppereast #for whole dataset#
## # A tibble: 5,826 x 54
## # Groups: Plate.ID [2,194]
## Violation.Code Summons.Number Plate.ID Registration.St… Plate.Type Issue.Date
## <int> <dbl> <chr> <chr> <chr> <chr>
## 1 40 8853963256 DME7952 NY PAS 01/19/2021
## 2 40 8820927962 P81BJL NJ PAS 11/02/2020
## 3 40 8717499100 GPT6930 NY PAS 11/06/2020
## 4 40 8796814160 63805NA NY COM 10/20/2020
## 5 40 8796816624 HNY6738 NY PAS 11/03/2020
## 6 40 8844318350 Y13SDB FL PAS 11/25/2020
## 7 40 8736575483 FHD2188 NY PAS 08/10/2020
## 8 40 8820504601 XFUT29 NJ PAS 07/01/2020
## 9 40 8809715810 58930MM NY COM 06/29/2020
## 10 40 8809742692 EWU2149 NY PAS 10/27/2020
## # … with 5,816 more rows, and 48 more variables: Vehicle.Body.Type <chr>,
## # Vehicle.Make <chr>, Issuing.Agency <chr>, Street.Code1 <int>,
## # Street.Code2 <int>, Street.Code3 <int>, Vehicle.Expiration.Date <int>,
## # Violation.Location <int>, Violation.Precinct <int>, Issuer.Precinct <int>,
## # Issuer.Code <int>, Issuer.Command <chr>, Issuer.Squad <chr>,
## # Violation.Time <chr>, Time.First.Observed <chr>, Violation.County <chr>,
## # Violation.In.Front.Of.Or.Opposite <chr>, House.Number <chr>,
## # Street.Name <chr>, Intersecting.Street <chr>, Date.First.Observed <int>,
## # Law.Section <int>, Sub.Division <chr>, Violation.Legal.Code <lgl>,
## # Days.Parking.In.Effect <chr>, From.Hours.In.Effect <chr>,
## # To.Hours.In.Effect <chr>, Vehicle.Color <chr>, Unregistered.Vehicle. <int>,
## # Vehicle.Year <int>, Meter.Number <chr>, Feet.From.Curb <int>,
## # Violation.Post.Code <lgl>, Violation.Description.x <chr>,
## # No.Standing.or.Stopping.Violation <lgl>, Hydrant.Violation <lgl>,
## # Double.Parking.Violation <lgl>, issue_date <date>, year <int>, month <int>,
## # day <int>, Violation.Description.y <chr>, Manhattan.Below <dbl>,
## # All.Other.Areas <dbl>, Month_Yr <chr>, address <chr>, lon <dbl>, lat <dbl>
print(table(df_fire_hydrants_uppereast$Vehicle.Make))
##
## ACURA ALFAR AUDI BENTL BERTO BMW BSA BUICK CADIL CHECK CHEVR CHRYS CITRO
## 274 6 451 9 2 730 1 51 114 2 1102 168 1
## DIAMO DODGE DUCAT FERRA FIAT FISKE FONTA FORD FRUEH GEO GMC HARLE HIN
## 1 697 1 4 51 1 1 2318 377 1 400 3 272
## HONDA HUMBE HUMME HYUN HYUND INFIN INTER ISUZU JAGUA JEEP JENSE KAWAS KENWO
## 1551 1 12 1 467 185 231 426 28 884 6 4 43
## KIA LAMBO LEXUS LINCO MACK MASE MAZDA ME/BE MERCU MINI MITSU NISSA NIU
## 263 11 355 70 6 18 296 979 25 171 180 1152 18
## NS/OT OSHKO PETER PONTI PORSC ROLLS ROVER SAAB SATUR SCION SMART SPRI SUBAR
## 143 1 7 15 106 6 313 17 14 9 65 9 464
## SUZUK TESLA TOYOT TRIUM UD UTILI VANHO VESPA VOLKS VOLVO WORKH YAMAH
## 11 63 1564 1 13 10 1 2 469 209 27 2
#This is a bit arbitrary, but I will attempt to isolate some luxury cars
luxury_cars <- c("BMW", "ALFAR", "BENTL", "CADIL", "CHRYS", "DUCAT", "FERRA", "INFIN", "JAGUA", "LAMBO", "LEXUS", "LINCO", "PORSC", "ROLLS", "TESLA", "VOLVO")
#new variable#
df_fire_hydrants_uppereast$luxury_car = ifelse(df_fire_hydrants_uppereast$Vehicle.Make %in% luxury_cars, 1, 0) #for whole dataset#
df_fire_hydrants_uppereast$luxury_car[0:10] #seems good#
## [1] 0 1 0 0 1 0 0 0 0 0
df_fire_hydrants_uppereast_geocoded$duplicated = ifelse(df_fire_hydrants_uppereast_geocoded$Summons.Number %in% duplicated_offences_uppereast$Summons.Number, 'Yes', 'No')
df_fire_hydrants_uppereast_geocoded$luxury = ifelse(df_fire_hydrants_uppereast_geocoded$Vehicle.Make %in% luxury_cars, 'Yes', 'No')
content2 <- paste("Vehicle Type:",df_fire_hydrants_uppereast_geocoded$Plate.Type,"<br/>",
"When:",df_fire_hydrants_uppereast_geocoded$issue_date,"<br/>",
"Where:",df_fire_hydrants_uppereast_geocoded$Street.Name,"<br/>",
"Is this a repeat offence?", df_fire_hydrants_uppereast_geocoded$duplicated, "<br/>",
"Is this a Luxury Car?",
df_fire_hydrants_uppereast_geocoded$luxury, "<br/>")
#recodes for new variable
df_fire_hydrants_uppereast_geocoded$trouble = ifelse(df_fire_hydrants_uppereast_geocoded$duplicated == 'Yes' & df_fire_hydrants_uppereast_geocoded$luxury == 'Yes', 'Both Repeat and Luxury Car', 0)
df_fire_hydrants_uppereast_geocoded$trouble = ifelse(df_fire_hydrants_uppereast_geocoded$duplicated == 'Yes' & df_fire_hydrants_uppereast_geocoded$luxury == 'No', 'Repeat, but not Luxury Car', df_fire_hydrants_uppereast_geocoded$trouble)
df_fire_hydrants_uppereast_geocoded$trouble = ifelse(df_fire_hydrants_uppereast_geocoded$duplicated == 'No' & df_fire_hydrants_uppereast_geocoded$luxury == 'Yes', 'Not Repeat, but Luxury Car', df_fire_hydrants_uppereast_geocoded$trouble)
df_fire_hydrants_uppereast_geocoded$trouble = ifelse(df_fire_hydrants_uppereast_geocoded$duplicated == 'No' & df_fire_hydrants_uppereast_geocoded$luxury == 'No', 'Neither Repeat nor Luxury Car', df_fire_hydrants_uppereast_geocoded$trouble)
library(RColorBrewer)
pal = colorFactor("Set1", domain = df_fire_hydrants_uppereast_geocoded$trouble) # Grab a palette
color_trouble = pal(df_fire_hydrants_uppereast_geocoded$trouble)
m %>% addCircles(color = color_trouble , popup = content2) %>%
addLegend(pal = pal, values = ~df_fire_hydrants_uppereast_geocoded$trouble, title = "Are Repeat Offenders or Luxury Car Drivers Responsible? Not Exactly.")
## Assuming "lon" and "lat" are longitude and latitude, respectively
Note: The data here seems a bit unclean, with some observations categorized as falling within the Upper East Side, when they do not.
Add marker clustering, so that zooming in will reveal the individual locations but the zoomed out map only shows the clusters. Show the map with clusters.
mclust <- leaflet(df_fire_hydrants_uppereast_geocoded) %>%
addTiles() %>% # Add OpenStreetMap map tiles
addCircleMarkers(color = color_trouble,
popup = content2,
clusterOptions = markerClusterOptions())
## Assuming "lon" and "lat" are longitude and latitude, respectively
mclust