library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(httr)
library(jsonlite)
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
library(supernova)
library(AICcmodavg)
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
##
## The following object is masked from 'package:Matrix':
##
## mean
##
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
##
## The following object is masked from 'package:purrr':
##
## cross
##
## The following object is masked from 'package:ggplot2':
##
## stat
##
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
##
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
endpoint<-"https://data.cityofnewyork.us/resource/nc67-uf89.json"
resp <- GET(endpoint, query = list(
"$limit" = 99999,
"$order" = "issue_date DESC"
))
camera <- fromJSON(content(resp, as = "text"), flatten = TRUE)
camera <- camera %>%
mutate(across
(c("fine_amount", "interest_amount", "reduction_amount", "payment_amount", "amount_due"),
~as.numeric(.)
))
camera <- camera %>%
filter(str_detect(issue_date, "^\\d{4}-\\d{2}-\\d{2}T"))
camera <- camera %>%
mutate(county = recode(county,
"Q" = "Queens County",
"K" = "Kings County",
"BX" = "Bronx County",
"NY" = "New York County",
"R" = "Richmond County",
"QN" = "Queens County",
"MN" = "New York County",
"BK" = "Kings County",
"ST" = "Richmond County",
"Bronx" = "Bronx County",
"Kings" = "Kings County",
"Qns" = "Queens County",
"RICH" = "Richmond County"))
camera <- camera %>% rename('plate_state' = 'state')
camera <- camera %>% rename('agency' = 'issuing_agency')
camera_states <- camera %>%
filter(plate_state == "NJ" | plate_state == "NY" | plate_state == "CT")
I have been hired as a data scientist by a law firm that specializes in fighting parking and camera tickets. The firm wants to work on their marketing strategy and have asked me to look at patterns in NYC violation data that can better inform them. So far, I have looked at day of the week, time of day, and violation type. Unfortunately, there was nothing of much significance there. Now, I will be looking into three more variables- issuing agency, the state the driver is from that received the violation, and the county the violation was issued in- and how these variables affect payment amount. Here is the dataset I am using: NYC Parking and Camera Violation
ggplot(camera, aes(x = agency, y = payment_amount, fill = agency)) + geom_boxplot() + coord_flip() +
labs(title = "Payment Amount Across Agencies",
x = "Agency",
y = "Payment Amount in Dollars") +
theme(legend.position = "none")
favstats(payment_amount ~ agency, data = camera) %>% arrange(desc(mean))
## agency min Q1 median Q3 max
## 1 HEALTH DEPARTMENT POLICE 243.81 243.81 243.81 243.8100 243.81
## 2 SEA GATE ASSOCIATION POLICE 190.00 190.00 190.00 190.0000 190.00
## 3 FIRE DEPARTMENT 180.00 180.00 180.00 180.0000 180.00
## 4 NYS OFFICE OF MENTAL HEALTH POLICE 0.00 180.00 180.00 190.0000 210.00
## 5 PORT AUTHORITY 0.00 180.00 180.00 190.0000 242.76
## 6 ROOSEVELT ISLAND SECURITY 0.00 135.00 180.00 190.0000 246.68
## 7 NYS PARKS POLICE 0.00 0.00 180.00 190.0000 242.58
## 8 POLICE DEPARTMENT 0.00 65.00 180.00 190.0000 260.00
## 9 PARKS DEPARTMENT 0.00 90.00 180.00 190.0000 245.28
## 10 TAXI AND LIMOUSINE COMMISSION 125.00 125.00 125.00 125.0000 125.00
## 11 HEALTH AND HOSPITAL CORP. POLICE 0.00 0.00 180.00 190.0000 245.64
## 12 CON RAIL 0.00 0.00 95.00 228.8875 243.87
## 13 DEPARTMENT OF TRANSPORTATION 0.00 50.00 75.00 125.0000 690.04
## 14 TRAFFIC 0.00 65.00 115.00 115.0000 245.79
## 15 TRANSIT AUTHORITY 0.00 0.00 75.00 125.0000 190.00
## 16 DEPARTMENT OF SANITATION 0.00 48.75 65.00 115.0000 115.00
## 17 LONG ISLAND RAILROAD 0.00 0.00 0.00 0.0000 0.00
## mean sd n missing
## 1 243.81000 NA 1 0
## 2 190.00000 0.00000 2 0
## 3 180.00000 NA 1 0
## 4 161.33333 65.99423 15 0
## 5 150.49319 80.53742 47 0
## 6 149.16083 90.57967 24 0
## 7 142.50970 90.27092 33 0
## 8 136.71574 82.82498 190 0
## 9 128.47736 78.92728 144 0
## 10 125.00000 NA 1 0
## 11 124.71373 98.60130 51 0
## 12 112.62000 124.87146 6 0
## 13 99.52822 82.88394 87273 0
## 14 94.59362 44.47453 12091 0
## 15 78.00000 82.05181 5 0
## 16 66.25000 45.48351 12 0
## 17 0.00000 NA 1 0
anova_model_agency<- aov(payment_amount ~ agency, data = camera)
summary(anova_model_agency)
## Df Sum Sq Mean Sq F value Pr(>F)
## agency 16 1063435 66465 10.59 <2e-16 ***
## Residuals 99880 627060364 6278
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
supernova(anova_model_agency)
## Analysis of Variance Table (Type III SS)
## Model: payment_amount ~ agency
##
## SS df MS F PRE p
## ----- --------------- | ------------- ----- --------- ------ ----- -----
## Model (error reduced) | 1063434.678 16 66464.667 10.587 .0017 .0000
## Error (from model) | 627060364.280 99880 6278.137
## ----- --------------- | ------------- ----- --------- ------ ----- -----
## Total (empty model) | 628123798.957 99896 6287.777
SSagency = 1063435
SSerror = 627060364
While there is a considerable amount of variability between agencies, there is much more variability within agencies.
F = 10.587
P = <2e-16
This is statistically significant (P < 0.05).
Only about 0.17% of the variance in payment amount is explained by the agency that issued the fine.
While the findings of variance in this model were found to be statistically significant, they were not found to be practically significant. The issuing agency does explain a proportion of variance in payment amount, but that proportion is only about 0.17%, which is less than 1% of total variance in payment amount. This is definitely not the most significant difference in the real world. I would not recommend the law firm necessarily use this variable in their marketing strategy, because even if they were able to address each agency, that would still only affect total payment amount by less than 1%.
ggplot(camera_states, aes(x = plate_state, y = payment_amount, fill = plate_state)) + geom_boxplot() +
labs(title = "Payment Amount Across License Plate States",
x = "License Plate State",
y = "Payment Amount in Dollars") +
theme(legend.position = "none")
favstats(payment_amount ~ plate_state, data = camera_states) %>% arrange(desc(mean))
## plate_state min Q1 median Q3 max mean sd n missing
## 1 NJ 0 50 75 115 682.35 101.5746 89.97170 8654 0
## 2 NY 0 50 75 125 690.04 101.0978 80.92861 79528 0
## 3 CT 0 50 75 100 276.57 80.6627 46.07849 1457 0
anova_model_state<- aov(payment_amount ~ plate_state, data = camera_states)
summary(anova_model_state)
## Df Sum Sq Mean Sq F value Pr(>F)
## plate_state 2 603061 301530 45.5 <2e-16 ***
## Residuals 89636 593994009 6627
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
supernova(anova_model_state)
## Analysis of Variance Table (Type III SS)
## Model: payment_amount ~ plate_state
##
## SS df MS F PRE p
## ----- --------------- | ------------- ----- ---------- ------ ----- -----
## Model (error reduced) | 603060.721 2 301530.360 45.502 .0010 .0000
## Error (from model) | 593994008.724 89636 6626.735
## ----- --------------- | ------------- ----- ---------- ------ ----- -----
## Total (empty model) | 594597069.446 89638 6633.315
SSstate = 603061
SSerror = 593994009
While there is a considerable amount of variability between states, there is much more variability within states.
F = 45.502
P = <2e-16
This is statistically significant (P < 0.05).
Only about 0.1% of the variance in payment amount is explained by the states the drivers are from.
Again, while the findings of variance in this model were found to be statistically significant, they were not found to be practically significant. The state the driver is from does explain a proportion of variance in payment amount, but that proportion is only about 0.1%, which is even less than the amount that issuing agency explains. This is, again, definitely not a significant difference in the real world. I would not recommend the law firm use this variable in their marketing strategy, because at best, it would only address less than 1% of the variance in payment amount.
ggplot(camera, aes(x = county, y = payment_amount, fill = county)) + geom_boxplot() +
labs(title = "Payment Amount Across Counties",
x = "County",
y = "Payment Amount in Dollars") +
theme(legend.position = "none")
favstats(payment_amount ~ county, data = camera) %>% arrange(desc(mean))
## county min Q1 median Q3 max mean sd n missing
## 1 Richmond County 0 50 125 180.0 250.00 114.53669 77.55385 1349 0
## 2 Kings County 0 50 75 115.0 690.04 110.90567 126.20960 16108 0
## 3 Bronx County 0 65 75 152.5 245.64 100.38053 67.32482 244 0
## 4 New York County 0 50 75 115.0 281.80 97.64833 62.54609 23468 0
## 5 Queens County 0 50 50 100.0 283.03 83.49201 60.07357 17357 0
anova_model_county<- aov(payment_amount ~ county, data = camera)
summary(anova_model_county)
## Df Sum Sq Mean Sq F value Pr(>F)
## county 4 6694742 1673685 233.1 <2e-16 ***
## Residuals 58521 420213429 7181
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 41371 observations deleted due to missingness
supernova(anova_model_county)
## Refitting to remove 41371 cases with missing value(s)
## ℹ aov(formula = payment_amount ~ county, data = listwise_delete(camera,
## c("payment_amount", "county")))
## Analysis of Variance Table (Type III SS)
## Model: payment_amount ~ county
##
## SS df MS F PRE p
## ----- --------------- | ------------- ----- ----------- ------- ----- -----
## Model (error reduced) | 6694741.856 4 1673685.464 233.086 .0157 .0000
## Error (from model) | 420213428.932 58521 7180.558
## ----- --------------- | ------------- ----- ----------- ------- ----- -----
## Total (empty model) | 426908170.788 58525 7294.458
SScounty = 6694742
SSerror = 420213429
Again, while there is a considerable amount of variability between counties, there is much more variability within counties.
F = 233.1
P = <2e-16
This is statistically significant (p < 0.05).
About 1.5% of the variance in payment amount is explained by the county the fine was issued in.
Again, while the findings of variance in this model were found to be statistically significant, they were not found to be very practically significant. The county does explain a proportion of variance in payment amount, but that proportion is only about 1.5%. Though not a high percentage of variance explained by any means, it is the highest percentage we have found thus far! It is also likely not very significant of a difference in the real world. I would also not recommend this as the best variable for the firm to use in their marketing strategy, since it would only address 1.5% of the variance in payment amount.
If the law firm only had these three variables (agency, state, or county) as options to use in their marketing strategy, I would suggest prioritizing county since it accounts for the largest amount of variance in payment amount out of all three variables.