getwd()
tickets <- data.frame(read.csv("tickets.csv", header = T, sep = ","))
##
## BUS ZONE DBL PARK PK PHB OTD PRK PROHIB TWAWY ZN#1 TWAWY ZONE
## 2338 2876 1479 2523 5293 2295
## Violation Weekday Total.Revenue Daily.Average.Revenue
## 1 BUS ZONE Weekday 114256 57128.0
## 2 DBL PARK Weekday 131245 65622.5
## 3 PK PHB OTD Weekday 112951 56475.5
## 4 PRK PROHIB Weekday 219356 109678.0
## 5 TWAWY ZN#1 Weekday 401554 200777.0
## 6 TWAWY ZONE Weekday 167374 83687.0
## 7 BUS ZONE Weekend 52657 26328.5
## 8 DBL PARK Weekend 45969 22984.5
## 9 PK PHB OTD Weekend 35518 17759.0
## 10 PRK PROHIB Weekend 60396 30198.0
## 11 TWAWY ZN#1 Weekend 99012 49506.0
## 12 TWAWY ZONE Weekend 49369 24684.5
## Source: local data frame [12 x 3]
## Groups: Violation [?]
##
## Violation Citation.Issue.Weekday Amount_Paid
## <fctr> <fctr> <int>
## 1 BUS ZONE Weekday 114256
## 2 BUS ZONE Weekend 52657
## 3 DBL PARK Weekday 131245
## 4 DBL PARK Weekend 45969
## 5 PK PHB OTD Weekday 112951
## 6 PK PHB OTD Weekend 35518
## 7 PRK PROHIB Weekday 219356
## 8 PRK PROHIB Weekend 60396
## 9 TWAWY ZN#1 Weekday 401554
## 10 TWAWY ZN#1 Weekend 99012
## 11 TWAWY ZONE Weekday 167374
## 12 TWAWY ZONE Weekend 49369
##
## 4 6 7 8 9 10 11 12 13 14 15 16 17 18 19
## 4 2 182 1016 394 493 530 516 383 335 730 3375 2589 82 37
## 20 21 22
## 22 1 1
# Dataframe for number of violations by Hour
Tickets.per.Hour <- tickets %>% group_by(Citation.Issue.Hour) %>% summarise(count = n())
Tickets.per.Hour
## # A tibble: 20 × 2
## Citation.Issue.Hour count
## <int> <int>
## 1 2 1
## 2 4 6
## 3 5 2
## 4 6 4
## 5 7 214
## 6 8 1142
## 7 9 619
## 8 10 754
## 9 11 812
## 10 12 837
## 11 13 673
## 12 14 575
## 13 15 966
## 14 16 6190
## 15 17 3669
## 16 18 183
## 17 19 99
## 18 20 52
## 19 21 4
## 20 22 2
## Plot violations by hour
ggplot(tickets, aes(x = Citation.Issue.Hour, fill = Violation)) + geom_bar(position = "fill") +
scale_x_continuous(breaks = seq(0, 24, 1)) + xlab("Hour of Citation") +
theme(plot.title = element_text(hjust = 0.5)) + ggtitle("Stacked Bar Chart of Violations by Citation Issue Hour ")
# NOTE: The graph shows almost entirely Double Parking and Bus Zone
# Violations during non-rush-hour and almost entirely Towaway and Prohibited
# Parking during rush hour
# Distribution of Violation by type and hour of day
ggplot(tickets) + geom_histogram(mapping = (aes(x = Citation.Issue.Hour)), fill = "salmon",
color = "black", bins = 24) + facet_wrap(~Violation) + xlab("Citation Issue Hour") +
theme(plot.title = element_text(hjust = 0.5)) + ggtitle("Number of Tickets by Violation and Hour of Citation")
#Violations broken down by year
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Violation, fill = as.factor(tickets$Citation.Issue.Year)), position = "dodge") + coord_flip() + xlab("Violation") + theme(plot.title = element_text(hjust = 0.5)) + ggtitle("Number of Tickets by Violation and Year of Citation") + labs(fill = "Citation Issue Year")
<<<<<<< HEAD
# Violations
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Violation)) + coord_flip()
# Violations broken down by year
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Violation, fill = as.factor(tickets$Citation.Issue.Year)),
position = "dodge") + coord_flip() + labs(title = "Violations by Year",
x = "Violation", y = "Total Citations", fill = "Year")
# Fine amount
table(tickets$Fine.Amount)
##
## 70 73 75 78 80 83 85 88 93 95 98 100 110 250 253
## 68 124 1280 396 882 1547 4421 573 1254 1576 947 148 1250 194 362
## 255 267 271 279
## 785 451 332 214
fine.plot <- ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Fine.Amount)) +
labs(x = "Fine Amount", y = "Total Citations")
fine.plot
## By Violation
fine.plot + facet_wrap(~tickets$Violation)
## It is clear that tickets have a low fine amount and high fine amount. Is
## it time of day, weekday vs weekend, year, etc.?
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Fine.Amount, fill = tickets$Citation.Issue.Weekday),
position = "dodge") + facet_wrap(~tickets$Violation) + labs(x = "Fine Amount",
y = "Total Citations", fill = "Weekday or Weekend")
# ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Fine.Amount,
# fill = as.factor(tickets$Citation.Issue.Year)), position = 'fill') +
# facet_wrap(~tickets$Violation)
table(tickets$Fine.Amount)
##
## 70 73 75 78 80 83 85 88 93 95 98 100 110 250 253
## 68 124 1280 396 882 1547 4421 573 1254 1576 947 148 1250 194 362
## 255 267 271 279
## 785 451 332 214
fine.plot <- ggplot(data = tickets) + geom_histogram(mapping = aes(x = tickets$Fine.Amount),
fill = "salmon")
fine.plot + xlab("Fine Amount") + theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Histogram of Fine Amount")
## By Violation
fine.plot + facet_wrap(~tickets$Violation) + xlab("Fine Amount") + theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Histogram of Fine Amount by Violation")
## It is clear that tickets have a low fine amount and high fine amount. Is
## it time of day, weekday vs weekend, year, etc.? Let's make a factor
## variable to check this
# Remove from workspace
rm(fine.plot)
# Initialize new factor variable Fine.Level
tickets$Fine.Level <- tickets$Fine.Amount
tickets$Fine.Level <- ifelse(tickets$Fine.Amount < 175, "Low", "High")
tickets$Fine.Level <- as.factor(tickets$Fine.Level)
tickets$Fine.Level <- relevel(tickets$Fine.Level, "Low")
## Generate table
table(tickets$Fine.Level)
##
## Low High
## 14466 2338
fine.plot2 <- ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Fine.Level),
fill = "salmon")
fine.plot2 + xlab("Fine Level") + theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Bar Chart of Fine Level")
## Fine level by Violation
fine.plot2 + facet_wrap(~tickets$Violation) + xlab("Fine Level") + theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Bar Chart of Fine Level by Violation")
# Remove from workspace
rm(fine.plot2)
# Plot Fine level by Hour
fine.plot3 <- ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Citation.Issue.Hour),
fill = "salmon")
fine.plot3 + facet_wrap(~tickets$Fine.Level) + xlab("Fine Level") + theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Bar Chart of Fine Level by Hour of Citation")
## ******High Fine.Level seems to spike at 4 and 5 pm. Is that during Rush
## Hour? but there is a similar spike in Low Fine.Level too*******
# Remove from workspace
rm(fine.plot3)
# Plot Fine level by weekday vs weekend
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Citation.Issue.Weekday),
fill = "salmon") + facet_wrap(~tickets$Fine.Level) + xlab("Fine Level") +
theme(plot.title = element_text(hjust = 0.5)) + ggtitle("Bar Chart of Fine Level by Weekday and Weekend")
## Does not seem to be a correlation
# Plot Fine level by day of week
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Citation.Issue.DayOfWeek),
fill = "salmon") + facet_wrap(~tickets$Fine.Level) + xlab("Fine Level") +
theme(plot.title = element_text(hjust = 0.5)) + ggtitle("Bar Chart of Fine Level by Day of the Week")
# Plot fine level by month
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Citation.Issue.Month),
fill = "salmon") + facet_wrap(~tickets$Fine.Level) + xlab("Fine Level") +
theme(plot.title = element_text(hjust = 0.5)) + ggtitle("Bar Chart of Fine Level by Month")
# Plot fine level by year
ggplot(data = tickets) + geom_bar(mapping = aes(x = as.factor(tickets$Citation.Issue.Year),
fill = "salmon")) + facet_wrap(~tickets$Fine.Level) + xlab("Fine Level") +
theme(plot.title = element_text(hjust = 0.5)) + ggtitle("Bar Chart of Fine Level by Year")
It looks like they introduced “High” level fines in 2014 as it is the only year in the dataset that a “High” fine level occurs.
# Citation Status
table(tickets$Citation.Status)
##
## Closed Open
## 15110 1694
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Citation.Status)) +
labs(x = "Citation Status", y = "Total Citations")
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Citation.Status),
fill = "salmon") + xlab("Citation Status") + theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Bar Chart of Citation Status")
## Most are closed. Which ones are still open though?
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Citation.Status),
fill = "salmon") + xlab("Citation Status") + theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Bar Chart of Citation Status by Year") + facet_wrap(~as.factor(tickets$Citation.Issue.Year))
## Not make sense to look at fine level or others like that
ggplot(data = tickets) + geom_point(mapping = (aes(x = tickets$Citation.Status,
y = tickets$Amount.Due)), alpha = 1/50) + xlab("Citation Status") + theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Dot plot of Citation Status by Amount Due")
ggplot(data = tickets) + geom_boxplot(aes(x = tickets$Citation.Status, y = tickets$Amount.Due)) +
xlab("Citation Status") + theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Boxplot of Citation Status by Amount Due")
ggplot(data = tickets) + geom_point(mapping = (aes(x = tickets$Fine.Amount,
y = tickets$Amount.Due, color = tickets$Citation.Status))) + xlab("Fine Amount") +
theme(plot.title = element_text(hjust = 0.5)) + ggtitle("Dot plot of Fine Amount vs Amount Due by Citation Status") +
ylab("Amount Due")
# Amount paid
table(tickets$Amount.Paid)
##
## 1 2 3 4 5 7 8 9 10 11 12 13 14 15 17
## 643 665 1124 846 739 175 23 31 23 1159 78 73 1 42 1
## 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
## 102 107 72 89 1 1 13 11 129 42 63 16 55 1 26
## 33 34 35 36 37 39 40 41 42 43 44 45 46 47 48
## 272 96 4 72 1 3 22 100 3 10 107 6 1 6 18
## 49 50 51 52 53 54 55 56 57 58 59 60 62 63 64
## 9 1 5 2 4 44 1 49 29 9 4 17 5 52 16
## 65 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 20 12 1 16 9 23 1 19 5 2 25 16 2 1 14
## 81 82 83 84 86 88 89 90 92 93 94 96 97 99 100
## 3 7 2 1 1 1 90 157 392 1 255 193 12 24 116
## 101 102 103 104 105 106 107 108 110 111 112 113 114 115 116
## 42 9 4 1 24 20 16 1 1 1 13 5 25 32 64
## 117 120 121 122 123 124 125 126 127 128 129 131 132 133 134
## 1 1 44 8 33 15 1 5 4 1 7 2 10 1 1
## 136 137 138 139 140 141 142 143 144 145 147 150 151 152 153
## 3 1 10 4 3 1 1 4 1 1 3 1 2 1 39
## 154 155 156 157 158 159 161 163 165 167 168
## 71 717 251 571 867 2376 332 794 1011 605 2
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Amount.Paid))
# Why are some amounts paid so low? Are these cases where status is open
# still?
tickets_by_paid_status <- tickets %>% group_by(Amount.Paid, Citation.Status)
summary(tickets_by_paid_status)
## Ticket.Number Citation.Issue.Date Violation Fine.Amount
## Min. :100000476 2012-09-25: 58 BUS ZONE :2338 Min. : 70.0
## 1st Qu.:794817169 2010-07-19: 53 DBL PARK :2876 1st Qu.: 83.0
## Median :814718876 2012-08-20: 50 PK PHB OTD:1479 Median : 85.0
## Mean :805965065 2012-09-06: 50 PRK PROHIB:2523 Mean :112.3
## 3rd Qu.:831587005 2012-09-20: 50 TWAWY ZN#1:5293 3rd Qu.: 98.0
## Max. :977362606 2011-07-28: 48 TWAWY ZONE:2295 Max. :279.0
## (Other) :16495
## Citation.Status Amount.Paid Amount.Due Citation.Issue.Month
## Closed:15110 Min. : 1.00 Min. : 0.00 Jul :1865
## Open : 1694 1st Qu.: 8.00 1st Qu.: 0.00 Sep :1862
## Median : 96.00 Median : 0.00 Mar :1736
## Mean : 88.65 Mean : 19.02 Aug :1652
## 3rd Qu.:159.00 3rd Qu.: 0.00 Oct :1627
## Max. :168.00 Max. :392.00 Apr :1533
## (Other):6529
## Paid.On.Time Latitude Longitude Citation.Issue.Year
## Min. :0.0000 Min. :37.71 Min. :-122.5 Min. :2009
## 1st Qu.:1.0000 1st Qu.:37.79 1st Qu.:-122.4 1st Qu.:2011
## Median :1.0000 Median :37.79 Median :-122.4 Median :2012
## Mean :0.9015 Mean :37.79 Mean :-122.4 Mean :2012
## 3rd Qu.:1.0000 3rd Qu.:37.79 3rd Qu.:-122.4 3rd Qu.:2013
## Max. :1.0000 Max. :37.80 Max. :-122.4 Max. :2014
##
## Citation.Issue.DayOfWeek Citation.Issue.Weekday Citation.Issue.Hour
## Fri : 546 Weekday:12779 Min. : 2.00
## Mon :3558 Weekend: 4025 1st Qu.:12.00
## Sat : 809 Median :16.00
## Sun :3216 Mean :14.39
## Thurs:2978 3rd Qu.:16.00
## Tues :3077 Max. :22.00
## Wed :2620
## Citation.Issue.MonthYear Fine.Level
## Sep-2012: 528 Low :14466
## Jul-2014: 471 High: 2338
## Sep-2014: 428
## Aug-2012: 410
## Mar-2011: 402
## Apr-2013: 365
## (Other) :14200
ggplot(data = tickets_by_paid_status) + geom_bar(mapping = aes(x = tickets$Amount.Paid,
fill = tickets$Citation.Status))
# Most of these ones are still open but it still doesn't make sense What
# types of violations are these????
tickets_paid_viol <- tickets %>% group_by(Amount.Paid, Violation)
ggplot(data = tickets_paid_viol) + geom_bar(mapping = aes(x = tickets$Amount.Paid,
fill = tickets$Violation))
# Looks like prices of violations vary. By time? Weekend?
# Amount Due
table(tickets$Amount.Due)
##
## 0 7 25 29 30 38 39 60 68 75
## 15110 1 2 2 2 1 7 3 7 1
## 82 83 85 87 88 90 93 95 98 100
## 1 46 34 1 23 2 17 20 17 2
## 102 103 105 106 107 108 110 111 113 114
## 1 4 5 5 1 1 34 1 5 1
## 115 117 118 120 121 125 127 128 129 130
## 39 17 1 6 1 40 14 1 1 1
## 133 135 137 139 140 141 143 144 145 148
## 1 3 1 5 11 1 1 1 5 1
## 150 151 152 155 156 158 160 161 162 163
## 6 12 3 3 53 7 9 33 1 13
## 164 165 166 168 170 170.95 172 173 175 176
## 1 11 107 7 2 1 1 4 153 4
## 177 178 179 180 182 185 187 189 191 195
## 14 10 2 21 5 81 29 33 38 35
## 196 199 200 201 202 205 206 209 211 216
## 20 39 1 51 6 62 58 1 53 14
## 218 220 221 223 250 267 271 278 279 285
## 5 12 9 6 1 15 13 6 18 4
## 295 297 308 309 310 313 315 320 331 335
## 5 2 4 2 3 1 1 2 1 15
## 337 338 340 343 345 347 354 355 357 359
## 4 18 2 5 9 17 1 22 12 1
## 373 375 381 382 392
## 25 12 16 11 12
ggplot(data = tickets) + geom_histogram(mapping = aes(x = tickets$Amount.Due,
fill = tickets$Violation))
## The vast majority of tickets are fully paid off
head(table(tickets$Amount.Due))
##
## 0 7 25 29 30 38
## 15110 1 2 2 2 1
## By citations status?
tickets_due_status <- tickets %>% group_by(Amount.Due, Citation.Status)
summary(tickets_due_status)
## Ticket.Number Citation.Issue.Date Violation Fine.Amount
## Min. :100000476 2012-09-25: 58 BUS ZONE :2338 Min. : 70.0
## 1st Qu.:794817169 2010-07-19: 53 DBL PARK :2876 1st Qu.: 83.0
## Median :814718876 2012-08-20: 50 PK PHB OTD:1479 Median : 85.0
## Mean :805965065 2012-09-06: 50 PRK PROHIB:2523 Mean :112.3
## 3rd Qu.:831587005 2012-09-20: 50 TWAWY ZN#1:5293 3rd Qu.: 98.0
## Max. :977362606 2011-07-28: 48 TWAWY ZONE:2295 Max. :279.0
## (Other) :16495
## Citation.Status Amount.Paid Amount.Due Citation.Issue.Month
## Closed:15110 Min. : 1.00 Min. : 0.00 Jul :1865
## Open : 1694 1st Qu.: 8.00 1st Qu.: 0.00 Sep :1862
## Median : 96.00 Median : 0.00 Mar :1736
## Mean : 88.65 Mean : 19.02 Aug :1652
## 3rd Qu.:159.00 3rd Qu.: 0.00 Oct :1627
## Max. :168.00 Max. :392.00 Apr :1533
## (Other):6529
## Paid.On.Time Latitude Longitude Citation.Issue.Year
## Min. :0.0000 Min. :37.71 Min. :-122.5 Min. :2009
## 1st Qu.:1.0000 1st Qu.:37.79 1st Qu.:-122.4 1st Qu.:2011
## Median :1.0000 Median :37.79 Median :-122.4 Median :2012
## Mean :0.9015 Mean :37.79 Mean :-122.4 Mean :2012
## 3rd Qu.:1.0000 3rd Qu.:37.79 3rd Qu.:-122.4 3rd Qu.:2013
## Max. :1.0000 Max. :37.80 Max. :-122.4 Max. :2014
##
## Citation.Issue.DayOfWeek Citation.Issue.Weekday Citation.Issue.Hour
## Fri : 546 Weekday:12779 Min. : 2.00
## Mon :3558 Weekend: 4025 1st Qu.:12.00
## Sat : 809 Median :16.00
## Sun :3216 Mean :14.39
## Thurs:2978 3rd Qu.:16.00
## Tues :3077 Max. :22.00
## Wed :2620
## Citation.Issue.MonthYear Fine.Level
## Sep-2012: 528 Low :14466
## Jul-2014: 471 High: 2338
## Sep-2014: 428
## Aug-2012: 410
## Mar-2011: 402
## Apr-2013: 365
## (Other) :14200
ggplot(data = tickets_due_status) + geom_histogram(mapping = aes(x = tickets$Amount.Due,
fill = tickets$Citation.Status))
# That seems to tell the whole story
# Paid on time
table(tickets$Paid.On.Time)
##
## 0 1
## 1656 15148
ggplot(data = tickets) + geom_bar(mapping = aes(x = as.factor(tickets$Paid.On.Time)))
# Month
tickets$Citation.Issue.DayOfWeek = factor(tickets$Citation.Issue.DayOfWeek,
levels(tickets$Citation.Issue.DayOfWeek)[c(4, 2, 6, 7, 5, 1, 3)])
table(tickets$Citation.Issue.Month)
##
## Apr Aug Dec Feb Jan Jul Jun Mar May Nov Oct Sep
## 1533 1652 1042 1184 790 1865 1199 1736 1270 1044 1627 1862
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Citation.Issue.Month), fill = "skyblue3", color = "gray29") + labs(title = "Average Number of Citations per Month", x = "Month", y = "Average Total Citations")
# Year
table(tickets$Citation.Issue.Year)
##
## 2009 2010 2011 2012 2013 2014
## 1288 2090 3058 3726 3108 3534
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Citation.Issue.Year), fill = "skyblue3", color = "gray29") + labs(title = "Toal Citations per Year", x = "Year", y = "Total Citations") + scale_x_continuous(breaks = seq(2009, 2014, 1))
# Day of Week
table(tickets$Citation.Issue.DayOfWeek)
##
## Sun Mon Tues Wed Thurs Fri Sat
## 3216 3558 3077 2620 2978 546 809
ggplot(tickets, aes(x = Citation.Issue.DayOfWeek)) + geom_bar(fill = "indianred2", color = "black") + labs(title = "Total Citations by Day of Week", x = "Day of Week", y = "Total Citations")
# ******* Let's look at how we coded the Day of Week variable. This trend shoud be lower on Sunday ******
# Weekday
table(tickets$Citation.Issue.Weekday)
##
## Weekday Weekend
## 12779 4025
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Citation.Issue.Weekday))
# ******* Let's look at how we coded the Day of Week variable. This trend shoud be lower on Sunday ******
#Hour
table(tickets$Citation.Issue.Hour)
##
## 2 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 1 6 2 4 214 1142 619 754 812 837 673 575 966 6190 3669
## 18 19 20 21 22
## 183 99 52 4 2
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Citation.Issue.Hour), fill = "skyblue3", color = "gray29") + labs(title = "Citations By Hour", x = "Hour of Day", y = "Total Citations")
# People are getting boned during rush hour on the way home
#Same graph but with Rush Hours in blue and Off Hours in Red
bar.color <- c("indianred2", "indianred2", "indianred2", "indianred2", "indianred2", "royalblue3", "royalblue3", "royalblue3", "indianred2", "indianred2", "indianred2", "indianred2", "indianred2", "indianred2", "royalblue3", "royalblue3", "royalblue3", "indianred2", "indianred2", "indianred2", "indianred2")
ggplot(tickets, aes(x = Citation.Issue.Hour)) + geom_histogram(binwidth = 1, fill = bar.color, color = "black") + scale_x_continuous(breaks = seq(0, 24, 1)) + labs(title = "Citations by Hour", x = "Hour of Day (24 Hour Scale)", y = "Total Citaitons", fill = "Rush Hour")
#Summary stats for the different tickets datasets for years
summary(tickets)
## Ticket.Number Citation.Issue.Date Violation Fine.Amount
## Min. :100000476 2012-09-25: 58 BUS ZONE :2338 Min. : 70.0
## 1st Qu.:794817169 2010-07-19: 53 DBL PARK :2876 1st Qu.: 83.0
## Median :814718876 2012-08-20: 50 PK PHB OTD:1479 Median : 85.0
## Mean :805965065 2012-09-06: 50 PRK PROHIB:2523 Mean :112.3
## 3rd Qu.:831587005 2012-09-20: 50 TWAWY ZN#1:5293 3rd Qu.: 98.0
## Max. :977362606 2011-07-28: 48 TWAWY ZONE:2295 Max. :279.0
## (Other) :16495
## Citation.Status Amount.Paid Amount.Due Citation.Issue.Month
## Closed:15110 Min. : 1.00 Min. : 0.00 Jul :1865
## Open : 1694 1st Qu.: 8.00 1st Qu.: 0.00 Sep :1862
## Median : 96.00 Median : 0.00 Mar :1736
## Mean : 88.65 Mean : 19.02 Aug :1652
## 3rd Qu.:159.00 3rd Qu.: 0.00 Oct :1627
## Max. :168.00 Max. :392.00 Apr :1533
## (Other):6529
## Paid.On.Time Latitude Longitude Citation.Issue.Year
## Min. :0.0000 Min. :37.71 Min. :-122.5 Min. :2009
## 1st Qu.:1.0000 1st Qu.:37.79 1st Qu.:-122.4 1st Qu.:2011
## Median :1.0000 Median :37.79 Median :-122.4 Median :2012
## Mean :0.9015 Mean :37.79 Mean :-122.4 Mean :2012
## 3rd Qu.:1.0000 3rd Qu.:37.79 3rd Qu.:-122.4 3rd Qu.:2013
## Max. :1.0000 Max. :37.80 Max. :-122.4 Max. :2014
##
## Citation.Issue.DayOfWeek Citation.Issue.Weekday Citation.Issue.Hour
## Sun :3216 Weekday:12779 Min. : 2.00
## Mon :3558 Weekend: 4025 1st Qu.:12.00
## Tues :3077 Median :16.00
## Wed :2620 Mean :14.39
## Thurs:2978 3rd Qu.:16.00
## Fri : 546 Max. :22.00
## Sat : 809
## Citation.Issue.MonthYear Fine.Level
## Sep-2012: 528 Low :14466
## Jul-2014: 471 High: 2338
## Sep-2014: 428
## Aug-2012: 410
## Mar-2011: 402
## Apr-2013: 365
## (Other) :14200
# summary(tickets_to2014)
# summary(tickets)
# summary stats and bar chart of fine amount
summary(tickets$Violation)
## BUS ZONE DBL PARK PK PHB OTD PRK PROHIB TWAWY ZN#1 TWAWY ZONE
## 2338 2876 1479 2523 5293 2295
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Violation))
# Violations broken down by year
ggplot(data = tickets) + geom_bar(mapping = aes(x = tickets$Violation, fill = as.factor(tickets$Citation.Issue.Year)),
position = "dodge")
# Because the distribution of fine amount has several high-end outliers, we
# should be using the median as our measure of central tendency. This is
# even more clear when comparing the difference between the mean and median
# (112.7 and 85).
median(tickets$Fine.Amount)
## [1] 85
mean(tickets$Fine.Amount)
## [1] 112.3348
median(tickets$Fine.Amount)
## [1] 85
mean(tickets$Fine.Amount)
## [1] 112.3348
# find the median fine amount by violation
fine_by_viol <- tickets %>% group_by(Violation) %>% summarise(median = median(Fine.Amount))
fine_by_viol
## # A tibble: 6 × 2
## Violation median
## <fctr> <dbl>
## 1 BUS ZONE 255
## 2 DBL PARK 80
## 3 PK PHB OTD 85
## 4 PRK PROHIB 95
## 5 TWAWY ZN#1 85
## 6 TWAWY ZONE 83
# Maybe put these results in a table?
# find the median fine amount by year
fine_by_year <- tickets %>% group_by(Citation.Issue.Year) %>% summarise(median = median(Fine.Amount))
head(fine_by_year)
## # A tibble: 6 × 2
## Citation.Issue.Year median
## <int> <dbl>
## 1 2009 78
## 2 2010 85
## 3 2011 85
## 4 2012 85
## 5 2013 95
## 6 2014 95
# plot median fine amount by year to see trend in prices over time
fine.plot.year <- ggplot(data = fine_by_year, aes(x = Citation.Issue.Year, y = median))
fine.plot.year <- fine.plot.year + geom_line() + geom_point()
fine.plot.year
# *******Median fine amount per year seems to have increased significantly
# over time. Index with inflation to see if this is really the case?*******
# fine amount by year, violation
fine_by_year_viol <- tickets %>% group_by(Violation, Citation.Issue.Year) %>%
summarise(median = median(Fine.Amount))
head(fine_by_year_viol)
## Source: local data frame [6 x 3]
## Groups: Violation [1]
##
## Violation Citation.Issue.Year median
## <fctr> <int> <dbl>
## 1 BUS ZONE 2009 253
## 2 BUS ZONE 2010 255
## 3 BUS ZONE 2011 255
## 4 BUS ZONE 2012 267
## 5 BUS ZONE 2013 271
## 6 BUS ZONE 2014 279
# plot fine by year and violation to see change and trends
fine.plot.yv <- ggplot(data = fine_by_year_viol, aes(x = Citation.Issue.Year,
y = median))
fine.plot.yv <- fine.plot.yv + geom_line() + geom_point()
fine.plot.yv <- fine.plot.yv + facet_wrap(~Violation)
fine.plot.yv
# ********** Looks like we should look at violations and the years they
# occur in and maybe get more specific on type of violation*********
# These graphs show how Fine Amount changes over each hour, day of the week, month, and year for each Violation Type
ggplot(tickets, aes(x = Citation.Issue.Hour, y = Fine.Amount)) + geom_point() + facet_wrap(~Violation)
ggplot(tickets, aes(x = Citation.Issue.DayOfWeek, y = Fine.Amount)) + geom_point() + facet_wrap(~Violation)
ggplot(tickets, aes(x = Citation.Issue.Month, y = Fine.Amount)) + geom_point() + facet_wrap(~Violation)
ggplot(tickets, aes(x = Citation.Issue.Year, y = Fine.Amount)) + geom_point() + facet_wrap(~Violation)
# There are appears to be no change over Hour, Day, or Month with a slight increase in prices each year, with a larger spike in 2012.
#Analysis of price changes over the years by Violation type
#Total of each type of Violation
ViolationCount <- tickets %>% group_by(Violation) %>% summarise(count = n())
ViolationCount
## # A tibble: 6 × 2
## Violation count
## <fctr> <int>
## 1 BUS ZONE 2338
## 2 DBL PARK 2876
## 3 PK PHB OTD 1479
## 4 PRK PROHIB 2523
## 5 TWAWY ZN#1 5293
## 6 TWAWY ZONE 2295
#Create objects for each type of Violation
Violation.DBLPRK <- tickets %>% filter(Violation == "DBL PARK")
Violation.BUSZONE <- tickets %>% filter(Violation == "BUS ZONE")
Violation.PKPHBOTD <- tickets %>% filter(Violation == "PK PHB OTD")
Violation.PRKPROHIB <- tickets %>% filter(Violation == "PRK PROHIB")
Violation.TOWAWAY1 <- tickets %>% filter(Violation == "TWAWY ZN#1")
Violation.TOWAWAYzONE <- tickets %>% filter(Violation == "TWAWY ZONE")
#Graphing the change in fine for each Violation type from 2010-2015
ggplot(Violation.DBLPRK, aes(x = Citation.Issue.Year, y = Fine.Amount)) + geom_point() + labs(title = "Double Parking")
ggplot(Violation.BUSZONE, aes(x = Citation.Issue.Year, y = Fine.Amount)) + geom_point() + labs(title = "Bus Zone")
ggplot(Violation.PKPHBOTD, aes(x = Citation.Issue.Year, y = Fine.Amount)) + geom_point() + labs(title = "Tow-Away Zone - Outside Downtown Core")
ggplot(Violation.PRKPROHIB, aes(x = Citation.Issue.Year, y = Fine.Amount)) + geom_point() + labs(title = "Tow-Away Zone - Downtown Core")
ggplot(Violation.TOWAWAY1, aes(x = Citation.Issue.Year, y = Fine.Amount)) + geom_point() + labs(title = "Tow Away Zone 1")
ggplot(Violation.TOWAWAYzONE, aes(x = Citation.Issue.Year, y = Fine.Amount)) + geom_point() + labs(title = "Tow Away Zone 2")
#General increases in fines over the interval with a noticable jump in prices in 2012
# Histogram of Citations per Hour
ByHour <- tickets %>% group_by(Citation.Issue.Hour) %>% summarise(Percentage = round(n()/nrow(tickets) *
100, digits = 5))
ByHour
## # A tibble: 20 × 2
## Citation.Issue.Hour Percentage
## <int> <dbl>
## 1 2 0.00595
## 2 4 0.03571
## 3 5 0.01190
## 4 6 0.02380
## 5 7 1.27351
## 6 8 6.79600
## 7 9 3.68365
## 8 10 4.48703
## 9 11 4.83218
## 10 12 4.98096
## 11 13 4.00500
## 12 14 3.42180
## 13 15 5.74863
## 14 16 36.83647
## 15 17 21.83409
## 16 18 1.08903
## 17 19 0.58915
## 18 20 0.30945
## 19 21 0.02380
## 20 22 0.01190
bar.color <- c("indianred2", "indianred2", "indianred2", "indianred2", "indianred2",
"royalblue3", "royalblue3", "royalblue3", "indianred2", "indianred2", "indianred2",
"indianred2", "indianred2", "indianred2", "royalblue3", "royalblue3", "royalblue3",
"indianred2", "indianred2", "indianred2", "indianred2")
ggplot(tickets, aes(x = Citation.Issue.Hour)) + geom_histogram(binwidth = 1,
fill = bar.color, color = "black") + scale_x_continuous(breaks = seq(0,
24, 1)) + labs(title = "Citations by Hour", x = "Hour of Day (24 Hour Scale)",
y = "Total Citaitons", fill = "Rush Hour")
#Created variable Day of Month for purposes of day-to-day calculations
tickets$Citation.Issue.Day <- day(tickets$Citation.Issue.Date)
RushHour <- tickets %>% filter((Citation.Issue.Hour >= 7 & Citation.Issue.Hour <= 9) | (Citation.Issue.Hour >= 16 & Citation.Issue.Hour <= 18))
NonRushHour <- tickets %>% filter((Citation.Issue.Hour < 7 | Citation.Issue.Hour > 9) & (Citation.Issue.Hour < 16 | Citation.Issue.Hour > 18))
nrow(RushHour)
## [1] 12017
nrow(NonRushHour)
## [1] 4787
PercentRush <- nrow(RushHour) / nrow(tickets) * 100
PercentRush
## [1] 71.51274
t <- tickets %>% group_by(Citation.Issue.Date) %>% summarise(median = median(Fine.Amount))
median(t$median)
## [1] 88
g <- tickets %>% group_by(Citation.Issue.DayOfWeek) %>% summarise(median = median(Fine.Amount))
g
## # A tibble: 7 × 2
## Citation.Issue.DayOfWeek median
## <fctr> <dbl>
## 1 Sun 85
## 2 Mon 85
## 3 Tues 85
## 4 Wed 85
## 5 Thurs 85
## 6 Fri 250
## 7 Sat 110
#EDA for Rushhour vs NonRushhour
dim(RushHour)
## [1] 12017 18
str(RushHour)
## 'data.frame': 12017 obs. of 18 variables:
## $ Ticket.Number : int 975990164 975990175 975990186 975990190 100002550 975990223 100002551 975990271 975990282 100002557 ...
## $ Citation.Issue.Date : Factor w/ 1684 levels "2009-01-11","2009-01-19",..: 5 6 7 7 7 8 8 9 9 10 ...
## $ Violation : Factor w/ 6 levels "BUS ZONE","DBL PARK",..: 1 1 1 1 5 2 2 2 2 2 ...
## $ Fine.Amount : int 250 250 250 250 70 75 75 75 75 75 ...
## $ Citation.Status : Factor w/ 2 levels "Closed","Open": 1 2 1 1 1 1 1 1 1 1 ...
## $ Amount.Paid : int 89 3 89 89 153 155 155 3 155 155 ...
## $ Amount.Due : num 0 335 0 0 0 0 0 0 0 0 ...
## $ Citation.Issue.Month : Factor w/ 12 levels "Apr","Aug","Dec",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Paid.On.Time : int 1 0 1 1 1 1 1 1 1 1 ...
## $ Latitude : num 37.8 37.8 37.8 37.8 37.8 ...
## $ Longitude : num -122 -122 -122 -122 -122 ...
## $ Citation.Issue.Year : int 2009 2009 2009 2009 2009 2009 2009 2009 2009 2009 ...
## $ Citation.Issue.DayOfWeek: Factor w/ 7 levels "Sun","Mon","Tues",..: 3 1 2 2 2 3 3 4 4 5 ...
## $ Citation.Issue.Weekday : Factor w/ 2 levels "Weekday","Weekend": 1 2 1 1 1 1 1 1 1 1 ...
## $ Citation.Issue.Hour : int 17 16 18 9 8 9 16 17 17 9 ...
## $ Citation.Issue.MonthYear: Factor w/ 72 levels "Apr-2009","Apr-2010",..: 19 19 19 19 19 19 19 19 19 19 ...
## $ Fine.Level : Factor w/ 2 levels "Low","High": 2 2 2 2 1 1 1 1 1 1 ...
## $ Citation.Issue.Day : int 10 15 16 16 16 17 17 18 18 19 ...
nrow(RushHour)
## [1] 12017
head(RushHour)
## Ticket.Number Citation.Issue.Date Violation Fine.Amount Citation.Status
## 1 975990164 2009-02-10 BUS ZONE 250 Closed
## 2 975990175 2009-02-15 BUS ZONE 250 Open
## 3 975990186 2009-02-16 BUS ZONE 250 Closed
## 4 975990190 2009-02-16 BUS ZONE 250 Closed
## 5 100002550 2009-02-16 TWAWY ZN#1 70 Closed
## 6 975990223 2009-02-17 DBL PARK 75 Closed
## Amount.Paid Amount.Due Citation.Issue.Month Paid.On.Time Latitude
## 1 89 0 Feb 1 37.78562
## 2 3 335 Feb 0 37.78701
## 3 89 0 Feb 1 37.78665
## 4 89 0 Feb 1 37.78699
## 5 153 0 Feb 1 37.78717
## 6 155 0 Feb 1 37.78883
## Longitude Citation.Issue.Year Citation.Issue.DayOfWeek
## 1 -122.4150 2009 Tues
## 2 -122.4116 2009 Sun
## 3 -122.4294 2009 Mon
## 4 -122.4117 2009 Mon
## 5 -122.4008 2009 Mon
## 6 -122.4112 2009 Tues
## Citation.Issue.Weekday Citation.Issue.Hour Citation.Issue.MonthYear
## 1 Weekday 17 Feb-2009
## 2 Weekend 16 Feb-2009
## 3 Weekday 18 Feb-2009
## 4 Weekday 9 Feb-2009
## 5 Weekday 8 Feb-2009
## 6 Weekday 9 Feb-2009
## Fine.Level Citation.Issue.Day
## 1 High 10
## 2 High 15
## 3 High 16
## 4 High 16
## 5 Low 16
## 6 Low 17
#Calculate total Rushhour violations for each day
Rush <- RushHour %>% group_by(Citation.Issue.Day, Citation.Issue.Month, Citation.Issue.Year) %>% summarise(count = n())
RushTotals <- Rush$count
#EDA of Statistics of Rushhour
mean(RushTotals)
## [1] 8.474612
median(RushTotals)
## [1] 6
var(RushTotals)
## [1] 64.75624
sd(RushTotals)
## [1] 8.047126
quantile(RushTotals, c(0, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, 1))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 100%
## 1 1 2 3 4 6 8 11 14 20 24 52
shapiro.test(RushTotals)
##
## Shapiro-Wilk normality test
##
## data: RushTotals
## W = 0.83909, p-value < 2.2e-16
#EDA Graphs for Rushhour
qqnorm(RushTotals)
qqline(RushTotals)
boxplot(RushTotals)
plot(RushTotals, dbinom(RushTotals, 30, 0.25), type = "h")
#EDA of NonRushhour
str(NonRushHour)
## 'data.frame': 4787 obs. of 18 variables:
## $ Ticket.Number : int 100013644 100013645 100013646 100013648 100013649 975990153 975990212 975990245 100002552 975990256 ...
## $ Citation.Issue.Date : Factor w/ 1684 levels "2009-01-11","2009-01-19",..: 1 1 1 2 3 4 7 8 8 9 ...
## $ Violation : Factor w/ 6 levels "BUS ZONE","DBL PARK",..: 2 2 2 1 5 1 2 2 1 2 ...
## $ Fine.Amount : int 75 75 75 250 70 250 75 75 250 75 ...
## $ Citation.Status : Factor w/ 2 levels "Closed","Open": 1 1 1 1 1 1 1 1 1 1 ...
## $ Amount.Paid : int 155 155 3 97 39 114 155 155 3 155 ...
## $ Amount.Due : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Citation.Issue.Month : Factor w/ 12 levels "Apr","Aug","Dec",..: 5 5 5 5 4 4 4 4 4 4 ...
## $ Paid.On.Time : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Latitude : num 37.8 37.8 37.8 37.8 37.8 ...
## $ Longitude : num -122 -122 -122 -122 -122 ...
## $ Citation.Issue.Year : int 2009 2009 2009 2009 2009 2009 2009 2009 2009 2009 ...
## $ Citation.Issue.DayOfWeek: Factor w/ 7 levels "Sun","Mon","Tues",..: 1 1 1 2 1 4 2 3 3 4 ...
## $ Citation.Issue.Weekday : Factor w/ 2 levels "Weekday","Weekend": 2 2 2 1 2 1 1 1 1 1 ...
## $ Citation.Issue.Hour : int 10 12 12 12 13 14 11 15 10 14 ...
## $ Citation.Issue.MonthYear: Factor w/ 72 levels "Apr-2009","Apr-2010",..: 25 25 25 25 19 19 19 19 19 19 ...
## $ Fine.Level : Factor w/ 2 levels "Low","High": 1 1 1 2 1 2 1 1 2 1 ...
## $ Citation.Issue.Day : int 11 11 11 19 1 4 16 17 17 18 ...
dim(NonRushHour)
## [1] 4787 18
nrow(NonRushHour)
## [1] 4787
NonRush <- NonRushHour %>% group_by(Citation.Issue.Day, Citation.Issue.Month, Citation.Issue.Year) %>% summarise(count = n())
NonRushTotals <- NonRush$count
mean(NonRushTotals)
## [1] 3.481455
median(NonRushTotals)
## [1] 3
sd(NonRushTotals)
## [1] 2.724261
var(NonRushTotals)
## [1] 7.421599
quantile(NonRushTotals, c(0, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, 1))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 95% 100%
## 1 1 1 2 2 3 3 4 5 7 9 30
shapiro.test(NonRushTotals)
##
## Shapiro-Wilk normality test
##
## data: NonRushTotals
## W = 0.80712, p-value < 2.2e-16
qqnorm(NonRushTotals)
qqline(NonRushTotals)
boxplot(NonRushTotals)
ggplot(Rush, aes(x = count)) + geom_histogram(binwidth = 1, fill = "red", color = "black") + scale_x_continuous(breaks = seq(0, 130, 10)) + geom_histogram(data = NonRush, aes(x = count), binwidth = 1, fill = "blue", color = "black", alpha = 0.7) + scale_x_continuous(breaks = seq(0, 130, 10)) + labs(title = "Rush Hour vs Off Hour Citation Distribution", x = "Total Citations", y = "Number of Days", fill = "Rush Hour")
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
plot(NonRushTotals, dbinom(NonRushTotals, 30, 0.25), type = "h")
# Proportion of Violation Type per Hour
Tickets.per.Hour <- tickets %>% group_by(Citation.Issue.Hour) %>% summarise(count = n())
Tickets.per.Hour
## # A tibble: 20 × 2
## Citation.Issue.Hour count
## <int> <int>
## 1 2 1
## 2 4 6
## 3 5 2
## 4 6 4
## 5 7 214
## 6 8 1142
## 7 9 619
## 8 10 754
## 9 11 812
## 10 12 837
## 11 13 673
## 12 14 575
## 13 15 966
## 14 16 6190
## 15 17 3669
## 16 18 183
## 17 19 99
## 18 20 52
## 19 21 4
## 20 22 2
ggplot(tickets, aes(x = Citation.Issue.Hour, fill = Violation)) + geom_bar(position = "fill") + scale_x_continuous(breaks = seq(0, 24, 1)) + labs(title = "Proportion of Violation Types by Hour", x = "Citation Issue Hour", y = "Violation Type Proportion")
# Graph shows almost entirely Double Parking and Bus Zone Violations during non-rush-hour and almost entirely Towaway and Prohibited Parking during rush hour
Violation.Rush <- RushHour %>% group_by(Violation) %>% summarise(Total_During_Rush = n())
Violation.Rush
## # A tibble: 6 × 2
## Violation Total_During_Rush
## <fctr> <int>
## 1 BUS ZONE 717
## 2 DBL PARK 786
## 3 PK PHB OTD 1469
## 4 PRK PROHIB 2259
## 5 TWAWY ZN#1 4593
## 6 TWAWY ZONE 2193
Violation.Off <- NonRushHour %>% group_by(Violation) %>% summarise(Total_During_Off = n())
Violation.Off
## # A tibble: 6 × 2
## Violation Total_During_Off
## <fctr> <int>
## 1 BUS ZONE 1621
## 2 DBL PARK 2090
## 3 PK PHB OTD 10
## 4 PRK PROHIB 264
## 5 TWAWY ZN#1 700
## 6 TWAWY ZONE 102
a <- Violation.Rush$Total_During_Rush / (Violation.Rush$Total_During_Rush + Violation.Off$Total_During_Off)
#a <- round(a, 3)