getwd()
tickets <- data.frame(read.csv("Muni_Driver_Reported_Transit_Only_Lane_Violations.csv",
header = T, sep = ","))
# Check dimensions of dataset
dim(tickets)
# Check variable names
colnames(tickets)
# Examine structure of the data
str(tickets)
The dataset consists of San Francisco Municipal Transportation Agency (SFMTA) driver reported transit only lane violations from March 2008 to February 2015. The dataset contains 17178 observations and 18 variables. The data were obtained from the US government’s open data site at https://catalog.data.gov/dataset/muni-driver-reported-transit-only-lane-violations.
Object.ID – Unique key that corresponds to each observationTicket.Number – Unique number that corresponds to each ticket catalogued with the Superior Court of San FranciscoCitation.Issue.Date – The date (MM/DD/YY) that the traffic only lane violation citation was issuedCitation.Issue.Month – The month (MM) that the traffic only lane violation citation was issued; This column was completely blank when downloadedCitation.Issue.Time – The time of day (HH:MM) PST that the traffic only lane violation citation was issuedLocation – The street address where the traffic only lane violation citation was issuedViolation.Code – A code that corresponds to the type of traffic only lane violation that was committed to warrant issuing a citationViolation – Description of the traffic only lane violation issued in the citationFine.Amount – The dollar amount of to be paid for the traffic only lane violationCitation.Status – Status of the citation (Open, Closed, Unapplied)Amount.Paid – The dollar amount paid for the traffic only lane violationAmount.Due – Outstanding balance for the traffic only lane violationSuspend.Code – Code that represents reason for license suspension (Suspended until the driver goes to driving school, pays fine, too many points on license, etc.)Suspend.Process.Date – Date the license suspension goes into effectSuspend.Until.Date – Date the license suspension is overDisposition.Code – The city of San Francisco does not release disposition codesLast.Edited.Date – Date that represents the last date the corresponding observation was edited; This column was completely blank when downloadedGeom – The latitude and longitude (latitutde, longitude) of the traffic only lane violation# Format Date and Month columns Change citation issue date from string to
# month/day/year date variable
tickets$Citation.Issue.Date <- mdy(tickets$Citation.Issue.Date)
# Populate blank column for citation issue month with month of citation
# issue date
tickets$Citaton.Issue.Month <- NULL
tickets$Citation.Issue.Month <- month(tickets$Citation.Issue.Date, label = TRUE)
# Convert Amount.Paid from factor to numeric
tickets$Amount.Paid <- as.numeric(tickets$Amount.Paid)
# New Column 'Paid.On.Time' (1 = true, 0 = false). Based on assumption that
# late payments have to pay an extra fine and, thus, Amount.Paid >
# Fine.Amount
tickets$Paid.On.Time <- ifelse(tickets$Amount.Paid > tickets$Amount.Due, 1,
0)
tickets$Paid.On.Time <- as.factor(tickets$Paid.On.Time)
# Seperate Geom variable into 2 separate latitude and longitude numeric
# variables remove parentheses from Geom variable
tickets$Geom <- substr(tickets$Geom, 2, nchar(as.character(tickets$Geom)) -
1)
## split string at comma & create new columns
out <- strsplit(as.character(tickets$Geom), ", ")
## Drop observations with no Geom data. ***21 OBSERVATIONS DROPPED***
tickets <- tickets[!(is.na(tickets$Geom) | tickets$Geom == ""), ]
tickets <- data.frame(tickets, do.call(rbind, out))
rm(out)
## rename columns and set str to numeric
names(tickets)[names(tickets) == "X1"] <- "Latitude"
names(tickets)[names(tickets) == "X2"] <- "Longitude"
tickets$Latitude <- as.numeric(as.character(tickets$Latitude))
tickets$Longitude <- as.numeric(as.character(tickets$Longitude))
# Create column indicating the year the ticket was issued
tickets$Citation.Issue.Year <- year(tickets$Citation.Issue.Date)
# Create column indicating DayOfWeek (Sunday (1) to Saturday (7)) that the
# ticket was issued
tickets$Citation.Issue.DayOfWeek <- wday(tickets$Citation.Issue.Date)
# Create column indicating whether the ticket was issued on: weekday (1),
# weekend (0)
tickets$Citation.Issue.Weekday <- ifelse(tickets$Citation.Issue.DayOfWeek >=
2 & tickets$Citation.Issue.DayOfWeek <= 6, 1, 0)
tickets$Citation.Issue.Weekday <- as.factor(tickets$Citation.Issue.Weekday)
# Code levels of Weekday variable
levels(tickets$Citation.Issue.Weekday) <- list(Weekend = 0, Weekday = 1)
# Convert Citation.Issue.DayOfWeek from 1, 2, 3, 4, 5, 6, 7 to Mon, Tues,
# Wed, Thurs, Fri, Sat
tickets$Citation.Issue.DayOfWeek <- wday(tickets$Citation.Issue.Date, label = TRUE)
# Create column for Hour of citation issuance. Drop observations with no
# Citation.Issue.Time data. ***0 OBSERVATIONS DROPPED***
tickets <- tickets[!(is.na(tickets$Citation.Issue.Time) | tickets$Citation.Issue.Time ==
""), ]
outTime <- strsplit(as.character(tickets$Citation.Issue.Time), ":")
tickets <- data.frame(tickets, do.call(rbind, outTime))
names(tickets)[names(tickets) == "X1"] <- "Citation.Issue.Hour"
names(tickets)[names(tickets) == "X2"] <- "Citation.Issue.Minute"
tickets$Citation.Issue.Hour <- as.numeric(as.character(tickets$Citation.Issue.Hour))
tickets$Citation.Issue.Minute <- NULL
rm(outTime)
# Create column from Month-Year
tickets <- within(tickets, Citation.Issue.MonthYear <- paste(Citation.Issue.Month,
"-", Citation.Issue.Year, sep = ""))
# Drop 'Unapplied' and empty/NA observations within Citation.Status.
# Unapplied result is outside the scope of this project. ***84 OBSERVATIONS
# DROPPED***
tickets <- tickets[!(is.na(tickets$Citation.Status) | tickets$Citation.Status ==
"" | tickets$Citation.Status == "Unapplied"), ]
levels(tickets$Citation.Status) <- list(Open = "Open", Closed = "Closed")
# Drop extraneous variables: Object.ID (just row index), Location (Not need
# street address b/c we have latitude and longitude), Last.Edited.Date
# (empty column), Suspend.Code (NA), Disposition.Code (NA), Violation.Code
# (have this info in Violation variable), Citation.Issue.Time (Not needed
# for analysis), Suspend.Process.Date and Suspend.Until.Date (outside scope
# of analysis), Geom (we split this into latitude and longitude and have all
# this info already)
tickets$Object.ID <- NULL
tickets$Location <- NULL
tickets$Last.Edited.Date <- NULL
tickets$Suspend.Code <- NULL
tickets$Disposition.Code <- NULL
tickets$Violation.Code <- NULL
tickets$Citation.Issue.Time <- NULL
tickets$Suspend.Process.Date <- NULL
tickets$Suspend.Until.Date <- NULL
tickets$Geom <- NULL
# Violation Variable
table(tickets$Violation)
##
## BUS ZONE DBL PARK NO VIOL ON SIDEWLK OVR 18 " C PK FR LN
## 2417 2968 1 1 5 1
## PK PHB OTD PRK PROHIB TRNST ONLY TWAWY ZN#1 TWAWY ZONE
## 1491 2530 70 5294 2295
# Drop 'NO VIOL' in Violation variable. ***1 OBSERVATION DROPPED*** Drop 'ON
# SIDEWLK' in Violation variable. ***1 OBSERVATION DROPPED*** Drop 'OVR 18'
# C' in Violation variable. ***5 OBSERVATIONS DROPPED*** Drop 'PK FR LN' in
# Violation variable. ***1 OBSERVATION DROPPED*** Drop 'TRNST ONLY' in
# Violation variable. ***70 OBSERVATION DROPPED***
tickets <- tickets[!table(tickets$Violation)[tickets$Violation] <= 70, ]
table(tickets$Violation)
##
## BUS ZONE DBL PARK NO VIOL ON SIDEWLK OVR 18 " C PK FR LN
## 2417 2968 0 0 0 0
## PK PHB OTD PRK PROHIB TRNST ONLY TWAWY ZN#1 TWAWY ZONE
## 1491 2530 0 5294 2295
# Recode factor varibale without the Null categories
levels(tickets$Violation) <- list(`BUS ZONE` = "BUS ZONE", `DBL PARK` = "DBL PARK",
`PK PHB OTD` = "PK PHB OTD", `PRK PROHIB` = "PRK PROHIB", `TWAWY ZN#1` = "TWAWY ZN#1",
`TWAWY ZONE` = "TWAWY ZONE")
# Year Variable
table(tickets$Citation.Issue.Year)
##
## 2008 2009 2010 2011 2012 2013 2014 2015
## 154 1288 2090 3058 3726 3108 3534 37
## Drop 2008 and 2015. *** 191 OBSERVATIONS DROPPED ***
tickets <- filter(tickets, tickets$Citation.Issue.Year > 2008 & tickets$Citation.Issue.Year <
2015)
Overall, we dropped 374 of the original 17,178 observations in the dataset. This only accounts for 2.18 percent of the dataset. Thus, we are confident that we did not introduce any inherent biases to the data by dropping the aforementioned observations.
# Check dimensions of cleaned dataset
dim(tickets)
## [1] 16804 16
# Check all variable names
colnames(tickets)
## [1] "Ticket.Number" "Citation.Issue.Date"
## [3] "Violation" "Fine.Amount"
## [5] "Citation.Status" "Amount.Paid"
## [7] "Amount.Due" "Citation.Issue.Month"
## [9] "Paid.On.Time" "Latitude"
## [11] "Longitude" "Citation.Issue.Year"
## [13] "Citation.Issue.DayOfWeek" "Citation.Issue.Weekday"
## [15] "Citation.Issue.Hour" "Citation.Issue.MonthYear"
# Examine structure of cleaned dataset
str(tickets)
## 'data.frame': 16804 obs. of 16 variables:
## $ Ticket.Number : int 100013644 100013645 100013646 100013648 100013649 975990153 975990164 975990175 975990186 975990190 ...
## $ Citation.Issue.Date : Date, format: "2009-01-11" "2009-01-11" ...
## $ Violation : Factor w/ 6 levels "BUS ZONE","DBL PARK",..: 2 2 2 1 5 1 1 1 1 1 ...
## $ Fine.Amount : int 75 75 75 250 70 250 250 250 250 250 ...
## $ Citation.Status : Factor w/ 2 levels "Open","Closed": 2 2 2 2 2 2 2 1 2 2 ...
## $ Amount.Paid : num 155 155 3 97 39 114 89 3 89 89 ...
## $ Amount.Due : num 0 0 0 0 0 0 0 335 0 0 ...
## $ Citation.Issue.Month : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 1 1 1 1 2 2 2 2 2 2 ...
## $ Paid.On.Time : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 2 2 ...
## $ Latitude : num 37.8 37.8 37.8 37.8 37.8 ...
## $ Longitude : num -122 -122 -122 -122 -122 ...
## $ Citation.Issue.Year : num 2009 2009 2009 2009 2009 ...
## $ Citation.Issue.DayOfWeek: Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 1 1 1 2 1 4 3 1 2 2 ...
## $ Citation.Issue.Weekday : Factor w/ 2 levels "Weekend","Weekday": 1 1 1 2 1 2 2 1 2 2 ...
## $ Citation.Issue.Hour : num 10 12 12 12 13 14 17 16 18 9 ...
## $ Citation.Issue.MonthYear: chr "Jan-2009" "Jan-2009" "Jan-2009" "Jan-2009" ...
After cleaning the dataset to begin exploratory data anlysis, there are 16804 observations and 16 variables.
# Save cleaned dataset as tickets.csv
write.csv(tickets, file = "tickets.csv", row.names = FALSE)