library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.7 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
Dataset link: https://data.cityofnewyork.us/Health/DOHMH-Dog-Bite-Data/rsgh-akpg
dog_bites_df <- read.csv("dog_bite_data.csv")
head(dog_bites_df)
## ï..UniqueID DateOfBite Species Breed Age Gender SpayNeuter
## 1 1 January 01 2018 DOG UNKNOWN U FALSE
## 2 2 January 04 2018 DOG UNKNOWN U FALSE
## 3 3 January 06 2018 DOG Pit Bull U FALSE
## 4 4 January 08 2018 DOG Mixed/Other 4 M FALSE
## 5 5 January 09 2018 DOG Pit Bull U FALSE
## 6 6 January 03 2018 DOG BASENJI 4Y M FALSE
## Borough ZipCode
## 1 Brooklyn 11220
## 2 Brooklyn
## 3 Brooklyn 11224
## 4 Brooklyn 11231
## 5 Brooklyn 11224
## 6 Brooklyn 11231
Is the Species column necessary? Aren’t we working with dog bites specifically? Let’s check:
unique(dog_bites_df['Species'])
## Species
## 1 DOG
We can drop this column.
dog_bites_df <- dog_bites_df[-c(3)]
Rename Columns
colnames(dog_bites_df) <- c("Bite.Id", "Bite.Date", "Dog.Breed", "Dog.Age", "Dog.Gender", "Dog.SpayNeuter", "Bite.Borough", "Bite.ZipCode")
Change the Bite.Date datatype to date, instead of character, for easier handling later.
dog_bites_df <- dog_bites_df %>%
mutate(Bite.Date = mdy(Bite.Date))
Issue: How can we standardize the dog breed? - The breed appears to be either full breed or a cross between 2. Have 2 columns for breed, Breed1 and Breed2. 1. Split on separator: / or - 2. Remove the words “mix”, “MIX” and “X” from the breed columns. 3. Convert breeds to lowercase. 4. Missing values for Breed1 -> “unknown”. Missing values for Breed2 -> “none” 5. For Dog.Age, blank -> ‘unknown’. Values containing ‘Y’, remove the Y. Containing ‘YRS’, remove. ‘3 1/2’ -> ‘3.5’. ’ MONS’ -> ‘M’, Number 121, ‘7-8M’ -> ‘8M’ Trim whitespace.
dog_bites_df <- dog_bites_df %>%
separate(Dog.Breed, into = c("Dog.Breed1", "Dog.Breed2"), sep="[/|-]")
## Warning: Expected 2 pieces. Additional pieces discarded in 66 rows [635, 1308,
## 1532, 1823, 1889, 2231, 2808, 2844, 3476, 4042, 4544, 4747, 4777, 5107, 5236,
## 5866, 6800, 7496, 7629, 7641, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 20008 rows [1, 2,
## 3, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 18, 19, 21, 23, 24, 25, ...].
remove_rows <- c() # avector for collecting rows that will removed later
for (index in 1:nrow(dog_bites_df)) {
dog_bites_df[index, 'Dog.Breed1'] = gsub("\\sMix|\\sMIX$|\\sx|\\sX", "", dog_bites_df[index, 'Dog.Breed1'])
dog_bites_df[index, 'Dog.Breed2'] = gsub("\\sMix|\\sMIX$|\\sx|\\sX", "", dog_bites_df[index, 'Dog.Breed2'])
# make breeds lowercase
dog_bites_df[index, 'Dog.Breed1'] = tolower(dog_bites_df[index, 'Dog.Breed1'])
dog_bites_df[index, 'Dog.Breed2'] = tolower(dog_bites_df[index, 'Dog.Breed2'])
if (dog_bites_df[index, 'Dog.Breed1'] == '') {
dog_bites_df[index, 'Dog.Breed1'] <- 'unknown'
}
if (is.na(dog_bites_df[index, 'Dog.Breed2'])) {
dog_bites_df[index, 'Dog.Breed2'] <- 'none'
}
if (dog_bites_df[index, 'Dog.Age'] == '') {
dog_bites_df[index, 'Dog.Age'] <- 'unknown'
}
if(dog_bites_df[index, 'Bite.ZipCode'] == '' || dog_bites_df[index, 'Bite.ZipCode'] == '?') {
dog_bites_df[index, 'Bite.ZipCode'] <- 'unknown'
}
if(nchar(dog_bites_df[index, 'Bite.ZipCode']) != 5) {
remove_rows <- append(remove_rows, index)
}
if(dog_bites_df[index, 'Bite.ZipCode'] == 'N/A') {
dog_bites_df[index, 'Bite.ZipCode'] <- 'unknown'
}
dog_bites_df[index, 'Dog.Age'] <- gsub('\\s*YRS|\\s*YR|\\s*yrs|\\s*y|\\s*Y', '', dog_bites_df[index, 'Dog.Age'])
dog_bites_df[index, 'Dog.Age'] <- gsub('\\s*1/2', '.5', dog_bites_df[index, 'Dog.Age'])
dog_bites_df[index, 'Dog.Age'] <- gsub('\\s*MN|\\s*MONS|\\s*mons|\\s*Mons|\\s*MONS[.]|\\s*MTH|\\s*MTHS|\\s*mths|\\s*MOS|\\s*MO|\\s*Mo', 'M', dog_bites_df[index, 'Dog.Age'])
dog_bites_df[index, 'Dog.Age'] <- gsub('\\s*wks|\\s*WKS|\\s*WK', 'W', dog_bites_df[index, 'Dog.Age'])
#
#special cases
dog_bites_df[index, 'Dog.Age'] <- gsub('\\s*', '', dog_bites_df[index, 'Dog.Age'])
dog_bites_df[index, 'Dog.Age'] <- gsub('\\d+-', '', dog_bites_df[index, 'Dog.Age'])
dog_bites_df[index, 'Dog.Age'] <- gsub('2-3MOS', '3M', dog_bites_df[index, 'Dog.Age'])
dog_bites_df[index, 'Dog.Age'] <- gsub('04M', '4M', dog_bites_df[index, 'Dog.Age'])
dog_bites_df[index, 'Dog.Age'] <- gsub('m', 'M', dog_bites_df[index, 'Dog.Age'])
}
# drop row 4661 because dog age= 3Q
# drop row 2949 because dog age = 2,4,5,& 6
# drop the rows where there are datetimes for dog age (contains 'T00')
dog_bites_df[4077, 'Dog.Age'] <- '12W'
dog_bites_df[12034, 'Dog.Age'] <- '20M'
dog_bites_df[12035, 'Dog.Age'] <- '20M'
dog_bites_df[19399, 'Dog.Age'] <- '56M'
dog_bites_df <- dog_bites_df %>% subset(str_detect(dog_bites_df$Dog.Age,'T00') == FALSE)
dog_bites_df <- dog_bites_df %>% subset(Bite.Borough != 'Other')
remove_rows <- append(remove_rows, c(924,1479,2949,3417,4270,4653,4661,5942,6376,6975,12014,12541,13499,13717,15513,16058,16194,18991,20439,20456,20589,20905,21027,21284,21392,22191))
dog_bites_df <- dog_bites_df[-remove_rows,]
rownames(dog_bites_df) <- NULL # adjust row numbers
head(dog_bites_df)
## Bite.Id Bite.Date Dog.Breed1 Dog.Breed2 Dog.Age Dog.Gender Dog.SpayNeuter
## 1 1 2018-01-01 unknown none unknown U FALSE
## 2 3 2018-01-06 pit bull none unknown U FALSE
## 3 4 2018-01-08 mixed other 4 M FALSE
## 4 5 2018-01-09 pit bull none unknown U FALSE
## 5 6 2018-01-03 basenji none 4 M FALSE
## 6 8 2018-01-03 pit bull none unknown U FALSE
## Bite.Borough Bite.ZipCode
## 1 Brooklyn 11220
## 2 Brooklyn 11224
## 3 Brooklyn 11231
## 4 Brooklyn 11224
## 5 Brooklyn 11231
## 6 Brooklyn 11233
The dataframe has been cleaned. We will export it to a new .csv file to work with.
write.csv(dog_bites_df,'dog_bites_cleaned.csv')