library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.7     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Objective: Clean the NYC Dog Bites dataset on NYC Open Data.

Dataset link: https://data.cityofnewyork.us/Health/DOHMH-Dog-Bite-Data/rsgh-akpg

dog_bites_df <- read.csv("dog_bite_data.csv")
head(dog_bites_df)
##   ï..UniqueID      DateOfBite Species       Breed Age Gender SpayNeuter
## 1           1 January 01 2018     DOG     UNKNOWN          U      FALSE
## 2           2 January 04 2018     DOG     UNKNOWN          U      FALSE
## 3           3 January 06 2018     DOG    Pit Bull          U      FALSE
## 4           4 January 08 2018     DOG Mixed/Other   4      M      FALSE
## 5           5 January 09 2018     DOG    Pit Bull          U      FALSE
## 6           6 January 03 2018     DOG     BASENJI  4Y      M      FALSE
##    Borough ZipCode
## 1 Brooklyn   11220
## 2 Brooklyn        
## 3 Brooklyn   11224
## 4 Brooklyn   11231
## 5 Brooklyn   11224
## 6 Brooklyn   11231

Is the Species column necessary? Aren’t we working with dog bites specifically? Let’s check:

unique(dog_bites_df['Species'])
##   Species
## 1     DOG

We can drop this column.

dog_bites_df <- dog_bites_df[-c(3)]

Rename Columns

colnames(dog_bites_df) <- c("Bite.Id", "Bite.Date", "Dog.Breed", "Dog.Age", "Dog.Gender", "Dog.SpayNeuter", "Bite.Borough", "Bite.ZipCode")

Change the Bite.Date datatype to date, instead of character, for easier handling later.

dog_bites_df <- dog_bites_df %>% 
                  mutate(Bite.Date = mdy(Bite.Date))

Issue: How can we standardize the dog breed? - The breed appears to be either full breed or a cross between 2. Have 2 columns for breed, Breed1 and Breed2. 1. Split on separator: / or - 2. Remove the words “mix”, “MIX” and “X” from the breed columns. 3. Convert breeds to lowercase. 4. Missing values for Breed1 -> “unknown”. Missing values for Breed2 -> “none” 5. For Dog.Age, blank -> ‘unknown’. Values containing ‘Y’, remove the Y. Containing ‘YRS’, remove. ‘3 1/2’ -> ‘3.5’. ’ MONS’ -> ‘M’, Number 121, ‘7-8M’ -> ‘8M’ Trim whitespace.

dog_bites_df <- dog_bites_df %>% 
                  separate(Dog.Breed, into = c("Dog.Breed1", "Dog.Breed2"), sep="[/|-]")
## Warning: Expected 2 pieces. Additional pieces discarded in 66 rows [635, 1308,
## 1532, 1823, 1889, 2231, 2808, 2844, 3476, 4042, 4544, 4747, 4777, 5107, 5236,
## 5866, 6800, 7496, 7629, 7641, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 20008 rows [1, 2,
## 3, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 18, 19, 21, 23, 24, 25, ...].
remove_rows <- c()   # avector for collecting rows that will removed later

for (index in 1:nrow(dog_bites_df)) { 
  dog_bites_df[index, 'Dog.Breed1'] = gsub("\\sMix|\\sMIX$|\\sx|\\sX", "", dog_bites_df[index, 'Dog.Breed1'])
  dog_bites_df[index, 'Dog.Breed2'] = gsub("\\sMix|\\sMIX$|\\sx|\\sX", "", dog_bites_df[index, 'Dog.Breed2'])
  # make breeds lowercase
  dog_bites_df[index, 'Dog.Breed1'] = tolower(dog_bites_df[index, 'Dog.Breed1'])
  dog_bites_df[index, 'Dog.Breed2'] = tolower(dog_bites_df[index, 'Dog.Breed2'])

  if (dog_bites_df[index, 'Dog.Breed1'] == '') {
    dog_bites_df[index, 'Dog.Breed1'] <- 'unknown'
  }

  if (is.na(dog_bites_df[index, 'Dog.Breed2'])) {
    dog_bites_df[index, 'Dog.Breed2'] <- 'none'
  }
  
  if (dog_bites_df[index, 'Dog.Age'] == '') {
    dog_bites_df[index, 'Dog.Age'] <- 'unknown'
  }
  
  if(dog_bites_df[index, 'Bite.ZipCode'] == '' || dog_bites_df[index, 'Bite.ZipCode'] == '?') {
    dog_bites_df[index, 'Bite.ZipCode'] <- 'unknown'
  }
  
  if(nchar(dog_bites_df[index, 'Bite.ZipCode']) != 5) {
    remove_rows <- append(remove_rows, index)
  }
 
  
  if(dog_bites_df[index, 'Bite.ZipCode'] == 'N/A') {
    dog_bites_df[index, 'Bite.ZipCode'] <- 'unknown'
  }
  
  dog_bites_df[index, 'Dog.Age'] <- gsub('\\s*YRS|\\s*YR|\\s*yrs|\\s*y|\\s*Y', '', dog_bites_df[index, 'Dog.Age'])
  dog_bites_df[index, 'Dog.Age'] <- gsub('\\s*1/2', '.5', dog_bites_df[index, 'Dog.Age'])
  dog_bites_df[index, 'Dog.Age'] <- gsub('\\s*MN|\\s*MONS|\\s*mons|\\s*Mons|\\s*MONS[.]|\\s*MTH|\\s*MTHS|\\s*mths|\\s*MOS|\\s*MO|\\s*Mo', 'M', dog_bites_df[index, 'Dog.Age'])
  dog_bites_df[index, 'Dog.Age'] <- gsub('\\s*wks|\\s*WKS|\\s*WK', 'W', dog_bites_df[index, 'Dog.Age'])
  # 
  #special cases
  dog_bites_df[index, 'Dog.Age'] <- gsub('\\s*', '', dog_bites_df[index, 'Dog.Age'])
  dog_bites_df[index, 'Dog.Age'] <- gsub('\\d+-', '', dog_bites_df[index, 'Dog.Age'])
  dog_bites_df[index, 'Dog.Age'] <- gsub('2-3MOS', '3M', dog_bites_df[index, 'Dog.Age'])
  dog_bites_df[index, 'Dog.Age'] <- gsub('04M', '4M', dog_bites_df[index, 'Dog.Age'])
  dog_bites_df[index, 'Dog.Age'] <- gsub('m', 'M', dog_bites_df[index, 'Dog.Age'])
  
  
} 
                  
# drop row 4661 because dog age= 3Q
# drop row 2949 because dog age = 2,4,5,& 6
# drop the rows where there are datetimes for dog age  (contains 'T00')
dog_bites_df[4077, 'Dog.Age'] <- '12W'
dog_bites_df[12034, 'Dog.Age'] <- '20M'
dog_bites_df[12035, 'Dog.Age'] <- '20M'
dog_bites_df[19399, 'Dog.Age'] <- '56M'

dog_bites_df <- dog_bites_df %>% subset(str_detect(dog_bites_df$Dog.Age,'T00') == FALSE)
dog_bites_df <- dog_bites_df %>% subset(Bite.Borough != 'Other')
remove_rows <- append(remove_rows, c(924,1479,2949,3417,4270,4653,4661,5942,6376,6975,12014,12541,13499,13717,15513,16058,16194,18991,20439,20456,20589,20905,21027,21284,21392,22191))
dog_bites_df <- dog_bites_df[-remove_rows,]
rownames(dog_bites_df) <- NULL   # adjust row numbers
head(dog_bites_df)
##   Bite.Id  Bite.Date Dog.Breed1 Dog.Breed2 Dog.Age Dog.Gender Dog.SpayNeuter
## 1       1 2018-01-01    unknown       none unknown          U          FALSE
## 2       3 2018-01-06   pit bull       none unknown          U          FALSE
## 3       4 2018-01-08      mixed      other       4          M          FALSE
## 4       5 2018-01-09   pit bull       none unknown          U          FALSE
## 5       6 2018-01-03    basenji       none       4          M          FALSE
## 6       8 2018-01-03   pit bull       none unknown          U          FALSE
##   Bite.Borough Bite.ZipCode
## 1     Brooklyn        11220
## 2     Brooklyn        11224
## 3     Brooklyn        11231
## 4     Brooklyn        11224
## 5     Brooklyn        11231
## 6     Brooklyn        11233

The dataframe has been cleaned. We will export it to a new .csv file to work with.

write.csv(dog_bites_df,'dog_bites_cleaned.csv')