library(wrapr)
library(plyr)
library(data.table)
library(ggplot2)
library(missForest)
library(Hmisc)
library(knitr)

Project Summary

Dog bites can cause pain and injury, but they can also spread germs that cause infection. Nearly 1 in 5 people bitten by a dog requires medical attention. Reporting the incident to the local animal control agency or police department helps the Health Department to determine if the person bitten should receive rabies shots.

This project explores the correlation between dog bites and features such as time, region, breed, age, sex and spay/neuter status. The results of the exploratory analysis will not only help the populace understand the dog bite situation in New York city area but also assist the health system to provide helpful tips on how to prevent dog bites. The ultimate goal of this project is to develop a machine learning mode predicting if a person bitten needs rabies shots, which will immensely reduce the cost of the health system for providing unnecessary rabies vaccines.

The Data were collected from reports received online, mail, fax or by phone to 311 or NYC DOHMH Animal Bite Unit. Each record represents a single dog bite incident. Information on breed, age, gender and spayed or neutered status have not been verified by DOHMH and is listed only as reported to DOHMH. Data Source: https://data.cityofnewyork.us/Health/DOHMH-Dog-Bite-Data/rsgh-akpg

Data preprocessing

# reading data
dog_bite <- fread("DOHMH_Dog_Bite_Data.csv")
dog_bite[dog_bite == ""] <- NA

# detecting missing values in each column
apply(dog_bite, 2, function(col) sum(is.na(col))/length(col))
##   UniqueID DateOfBite    Species      Breed        Age     Gender 
##  0.0000000  0.0000000  0.0000000  0.0000000  0.3726886  0.0000000 
## SpayNeuter    Borough    ZipCode 
##  0.0000000  0.0000000  0.2137361

Reformatting the DateOfBite column

dog_bite$DateOfBite <- as.Date(dog_bite$DateOfBite,format='%B %d %Y')

Cleaning Age column

# checking the format of the Age column
print(unique(dog_bite$Age))
##   [1] "11"                      NA                       
##   [3] "3"                       "4"                      
##   [5] "1"                       "10"                     
##   [7] "5"                       "2"                      
##   [9] "13"                      "7"                      
##  [11] "1900-01-11T00:00:00.000" "6"                      
##  [13] "7M"                      "8"                      
##  [15] "10 M"                    "8M"                     
##  [17] "4M"                      "9M"                     
##  [19] "6M"                      "9"                      
##  [21] "19"                      "15"                     
##  [23] "14"                      "11M"                    
##  [25] "12"                      "10M"                    
##  [27] "5m"                      "14M"                    
##  [29] "3M"                      "18M"                    
##  [31] "2-3M"                    "2018-02-03T00:00:00.000"
##  [33] "1900-01-02T00:00:00.000" "11 MTHS"                
##  [35] "2 YRS"                   "4 YRS"                  
##  [37] "5M"                      "2018-03-04T00:00:00.000"
##  [39] "22 MTHS"                 "1900-01-08T00:00:00.000"
##  [41] "2M"                      "17y"                    
##  [43] "7 MTHS"                  "2018-09-10T00:00:00.000"
##  [45] "1Y"                      "10Y"                    
##  [47] "2Y"                      "1900-01-01T00:00:00.000"
##  [49] "8YRS & 8 M"              "4Y"                     
##  [51] "8Y"                      "1M"                     
##  [53] "3Y"                      "9Y"                     
##  [55] "12Y"                     "6Y"                     
##  [57] "3 MTHS"                  "7m"                     
##  [59] "4y"                      "1YR"                    
##  [61] "5Y"                      "2y"                     
##  [63] "2018-04-05T00:00:00.000" "2-3 YR"                 
##  [65] "2-3 YRS"                 "10y"                    
##  [67] "3YR"                     "13Y"                    
##  [69] "11Y"                     "3y"                     
##  [71] "4 M"                     "11MTHS"                 
##  [73] "8 M"                     "7Y"                     
##  [75] "2018-08-09T00:00:00.000" "8MTHS"                  
##  [77] "11m"                     "6y"                     
##  [79] "3 M"                     "1900-01-07T00:00:00.000"
##  [81] "9 M"                     "0.6"                    
##  [83] "11 M"                    "2.5"                    
##  [85] "6.5 YRS"                 "16"                     
##  [87] "2018-08-02T00:00:00.000" "1900-01-04T00:00:00.000"
##  [89] "9WK"                     "6 MTHS"                 
##  [91] "6 YRS"                   "1 YR"                   
##  [93] "1/12M"                   "9MTHS"                  
##  [95] "1.6"                     "8WKS"                   
##  [97] "3MTH"                    "15.5"                   
##  [99] "13M"                     "5YR"                    
## [101] "21"                      "7 M"                    
## [103] "1.3"                     "17"                     
## [105] "1.5"                     "15M"                    
## [107] "3.5"                     "11 YRS"                 
## [109] "10 YRS"                  "1 1/2 YRS"              
## [111] "2018-04-06T00:00:00.000" "0.2"                    
## [113] "7W"                      "9 MTHS"                 
## [115] "4 MTHS"                  "4.5"                    
## [117] "8 MTHS"                  "1900-01-03T00:00:00.000"
## [119] "4MTHS"                   "10 MTHS"                
## [121] "8W"                      "2 MTH"                  
## [123] "2018-06-09T00:00:00.000" "9 YRS"                  
## [125] "1 & 3"                   "8 MOS"                  
## [127] "1.8"                     "11MOS"                  
## [129] "1900-01-05T00:00:00.000" "2018-06-07T00:00:00.000"
## [131] "16M"                     "4 yrs 8 mo"             
## [133] "4.6"                     "2.6"                    
## [135] "3 YR"                    "3 YRS"                  
## [137] "15 YRS"                  "8 YRS"                  
## [139] "13 YRS"                  "10 & 9"                 
## [141] "2 (2) & (1"              "3.6"                    
## [143] "10 MTHS &"               "5 MTHS"                 
## [145] "6 & 4"                   "2YRS (MALE"             
## [147] "2 MTHS"                  "1 & 8"                  
## [149] "2 & 9MTHS"               "6.5"                    
## [151] "3 & 4"                   "10.5"                   
## [153] "2018-04-02T00:00:00.000" "6MTH"                   
## [155] "18W"                     "2018-07-03T00:00:00.000"
## [157] "13WK"                    "1900-01-09T00:00:00.000"
## [159] "11W"                     "2018-05-06T00:00:00.000"
## [161] "2018-01-01T00:00:00.000"

Reformatting the Age column

# setting ambiguous age to NA
dog_bite$Age[grep(".*\\-.*\\-", dog_bite$Age, ignore.case = T)] <- NA

# converting years to months
age_year <- dog_bite$Age[grep("y|\\& ", dog_bite$Age, ignore.case = T)]
age_year <- as.numeric(str_extract(age_year, "\\-*\\d+\\.*\\d*")) * 12

# reformating months
age_month <- dog_bite$Age[grep("^[0-9]m|^[0-9] m", dog_bite$Age, ignore.case = T)]
age_month <- as.numeric(str_extract(age_month, "\\-*\\d+\\.*\\d*"))

# reformating weeks
age_week <- dog_bite$Age[grep("w", dog_bite$Age, ignore.case = T)]
age_week <- as.numeric(str_extract(age_week, "\\-*\\d+\\.*\\d*"))/4

# add a new age column and match row orders
age_index_1 <- c(grep("y|\\& ", dog_bite$Age, ignore.case = T), grep("^[0-9]m|^[0-9] m", dog_bite$Age, ignore.case = T), grep("w", dog_bite$Age, ignore.case = T))

# assuming missing units are "month"
age_num <- dog_bite$Age[seq(8707)[!(seq(8707) %in% age_index_1)]]
age_num <- str_extract(age_num, "\\-*\\d+\\.*\\d*")
age_num <- as.numeric(age_num)


age_index_2 <- seq(8707)[!(seq(8707) %in% age_index_1)]
age_index <- c(age_index_1, age_index_2)

# order the df with the order of the new age column
dog_bite <- dog_bite[age_index, ]
# add new age column
dog_bite$Age_month <- c(age_year, age_month, age_week, age_num)

Reformatting the ZipCode column

# checking the format of the Age column
print(unique(dog_bite$ZipCode))
##   [1] "11201"    "11206"    "11234"    "11229"    "11233"    "11226"   
##   [7] "11249"    "11212"    "11208"    "11224"    "11221"    "11236"   
##  [13] NA         "11225"    "11211"    "11238"    "11222"    "11228"   
##  [19] "11215"    "11216"    "11214"    "11217"    "11213"    "11218"   
##  [25] "11207"    "11231"    "11237"    "11209"    "11223"    "11220"   
##  [31] "11230"    "11219"    "11204"    "11235"    "11239"    "10467"   
##  [37] "10453"    "10459"    "10462"    "10456"    "10463"    "10458"   
##  [43] "10473"    "10474"    "10468"    "10128"    "10028"    "10027"   
##  [49] "10025"    "10029"    "10034"    "10032"    "10285"    "10701"   
##  [55] "10541"    "11365"    "11003"    "11694"    "11420"    "11426"   
##  [61] "11434"    "11372"    "11378"    "11691"    "11106"    "11369"   
##  [67] "11422"    "11433"    "11432"    "11368"    "11101"    "11423"   
##  [73] "11354"    "11355"    "11435"    "11358"    "10306"    "10304"   
##  [79] "10314"    "11205"    "11232"    "1122O"    "10452"    "10472"   
##  [85] "10451"    "10465"    "10457"    "10466"    "10065"    "10026"   
##  [91] "10035"    "10016"    "10003"    "10002"    "10024"    "10038"   
##  [97] "10019"    "10017"    "10001"    "10030"    "10039"    "10004"   
## [103] "10022"    "10010"    "10021"    "10011"    "10023"    "10014"   
## [109] "11580"    "7930"     "10710"    "8402"     "10607"    "12589"   
## [115] "19426"    "11421"    "11385"    "11377"    "11692"    "11373"   
## [121] "11418"    "11103"    "11416"    "11370"    "11367"    "11374"   
## [127] "11414"    "11361"    "11412"    "11375"    "11429"    "11364"   
## [133] "11379"    "10302"    "10305"    "10309"    "10312"    "10301"   
## [139] "11252"    "11210"    "11027"    "11203"    "1228"     "BROOKLYN"
## [145] "11202"    "11227"    "11243"    "112O7"    "48064"    "10461"   
## [151] "10469"    "33172"    "10464"    "10470"    "10460"    "10455"   
## [157] "10475"    "10454"    "10471"    "10476"    "10407"    "10007"   
## [163] "104"      "N/A"      "10072"    "10432"    "10033"    "10044"   
## [169] "10009"    "10013"    "10031"    "10037"    "10075"    "10036"   
## [175] "10012"    "1009"     "10006"    "100128"   "10040"    "10282"   
## [181] "10005"    "10280"    "10062"    "10069"    "10018"    "10927"   
## [187] "11010"    "11967"    "20151"    "10954"    "6484"     "11542"   
## [193] "10580"    "11801"    "10605"    "10514"    "10919"    "11743"   
## [199] "7827"     "10705"    "11576"    "6840"     "8210"     "15650"   
## [205] "12550"    "11714"    "10803"    "7065"     "10805"    "12754"   
## [211] "21401"    "10579"    "7307"     "7869"     "11530"    "7020"    
## [217] "11758"    "7055"     "23464"    "13783"    "12304"    "19002"   
## [223] "10990"    "7024"     "20832"    "8106"     "8501"     "7828"    
## [229] "12601"    "12435"    "21043"    "11563"    "10704"    "11793"   
## [235] "6831"     "10549"    "10566"    "2631"     "10708"    "21113"   
## [241] "8247"     "7751"     "7723"     "14068"    "2301"     "23188"   
## [247] "12455"    "11510"    "11875"    "11951"    "7950"     "21842"   
## [253] "11033"    "7830"     "18073"    "7405"     "6896"     "11581"   
## [259] "23221"    "11784"    "15005"    "7652"     "8221"     "11040"   
## [265] "19125"    "8831"     "18360"    "6880"     "11710"    "1013"    
## [271] "6518"     "7030"     "8098"     "10530"    "13740"    "12446"   
## [277] "94128"    "10548"    "12582"    "11706"    "7403"     "12564"   
## [283] "14850"    "11772"    "6460"     "2633"     "18301"    "7036"    
## [289] "13903"    "10512"    "6877"     "12581"    "7726"     "18103"   
## [295] "7871"     "13731"    "11362"    "13865"    "12734"    "18466"   
## [301] "8816"     "11050"    "1720"     "10533"    "28468"    "11020"   
## [307] "13045"    "3106"     "10921"    "10940"    "11756"    "11693"   
## [313] "11357"    "11419"    "11102"    "11105"    "11417"    "11411"   
## [319] "11109"    "11363"    "11436"    "11356"    "11366"    "11104"   
## [325] "11415"    "11004"    "11413"    "11427"    "11371"    "11430"   
## [331] "11428"    "11360"    "11001"    "1143"     "?"        "11697"   
## [337] "10310"    "10303"    "10308"    "10307"    "11306"
# substitute ambiguous zipcode with NA
dog_bite$ZipCode <- as.factor(as.character(as.numeric(dog_bite$ZipCode)))
## Warning in is.factor(x): NAs introduced by coercion

Imputing the missing values in the Age and ZipCode column

# for zipcode
dog_bite_sub <- dog_bite[, qc(Gender, SpayNeuter, Borough, ZipCode), with = F]
dog_bite_sub$Borough <- as.factor(dog_bite_sub$Borough)
dog_bite_sub$Breed <- as.factor(dog_bite_sub$Breed)
dog_bite_sub$Gender <- as.factor(dog_bite_sub$Gender)
#impute zipcode
dog_bite_imp <- missForest(dog_bite_sub)
##   removed variable(s) 5 due to the missingness of all entries
##   missForest iteration 1 in progress...done!
##   missForest iteration 2 in progress...done!
##   missForest iteration 3 in progress...done!
##   missForest iteration 4 in progress...done!
dog_bite$ZipCode <- dog_bite_imp$ximp$ZipCode

# for age
dog_bite$Age_month <- impute(dog_bite$Age_month, "random")

Explorotary Data Analysis (EDA)

What time do dogs bite most?

# add year, month, day column to dog_bite
dog_bite$year <- strftime(dog_bite$DateOfBite, format="%Y")
dog_bite$month <- strftime(dog_bite$DateOfBite, format="%b")
dog_bite$day <- strftime(dog_bite$DateOfBite, format="%d")

# calculating dog bite freq for each month in each year
dog_bite_time <- count(dog_bite, c("year", "month"))
dog_bite_time$year <- gsub(".*\\-", "", dog_bite_time$year)
dog_bite_time$month <- factor(dog_bite_time$month, levels = qc(Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Oct, Nov, Dec))

# bar plot for year and month
plot1 <- ggplot(dog_bite_time, aes(month, freq)) + geom_bar(aes(fill = year), width = 0.75, position = position_dodge(width=0.75), stat="identity") + labs(title = "Dog bite incidents in each month", x = "Month in the Year", y = "Dog Bite Incidents")

plot1

# line plot for month and day
dog_bite_time_2 <- count(dog_bite, c("month", "day"))
# line plot
plot2 <- ggplot(dog_bite_time_2, aes(day, freq, group = month)) + geom_line(aes(color = month)) + labs(title = "Dog bite incidents on each day of the month", x = "Day in the Month", y = "Dog Bite Incidents")

plot2

Which dog breed bites the most?

# calculating dog bite freq for each breed
dog_bite_breed <- as.data.frame(table(dog_bite$Breed))

# ordering dog breed by bite incidents
dog_bite_breed <- dog_bite_breed[order(-dog_bite_breed$Freq),]

# calculating the dog bite proportion for each breed
dog_bite_breed$percentage <- dog_bite_breed$Freq/sum(dog_bite_breed$Freq)
dog_bite_breed_top10 <- head(dog_bite_breed, 10)
dog_bite_breed_top10$Var1 <- factor(dog_bite_breed_top10$Var1, levels = dog_bite_breed_top10$Var1)

# plot top 10 breed
plot3 <- ggplot(dog_bite_breed_top10, aes(Var1, Freq)) + geom_bar(width = 0.75, position = position_dodge(width=0.75), stat="identity") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(title = "Top 10 dog breeds in the dog-bite incidents", x = "Dog Breed", y = "Dog Bite Frequency")

plot3

Does spaying/neutering help?

dog_bite_sex <- count(dog_bite, c("Gender", "SpayNeuter"))

plot4 <- ggplot(dog_bite_sex, aes(Gender, freq, fill = SpayNeuter)) + geom_bar(width = 0.75, stat="identity") + labs(title = "Does spraying/neutering help?", x = "Dog Sex", y = "Dog Bite Incidents")

plot4

Which borough is the most notorious for dog bites?

dog_bite_borough <- count(dog_bite, "Borough")

# pie chart
plot5 <- ggplot(dog_bite_borough, aes(x = "", y=freq, fill = factor(Borough))) + geom_bar(width = 1, stat = "identity") + coord_polar(theta = "y", start=0) + labs(title = "Which borough is the most notorious for dog bites?", x = "", y = "")

plot5

Dogs of what age and sex bite most?

# bin the Age_month
dog_bite$bin_age <- as.factor(.bincode(dog_bite$Age_month, c(0, 2, 4, 6, 8, 250), TRUE, TRUE))

plot6 <- ggplot(dog_bite, aes(bin_age, fill = Gender)) + geom_density(alpha=0.6) + labs(title = "Dogs of what age and sex bite most?", x = "Dog Age", y = "Dog Bite Incidents") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_x_discrete(labels = c("1" = "0~2 months", "2" = "2~4 months","3" = "4~6 months", "4" = "6~8 months", "5" = "8~250 months"))

plot6