This dataset is from Gregg Maloy’s Week 5 discussion on NYC dog bites data. “A possible analysis using this data would be bites per zip code, bites per breed and whether the dog was spayed or not.” Although the data is in wide format, I can perform the analysis just by manupilating the dataframe and without converting it to long format.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.0
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
#Importing the CSV
df <- read.csv("https://raw.githubusercontent.com/LeJQC/MSDS/main/DATA%20607/Project%202/DOHMH_Dog_Bite_Data.csv",na.strings = c("","N/A"))
head(df)
## UniqueID DateOfBite Species Breed Age Gender SpayNeuter Borough
## 1 1 January 01 2018 DOG UNKNOWN <NA> U false Brooklyn
## 2 2 January 04 2018 DOG UNKNOWN <NA> U false Brooklyn
## 3 3 January 06 2018 DOG Pit Bull <NA> U false Brooklyn
## 4 4 January 08 2018 DOG Mixed/Other 4 M false Brooklyn
## 5 5 January 09 2018 DOG Pit Bull <NA> U false Brooklyn
## 6 6 January 03 2018 DOG BASENJI 4Y M false Brooklyn
## ZipCode
## 1 11220
## 2 <NA>
## 3 11224
## 4 11231
## 5 11224
## 6 11231
#Filtering and grouping the dataset by zipcode
zipcode_filtered <- df %>%
group_by(ZipCode) %>%
summarise(count = n()) %>%
filter(ZipCode >=0) #Wanted to get rid of the non values
head(zipcode_filtered)
## # A tibble: 6 × 2
## ZipCode count
## <chr> <int>
## 1 01013 1
## 2 01720 1
## 3 01852 1
## 4 02301 1
## 5 02631 1
## 6 02633 1
#Sorting the count to show the zipcodes with the most dog bites
zipcode_filtered %>%
arrange(desc(count))
## # A tibble: 518 × 2
## ZipCode count
## <chr> <int>
## 1 10029 369
## 2 11208 261
## 3 11368 226
## 4 10065 221
## 5 10128 211
## 6 10314 207
## 7 10467 198
## 8 11234 196
## 9 10456 188
## 10 11377 188
## # … with 508 more rows
#Oddly enough...I work in this neighborhood
#Dog bites by borough
table(df$Borough)
##
## Bronx Brooklyn Manhattan Other Queens
## 3782 4985 5270 981 5773
## Staten Island
## 1872
#Filtering and grouping the dataset by breed
breed_filtered <- df %>%
group_by(Breed) %>%
summarise(count = n()) %>%
na.omit() %>%
arrange(desc(count))
breed_filtered
## # A tibble: 1,651 × 2
## Breed count
## <chr> <int>
## 1 Pit Bull 4004
## 2 UNKNOWN 2349
## 3 Shih Tzu 731
## 4 Chihuahua 646
## 5 German Shepherd 622
## 6 Mixed/Other 559
## 7 American Pit Bull Mix / Pit Bull Mix 520
## 8 American Pit Bull Terrier/Pit Bull 511
## 9 Yorkshire Terrier 480
## 10 MIXED BREED 388
## # … with 1,641 more rows
#Cleaning the data
#Getting rid of "UNKNOWN" breed
breed_filtered <- breed_filtered[breed_filtered$Breed != "UNKNOWN",]
#There were too many breeds so I just picked the top 20
breed_filtered <- head(breed_filtered,20)
breed_filtered
## # A tibble: 20 × 2
## Breed count
## <chr> <int>
## 1 Pit Bull 4004
## 2 Shih Tzu 731
## 3 Chihuahua 646
## 4 German Shepherd 622
## 5 Mixed/Other 559
## 6 American Pit Bull Mix / Pit Bull Mix 520
## 7 American Pit Bull Terrier/Pit Bull 511
## 8 Yorkshire Terrier 480
## 9 MIXED BREED 388
## 10 Maltese 371
## 11 MIXED 351
## 12 Rottweiler 316
## 13 Siberian Husky 303
## 14 Labrador Retriever 282
## 15 Poodle, Standard 264
## 16 Bull dog 220
## 17 Jack Russ 190
## 18 Cocker Spaniel 172
## 19 Labrador Retriever Crossbreed 160
## 20 TERRIER 158
#Plotting the data
breed_filtered %>%
ggplot(aes(x=Breed, y=count))+
geom_bar(stat = "identity", position = "dodge")+
coord_flip()+
labs(title = "Dog Bites by Breed", y = "Amount of Dog Bites")+
theme_bw()
3) Bites by Spay
#Filtering and grouping the dataset by Spay/Neuter
spray_filtered <- df %>%
group_by(SpayNeuter) %>%
summarise(count = n()) %>%
arrange(desc(count))
spray_filtered
## # A tibble: 2 × 2
## SpayNeuter count
## <chr> <int>
## 1 false 16787
## 2 true 5876
#Plotting the data
spray_filtered %>%
ggplot(aes(x=SpayNeuter, y=count))+
geom_bar(stat = "identity", position = "dodge")+
labs(title = "Bites by Neuter")+
theme_bw()
Key point: Stay away from unspayed/neutered pitbulls in East Harlem.