I will be focusing on 2 point in this project. They are as follows:
1. The location having the highest shooting rates.
2. Analyze the crime rate in each location.
I have 2 data sources. Both are from NYCOpenData - https://opendata.cityofnewyork.us/.
I will read the .tsv file using read_tsv() and the .csv file using read.csv().
shoot_current <- read.csv("https://raw.githubusercontent.com/irene908/DATA-607/master/NYPD_Shooting_Incident_Data__Year_To_Date_.csv")
datatable(shoot_current)
shoot_historic <- read_tsv("https://raw.githubusercontent.com/irene908/DATA-607/master/NYPD_Shooting_Incident_Data__Historic_.tsv")
##
## -- Column specification --------------------------------------------------------
## cols(
## INCIDENT_KEY = col_double(),
## OCCUR_DATE = col_character(),
## OCCUR_TIME = col_time(format = ""),
## BORO = col_character(),
## PRECINCT = col_double(),
## JURISDICTION_CODE = col_double(),
## LOCATION_DESC = col_character(),
## STATISTICAL_MURDER_FLAG = col_logical(),
## PERP_AGE_GROUP = col_character(),
## PERP_SEX = col_character(),
## PERP_RACE = col_character(),
## VIC_AGE_GROUP = col_character(),
## VIC_SEX = col_character(),
## VIC_RACE = col_character(),
## X_COORD_CD = col_number(),
## Y_COORD_CD = col_number(),
## Latitude = col_double(),
## Longitude = col_double(),
## Lon_Lat = col_character()
## )
datatable(shoot_historic)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
class(shoot_historic$OCCUR_TIME) = "Character"
Clean the datasets by selecting the columns required for the analysis and removing the NA values.
dim(shoot_current)
## [1] 1501 19
dim(shoot_historic)
## [1] 21626 19
colnames(shoot_current)
## [1] "INCIDENT_KEY" "OCCUR_DATE"
## [3] "OCCUR_TIME" "BORO"
## [5] "PRECINCT" "JURISDICTION_CODE"
## [7] "LOCATION_DESC" "STATISTICAL_MURDER_FLAG"
## [9] "PERP_AGE_GROUP" "PERP_SEX"
## [11] "PERP_RACE" "VIC_AGE_GROUP"
## [13] "VIC_SEX" "VIC_RACE"
## [15] "X_COORD_CD" "Y_COORD_CD"
## [17] "Latitude" "Longitude"
## [19] "New.Georeferenced.Column"
colnames(shoot_historic)
## [1] "INCIDENT_KEY" "OCCUR_DATE"
## [3] "OCCUR_TIME" "BORO"
## [5] "PRECINCT" "JURISDICTION_CODE"
## [7] "LOCATION_DESC" "STATISTICAL_MURDER_FLAG"
## [9] "PERP_AGE_GROUP" "PERP_SEX"
## [11] "PERP_RACE" "VIC_AGE_GROUP"
## [13] "VIC_SEX" "VIC_RACE"
## [15] "X_COORD_CD" "Y_COORD_CD"
## [17] "Latitude" "Longitude"
## [19] "Lon_Lat"
# Remove NAs from the data.
sapply(shoot_current, function(x) sum(is.na(x)))
## INCIDENT_KEY OCCUR_DATE OCCUR_TIME
## 0 0 0
## BORO PRECINCT JURISDICTION_CODE
## 0 0 0
## LOCATION_DESC STATISTICAL_MURDER_FLAG PERP_AGE_GROUP
## 0 0 0
## PERP_SEX PERP_RACE VIC_AGE_GROUP
## 0 0 0
## VIC_SEX VIC_RACE X_COORD_CD
## 0 0 0
## Y_COORD_CD Latitude Longitude
## 0 0 0
## New.Georeferenced.Column
## 0
sapply(shoot_historic, function(x) sum(is.na(x)))
## INCIDENT_KEY OCCUR_DATE OCCUR_TIME
## 0 0 0
## BORO PRECINCT JURISDICTION_CODE
## 0 0 2
## LOCATION_DESC STATISTICAL_MURDER_FLAG PERP_AGE_GROUP
## 12361 0 7305
## PERP_SEX PERP_RACE VIC_AGE_GROUP
## 7271 7271 0
## VIC_SEX VIC_RACE X_COORD_CD
## 0 0 0
## Y_COORD_CD Latitude Longitude
## 0 0 0
## Lon_Lat
## 0
Count the occurrences of a borough name to give an idea of the shooting rate in the borough.
current_borough <- table(shoot_current$BORO)
current_borough <- as.data.frame(current_borough)
current_borough$Percent <- round((current_borough$Freq / sum(current_borough$Freq)*100),2)
kable(current_borough)
| Var1 | Freq | Percent |
|---|---|---|
| BRONX | 388 | 25.85 |
| BROOKLYN | 636 | 42.37 |
| MANHATTAN | 205 | 13.66 |
| QUEENS | 229 | 15.26 |
| STATEN ISLAND | 43 | 2.86 |
ggplot(current_borough, aes(x=Var1, y=Freq, fill=Var1)) + geom_bar(stat="identity") + geom_text_repel(data=current_borough, aes(label=Var1))
paste("The total number of shootings in the current year is:", sum(current_borough$Freq))
## [1] "The total number of shootings in the current year is: 1501"
#Date
current_date <- shoot_current
current_date$OCCUR_DATE <- as.Date(current_date$OCCUR_DATE,format = "%m/%d/%Y")
current_date$DAY<- wday(current_date$OCCUR_DATE, label=TRUE)
#Hour
h <- function(x) {
return (as.numeric(strsplit(x,":")[[1]][1]))
}
current_hour <- current_date %>%
mutate(HOUR = sapply(OCCUR_TIME, h)) %>%
group_by(DAY, HOUR) %>%
summarize(count = n())
## `summarise()` regrouping output by 'DAY' (override with `.groups` argument)
day <- c("Sun","Mon","Tue","Wed","Thu","Fri","Sat")
hr <- c(paste(c(12,1:11),"AM"), paste(c(12,1:11),"PM"))
current_hour$DAY <- factor(current_hour$DAY, level = rev(day))
current_hour$HOUR <- factor(current_hour$HOUR, level = 0:23, label = hr)
#Heatmap
ggplot(current_hour, aes(x = HOUR, y = DAY, fill = count)) +geom_tile() + theme(axis.text.x = element_text(angle = 90, vjust = 0.6), legend.title = element_blank(), legend.position="top", legend.direction="horizontal", legend.key.width=unit(2, "cm"), legend.key.height=unit(0.25, "cm"), legend.margin=unit(-0.5,"cm"), panel.margin=element_blank()) + labs(x = "HOUR", y = "DAY", title = "SHOOTING RATE") + scale_fill_gradient(low = "white", high = "BLACK")
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: `legend.margin` must be specified using `margin()`. For the old
## behavior use legend.spacing
Count the occurrences of a borough name to give an idea of the shooting rate in the borough.
historic_borough <- table(shoot_historic$BORO)
historic_borough <- as.data.frame(historic_borough)
historic_borough$Percent <- round((historic_borough$Freq / sum(historic_borough$Freq)*100),2)
kable(historic_borough)
| Var1 | Freq | Percent |
|---|---|---|
| BRONX | 6195 | 28.65 |
| BROOKLYN | 8913 | 41.21 |
| MANHATTAN | 2647 | 12.24 |
| QUEENS | 3225 | 14.91 |
| STATEN ISLAND | 646 | 2.99 |
ggplot(historic_borough, aes(x=Var1, y=Freq, fill=Var1)) + geom_bar(stat="identity") + geom_text_repel(data=historic_borough, aes(label=Var1))
paste("The total number of shootings in the historic data", sum(historic_borough$Freq))
## [1] "The total number of shootings in the historic data 21626"
Count the occurrences of shooting in each year in BRONX.
B <-subset(shoot_historic, BORO=='BRONX', select=c(BORO, OCCUR_DATE))
n <- 4 # Specify number of characters to extract
B$YEAR <- substr(B$OCCUR_DATE, nchar(B$OCCUR_DATE) - n + 1, nchar(B$OCCUR_DATE)) # Extract last three characters
B <- subset(B, select = -c(OCCUR_DATE))
BRONX <- table(B$YEAR)
BRONX <- as.data.frame(BRONX)
ggplot(data=BRONX, aes(x=Var1, y=Freq, group=1)) +
geom_line()+
geom_point()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Count the occurrences of shooting in each year in BROOKLYN.
Br <-subset(shoot_historic, BORO=='BROOKLYN', select=c(BORO, OCCUR_DATE))
n <- 4 # Specify number of characters to extract
Br$YEAR <- substr(Br$OCCUR_DATE, nchar(Br$OCCUR_DATE) - n + 1, nchar(Br$OCCUR_DATE)) # Extract last three characters
Br <- subset(Br, select = -c(OCCUR_DATE))
BROOLYN <- table(Br$YEAR)
BROOLYN <- as.data.frame(BROOLYN)
ggplot(data=BROOLYN, aes(x=Var1, y=Freq, group=1)) +
geom_line()+
geom_point()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Count the occurrences of shooting in each year in MANHATTAN.
M <-subset(shoot_historic, BORO=='MANHATTAN', select=c(BORO, OCCUR_DATE))
n <- 4 # Specify number of characters to extract
M$YEAR <- substr(M$OCCUR_DATE, nchar(M$OCCUR_DATE) - n + 1, nchar(M$OCCUR_DATE)) # Extract last three characters
M <- subset(M, select = -c(OCCUR_DATE))
MANHATTAN <- table(M$YEAR)
MANHATTAN <- as.data.frame(MANHATTAN)
ggplot(data=MANHATTAN, aes(x=Var1, y=Freq, group=1)) +
geom_line()+
geom_point()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Count the occurrences of shooting in each year in QUEENS.
Q <-subset(shoot_historic, BORO=='QUEENS', select=c(BORO, OCCUR_DATE))
n <- 4 # Specify number of characters to extract
Q$YEAR <- substr(Q$OCCUR_DATE, nchar(Q$OCCUR_DATE) - n + 1, nchar(Q$OCCUR_DATE)) # Extract last three characters
Q <- subset(Q, select = -c(OCCUR_DATE))
QUEENS <- table(Q$YEAR)
QUEENS <- as.data.frame(QUEENS)
ggplot(data=QUEENS, aes(x=Var1, y=Freq, group=1)) +
geom_line()+
geom_point()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Count the occurrences of shooting in each year in STATEN ISLAND.
S <-subset(shoot_historic, BORO=='STATEN ISLAND', select=c(BORO, OCCUR_DATE))
n <- 4 # Specify number of characters to extract
S$YEAR <- substr(S$OCCUR_DATE, nchar(S$OCCUR_DATE) - n + 1, nchar(S$OCCUR_DATE)) # Extract last three characters
S <- subset(S, select = -c(OCCUR_DATE))
STATEN_ISLAND <- table(S$YEAR)
STATEN_ISLAND <- as.data.frame(STATEN_ISLAND)
ggplot(data=STATEN_ISLAND, aes(x=Var1, y=Freq, group=1)) +
geom_line()+
geom_point()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
current_borough$type <- c("Current")
historic_borough$type <- c("Historic")
borough <- rbind(historic_borough,current_borough)
ggplot(borough ,aes(x= Var1, y=Freq, fill=type)) +
geom_bar(stat="identity", position=position_dodge())
ggplot(borough ,aes(x= Var1, y=Percent, fill=type)) +
geom_bar(stat="identity", position=position_dodge())
To conclude, I have answered both my questions from the analysis done above. The answers are as follows:
1. The location having the highest shooting rates.
When the current and historic shooting rates were analyzed I found out that BROOKLYN had the highest shooting rates always.
But in the current year the percentage of shooting rate in BROOKYLN has reduced to 41.21% from the 42.37% that was present in the historic data.
From the above barplots it is understood that percentage is the best option to represent the data. So from the percentage barplot it can be seen that in ‘Bronx’ and ‘Staten Island’ there is an increase in the number of shootings in the current year compared to the historic data whereas all the other boroughs saw a decrease.
2. Analyze the crime rate in each location.
Bronx
The highest number of shootings were reported in 2006 and the lowest in 2019.
Brooklyn
The highest number of shootings were reported in 2006 and the lowest in 2017.
Manhattan
The highest number of shootings were reported in 2006 and the lowest in 2018.
Queens
The highest number of shootings were reported in 2008 and the lowest in 2017.
Staten Island
The highest number of shootings were reported in 2008 and the lowest in 2018.
From these it can be seen that the shootings were more high between 2006-2008 and have reduced from 2017-2019