In this Project, I will analyze the shooting incidents in all the 50 States in US based on the data from year 2013 to 2015. The objective is to find the States having highest shooting rate and people killed and wounded and see if there is any trend by comparison of the count of incidents. Has the incidents raised or lowered in the three years. Is there any particular month having high incident rate in any state. This would help to understand if any measure has been taken already or need to take to lower the incidents. This would also warn in terms of understanding high crime prone State.
if (!require('dplyr')) install.packages('dplyr')
if (!require('stringr')) install.packages('stringr')
if (!require('ggplot2')) install.packages('ggplot2')
if (!require('lubridate')) install.packages('lubridate')
if (!require('tidyr')) install.packages('tidyr')
if (!require('doBy')) install.packages('doBy')
if (!require('knitr')) install.packages('knitr')
if (!require('reshape')) install.packages('reshape')
if (!require('lattice')) install.packages('lattice')
yr_2013 <- read.csv(file = ("https://raw.githubusercontent.com/raghu74us/606/master/2013_shooter.csv"),sep =",")
yr_2014 <- read.csv(file = ("https://raw.githubusercontent.com/BuzzFeedNews/2015-12-mass-shooting-intervals/master/data/2014MASTER.csv"),sep =",")
yr_2015 <- read.csv(file = ("https://raw.githubusercontent.com/BuzzFeedNews/2015-12-mass-shooting-intervals/master/data/2015CURRENT.csv"),sep =",")
There are only 3 years of data available which totals to 1050 observations and 9 variables. This data set has the year, month, City, state and incident date and total killed and wounded.
Data is available in Buzfeed News. There are data files for each year from 2013 to 2015.
This study is observational.
Data is available under the below link. https://github.com/BuzzFeedNews/2015-12-mass-shooting-intervals/tree/master/data
Response variable is the number of incidents in a month in a city, State. It is numerical.
Number of incidents each month. Has the incidents happened each month what is the count of it. This need to be done for atleast 3 years to decide which city has highest incident and the reoccurrence chances each month.
#not including the columns that are not needed for analysis.
yr_2013 <- yr_2013[ , c('date', 'killed', 'wounded', 'location')]
colnames(yr_2013) <- c('Date', 'Dead', 'Injured', 'Location')
yr_2013$Year <- "2013"
yr_2014 <- yr_2014[ , c('Date', 'Dead', 'Injured', 'Location')]
yr_2014$Year <- "2014"
yr_2015 <- yr_2015[ , c('Date', 'Dead', 'Injured', 'Location')]
yr_2015$Year <- "2015"
sh <-rbind(yr_2013, yr_2014, yr_2015)
#split state and city based on location
str(sh)
## 'data.frame': 1050 obs. of 5 variables:
## $ Date : Factor w/ 613 levels "1/1/13","1/10/13",..: 1 1 1 1 10 11 11 11 2 3 ...
## $ Dead : int 4 1 0 1 4 5 2 3 3 1 ...
## $ Injured : int 0 3 4 4 0 0 2 1 2 4 ...
## $ Location: Factor w/ 557 levels "Aguas Buenas, PR",..: 199 87 135 122 9 228 76 50 154 229 ...
## $ Year : chr "2013" "2013" "2013" "2013" ...
head(sh,5)
## Date Dead Injured Location Year
## 1 1/1/13 4 0 Sacramento, CA 2013
## 2 1/1/13 1 3 Hawthorne, CA 2013
## 3 1/1/13 0 4 McKeesport, PA 2013
## 4 1/1/13 1 4 Lorain, OH 2013
## 5 1/5/13 4 0 Aurora, CO 2013
cit_st<-str_split_fixed(sh$Location, ",", 2)
sh <- cbind(sh, cit_st)
sh$Year<- as.factor(sh$Year)
colnames(sh) <- c('date', 'killed', 'wounded', 'location', "Year","City", "State")
sh$State<- as.factor(sh$State)
#Add month and year based on date.
sh$date <- mdy(sh$date)
str(sh)
## 'data.frame': 1050 obs. of 7 variables:
## $ date : Date, format: "2013-01-01" "2013-01-01" ...
## $ killed : int 4 1 0 1 4 5 2 3 3 1 ...
## $ wounded : int 0 3 4 4 0 0 2 1 2 4 ...
## $ location: Factor w/ 557 levels "Aguas Buenas, PR",..: 199 87 135 122 9 228 76 50 154 229 ...
## $ Year : Factor w/ 3 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ City : Factor w/ 522 levels " Fort Bend County",..: 401 194 272 251 25 474 178 123 306 475 ...
## $ State : Factor w/ 69 levels ""," AK"," AL",..: 8 8 53 48 9 51 42 64 29 3 ...
#y1<-year(sh$date)
m1<-as.character(sh$date, format="%b")
d1<-as.character(sh$date, format="%d")
df1 <- cbind(d1,m1)
colnames(df1) <- c('day','month')
sh <- cbind(sh , df1)
sh$State <- trimws(toupper(sh$State))
sh$State <- replace(sh$State, sh$State=="ILLINOIS", "IL")
sh$State <- replace(sh$State, sh$State=="TENNESSEE", "TN")
sh$State <- replace(sh$State, sh$State=="KANSAS", "KS")
sh$State <- replace(sh$State, sh$State=="LOUISIANA", "MO")
sh$State <- replace(sh$State, sh$State=="OHIO", "OH")
sh$State <- replace(sh$State, sh$State=="D.C.", "DC")
sh$State <- replace(sh$State, sh$State=="PUERTO RICO", "NA")
sh$State <- replace(sh$State, sh$State=="", "NA")
sh_us_sum <- sh %>%
select(Year,State,killed,wounded) %>%
group_by(Year,State ) %>%
summarise( total_incidents= n() ) %>%
top_n(5)
## Selecting by total_incidents
ggplot(sh_us_sum, aes(State,total_incidents)) +
geom_line(aes(color=Year, group=Year)) +
ggtitle("Shooting in Top 5 States from 2013 to 2015")
sh1 <- sh[ !((trimws(sh$State)) != "CA"), ]
#summarize by total killed and wounded in California
sh_kw_ca <- sh1 %>%
select(Year,State,month,killed,wounded) %>%
group_by(Year,State,month ) %>%
summarise_each(funs(sum) ) %>%
arrange(Year,State,month)
sh_kw_ca$Tot_kw<- sh_kw_ca$killed + sh_kw_ca$wounded
#summarize by total incidents in California
sh_inc_ca <- sh1 %>%
select(Year,State,month,killed+wounded) %>%
group_by(Year,State,month ) %>%
summarise( total_incidents= n() )
knitr::opts_chunk$set(echo = TRUE)
#Incidents from 2013 to 2015 in California Monthwise.
ggplot( sh_inc_ca , aes(sh_inc_ca$month,
sh_inc_ca$total_incidents, group=Year, colour=Year)) +
geom_line() +
geom_point() +
labs(x="Month", colour="Year") +
theme_classic() +
ggtitle("Incidents from 2013 to 2015 in California Monthwise")
#Total Killed and Wounded from 2013 to 2015 in California Monthwise.
ggplot( sh_kw_ca , aes( sh_kw_ca$month,sh_kw_ca$Tot_kw, group=Year, colour=Year)) +
geom_line() +
geom_point() +
labs(x="Month", colour="Year") +
theme_classic() +
ggtitle("Total killed and Wounded from 2013 to 2015 in California Monthwise")
knitr::opts_chunk$set(echo = TRUE)
$H_0: {TJan} = {Feb} =… _{Dec} $. The avg monthly shooting does not vary across all year.
\(H_A:\) The avg monthly shooting across some or all months does vary.
#box plot from Jan to Dec in CA from 2013-15
bwplot(sh_inc_ca$total_incidents ~ sh_inc_ca$month | sh_inc_ca$Year, sh_inc_ca,main="Incidents by Month from 2013-15",xlab="month",ylab="Total Incidents")
#wide format
sh_inc_w <- sh_inc_ca %>%
spread(month, total_incidents, fill=NA, convert = FALSE, drop=TRUE, sep=NULL)
summary(sh_inc_w)
## Year State Apr Aug Dec
## 2013:1 Length:3 Min. :1.0 Min. :3.000 Min. :1.0
## 2014:1 Class :character 1st Qu.:1.0 1st Qu.:3.500 1st Qu.:1.5
## 2015:1 Mode :character Median :1.0 Median :4.000 Median :2.0
## Mean :2.0 Mean :4.333 Mean :2.0
## 3rd Qu.:2.5 3rd Qu.:5.000 3rd Qu.:2.5
## Max. :4.0 Max. :6.000 Max. :3.0
## NA's :1
## Feb Jan Jul Jun
## Min. :1.0 Min. :2.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.5 1st Qu.:2.500 1st Qu.:2.500 1st Qu.:2.500
## Median :4.0 Median :3.000 Median :4.000 Median :4.000
## Mean :4.0 Mean :2.667 Mean :3.333 Mean :3.667
## 3rd Qu.:5.5 3rd Qu.:3.000 3rd Qu.:4.500 3rd Qu.:5.000
## Max. :7.0 Max. :3.000 Max. :5.000 Max. :6.000
##
## Mar May Nov Oct
## Min. :3.000 Min. :2.0 Min. :4.000 Min. :2.00
## 1st Qu.:4.000 1st Qu.:3.5 1st Qu.:4.000 1st Qu.:2.75
## Median :5.000 Median :5.0 Median :4.000 Median :3.50
## Mean :4.667 Mean :5.0 Mean :4.333 Mean :3.50
## 3rd Qu.:5.500 3rd Qu.:6.5 3rd Qu.:4.500 3rd Qu.:4.25
## Max. :6.000 Max. :8.0 Max. :5.000 Max. :5.00
## NA's :1
## Sep
## Min. :2.000
## 1st Qu.:2.500
## Median :3.000
## Mean :3.333
## 3rd Qu.:4.000
## Max. :5.000
##
boxplot(sh_inc_w[3:14], main="Incidents by Month from 2013-15",xlab="month",ylab="Total Incidents")
# Average mean and SD of Jan to Dec.
sh_inc_ca_msc <- data.frame(State=sh_inc_ca$State,Month= sh_inc_ca$month,Incidents= sh_inc_ca$total_incidents)
sh_inc_ca_TAS <- aggregate(Incidents ~ Month, data = sh_inc_ca_msc, function(x) c(Total = sum(x), Average = mean(x), SD = sd(x) ))
sh_inc_ca_TAS<- cbind(sh_inc_ca_TAS[-ncol(sh_inc_ca_TAS)], sh_inc_ca_TAS[[ncol(sh_inc_ca_TAS)]])
sh_inc_ca_TAS
## Month Total Average SD
## 1 Apr 6 2.000000 1.7320508
## 2 Aug 13 4.333333 1.5275252
## 3 Dec 4 2.000000 1.4142136
## 4 Feb 12 4.000000 3.0000000
## 5 Jan 8 2.666667 0.5773503
## 6 Jul 10 3.333333 2.0816660
## 7 Jun 11 3.666667 2.5166115
## 8 Mar 14 4.666667 1.5275252
## 9 May 15 5.000000 3.0000000
## 10 Nov 13 4.333333 0.5773503
## 11 Oct 7 3.500000 2.1213203
## 12 Sep 10 3.333333 1.5275252
p<- ggplot(sh_inc_ca_TAS, aes(x=sh_inc_ca_TAS$Total, y=sh_inc_ca_TAS$Average, group=sh_inc_ca_TAS$Month, color=sh_inc_ca_TAS$Month)) +
geom_line() +
geom_point()+
geom_errorbar(aes(ymin=sh_inc_ca_TAS$Total-sh_inc_ca_TAS$SD, ymax=sh_inc_ca_TAS$Total+sh_inc_ca_TAS$SD), width=.2,
position=position_dodge(0.05))
#print(p)
# Finished line plot
p + labs(title="Shooting Incidents by Month in California", x="Incidents Total", y = "Average by Month")+
theme_classic()
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
knitr::opts_chunk$set(echo = TRUE)
We believe that the shooting data from each month is independent and varies across all months.
Based on the data analysis, I took the top 5 states where the shooting rate is hgh and indentified that its high in California. I was analyzing the CA state in detail from 2013-15 month wise. The study concludes that the shooting incidents has been decreasing from 2013 to 2015. The number of incidents occurring each month is not related or dependent on previous months or years. Also, shooting rate is not predictive in a State or month based on the data.