School Shooting in the United States

Gun related violence in the United States has been characterized as an epidemic and a public health crisis with a substantial financial burden. According to the Washington Post, more than 215,000 students have experienced gun violence at 217 school since the Columbine High School massacre that occurred on April 20, 1999. This study attempts to understand the correlation of different variables hehind the school shooting incidence and what variables contribute most to the number of casualties using univariate and multivariate analyses on dataset features.

library(RCurl)
## Loading required package: bitops
library(wrapr)
library(data.table)
library(DataExplorer)

Functions

# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

Read in data

school_shoot_raw <- fread(getURL("https://raw.githubusercontent.com/washingtonpost/data-school-shootings/master/school-shootings-data.csv"))

Check missing values in the data

plot_missing(school_shoot_raw)

#remove columns with missing value over 20%
school_shoot_raw <- school_shoot_raw[, -qc(two_or_more, hawaiian_native_pacific_islander, age_shooter2, shooter_deceased2), with = F]

Data Distribution Analysis

#School shooting incidence by state
freq_by_state_table <- as.data.frame(table(school_shoot_raw$state))
freq_by_state_plot <- ggplot(data = freq_by_state_table, aes(x = reorder(Var1, Freq), y = Freq)) + geom_bar(stat="identity", fill="steelblue") + xlab("States") + ylab("Number of School Shooting Incidence") + coord_flip()

freq_by_state_plot

#School shooting incidence by year
freq_by_year_table <- as.data.frame(table(school_shoot_raw$year))
freq_by_year_plot <- ggplot(data = freq_by_year_table, aes(x = reorder(Var1, Freq), y = Freq)) + geom_bar(stat="identity", fill="steelblue") + xlab("Years") + ylab("Number of School Shooting Incidence") + coord_flip()

freq_by_year_plot

#School shooting incidence by day_of_week
freq_by_day_of_week_table <- as.data.frame(table(school_shoot_raw$day_of_week))
freq_by_day_of_week_plot <- ggplot(data = freq_by_day_of_week_table, aes(x = reorder(Var1, Freq), y = Freq)) + geom_bar(stat="identity", fill="steelblue") + xlab("Day of Weeks") + ylab("Number of School Shooting Incidence") + coord_flip()

freq_by_day_of_week_plot

#School shooting incidence by time
freq_by_time_table <- as.data.frame(table(school_shoot_raw$time))
levels(freq_by_time_table$Var1)[1] <- "Unknown"
freq_by_time_plot <- ggplot(data = freq_by_time_table, aes(x = reorder(Var1, Freq), y = Freq)) + geom_bar(stat="identity", fill="steelblue") + xlab("Times") + ylab("Number of School Shooting Incidence") + coord_flip()

freq_by_time_plot

#School shooting incidence by shooting_type
freq_by_shooting_type_table <- as.data.frame(table(school_shoot_raw$shooting_type))
freq_by_shooting_type_plot <- ggplot(data = freq_by_shooting_type_table, aes(x = reorder(Var1, Freq), y = Freq)) + geom_bar(stat="identity", fill="steelblue") + xlab("Shooting Types") + ylab("Number of School Shooting Incidence") + coord_flip()

freq_by_shooting_type_plot

#School shooting incidence by age_shooter1
freq_by_age_shooter1_table <- as.data.frame(table(school_shoot_raw$age_shooter1))
freq_by_age_shooter1_plot <- ggplot(data = freq_by_age_shooter1_table, aes(x = reorder(Var1, Freq), y = Freq)) + geom_bar(stat="identity", fill="steelblue") + xlab("Shooter's Age") + ylab("Number of School Shooting Incidence") + coord_flip()

freq_by_age_shooter1_plot

#School shooting incidence by gender_shooter1
freq_by_gender_shooter1_table <- as.data.frame(table(school_shoot_raw$gender_shooter1))
levels(freq_by_gender_shooter1_table$Var1)[1] <- "Unknown"
freq_by_gender_shooter1_plot <- ggplot(data = freq_by_gender_shooter1_table, aes(x = reorder(Var1, Freq), y = Freq)) + geom_bar(stat="identity", fill="steelblue") + xlab("Shooter's Gender") + ylab("Number of School Shooting Incidence") + coord_flip()

freq_by_gender_shooter1_plot

#School shooting incidence by race_ethnicity_shooter1
freq_by_race_ethnicity_shooter1_table <- as.data.frame(table(school_shoot_raw$race_ethnicity_shooter1))
levels(freq_by_race_ethnicity_shooter1_table$Var1)[1] <- "Unknown"
freq_by_race_ethnicity_shooter1_plot <- ggplot(data = freq_by_race_ethnicity_shooter1_table, aes(x = reorder(Var1, Freq), y = Freq)) + geom_bar(stat="identity", fill="steelblue") + xlab("Shooter's Race") + ylab("Number of School Shooting Incidence") + coord_flip()

freq_by_race_ethnicity_shooter1_plot

#School shooting incidence by shooter_relationship1
freq_by_shooter_relationship1_table <- as.data.frame(table(school_shoot_raw$shooter_relationship1))
levels(freq_by_shooter_relationship1_table$Var1)[1] <- "Unknown"
freq_by_shooter_relationship1_plot <- ggplot(data = freq_by_shooter_relationship1_table, aes(x = reorder(Var1, Freq), y = Freq)) + geom_bar(stat="identity", fill="steelblue") + xlab("Shooter's Relationship") + ylab("Number of School Shooting Incidence") + coord_flip()

freq_by_shooter_relationship1_plot

#School shooting incidence by weapon
freq_by_weapon_table <- as.data.frame(table(school_shoot_raw$weapon))
levels(freq_by_weapon_table$Var1)[1] <- "Unknown"
freq_by_weapon_plot <- ggplot(data = freq_by_weapon_table, aes(x = reorder(Var1, Freq), y = Freq)) + geom_bar(stat="identity", fill="steelblue") + xlab("Shooter's Weapon") + ylab("Number of School Shooting Incidence") + coord_flip() + theme(axis.text=element_text(size=4))

freq_by_weapon_plot

#School shooting incidence by weapon_source
freq_by_weapon_source_table <- as.data.frame(table(school_shoot_raw$weapon_source))
levels(freq_by_weapon_source_table$Var1)[1] <- "Unknown"
freq_by_weapon_source_plot <- ggplot(data = freq_by_weapon_source_table, aes(x = reorder(Var1, Freq), y = Freq)) + geom_bar(stat="identity", fill="steelblue") + xlab("Shooter's Weapon Source") + ylab("Number of School Shooting Incidence") + coord_flip()

freq_by_weapon_source_plot

Categorical Variable by Categorical Variable

#shooting type vs shooter's gender
type_gender_table <- as.data.frame(table(school_shoot_raw$shooting_type, school_shoot_raw$gender_shooter1))

levels(type_gender_table$Var1)[1] <- "Unknown"
levels(type_gender_table$Var2)[1] <- "Unknown"

type_gender_table_plot <- ggplot(type_gender_table, aes(Var2, Var1 )) + geom_tile(aes(fill = Freq)) + scale_fill_gradient(low = "white", high = "pink") + geom_text(data = type_gender_table, aes(label = Freq)) + guides(fill=guide_legend(title="Shooting Incidence")) + xlab("Shooter's Gender") + ylab("Shooting Type")

type_gender_table_plot

#race vs shooter's gender
race_gender_table <- as.data.frame(table(school_shoot_raw$race_ethnicity_shooter1, school_shoot_raw$gender_shooter1))

levels(race_gender_table$Var1)[1] <- "Unknown"
levels(race_gender_table$Var2)[1] <- "Unknown"

race_gender_table_plot <- ggplot(race_gender_table, aes(Var2, Var1 )) + geom_tile(aes(fill = Freq)) + scale_fill_gradient(low = "white", high = "pink") + geom_text(data = race_gender_table, aes(label = Freq)) + guides(fill=guide_legend(title="Shooting Incidence")) + xlab("Shooter's Gender") + ylab("Shooter's Race")

race_gender_table_plot

#race vs shooter_relationship1
race_relationship_table <- as.data.frame(table(school_shoot_raw$race_ethnicity_shooter1, school_shoot_raw$shooter_relationship1))

levels(race_relationship_table$Var1)[1] <- "Unknown"
levels(race_relationship_table$Var2)[1] <- "Unknown"

race_relationship_table_plot <- ggplot(race_relationship_table, aes(Var1, Var2 )) + geom_tile(aes(fill = Freq)) + scale_fill_gradient(low = "white", high = "pink") + geom_text(data = race_relationship_table, aes(label = Freq)) + guides(fill=guide_legend(title="Shooting Incidence")) + xlab("Shooter's Relationship to School") + ylab("Shooter's Race")

race_relationship_table_plot

Categorical Variable by Numerical Variable

#weapon fatality
weapon_killed <- ggplot(data = school_shoot_raw[killed != 0], aes(x = reorder(weapon, killed), y = killed)) + geom_bar(stat="identity", fill="steelblue") + xlab("Shooter's Weapon") + ylab("Number of Killed") + coord_flip() + theme(axis.text=element_text(size=4))

weapon_killed

#weapon injured
weapon_injured <- ggplot(data = school_shoot_raw[injured != 0], aes(x = reorder(weapon, injured), y = injured)) + geom_bar(stat="identity", fill="steelblue") + xlab("Shooter's Weapon") + ylab("Number of Injured") + coord_flip() + theme(axis.text=element_text(size=4))

weapon_injured

#weanpon casualties
weapon_casualties <- ggplot(data = school_shoot_raw[casualties != 0], aes(x = reorder(weapon, casualties), y = casualties)) + geom_bar(stat="identity", fill="steelblue") + xlab("Shooter's Weapon") + ylab("Number of Casualties") + coord_flip() + theme(axis.text=element_text(size=4))

weapon_casualties

Numerical Variable by Numerical Variable

#shooter age vs fatality
weapon_killed <- ggplot(data = na.omit(school_shoot_raw[killed != 0], cols = "age_shooter1"), aes(x = age_shooter1, y = killed)) + geom_point() + xlab("Shooter's Age") + ylab("Number of Killed")

weapon_killed

#shooter age vs injured
weapon_injured <- ggplot(data = na.omit(school_shoot_raw[injured != 0], cols = "age_shooter1"), aes(x = age_shooter1, y = injured)) + geom_point() + xlab("Shooter's Age") + ylab("Number of Injured")

weapon_injured

#shooter age vs casualties
weapon_casualties <- ggplot(data = na.omit(school_shoot_raw[casualties != 0], cols = "age_shooter1"), aes(x = age_shooter1, y = casualties)) + geom_point() + xlab("Shooter's Age") + ylab("Number of Casualties")

weapon_casualties

#race trend in student enrollment
race_trend_df <- na.omit(school_shoot_raw[, c("year", "white", "black", "hispanic", "asian", "american_indian_alaska_native"), with = FALSE])
race_trend_df$white <- as.integer(gsub(",", "", race_trend_df$white))

race_trend_df_mt <- melt(race_trend_df, id.vars = 1, measure.vars = 2:ncol(race_trend_df))

race_trend_df_mt_plot <- ggplot(data=na.omit(race_trend_df_mt), aes(x=year, y=value, colour=variable)) + geom_line()

race_trend_df_mt_plot

#enrollment-casualties correlation
enrollment_mx <- na.omit(school_shoot_raw[killed != 0])[, c("enrollment", "casualties", "white", "black", "hispanic", "asian", "american_indian_alaska_native"), with = F]
enrollment_mx <- apply(apply(enrollment_mx, 2, function(x) gsub(",", "", x)), 2, as.numeric)
plot_correlation(enrollment_mx)