Executive Summary

Data blah blah and … These data were imported as xxx and yyy, respectively.

Required packages

#The following packages were used whilst completing the report (with annotation)
library(readr) # Used to import .csv files
library(dplyr) # Section Y
library(stringr) # Section Z
library(knitr) # Section X
library(tidyr)
#xlsx #readxl #foreign #gdata #rvest #dplyr  #deductive #validate #Hmisc #stringr #lubridate #outliers
#MVN #infotheo #MASS #caret #MLR #ggplot #base R functions

Importing the Data

The readr function was used to import the data as shown below. The data were imported as gvd for Gun Violence Data and acs2015 for American (US) Census Data

gvd <- read.csv("gun-violence-data.csv")
acs2015 <- read.csv("acs2015_county_data.csv")
#gvdNEW <- write.csv(gvd, "test.csv")

Data Summary

Three data sets from two sources were used during this analysis.

Gun Violence Data a Comprehensive record of over 260k US gun violence incidents from 2013-2018 (italics), was compiled by James Ko and made available on kaggle.com (footnote). These data contain all recorded gun violence incidents in the US between January 2013 and March 2018, inclusive. These data were imported as gvd.

https://www.kaggle.com/jameslko/gun-violence-data/data = gun_violence_data

US Census Demographic Data Demographic and Economic Data for Tracts and Counties (italics), was collected by the US Census Bureau and made available on kaggle.com (footnote). These data contain all cencus data from the 2015 federal census. These data were split into two data sets, the first being county data, and the second being tract data (The “Census Tract” is an area roughly equivalent to a neighborhood established by the Bureau of Census for analyzing populations. They generally encompass a population between 2,500 to 8,000 people).

https://www.kaggle.com/zimeiyang/2015-us-census-demographic-data/data = acs_2015

Understanding the Imported Data

Summarise the types of variables and data structures, check the attributes in the data. In addition to the R codes and outputs, explain briefly the steps that you have taken. In this section, show that you have fulfilled minimum requirements 2-4.

dim(gvd)
## [1] 239677     29
dim(acs2015)
## [1] 3220   37
#head(gvd, n=8)
#head(acs2015, n=8)
colnames(gvd) # Checking the column names
##  [1] "incident_id"                 "date"                       
##  [3] "state"                       "city_or_county"             
##  [5] "address"                     "n_killed"                   
##  [7] "n_injured"                   "incident_url"               
##  [9] "source_url"                  "incident_url_fields_missing"
## [11] "congressional_district"      "gun_stolen"                 
## [13] "gun_type"                    "incident_characteristics"   
## [15] "latitude"                    "location_description"       
## [17] "longitude"                   "n_guns_involved"            
## [19] "notes"                       "participant_age"            
## [21] "participant_age_group"       "participant_gender"         
## [23] "participant_name"            "participant_relationship"   
## [25] "participant_status"          "participant_type"           
## [27] "sources"                     "state_house_district"       
## [29] "state_senate_district"
colnames(acs2015) # Checking the column names
##  [1] "CensusId"        "State"           "County"         
##  [4] "TotalPop"        "Men"             "Women"          
##  [7] "Hispanic"        "White"           "Black"          
## [10] "Native"          "Asian"           "Pacific"        
## [13] "Citizen"         "Income"          "IncomeErr"      
## [16] "IncomePerCap"    "IncomePerCapErr" "Poverty"        
## [19] "ChildPoverty"    "Professional"    "Service"        
## [22] "Office"          "Construction"    "Production"     
## [25] "Drive"           "Carpool"         "Transit"        
## [28] "Walk"            "OtherTransp"     "WorkAtHome"     
## [31] "MeanCommute"     "Employed"        "PrivateWork"    
## [34] "PublicWork"      "SelfEmployed"    "FamilyWork"     
## [37] "Unemployment"
sapply(gvd, class) # Checking the data types for 'gvd'
##                 incident_id                        date 
##                   "integer"                    "factor" 
##                       state              city_or_county 
##                    "factor"                    "factor" 
##                     address                    n_killed 
##                    "factor"                   "integer" 
##                   n_injured                incident_url 
##                   "integer"                    "factor" 
##                  source_url incident_url_fields_missing 
##                    "factor"                    "factor" 
##      congressional_district                  gun_stolen 
##                   "integer"                    "factor" 
##                    gun_type    incident_characteristics 
##                    "factor"                    "factor" 
##                    latitude        location_description 
##                   "numeric"                    "factor" 
##                   longitude             n_guns_involved 
##                   "numeric"                   "integer" 
##                       notes             participant_age 
##                    "factor"                    "factor" 
##       participant_age_group          participant_gender 
##                    "factor"                    "factor" 
##            participant_name    participant_relationship 
##                    "factor"                    "factor" 
##          participant_status            participant_type 
##                    "factor"                    "factor" 
##                     sources        state_house_district 
##                    "factor"                   "integer" 
##       state_senate_district 
##                   "integer"
sapply(acs2015, class) # Checking the data types for 'acs2015'
##        CensusId           State          County        TotalPop 
##       "integer"        "factor"        "factor"       "integer" 
##             Men           Women        Hispanic           White 
##       "integer"       "integer"       "numeric"       "numeric" 
##           Black          Native           Asian         Pacific 
##       "numeric"       "numeric"       "numeric"       "numeric" 
##         Citizen          Income       IncomeErr    IncomePerCap 
##       "integer"       "numeric"       "numeric"       "integer" 
## IncomePerCapErr         Poverty    ChildPoverty    Professional 
##       "integer"       "numeric"       "numeric"       "numeric" 
##         Service          Office    Construction      Production 
##       "numeric"       "numeric"       "numeric"       "numeric" 
##           Drive         Carpool         Transit            Walk 
##       "numeric"       "numeric"       "numeric"       "numeric" 
##     OtherTransp      WorkAtHome     MeanCommute        Employed 
##       "numeric"       "numeric"       "numeric"       "integer" 
##     PrivateWork      PublicWork    SelfEmployed      FamilyWork 
##       "numeric"       "numeric"       "numeric"       "numeric" 
##    Unemployment 
##       "numeric"

The Gun Violence Dataset contains data from 2013 till 2015, while the American Census Data is from 2015. We will therefore create a data subset of the Gun Violence Data from the year 2015 only.

str(gvd$date)
##  Factor w/ 1725 levels "2013-01-01","2013-01-05",..: 1 1 1 2 3 3 4 5 5 6 ...
gvd$date <- format.Date(gvd$date, format="%Y") # Converting YYYY-DD-MM to 
str(gvd$date)
##  chr [1:239677] "2013" "2013" "2013" "2013" "2013" "2013" "2013" ...
gvd2015 <- filter(gvd, date == "2015") # Filtering Gun Violence Data for 2015 only
str(gvd2015$date) # Confirming only 2015 data is present
##  chr [1:53579] "2015" "2015" "2015" "2015" "2015" "2015" "2015" "2015" ...
Comment - drop columns first then join then clean… OR clean then join…

For the following analysis, nly interested in economic indicators

dim(gvd2015)
## [1] 53579    29
acs2015_keep <- select(acs2015, State, County, TotalPop, Income, IncomePerCap, Poverty, Unemployment) #Dplyr

Checking the data types

sapply(acs2015_keep, class)
##        State       County     TotalPop       Income IncomePerCap 
##     "factor"     "factor"    "integer"    "numeric"    "integer" 
##      Poverty Unemployment 
##    "numeric"    "numeric"
summary(acs2015_keep)
##       State             County        TotalPop            Income      
##  Texas   : 254   Washington:  31   Min.   :      85   Min.   : 10499  
##  Georgia : 159   Jefferson :  26   1st Qu.:   11218   1st Qu.: 38192  
##  Virginia: 133   Franklin  :  25   Median :   26035   Median : 44749  
##  Kentucky: 120   Jackson   :  24   Mean   :   99409   Mean   : 46130  
##  Missouri: 115   Lincoln   :  24   3rd Qu.:   66430   3rd Qu.: 52074  
##  Kansas  : 105   Madison   :  20   Max.   :10038388   Max.   :123453  
##  (Other) :2334   (Other)   :3070                      NA's   :1       
##   IncomePerCap      Poverty       Unemployment   
##  Min.   : 5878   Min.   : 1.40   Min.   : 0.000  
##  1st Qu.:20238   1st Qu.:12.10   1st Qu.: 5.500  
##  Median :23460   Median :16.15   Median : 7.600  
##  Mean   :23982   Mean   :17.49   Mean   : 8.094  
##  3rd Qu.:27053   3rd Qu.:20.70   3rd Qu.: 9.900  
##  Max.   :65600   Max.   :64.20   Max.   :36.500  
## 
acs2015_keep$SC_concat <- do.call(str_c, c(acs2015_keep[c("State", "County")], sep = "_")) 

Will use concatenated value to join…

2015 First!!!

gvd2015_keep <- select(gvd2015, incident_id, date, state, city_or_county, n_killed, n_injured, congressional_district, latitude, longitude, n_guns_involved, state_house_district, state_senate_district) #Dplyr

Merge …

gvd2015_keep$SC_concat <- do.call(str_c, c(gvd2015_keep[c("state", "city_or_county")], sep = "_"))
join1 <- right_join(acs2015_keep, gvd2015_keep, by = "SC_concat")
check <- filter(join1, state == "California")
check2 <- filter(check, County == "Oakland")

How could CENSUS data be missing???? Justifiably there may not be gun incidents in every country, but there IS more info that gun violence!!!!

acsB2015 <- read.csv("acs2015_census_tract_data.csv")
acs2015 <- read.csv("acs2015_county_data.csv")
colnames(acsB2015)
##  [1] "CensusTract"     "State"           "County"         
##  [4] "TotalPop"        "Men"             "Women"          
##  [7] "Hispanic"        "White"           "Black"          
## [10] "Native"          "Asian"           "Pacific"        
## [13] "Citizen"         "Income"          "IncomeErr"      
## [16] "IncomePerCap"    "IncomePerCapErr" "Poverty"        
## [19] "ChildPoverty"    "Professional"    "Service"        
## [22] "Office"          "Construction"    "Production"     
## [25] "Drive"           "Carpool"         "Transit"        
## [28] "Walk"            "OtherTransp"     "WorkAtHome"     
## [31] "MeanCommute"     "Employed"        "PrivateWork"    
## [34] "PublicWork"      "SelfEmployed"    "FamilyWork"     
## [37] "Unemployment"
colnames(acs2015)
##  [1] "CensusId"        "State"           "County"         
##  [4] "TotalPop"        "Men"             "Women"          
##  [7] "Hispanic"        "White"           "Black"          
## [10] "Native"          "Asian"           "Pacific"        
## [13] "Citizen"         "Income"          "IncomeErr"      
## [16] "IncomePerCap"    "IncomePerCapErr" "Poverty"        
## [19] "ChildPoverty"    "Professional"    "Service"        
## [22] "Office"          "Construction"    "Production"     
## [25] "Drive"           "Carpool"         "Transit"        
## [28] "Walk"            "OtherTransp"     "WorkAtHome"     
## [31] "MeanCommute"     "Employed"        "PrivateWork"    
## [34] "PublicWork"      "SelfEmployed"    "FamilyWork"     
## [37] "Unemployment"
colnames(acsB2015) <- colnames(acs2015)
colnames(acsB2015)
##  [1] "CensusId"        "State"           "County"         
##  [4] "TotalPop"        "Men"             "Women"          
##  [7] "Hispanic"        "White"           "Black"          
## [10] "Native"          "Asian"           "Pacific"        
## [13] "Citizen"         "Income"          "IncomeErr"      
## [16] "IncomePerCap"    "IncomePerCapErr" "Poverty"        
## [19] "ChildPoverty"    "Professional"    "Service"        
## [22] "Office"          "Construction"    "Production"     
## [25] "Drive"           "Carpool"         "Transit"        
## [28] "Walk"            "OtherTransp"     "WorkAtHome"     
## [31] "MeanCommute"     "Employed"        "PrivateWork"    
## [34] "PublicWork"      "SelfEmployed"    "FamilyWork"     
## [37] "Unemployment"
acs_all <- union(acs2015, acsB2015)
acs_all_keep <- select(acs_all, State, County, TotalPop, Income, IncomePerCap, Poverty, Unemployment) #Dplyr
acs_all_keep$SC_concat <- do.call(str_c, c(acs_all_keep[c("State", "County")], sep = "_")) 
join2 <- semi_join(gvd2015_keep, acs_all_keep, by = "SC_concat")
join3 <- left_join(acs_all_keep, gvd2015_keep, by = "SC_concat")
check_join3 <- join3 %>% drop_na(incident_id)
join4 <- inner_join(acs_all_keep, gvd2015_keep, by = "SC_concat")