Data blah blah and … These data were imported as xxx
and yyy
, respectively.
#The following packages were used whilst completing the report (with annotation)
library(readr) # Used to import .csv files
library(dplyr) # Section Y
library(stringr) # Section Z
library(knitr) # Section X
library(tidyr)
#xlsx #readxl #foreign #gdata #rvest #dplyr #deductive #validate #Hmisc #stringr #lubridate #outliers
#MVN #infotheo #MASS #caret #MLR #ggplot #base R functions
The readr
function was used to import the data as shown below. The data were imported as gvd for Gun Violence Data and acs2015 for American (US) Census Data
gvd <- read.csv("gun-violence-data.csv")
acs2015 <- read.csv("acs2015_county_data.csv")
#gvdNEW <- write.csv(gvd, "test.csv")
Three data sets from two sources were used during this analysis.
Gun Violence Data a Comprehensive record of over 260k US gun violence incidents from 2013-2018 (italics), was compiled by James Ko and made available on kaggle.com (footnote). These data contain all recorded gun violence incidents in the US between January 2013 and March 2018, inclusive. These data were imported as gvd
.
https://www.kaggle.com/jameslko/gun-violence-data/data = gun_violence_data
US Census Demographic Data Demographic and Economic Data for Tracts and Counties (italics), was collected by the US Census Bureau and made available on kaggle.com (footnote). These data contain all cencus data from the 2015 federal census. These data were split into two data sets, the first being county data, and the second being tract data (The “Census Tract” is an area roughly equivalent to a neighborhood established by the Bureau of Census for analyzing populations. They generally encompass a population between 2,500 to 8,000 people).
https://www.kaggle.com/zimeiyang/2015-us-census-demographic-data/data = acs_2015
Summarise the types of variables and data structures, check the attributes in the data. In addition to the R codes and outputs, explain briefly the steps that you have taken. In this section, show that you have fulfilled minimum requirements 2-4.
dim(gvd)
## [1] 239677 29
dim(acs2015)
## [1] 3220 37
#head(gvd, n=8)
#head(acs2015, n=8)
colnames(gvd) # Checking the column names
## [1] "incident_id" "date"
## [3] "state" "city_or_county"
## [5] "address" "n_killed"
## [7] "n_injured" "incident_url"
## [9] "source_url" "incident_url_fields_missing"
## [11] "congressional_district" "gun_stolen"
## [13] "gun_type" "incident_characteristics"
## [15] "latitude" "location_description"
## [17] "longitude" "n_guns_involved"
## [19] "notes" "participant_age"
## [21] "participant_age_group" "participant_gender"
## [23] "participant_name" "participant_relationship"
## [25] "participant_status" "participant_type"
## [27] "sources" "state_house_district"
## [29] "state_senate_district"
colnames(acs2015) # Checking the column names
## [1] "CensusId" "State" "County"
## [4] "TotalPop" "Men" "Women"
## [7] "Hispanic" "White" "Black"
## [10] "Native" "Asian" "Pacific"
## [13] "Citizen" "Income" "IncomeErr"
## [16] "IncomePerCap" "IncomePerCapErr" "Poverty"
## [19] "ChildPoverty" "Professional" "Service"
## [22] "Office" "Construction" "Production"
## [25] "Drive" "Carpool" "Transit"
## [28] "Walk" "OtherTransp" "WorkAtHome"
## [31] "MeanCommute" "Employed" "PrivateWork"
## [34] "PublicWork" "SelfEmployed" "FamilyWork"
## [37] "Unemployment"
sapply(gvd, class) # Checking the data types for 'gvd'
## incident_id date
## "integer" "factor"
## state city_or_county
## "factor" "factor"
## address n_killed
## "factor" "integer"
## n_injured incident_url
## "integer" "factor"
## source_url incident_url_fields_missing
## "factor" "factor"
## congressional_district gun_stolen
## "integer" "factor"
## gun_type incident_characteristics
## "factor" "factor"
## latitude location_description
## "numeric" "factor"
## longitude n_guns_involved
## "numeric" "integer"
## notes participant_age
## "factor" "factor"
## participant_age_group participant_gender
## "factor" "factor"
## participant_name participant_relationship
## "factor" "factor"
## participant_status participant_type
## "factor" "factor"
## sources state_house_district
## "factor" "integer"
## state_senate_district
## "integer"
sapply(acs2015, class) # Checking the data types for 'acs2015'
## CensusId State County TotalPop
## "integer" "factor" "factor" "integer"
## Men Women Hispanic White
## "integer" "integer" "numeric" "numeric"
## Black Native Asian Pacific
## "numeric" "numeric" "numeric" "numeric"
## Citizen Income IncomeErr IncomePerCap
## "integer" "numeric" "numeric" "integer"
## IncomePerCapErr Poverty ChildPoverty Professional
## "integer" "numeric" "numeric" "numeric"
## Service Office Construction Production
## "numeric" "numeric" "numeric" "numeric"
## Drive Carpool Transit Walk
## "numeric" "numeric" "numeric" "numeric"
## OtherTransp WorkAtHome MeanCommute Employed
## "numeric" "numeric" "numeric" "integer"
## PrivateWork PublicWork SelfEmployed FamilyWork
## "numeric" "numeric" "numeric" "numeric"
## Unemployment
## "numeric"
The Gun Violence Dataset contains data from 2013 till 2015, while the American Census Data is from 2015. We will therefore create a data subset of the Gun Violence Data from the year 2015 only.
str(gvd$date)
## Factor w/ 1725 levels "2013-01-01","2013-01-05",..: 1 1 1 2 3 3 4 5 5 6 ...
gvd$date <- format.Date(gvd$date, format="%Y") # Converting YYYY-DD-MM to
str(gvd$date)
## chr [1:239677] "2013" "2013" "2013" "2013" "2013" "2013" "2013" ...
gvd2015 <- filter(gvd, date == "2015") # Filtering Gun Violence Data for 2015 only
str(gvd2015$date) # Confirming only 2015 data is present
## chr [1:53579] "2015" "2015" "2015" "2015" "2015" "2015" "2015" "2015" ...
gvd2015_keep <- select(gvd2015, incident_id, date, state, city_or_county, n_killed, n_injured, congressional_district, latitude, longitude, n_guns_involved, state_house_district, state_senate_district) #Dplyr
gvd2015_keep$SC_concat <- do.call(str_c, c(gvd2015_keep[c("state", "city_or_county")], sep = "_"))
join1 <- right_join(acs2015_keep, gvd2015_keep, by = "SC_concat")
check <- filter(join1, state == "California")
check2 <- filter(check, County == "Oakland")
acsB2015 <- read.csv("acs2015_census_tract_data.csv")
acs2015 <- read.csv("acs2015_county_data.csv")
colnames(acsB2015)
## [1] "CensusTract" "State" "County"
## [4] "TotalPop" "Men" "Women"
## [7] "Hispanic" "White" "Black"
## [10] "Native" "Asian" "Pacific"
## [13] "Citizen" "Income" "IncomeErr"
## [16] "IncomePerCap" "IncomePerCapErr" "Poverty"
## [19] "ChildPoverty" "Professional" "Service"
## [22] "Office" "Construction" "Production"
## [25] "Drive" "Carpool" "Transit"
## [28] "Walk" "OtherTransp" "WorkAtHome"
## [31] "MeanCommute" "Employed" "PrivateWork"
## [34] "PublicWork" "SelfEmployed" "FamilyWork"
## [37] "Unemployment"
colnames(acs2015)
## [1] "CensusId" "State" "County"
## [4] "TotalPop" "Men" "Women"
## [7] "Hispanic" "White" "Black"
## [10] "Native" "Asian" "Pacific"
## [13] "Citizen" "Income" "IncomeErr"
## [16] "IncomePerCap" "IncomePerCapErr" "Poverty"
## [19] "ChildPoverty" "Professional" "Service"
## [22] "Office" "Construction" "Production"
## [25] "Drive" "Carpool" "Transit"
## [28] "Walk" "OtherTransp" "WorkAtHome"
## [31] "MeanCommute" "Employed" "PrivateWork"
## [34] "PublicWork" "SelfEmployed" "FamilyWork"
## [37] "Unemployment"
colnames(acsB2015) <- colnames(acs2015)
colnames(acsB2015)
## [1] "CensusId" "State" "County"
## [4] "TotalPop" "Men" "Women"
## [7] "Hispanic" "White" "Black"
## [10] "Native" "Asian" "Pacific"
## [13] "Citizen" "Income" "IncomeErr"
## [16] "IncomePerCap" "IncomePerCapErr" "Poverty"
## [19] "ChildPoverty" "Professional" "Service"
## [22] "Office" "Construction" "Production"
## [25] "Drive" "Carpool" "Transit"
## [28] "Walk" "OtherTransp" "WorkAtHome"
## [31] "MeanCommute" "Employed" "PrivateWork"
## [34] "PublicWork" "SelfEmployed" "FamilyWork"
## [37] "Unemployment"
acs_all <- union(acs2015, acsB2015)
acs_all_keep <- select(acs_all, State, County, TotalPop, Income, IncomePerCap, Poverty, Unemployment) #Dplyr
acs_all_keep$SC_concat <- do.call(str_c, c(acs_all_keep[c("State", "County")], sep = "_"))
join2 <- semi_join(gvd2015_keep, acs_all_keep, by = "SC_concat")
join3 <- left_join(acs_all_keep, gvd2015_keep, by = "SC_concat")
check_join3 <- join3 %>% drop_na(incident_id)
join4 <- inner_join(acs_all_keep, gvd2015_keep, by = "SC_concat")
Comment - drop columns first then join then clean… OR clean then join…
For the following analysis, nly interested in economic indicators
Checking the data types
Will use concatenated value to join…