#Set Working Directory to pull csv file from
setwd("C:/Users/12403/Desktop/MC 2020/MC Fall 2020/DATA110/")
#Read csv file into an object in R
df0 <- read.csv("mappingPoliceViolenceFull.csv")
#Create dataframe out of the imported object
df1 <- data.frame(df0)
#Check structure of dataframe
str(df1)
## 'data.frame': 8427 obs. of 68 variables:
## $ Victim.s.name : Factor w/ 8129 levels "A'Donte Washington",..: 5817 5817 5817 5817 7282 4317 5558 7556 5817 2027 ...
## $ Victim.s.age : Factor w/ 87 levels "1","10","107",..: 87 87 56 14 25 30 41 47 87 10 ...
## $ Victim.s.gender : Factor w/ 5 levels "","Female","Male",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Victim.s.race : Factor w/ 7 levels "Asian","Black",..: 6 6 6 6 6 7 7 7 6 2 ...
## $ URL.of.image.of.victim : Factor w/ 4532 levels "","\"from Mexico\" http://www.elpasotimes.com/news/ci_28023272/el-paso-police-officer-shot-and-killed-burglary",..: 1 1 1 1 1 3898 3900 1 1 3895 ...
## $ Date.of.Incident..month.day.year. : Factor w/ 2641 levels "1/1/2013","1/1/2014",..: 2620 2612 2612 2612 2604 2604 2591 2591 2591 2519 ...
## $ Street.Address.of.Incident : Factor w/ 8240 levels ""," U.S. 287 and Oxford Road ",..: 3320 4186 2195 2540 1947 3590 4537 548 6102 1959 ...
## $ City : Factor w/ 3084 levels "","Abbeville",..: 2866 471 2411 162 2702 1595 1434 221 138 2907 ...
## $ State : Factor w/ 51 levels "AK","AL","AR",..: 46 15 44 23 35 5 48 13 5 8 ...
## $ Zipcode : int 23452 60638 78201 49037 13203 92356 98503 52722 93519 20032 ...
## $ County : Factor w/ 1145 levels "","Acadia","Acadiana",..: 1070 244 88 147 763 902 1021 923 545 306 ...
## $ Agency.responsible.for.death : Factor w/ 3076 levels "","Aberdeen Police Department",..: 2879 480 2385 173 2663 2389 2797 226 1382 684 ...
## $ ORI.Agency.Identifier..if.available. : Factor w/ 3108 levels "","AK0010100",..: 2899 1064 2774 1372 1990 392 2776 942 266 599 ...
## $ Cause.of.death : Factor w/ 31 levels "Asphyxiated",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ A.brief.description.of.the.circumstances.surrounding.the.death : Factor w/ 8347 levels "","\"As the officers approached the vehicle, the suspect exited the vehicle with a weapon drawn and pointed it at "| __truncated__,..: 6647 314 372 3238 4961 3785 4255 1751 1682 787 ...
## $ Official.disposition.of.death..justified.or.other. : Factor w/ 134 levels "","Accidental",..: 119 119 119 119 119 119 119 119 119 119 ...
## $ Criminal.Charges. : Factor w/ 31 levels "Charged with a crime",..: 31 31 31 31 31 31 31 31 31 31 ...
## $ Link.to.news.article.or.photo.of.official.document : Factor w/ 8327 levels "","facebook.com/KilledByPolice/posts/1034785363216267",..: 8252 6798 6278 6742 5939 8003 6110 7409 5886 8279 ...
## $ Symptoms.of.mental.illness. : Factor w/ 8 levels "","Drug or alcohol use",..: 3 3 3 2 3 3 3 3 3 3 ...
## $ Unarmed.Did.Not.Have.an.Actual.Weapon : Factor w/ 4 levels "Allegedly Armed",..: 1 1 1 4 1 1 1 1 1 1 ...
## $ Alleged.Weapon..Source..WaPo.and.Review.of.Cases.Not.Included.in.WaPo.Database. : Factor w/ 170 levels "air conditioner and glass bottle",..: 84 84 84 160 62 62 62 84 62 62 ...
## $ Alleged.Threat.Level..Source..WaPo. : Factor w/ 4 levels "","attack","other",..: 2 2 2 2 1 2 1 2 2 1 ...
## $ Fleeing..Source..WaPo. : Factor w/ 8 levels "","car","Car",..: 7 7 7 3 1 1 1 7 1 5 ...
## $ Body.Camera..Source..WaPo. : Factor w/ 6 levels "","Bystander Video",..: 1 1 1 1 1 1 1 1 1 6 ...
## $ WaPo.ID..If.included.in.WaPo.database. : int NA NA NA NA NA NA NA NA NA NA ...
## $ Off.Duty.Killing. : Factor w/ 4 levels "","off-duty",..: 1 1 2 1 1 1 1 1 1 1 ...
## $ Geography..via.Trulia.methodology.based.on.zipcode.population.density..http...jedkolko.com.wp.content.uploads.2015.05.full.ZCTA.urban.suburban.rural.classification.xlsx..: Factor w/ 5 levels "","Rural","Suburban",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ MPV.ID : int 8448 8446 8447 8449 8444 8445 8441 8442 8443 8440 ...
## $ Fatal.Encounters.ID : int 28737 28734 28735 28736 28726 28732 28723 28724 28725 28719 ...
## $ X : logi NA NA NA NA NA NA ...
## $ X.1 : logi NA NA NA NA NA NA ...
## $ X.2 : logi NA NA NA NA NA NA ...
## $ X.3 : logi NA NA NA NA NA NA ...
## $ X.4 : logi NA NA NA NA NA NA ...
## $ X.5 : logi NA NA NA NA NA NA ...
## $ X.6 : logi NA NA NA NA NA NA ...
## $ X.7 : logi NA NA NA NA NA NA ...
## $ X.8 : logi NA NA NA NA NA NA ...
## $ X.9 : logi NA NA NA NA NA NA ...
## $ X.10 : logi NA NA NA NA NA NA ...
## $ X.11 : logi NA NA NA NA NA NA ...
## $ X.12 : logi NA NA NA NA NA NA ...
## $ X.13 : logi NA NA NA NA NA NA ...
## $ X.14 : logi NA NA NA NA NA NA ...
## $ X.15 : logi NA NA NA NA NA NA ...
## $ X.16 : logi NA NA NA NA NA NA ...
## $ X.17 : logi NA NA NA NA NA NA ...
## $ X.18 : logi NA NA NA NA NA NA ...
## $ X.19 : logi NA NA NA NA NA NA ...
## $ X.20 : logi NA NA NA NA NA NA ...
## $ X.21 : logi NA NA NA NA NA NA ...
## $ X.22 : logi NA NA NA NA NA NA ...
## $ X.23 : logi NA NA NA NA NA NA ...
## $ X.24 : logi NA NA NA NA NA NA ...
## $ X.25 : logi NA NA NA NA NA NA ...
## $ X.26 : logi NA NA NA NA NA NA ...
## $ X.27 : logi NA NA NA NA NA NA ...
## $ X.28 : logi NA NA NA NA NA NA ...
## $ X.29 : logi NA NA NA NA NA NA ...
## $ X.30 : logi NA NA NA NA NA NA ...
## $ X.31 : logi NA NA NA NA NA NA ...
## $ X.32 : logi NA NA NA NA NA NA ...
## $ X.33 : logi NA NA NA NA NA NA ...
## $ X.34 : logi NA NA NA NA NA NA ...
## $ X.35 : logi NA NA NA NA NA NA ...
## $ X.36 : logi NA NA NA NA NA NA ...
## $ X.37 : logi NA NA NA NA NA NA ...
## $ X.38 : logi NA NA NA NA NA NA ...
#Change column names
colnames(df1) <- c("Victim_Name", "Victim_Age", "Victim_Gender", "Victim_Race", "URL_Victim_Image", "Date_of_Incident", "Street_Address_of_Incident", "City", "State", "Zipcode", "County", "Agency _Responsible_for_Death", "ORI_Agency_Identifier", "Cause_of_Death", "Brief_description_of_circumstance", "Official_disposition_of_death", "Criminal_Charges", "Link_to_Article_or_doc", "Symptoms_of_mental_illness", "Unarmed", "Alleged_Weapon_Source", "Alleged_Threat_Level", "Fleeing", "Body_Camera", "WaPo ID", "Off-Duty")
#remove columns with na values
df2 <- df1[-c(27:68)]
#count occurrences by race
df3 <- df2 %>% count(Victim_Gender, Victim_Race, Unarmed, Symptoms_of_mental_illness) %>% group_by(Victim_Race,Unarmed) %>% summarise(total = sum(n))
## `summarise()` regrouping output by 'Victim_Race' (override with `.groups` argument)
df3
## # A tibble: 28 x 3
## # Groups: Victim_Race [7]
## Victim_Race Unarmed total
## <fct> <fct> <int>
## 1 Asian Allegedly Armed 104
## 2 Asian Unarmed/Did Not Have an Actual Weapon 13
## 3 Asian Unclear 9
## 4 Asian Vehicle 4
## 5 Black Allegedly Armed 1437
## 6 Black Unarmed/Did Not Have an Actual Weapon 361
## 7 Black Unclear 159
## 8 Black Vehicle 164
## 9 Hispanic Allegedly Armed 994
## 10 Hispanic Unarmed/Did Not Have an Actual Weapon 213
## # ... with 18 more rows
#omit na values
df4 <- na.omit(df3)
df4
## # A tibble: 28 x 3
## # Groups: Victim_Race [7]
## Victim_Race Unarmed total
## <fct> <fct> <int>
## 1 Asian Allegedly Armed 104
## 2 Asian Unarmed/Did Not Have an Actual Weapon 13
## 3 Asian Unclear 9
## 4 Asian Vehicle 4
## 5 Black Allegedly Armed 1437
## 6 Black Unarmed/Did Not Have an Actual Weapon 361
## 7 Black Unclear 159
## 8 Black Vehicle 164
## 9 Hispanic Allegedly Armed 994
## 10 Hispanic Unarmed/Did Not Have an Actual Weapon 213
## # ... with 18 more rows
#explore data as it seemed confusing, created basic bar charts, compared df2 and df3 to see if this changed the weird data i found
ggplot(df2) + geom_bar(aes(x = Victim_Race, fill = Victim_Race))
ggplot(df3) + geom_bar(aes(x = Victim_Race, fill = Victim_Race))
ggplot(df1) + geom_bar(aes(x = Victim_Race, fill = Victim_Race))
ggplot(df1) + geom_bar(aes(x = Victim_Gender, fill = Victim_Gender))
#Found something quite interesting that could explain the data
ggplot(df2) + geom_bar(aes(x = Body_Camera, fill = Body_Camera))
ggplot(df2) + geom_bar(aes(x = Alleged_Threat_Level, fill = Alleged_Threat_Level))
#plot data by unarmed category, and splitting unarmed status into different Races
p1 <- ggplot(df4) + geom_col(aes(x = Unarmed, y = total, fill = Victim_Race), position = "dodge") + ylab("Count") + labs(fill = "Victim Race") + ggtitle("Unarmed Cases by Race")
p1
#For Project 1, I decided to use the mappingPoliceViolence dataset from the class drive.
#At first I intended to use the Montgomery County dataset, but as advised, this proved to be an extremely large dataset.
#I chose this dataset due to its ease of cleaning, and moderately small size.
#I found difficulty deciding what variables to use and the questions I wanted to find answers to.
#Once I decided to visualize race and gender, I understood how I needed to data clean.
#First I changed the column names to make the data more concise and legible.
#Then I removed all variables with NA values, this removed around 50% of the dataset.
#This did concern me, but as the columns/variables had no other information to display, I could not investigate any further.
#I would however, start to get a feeling for why this was.
#I opted to focus on the variables concerning the victims, their gender, race, armed status(unarmed), and alleged threat level.
#I began by using "count", to give me information about the occurences of each categorical variable I was interested in.
#This was paired with bar plots to visually aid my analysis.
#I then found issues with the dataset. The data seemed to tell a story completely different from what I expected.
#Throughout the variables, I allegedly found that white males were overwhelmingly the most afflicted demographic by the hands of police, and by no shy margin.
#Graph after graph, I found the same thing, even when visually inspecting the data.
#This makes me believe that the data is either corrupted, missing, or tampered with. So I then took a look at the data for "Body Camera" and "Alleged_Threat_Level".
#The Body Camera data tells us that in overwhelmingly most cases there is no use of cameras by the police department, and the rest of the data is not usable, except for the small percentage (<10%) of instances reporting "Yes" for Body camera use.
#The Alleged Threat Level graph tells us that in most instances, the threat level was "Attack", while the other variables are vague (other, undetermined, "NA")
#These last two variables make me further suspicious of underreporting and data tampering. All-in-all, I started out with a dataset that unfortunately does not tell me much.
#I certainly struggled with this particular project, as most of the datasets were primarily categorical data, and find it easier to work with quantitative data.
#I hope to work with more quantitative data, especially regarding our ongoing topic of policing. I am eager to begin deeper analytical methods.