ArrestMarij <- read.csv("https://raw.githubusercontent.com/SieSiongWong/DATA-607/master/Arrests.csv", header=TRUE, sep=",")
library(dplyr)
library(tidyr)
library(ggplot2)
library(reshape2)
str(ArrestMarij)
## 'data.frame': 5226 obs. of 9 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ released: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 2 2 2 ...
## $ colour : Factor w/ 2 levels "Black","White": 2 1 2 1 1 1 2 2 1 2 ...
## $ year : int 2002 1999 2000 2000 1999 1998 1999 1998 2000 2001 ...
## $ age : int 21 17 24 46 27 16 40 34 23 30 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 2 1 2 2 ...
## $ employed: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 2 2 ...
## $ citizen : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ checks : int 3 3 3 1 1 0 0 1 4 3 ...
summary(ArrestMarij)
## X released colour year age
## Min. : 1 No : 892 Black:1288 Min. :1997 Min. :12.00
## 1st Qu.:1307 Yes:4334 White:3938 1st Qu.:1998 1st Qu.:18.00
## Median :2614 Median :2000 Median :21.00
## Mean :2614 Mean :2000 Mean :23.85
## 3rd Qu.:3920 3rd Qu.:2001 3rd Qu.:27.00
## Max. :5226 Max. :2002 Max. :66.00
## sex employed citizen checks
## Female: 443 No :1115 No : 771 Min. :0.000
## Male :4783 Yes:4111 Yes:4455 1st Qu.:0.000
## Median :1.000
## Mean :1.636
## 3rd Qu.:3.000
## Max. :6.000
## Histogram of age:- Figure 1
hist(ArrestMarij$age, main="Total Marijuana Possession Arrests at Each Age", xlab="Age", ylab="Total Arrests", ylim=c(0,2500), col=rainbow(30))
## Create age group.
labs <- c(paste(seq(0, 95, by=5), seq(0+5-1, 100-1, by=5), sep="-"), paste(100, "+", sep=""))
ArrestMarij$AgeGroup <- cut(ArrestMarij$age, breaks = c(seq(0, 100, by = 5), Inf), labels = labs, right = FALSE)
## Group by age group and then count the arrests.
AgeGroup_tbl <- ArrestMarij %>% group_by(AgeGroup) %>% summarize(
TotalMarijuanaArrests=length(colour), TotalPreviousArrests=sum(checks), TotalArrests=TotalMarijuanaArrests+TotalPreviousArrests)
## Display the top 5 Age Group for Marijuana Possession Arrests:- Table 1
head(arrange(AgeGroup_tbl, desc(TotalMarijuanaArrests)), n=5)
## # A tibble: 5 x 4
## AgeGroup TotalMarijuanaArrests TotalPreviousArrests TotalArrests
## <fct> <int> <int> <int>
## 1 15-19 1901 2713 4614
## 2 20-24 1526 2542 4068
## 3 25-29 615 1060 1675
## 4 30-34 411 798 1209
## 5 35-39 320 679 999
## GGPlot Histogram for Age Group vs. TotalMarijuanaArrests:- Figure 2
ggplot(AgeGroup_tbl, aes(x=AgeGroup, y=TotalMarijuanaArrests)) + geom_histogram(stat="identity", fill="hotpink") + xlab("Age Group") + ylab("Total Arrests") + ggtitle("Age Group vs. Total Marijuana Possession Arrests") + theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=11, face="bold"),
axis.title.y = element_text(size=11, face="bold"), axis.text.x = element_text(face="bold"), axis.text.y = element_text(face="bold")) + ylim(0,2000)
## GGPlot Histogram for Age Group vs. Previous Arrests:- Figure 3
ggplot(AgeGroup_tbl, aes(x=AgeGroup, y=TotalPreviousArrests)) + geom_histogram(stat="identity", fill="hotpink") + xlab("Age Group") + ylab("Previous Arrests") + ggtitle("Age Group vs. Previous Arrests") + theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=11, face="bold"),
axis.title.y = element_text(size=11, face="bold"), axis.text.x = element_text(face="bold"), axis.text.y = element_text(face="bold")) + ylim(0,3000)
## Scatter Plot of marijuana arrests vs. previous arrests: - Figure 4
AgeMarijArrests_tbl <- ArrestMarij %>% group_by(age) %>% summarize(MarijArrests=length(colour))
AgePreviousArrests_tbl <- ArrestMarij %>% group_by(age) %>% summarize(PreviousArrests=sum(checks))
ggplot(NULL) + geom_point(data=AgeMarijArrests_tbl, aes(x=age, y=MarijArrests, color="Marijuana Arrests")) + ggtitle("Marijuana Possession Arrests vs. Previous Arrests") + xlab("Age") + ylab("Total Arrests") + geom_line(data=AgePreviousArrests_tbl, aes(x=age, y=PreviousArrests, color="Previous Arrests")) + xlab("Age") + ylab("Total Arrests") + theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=11, face="bold"),
axis.title.y = element_text(size=11, face="bold"), axis.text.x = element_text(face="bold"), axis.text.y = element_text(face="bold")) + scale_fill_manual(values=c("Marijuana Arrests"="red", "Previous Arrests"="darkblue")) + theme(legend.title=element_blank())
## Bar plot of white vs. black:- Table 2 & Figure 5
WhiteCount <- length(which(ArrestMarij$colour=="White"))
BlackCount <- length(which(ArrestMarij$colour=="Black"))
BlackWhiteTable <- matrix(c(WhiteCount,BlackCount),ncol=2,nrow=1,byrow=TRUE)
dimnames(BlackWhiteTable) = list(c("Total of Arrests"), c("White", "Black"))
barplot(BlackWhiteTable, beside=T,las=0.5, cex.names=1.2,cex.axis=1.0, ylab="Total of Arrests", axes=FALSE, ylim=c(0,4500), cex.lab=1.1, main="White vs. Black", col=c("blue","red"))
axis(2, at = seq(0, 4500, 500), las = 0.9)
BlackWhiteTable
## White Black
## Total of Arrests 3938 1288
## Scatter Plot of year vs. black & white:- Table 3 & Figure 6
YearWhiteBlackCount_tbl <- ArrestMarij %>% group_by(year) %>% summarize(WhiteArrests=length(which(colour=="White")), BlackArrests=length(which(colour=="Black")))
names(YearWhiteBlackCount_tbl) <- sub("^year$", "Year", names(YearWhiteBlackCount_tbl))
YearWhiteBlackCount_tbl
## # A tibble: 6 x 3
## Year WhiteArrests BlackArrests
## <int> <int> <int>
## 1 1997 369 123
## 2 1998 637 240
## 3 1999 849 250
## 4 2000 991 279
## 5 2001 900 311
## 6 2002 192 85
YearWhiteBlackCount_tbl <- transform(YearWhiteBlackCount_tbl, Year = as.Date(as.character(Year), "%Y"))
plot(YearWhiteBlackCount_tbl$WhiteArrests~as.Date(YearWhiteBlackCount_tbl$Year, "%y"),type="b", col="red", lty=2, main="Year Vs White & Black Arrests", xlab="Year", ylab="Arrests", ylim=c(0,1200), pch=19)
lines(YearWhiteBlackCount_tbl$BlackArrests~as.Date(YearWhiteBlackCount_tbl$Year, "%y"), type="b", col="green", pch=18)
legend("topright", lty=c(2,1), legend=c("White", "Black"), col=c("red", "green"), cex=0.8, text.font=4)
## Scatter plot of released vs. previous records:- Figure 7
ReleasedCustodyRecord_tbl <- ArrestMarij %>% group_by(checks) %>% summarize("Released"=length(which(released=="Yes")), "Custody"=length(which(released=="No")))
names(ReleasedCustodyRecord_tbl) <- sub("^checks$", "PreviousArrests", names(ReleasedCustodyRecord_tbl))
ggplot(NULL) + geom_line(data=ReleasedCustodyRecord_tbl, aes(x=PreviousArrests, y=Released, color="red")) + ggtitle("Released vs Previous Records") + xlab("Previous Records") + ylab("Total Released") + theme(plot.title = element_text(hjust=0.5), axis.title.x = element_text(size=11, face="bold"),
axis.title.y = element_text(size=11, face="bold"), axis.text.x = element_text(face="bold"), axis.text.y = element_text(face="bold")) + theme(legend.position = "none")
## Group by released and then count the white and black arrests.
ReleasedSummonsRace_tbl <- ArrestMarij %>% group_by(released) %>% summarize(White=length(which(colour=="White")), Black=length(which(colour=="Black")))
## Create into a table.
ReleasedSummonsRace_tbl2 <- matrix(c(ReleasedSummonsRace_tbl$White, ReleasedSummonsRace_tbl$Black), ncol=2)
colnames(ReleasedSummonsRace_tbl2) <- c("White", "Black")
rownames(ReleasedSummonsRace_tbl2) <- c("Released", "Custody")
ReleasedSummonsRace_tbl2 <- as.table(ReleasedSummonsRace_tbl2)
## Conditional Distribution: Table 4 and Figure 8
round(prop.table(ReleasedSummonsRace_tbl2,2)*100, digits=2)
## White Black
## Released 14.20 25.85
## Custody 85.80 74.15
barplot(prop.table(ReleasedSummonsRace_tbl2,2)*100, beside=T, ylab="%", ylim=c(0,100), main=" Released or Custody by Race", col=c("mediumspringgreen", "lightslateblue"))
legend("center", legend = c("Released", "Custody"), fill = c("mediumspringgreen", "lightslateblue"),cex=0.85)
## Group by released and then count the gender.
ReleasedSummonsGender_tbl <- ArrestMarij %>% group_by(released) %>% summarize(Male=length(which(sex=="Male")), Female=length(which(sex=="Female")))
## Create into a table.
ReleasedSummonsGender_tbl2 <- matrix(c(ReleasedSummonsGender_tbl$Male, ReleasedSummonsGender_tbl$Female), ncol=2)
colnames(ReleasedSummonsGender_tbl2) <- c("Male", "Female")
rownames(ReleasedSummonsGender_tbl2) <- c("Custody", "Released")
ReleasedSummonsGender_tbl2 <- as.table(ReleasedSummonsGender_tbl2)
## Conditional Distribution: Table 5 and Figure 9
round(prop.table(ReleasedSummonsGender_tbl2,2)*100, digits=2)
## Male Female
## Custody 17.33 14.22
## Released 82.67 85.78
barplot(prop.table(ReleasedSummonsGender_tbl2,2)*100, beside=T, ylab="%", main=" Released or Custody by Gender", ylim=c(0,100), col=c("hotpink", "lightgreen"))
legend("center", legend = c("Custody", "Released"), fill = c("hotpink", "lightgreen"),cex=0.85)
## Group by released and then count the employment status.
ReleasedSummonsEmploy_tbl <- ArrestMarij %>% group_by(released) %>% summarize(Employed=length(which(employed=="Yes")), Unemployed=length(which(employed=="No")))
## Create into a table.
ReleasedSummonsEmploy_tbl2 <- matrix(c(ReleasedSummonsEmploy_tbl$Employed, ReleasedSummonsEmploy_tbl$Unemployed), ncol=2)
colnames(ReleasedSummonsEmploy_tbl2) <- c("Employed", "Unemployed")
rownames(ReleasedSummonsEmploy_tbl2) <- c("Custody", "Released")
ReleasedSummonsEmploy_tbl2 <- as.table(ReleasedSummonsEmploy_tbl2)
## Conditional Distribution: Table 6 and Figure 10
round(prop.table(ReleasedSummonsEmploy_tbl2,2)*100, digits=2)
## Employed Unemployed
## Custody 13.21 31.30
## Released 86.79 68.70
barplot(prop.table(ReleasedSummonsEmploy_tbl2,2)*100, beside=T, ylab="%", main=" Released or Custody by Employment Status", ylim=c(0,100), col=c("deepskyblue", "deeppink"))
legend("center", legend = c("Custody", "Released"), fill = c("deepskyblue", "deeppink"),cex=0.85)
## Group by released and then count the citizenship status.
ReleasedSummonsCitiz_tbl <- ArrestMarij %>% group_by(released) %>% summarize(Citizen=length(which(citizen=="Yes")), NonCitizen=length(which(citizen=="No")))
## Create into a table.
ReleasedSummonsCitiz_tbl2 <- matrix(c(ReleasedSummonsCitiz_tbl$Citizen, ReleasedSummonsCitiz_tbl$NonCitizen), ncol=2)
colnames(ReleasedSummonsCitiz_tbl2) <- c("Citizen", "Non Citizen")
rownames(ReleasedSummonsCitiz_tbl2) <- c("Custody", "Released")
ReleasedSummonsCitiz_tbl2 <- as.table(ReleasedSummonsCitiz_tbl2)
## Conditional Distribution: - Table 7 and Figure 11
round(prop.table(ReleasedSummonsCitiz_tbl2,2)*100, digits=2)
## Citizen Non Citizen
## Custody 15.26 27.50
## Released 84.74 72.50
barplot(prop.table(ReleasedSummonsCitiz_tbl2,2)*100, beside=T, main=" Released or Custody by Citizenship Status", ylab="%", ylim=c(0,100), col=c("deepskyblue", "darkorchid1"))
legend("center", legend = c("Custody", "Released"), fill = c("deepskyblue", "darkorchid1"),cex=0.85)
## Summary for all variables group by age group:- Table 8
Age_tbl <- ArrestMarij %>% group_by(AgeGroup) %>% summarize("Released"=length(which(released=="Yes")), "Custody"=length(which(released=="No")), "White"=length(which(colour=="White")), "Black"=length(which(colour=="Black")), "Male"=length(which(sex=="Male")), "Female"=length(which(sex=="Female")), "Employed"=length(which(employed=="Yes")), "Unemployed"=length(which(employed=="No")), "Citizen"=length(which(citizen=="Yes")), "Non Citizen"=length(which(citizen=="No")))
head(arrange(Age_tbl, desc(Released)))
## # A tibble: 6 x 11
## AgeGroup Released Custody White Black Male Female Employed Unemployed
## <fct> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 15-19 1590 311 1532 369 1710 191 1599 302
## 2 20-24 1285 241 1097 429 1430 96 1165 361
## 3 25-29 498 117 430 185 562 53 472 143
## 4 30-34 344 67 304 107 378 33 299 112
## 5 35-39 258 62 234 86 291 29 239 81
## 6 40-44 145 47 144 48 173 19 127 65
## # ... with 2 more variables: Citizen <int>, `Non Citizen` <int>
## Box plot for each variables: Figure 12
Age_tbl2 <- melt(Age_tbl, id.vars=c("AgeGroup"))
ggplot(Age_tbl2, aes(x=reorder(factor(variable),value, fun=median),y=value,fill=factor(variable))) + geom_boxplot() + labs(title="Marijuana Possession Arrests by Different Variables") + ylab("Total Arrests") + theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x=element_text(angle=45)) + theme(plot.title = element_text(hjust=0.5)) + theme(axis.text.x = element_text(margin = margin(t = 20, r = 20, b = 0, l = 0)))