## install.packages("scatterplot3d")
library(scatterplot3d)
## install.packages("ggplot2")
library(ggplot2)
## install.packages("plotrix")
library(plotrix)
## Dataset about Arrests for Marijuana possession
url <- 'https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/carData/Arrests.csv'
doc <- 'https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/doc/carData/Arrests.html'
maparrests <- read.csv(url, header=TRUE, stringsAsFactors=FALSE)
## Meaningful question for analysis:
## What is the correlation between arrests/violations and features like: Age, Employment and Race?
## How do the number of arrests/violations vary by age group and gender (teenagers, adults, elders)?
## Data Exploration & Basic Visualizations & Graphics
summary(maparrests)
## X released colour year
## Min. : 1 Length:5226 Length:5226 Min. :1997
## 1st Qu.:1307 Class :character Class :character 1st Qu.:1998
## Median :2614 Mode :character Mode :character Median :2000
## Mean :2614 Mean :2000
## 3rd Qu.:3920 3rd Qu.:2001
## Max. :5226 Max. :2002
## age sex employed citizen
## Min. :12.00 Length:5226 Length:5226 Length:5226
## 1st Qu.:18.00 Class :character Class :character Class :character
## Median :21.00 Mode :character Mode :character Mode :character
## Mean :23.85
## 3rd Qu.:27.00
## Max. :66.00
## checks
## Min. :0.000
## 1st Qu.:0.000
## Median :1.000
## Mean :1.636
## 3rd Qu.:3.000
## Max. :6.000
head(maparrests)
## X released colour year age sex employed citizen checks
## 1 1 Yes White 2002 21 Male Yes Yes 3
## 2 2 No Black 1999 17 Male Yes Yes 3
## 3 3 Yes White 2000 24 Male Yes Yes 3
## 4 4 No Black 2000 46 Male Yes Yes 1
## 5 5 Yes Black 1999 27 Female Yes Yes 1
## 6 6 Yes Black 1998 16 Female Yes Yes 0
nrow(maparrests)
## [1] 5226
hist(maparrests$age, col= "green")

hist(maparrests$year, col= "blue")

hist(maparrests$checks, col= "red")

table(maparrests$checks, maparrests$year)
##
## 1997 1998 1999 2000 2001 2002
## 0 164 311 408 445 424 99
## 1 60 129 170 232 218 45
## 2 86 132 158 190 181 42
## 3 77 180 182 220 243 51
## 4 84 98 155 151 120 35
## 5 21 27 22 29 23 5
## 6 0 0 4 3 2 0
table(maparrests$colour, maparrests$sex)
##
## Female Male
## Black 72 1216
## White 371 3567
## Conclusions:
## Total number of observations - 5226
## Total number of features - 9
## Majority of the data set features are categorical: Released (Y/N), Colour, Sex, Citizen (Y/N)
## Only four are numeric (continuos): RowId, Year of Arrest, Age, Number of checks (records on other police DBs) (previous arrests, convictions, paroles, etc.)
## Number of checks spread more or less evenly over the years
## Most of the observations fall into the 0 and 1 checks (police records)
## Majority of the dataset records correspond to gender=Male and colour= White groups
## Mean age = 23.85 and Median Age = 21
## Data wrangling and transformations
## Make known categories numeric for easier data manipulation and visualization:
## Gender: from Male/Female to 1/2
maparrests[maparrests == "Male"] <- 1
maparrests[maparrests == "Female"] <- 2
maparrests$sex <- as.numeric(maparrests$sex)
## Released, Employed & Citizen: from Yes/No to 1/0
maparrests[maparrests == "Yes"] <- 1
maparrests[maparrests == "No"] <- 0
maparrests$released <- as.numeric(maparrests$released)
maparrests$employed <- as.numeric(maparrests$employed)
maparrests$citizen <- as.numeric(maparrests$citizen)
## Change column name from "Colour" to "Race"
names(maparrests)[names(maparrests)=="colour"] <- c("race")
## race: from White/Black to 11/22
maparrests[maparrests == "White"] <- 11
maparrests[maparrests == "Black"] <- 22
maparrests$Race <- as.numeric(maparrests$race)
## Eliminate irrelevant features (first column was a unique record identifier with no relevance)
maparrests <- subset(maparrests, select = -c(X))
## More advanced Visualizations and Data Wrangling
## Plotting the entire dataset features to visualize correlations of all features simultaneously
plot(maparrests, col="darkblue")

## Create a new feature "Age Group" by binning/grouping observations by "Age"
boxplot(maparrests$age) ## to identify the potential Age Group limits

maparrests_bin <- maparrests
agegrp <- rep("blank", nrow(maparrests_bin)) # Empty vector to add as a new column in the dataset
maparrests_bin <- cbind(maparrests_bin, agegrp)
maparrests_bin$agegrp <- as.character(maparrests_bin$agegrp)
summary(maparrests_bin)
## released race year age
## Min. :0.0000 Length:5226 Min. :1997 Min. :12.00
## 1st Qu.:1.0000 Class :character 1st Qu.:1998 1st Qu.:18.00
## Median :1.0000 Mode :character Median :2000 Median :21.00
## Mean :0.8293 Mean :2000 Mean :23.85
## 3rd Qu.:1.0000 3rd Qu.:2001 3rd Qu.:27.00
## Max. :1.0000 Max. :2002 Max. :66.00
## sex employed citizen checks
## Min. :1.000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:0.000
## Median :1.000 Median :1.0000 Median :1.0000 Median :1.000
## Mean :1.085 Mean :0.7866 Mean :0.8525 Mean :1.636
## 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.000 Max. :1.0000 Max. :1.0000 Max. :6.000
## Race agegrp
## Min. :11.00 Length:5226
## 1st Qu.:11.00 Class :character
## Median :11.00 Mode :character
## Mean :13.71
## 3rd Qu.:11.00
## Max. :22.00
maparrests_bin$agegrp[maparrests_bin$age > 10 & maparrests_bin$age <= 19] <- "Teenager"
maparrests_bin$agegrp[maparrests_bin$age > 19 & maparrests_bin$age <= 40] <- "Young Adult"
maparrests_bin$agegrp[maparrests_bin$age > 40 & maparrests_bin$age <= 65] <- "Mature Adult"
maparrests_bin$agegrp[maparrests_bin$age > 65 & maparrests_bin$age <= 75] <- "Elder"
## Boxplotting Arrests/Violations and Age Group and Gender to find correlations
boxplot(checks~agegrp, data=maparrests_bin, col="orange", main="Arrests for Marijuana Possession", xlab="Age Group",ylab="Arrests/Violations")

boxplot(checks~agegrp+sex, data=maparrests_bin, col="lightblue", horizontal=FALSE, main="Arrests for Marijuana Possession", xlab="Age Group and Gender (1=Male, 2=Female)",ylab="Arrests/Violations")

## Table-ing and Pie-charting Checks, Age Group and Race Correlations
table(maparrests_bin$checks, maparrests_bin$agegrp)
##
## Elder Mature Adult Teenager Young Adult
## 0 0 82 857 912
## 1 0 44 350 460
## 2 2 44 271 472
## 3 0 81 263 609
## 4 0 33 209 401
## 5 0 7 57 63
## 6 0 0 1 8
tab_agegrp <- table(maparrests_bin$agegrp)
lbls <- names(tab_agegrp)
pie3D(tab_agegrp, labels = lbls, main="Arrests/Violation by Age Group")

table(maparrests_bin$checks, maparrests_bin$race)
##
## 11 22
## 0 1575 276
## 1 635 219
## 2 571 218
## 3 641 312
## 4 433 210
## 5 82 45
## 6 1 8
tab_race <- table(maparrests_bin$race)
lbls <- c("White", "Black")
pie3D(tab_race, labels = lbls, main="Arrests/Violation by Race")

## Scatterplot of other dataset features to understand other correlations to Arrests/Violations
attach(maparrests_bin)
## The following object is masked _by_ .GlobalEnv:
##
## agegrp
scatterplot3d(employed, age, checks, pch=16, type="h", col.axis="blue", main="3D Scatterplot")

## Conclusion:
## The highest concentration of arrests happen in the "Youg Adult (19-40)" age group, followed closely by "Teenagers (10-19)"
## The median for "Adults" (both Young and Mature) is 2 arrests
## While the median for "Teenagers" is 1 arrest, the maximum goes to 6 which is interesting considering the shorter lifespan compared to the "Adults" groups
## The highest concentration of arrests are done by "White" race group - 3938
## The correlation between Employment and Age to arrests shows similar trends between employed and unemployed up to 40 years of age,
## and then shows slightly higher trend for the employed group from 40 to 60 years of age
## BONUS - place the original .csv in a github file and have R read from the link
url <- 'https://raw.githubusercontent.com/humbertohpgit/MSDSSummerBridge2018/master/Arrests'
maparrests_bonus <- read.csv(url, header=TRUE, stringsAsFactors=FALSE)
nrow(maparrests_bonus)
## [1] 5226