MSDS_Bridge_Math_Week3 - Final Project

## install.packages("scatterplot3d")
library(scatterplot3d)
## install.packages("ggplot2")
library(ggplot2)
## install.packages("plotrix")
library(plotrix)

## Dataset about Arrests for Marijuana possession
url <- 'https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/carData/Arrests.csv'
doc <- 'https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/doc/carData/Arrests.html'

maparrests <- read.csv(url, header=TRUE, stringsAsFactors=FALSE)

## Meaningful question for analysis:
## What is the correlation between arrests/violations and features like: Age, Employment and Race? 
## How do the number of arrests/violations vary by age group and gender (teenagers, adults, elders)?


## Data Exploration & Basic Visualizations & Graphics

summary(maparrests)

##        X          released            colour               year     
##  Min.   :   1   Length:5226        Length:5226        Min.   :1997  
##  1st Qu.:1307   Class :character   Class :character   1st Qu.:1998  
##  Median :2614   Mode  :character   Mode  :character   Median :2000  
##  Mean   :2614                                         Mean   :2000  
##  3rd Qu.:3920                                         3rd Qu.:2001  
##  Max.   :5226                                         Max.   :2002  
##       age            sex              employed           citizen         
##  Min.   :12.00   Length:5226        Length:5226        Length:5226       
##  1st Qu.:18.00   Class :character   Class :character   Class :character  
##  Median :21.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :23.85                                                           
##  3rd Qu.:27.00                                                           
##  Max.   :66.00                                                           
##      checks     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :1.000  
##  Mean   :1.636  
##  3rd Qu.:3.000  
##  Max.   :6.000

head(maparrests)

##   X released colour year age    sex employed citizen checks
## 1 1      Yes  White 2002  21   Male      Yes     Yes      3
## 2 2       No  Black 1999  17   Male      Yes     Yes      3
## 3 3      Yes  White 2000  24   Male      Yes     Yes      3
## 4 4       No  Black 2000  46   Male      Yes     Yes      1
## 5 5      Yes  Black 1999  27 Female      Yes     Yes      1
## 6 6      Yes  Black 1998  16 Female      Yes     Yes      0

nrow(maparrests)

## [1] 5226

hist(maparrests$age, col= "green")

hist(maparrests$year, col= "blue")

hist(maparrests$checks, col= "red")

table(maparrests$checks, maparrests$year)

##    
##     1997 1998 1999 2000 2001 2002
##   0  164  311  408  445  424   99
##   1   60  129  170  232  218   45
##   2   86  132  158  190  181   42
##   3   77  180  182  220  243   51
##   4   84   98  155  151  120   35
##   5   21   27   22   29   23    5
##   6    0    0    4    3    2    0

table(maparrests$colour, maparrests$sex)

##        
##         Female Male
##   Black     72 1216
##   White    371 3567

## Conclusions:
## Total number of observations - 5226
## Total number of features - 9
## Majority of the data set features are categorical: Released (Y/N), Colour, Sex, Citizen (Y/N)
## Only four are numeric (continuos): RowId, Year of Arrest, Age, Number of checks (records on other police DBs) (previous arrests, convictions, paroles, etc.)
## Number of checks spread more or less evenly over the years
## Most of the observations fall into the 0 and 1 checks (police records)
## Majority of the dataset records correspond to gender=Male and colour= White groups
## Mean age = 23.85 and Median Age = 21


## Data wrangling and transformations

## Make known categories numeric for easier data manipulation and visualization:

## Gender: from Male/Female to 1/2
maparrests[maparrests == "Male"] <- 1
maparrests[maparrests == "Female"] <- 2
maparrests$sex <- as.numeric(maparrests$sex)

## Released, Employed & Citizen: from Yes/No to 1/0
maparrests[maparrests == "Yes"] <- 1
maparrests[maparrests == "No"] <- 0
maparrests$released <- as.numeric(maparrests$released)
maparrests$employed <- as.numeric(maparrests$employed)
maparrests$citizen <- as.numeric(maparrests$citizen)

## Change column name from "Colour" to "Race"
names(maparrests)[names(maparrests)=="colour"] <- c("race")

## race: from White/Black to 11/22
maparrests[maparrests == "White"] <- 11
maparrests[maparrests == "Black"] <- 22
maparrests$Race <- as.numeric(maparrests$race)

## Eliminate irrelevant features (first column was a unique record identifier with no relevance)
maparrests <- subset(maparrests, select = -c(X))

## More advanced Visualizations and Data Wrangling

## Plotting the entire dataset features to visualize correlations of all features simultaneously
plot(maparrests, col="darkblue")

## Create a new feature "Age Group" by binning/grouping observations by "Age"

boxplot(maparrests$age) ## to identify the potential Age Group limits

maparrests_bin <- maparrests
agegrp <- rep("blank", nrow(maparrests_bin)) # Empty vector to add as a new column in the dataset
maparrests_bin <- cbind(maparrests_bin, agegrp)
maparrests_bin$agegrp <- as.character(maparrests_bin$agegrp)

summary(maparrests_bin)

##     released          race                year           age       
##  Min.   :0.0000   Length:5226        Min.   :1997   Min.   :12.00  
##  1st Qu.:1.0000   Class :character   1st Qu.:1998   1st Qu.:18.00  
##  Median :1.0000   Mode  :character   Median :2000   Median :21.00  
##  Mean   :0.8293                      Mean   :2000   Mean   :23.85  
##  3rd Qu.:1.0000                      3rd Qu.:2001   3rd Qu.:27.00  
##  Max.   :1.0000                      Max.   :2002   Max.   :66.00  
##       sex           employed         citizen           checks     
##  Min.   :1.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:1.000   1st Qu.:1.0000   1st Qu.:1.0000   1st Qu.:0.000  
##  Median :1.000   Median :1.0000   Median :1.0000   Median :1.000  
##  Mean   :1.085   Mean   :0.7866   Mean   :0.8525   Mean   :1.636  
##  3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.000   Max.   :1.0000   Max.   :1.0000   Max.   :6.000  
##       Race          agegrp         
##  Min.   :11.00   Length:5226       
##  1st Qu.:11.00   Class :character  
##  Median :11.00   Mode  :character  
##  Mean   :13.71                     
##  3rd Qu.:11.00                     
##  Max.   :22.00

maparrests_bin$agegrp[maparrests_bin$age > 10 & maparrests_bin$age <= 19] <- "Teenager"
maparrests_bin$agegrp[maparrests_bin$age > 19 & maparrests_bin$age <= 40] <- "Young Adult"
maparrests_bin$agegrp[maparrests_bin$age > 40 & maparrests_bin$age <= 65] <- "Mature Adult"
maparrests_bin$agegrp[maparrests_bin$age > 65 & maparrests_bin$age <= 75] <- "Elder"

## Boxplotting Arrests/Violations and Age Group and Gender to find correlations
boxplot(checks~agegrp, data=maparrests_bin, col="orange", main="Arrests for Marijuana Possession", xlab="Age Group",ylab="Arrests/Violations")

boxplot(checks~agegrp+sex, data=maparrests_bin, col="lightblue", horizontal=FALSE, main="Arrests for Marijuana Possession", xlab="Age Group and Gender (1=Male, 2=Female)",ylab="Arrests/Violations")

## Table-ing and Pie-charting Checks, Age Group and Race Correlations

table(maparrests_bin$checks, maparrests_bin$agegrp)

##    
##     Elder Mature Adult Teenager Young Adult
##   0     0           82      857         912
##   1     0           44      350         460
##   2     2           44      271         472
##   3     0           81      263         609
##   4     0           33      209         401
##   5     0            7       57          63
##   6     0            0        1           8

tab_agegrp <- table(maparrests_bin$agegrp)
lbls <- names(tab_agegrp)
pie3D(tab_agegrp, labels = lbls, main="Arrests/Violation by Age Group")

table(maparrests_bin$checks, maparrests_bin$race)

##    
##       11   22
##   0 1575  276
##   1  635  219
##   2  571  218
##   3  641  312
##   4  433  210
##   5   82   45
##   6    1    8

tab_race <- table(maparrests_bin$race)
lbls <- c("White", "Black")
pie3D(tab_race, labels = lbls, main="Arrests/Violation by Race")

## Scatterplot of other dataset features to understand other correlations to Arrests/Violations
attach(maparrests_bin)

## The following object is masked _by_ .GlobalEnv:
## 
##     agegrp

scatterplot3d(employed, age, checks, pch=16, type="h", col.axis="blue", main="3D Scatterplot")

## Conclusion:
## The highest concentration of arrests happen in the "Youg Adult (19-40)" age group, followed closely by "Teenagers (10-19)"
## The median for "Adults" (both Young and Mature) is 2 arrests
## While the median for "Teenagers" is 1 arrest, the maximum goes to 6 which is interesting considering the shorter lifespan compared to the "Adults" groups
## The highest concentration of arrests are done by "White" race group - 3938
## The correlation between Employment and Age to arrests shows similar trends between employed and unemployed up to 40 years of age,
## and then shows slightly higher trend for the employed group from 40 to 60 years of age

## BONUS - place the original .csv in a github file and have R read from the link

url <- 'https://raw.githubusercontent.com/humbertohpgit/MSDSSummerBridge2018/master/Arrests'
maparrests_bonus <- read.csv(url, header=TRUE, stringsAsFactors=FALSE)
nrow(maparrests_bonus)

## [1] 5226

MSDS_Bridge_Math_Week3 - Final Project

humbertohp

July 26, 2018