For this project, I analyzed a data set of people who have been arrested due to marijuana possession.

The question I want to answer is: Which attributes have the greatest impact on an arrestee being released? The attributes being assessed are race, age, sex, employment status, citizenship status, and previous arrests.

Here we clean up the data:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)

# call the csv file from the github link
arrests <- read.csv("https://raw.githubusercontent.com/kristinlussi/RBridgeFinalProject/main/Arrests.csv")

# clean up the data

# released column
# replace "Yes" with 1 and "No" with 0
arrests$released <- str_replace_all(arrests$released, "Yes", '1')
arrests$released <- str_replace_all(arrests$released, "No", '0')
arrests$released <- as.integer(arrests$released)

# citizen column
# replace "Yes" with 1 and "No" with 0
arrests$citizen <- str_replace_all(arrests$citizen, "Yes", '1')
arrests$citizen <- str_replace_all(arrests$citizen, "No", '0')
arrests$citizen <- as.integer(arrests$citizen)

# sex column
# replace "Male" with 0 and "Female with 1
arrests$sex <- str_replace_all(arrests$sex, "Female", '1')
arrests$sex <- str_replace_all(arrests$sex, "Male", '0') 
arrests$sex <- as.integer(arrests$sex)

# employed column
# replace "Yes" with 1 and "No" with 0
arrests$employed <- str_replace_all(arrests$employed, "Yes", '1')
arrests$employed <- str_replace_all(arrests$employed, "No", '0') 
arrests$employed <- as.integer(arrests$sex)

head(arrests, 20)
##     X released colour year age sex employed citizen checks
## 1   1        1  White 2002  21   0        0       1      3
## 2   2        0  Black 1999  17   0        0       1      3
## 3   3        1  White 2000  24   0        0       1      3
## 4   4        0  Black 2000  46   0        0       1      1
## 5   5        1  Black 1999  27   1        1       1      1
## 6   6        1  Black 1998  16   1        1       1      0
## 7   7        1  White 1999  40   0        0       1      0
## 8   8        1  White 1998  34   1        1       1      1
## 9   9        1  Black 2000  23   0        0       1      4
## 10 10        1  White 2001  30   0        0       1      3
## 11 11        1  White 1999  18   0        0       0      0
## 12 12        1  White 2000  18   0        0       1      3
## 13 13        1  White 2000  17   0        0       1      1
## 14 14        1  Black 1997  42   0        0       1      0
## 15 15        1  Black 1999  26   0        0       1      2
## 16 16        1  White 2001  25   0        0       1      3
## 17 17        0  White 2001  45   0        0       1      4
## 18 18        0  White 2002  20   0        0       1      5
## 19 19        1  White 2001  32   1        1       1      3
## 20 20        0  White 1998  14   0        0       1      0

Here, we will summarize the data:

# summarize the data
summary(arrests)
##        X           released         colour               year     
##  Min.   :   1   Min.   :0.0000   Length:5226        Min.   :1997  
##  1st Qu.:1307   1st Qu.:1.0000   Class :character   1st Qu.:1998  
##  Median :2614   Median :1.0000   Mode  :character   Median :2000  
##  Mean   :2614   Mean   :0.8293                      Mean   :2000  
##  3rd Qu.:3920   3rd Qu.:1.0000                      3rd Qu.:2001  
##  Max.   :5226   Max.   :1.0000                      Max.   :2002  
##       age             sex             employed          citizen      
##  Min.   :12.00   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:18.00   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:1.0000  
##  Median :21.00   Median :0.00000   Median :0.00000   Median :1.0000  
##  Mean   :23.85   Mean   :0.08477   Mean   :0.08477   Mean   :0.8525  
##  3rd Qu.:27.00   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :66.00   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000  
##      checks     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :1.000  
##  Mean   :1.636  
##  3rd Qu.:3.000  
##  Max.   :6.000

First, I wanted to know what the average and median ages are for people who are released/not released:

## Subset the data to include only the rows where "released" is "yes"
released_data <- subset(arrests, released == 1)

# what is the average age of people of people who were released and not released
avgAge <- aggregate(age ~ released, data = arrests, FUN = mean)
colnames(avgAge) <- c("Released", "AverageAge")
avgAge$Released <- ifelse(avgAge$Released == 1, "Yes", avgAge$Released)
avgAge$Released <- ifelse(avgAge$Released == 0, "No", avgAge$Released)
print(avgAge)
##   Released AverageAge
## 1       No   24.63565
## 2      Yes   23.68413
# what is the median age of people of people who were released and not released
medAge <- aggregate(age ~ released, data = arrests, FUN = median)
colnames(medAge) <- c("Released", "MedianAge")
medAge$Released <- ifelse(medAge$Released == 1, "Yes", medAge$Released)
medAge$Released <- ifelse(medAge$Released == 0, "No", medAge$Released)
print(medAge)
##   Released MedianAge
## 1       No        22
## 2      Yes        21

From my findings, I could see that the majority of people arrested for this crime are in their early twenties.

I wanted to know how many arrests there were per year. Here is a scatter plot showing the relationship between the number of arrests and the year:

require(ggplot2)
## Loading required package: ggplot2
# Group the data by year and summarize the counts
arrests_per_year <- data.frame(arrests %>%
  group_by(year) %>%
  summarize(Number.of.Arrests.per.Year = n()))

# create a scatterplot showing the relationship between number of arrests and the year
g <- ggplot(arrests_per_year, aes(x = year, y = Number.of.Arrests.per.Year)) 
g <- g + geom_point() 
g <- g + ggtitle("Number of Arrests per Year between 1997-2002")
show(g)

Next, I wanted to know what percentage of people are released based on their race:

# percentage of white people who are released
sumWhite <- sum(arrests$colour == "White")
sumWhiteReleased <- sum(released_data$colour == "White")
percentageWhiteReleased <- sumWhiteReleased / sumWhite

# percentage of black people who are released
sumBlack <- sum(arrests$colour == "Black")
sumBlackReleased <- sum(released_data$colour == "Black")
percentageBlackReleased <- sumBlackReleased / sumBlack

# create a data frame of race and percentage released
raceData <- data.frame("Color" = c("Black", "White"), "Number of People" = c(sumBlack, sumWhite), "Number Released" = c(sumBlackReleased, sumWhiteReleased), "Percentage Released" = c(percentageBlackReleased * 100, percentageWhiteReleased * 100))
print(raceData)
##   Color Number.of.People Number.Released Percentage.Released
## 1 Black             1288             955            74.14596
## 2 White             3938            3379            85.80498

I learned that in general, white people are more likely to be released for this crime than people of color.

Next, I wanted to see if women were more likely to be released for this crime:

## find the percentage of arrestees that were women vs men
# sum of all rows
sumData <- nrow(arrests)
# sum of women arrests
sumWomen <- sum(arrests$sex == 1)
# percentage of women arrestees
percentageWomen <- sumWomen / sumData
# sum of women released
sumWomenReleased <- sum(released_data$sex == "1")
# sum of men released
sumMenReleased <- nrow(released_data) - sumWomenReleased

# percentage of women released
percentageWomenReleased <- sumWomenReleased / sumWomen
percentageMenReleased <- sumMenReleased / (sumData-sumWomen)

# create data frame of sex and number of released
sexData <- data.frame("Sex" = c("Male","Female"), "Number of Arrestees" = c(sumData-sumWomen, sumWomen), "Percentage of Arrestees" = c((1-percentageWomen)*100, percentageWomen*100), "Number of People Released" = c(sumMenReleased, sumWomenReleased), "Percentage of Sex Released" = c(percentageMenReleased * 100, percentageWomenReleased * 100))
print(sexData)
##      Sex Number.of.Arrestees Percentage.of.Arrestees Number.of.People.Released
## 1   Male                4783               91.523153                      3954
## 2 Female                 443                8.476847                       380
##   Percentage.of.Sex.Released
## 1                   82.66778
## 2                   85.77878

From my analysis, I found that far more men are arrested for this crime. However, comparing the amount of people released for each sex, the percentages are not far off. So, I can conclude that sex does not have a significant impact on whether or not an arrestee is released.

Next, I wanted to see if there was a relationship between the number of previous arrests and whether or not a person is released.

# Convert 'released' column to a factor
arrests$released <- factor(arrests$released, levels = c("0", "1"), labels = c("No", "Yes"))

# Create the boxplot
ggplot(arrests, aes(y = checks, x = released)) + geom_boxplot(fill = 'orange') + labs(x = "Released?", y = "Number of Previous Arrests") + ggtitle("Relationship Between the Number of Previous Arrests and Released Status")

From the above graph, I could infer that there is a relationship between a lower amount of arrests and being released. The greater amount of previous arrests, the less likely the person will be released for the crime.

Here is a histogram showing the distribution of arrests for each age:

g <- ggplot(data = arrests) + geom_histogram(aes(x=age), binwidth = 1, fill = 'blue') + labs(x = "Age", y = "Count of Arrestees") + ggtitle("Distribution of Arrests for Each Age")
show(g)

Next, I wanted to see if age had an impact on whether or not a person is released:

ggplot(arrests, aes(x = released, y = age)) + geom_violin(fill = 'green') + labs(x= "Released?", y = "Age") + ggtitle("Relationship Between Age and Being Released")

From the above graph, I could infer that there was a slight relationship between age and whether or not a person is released. On the right side, there is a wider distribution of data between the ages 15 and 20. This tells us that there is likely a stronger likelihood that a person will be released when they are between these ages. However, based on the previous observation of number of arrests having a factor in whether or not a person is released, saying that age has an impact on being released may not be the most accurate statement. So, next let’s look at a smaller sample.

Here, we will compare the age of people who have had no previous arrests and whether or not they were released.

lowArrests <- subset(arrests, checks == 0)

ggplot(lowArrests, aes(x = released, y = age)) + geom_violin(fill = 'lightblue') + labs(x = "Released?", y = "Age") + ggtitle("Relationship Between Age and Being Released for People with No Previous Arrests")

We can tell from the above graph that there is still a relationship between age and whether or not a person is released. There is a wider distribution of data between the ages 15 and 20 for people who are released, which implies that the younger a person is, the more likely they will be released for the crime.

Next, I wanted to see if there is a relationship between a person being employed and whether or not they are released.

## find the percentage of arrestees that were citizens vs non-citizens
# sum of all rows
sumData <- nrow(arrests)
# sum of citizen arrests
sumCitizen <- sum(arrests$citizen == 1)
# percentage of citizen arrestees
percentageCitizen <- sumCitizen / sumData
# sum of citizens released
sumCitizenReleased <- sum(released_data$citizen == "1")
# sum of non citizens released
sumNonCitizenReleased <- nrow(released_data) - sumCitizenReleased

# percentage of citizen and non citizens released
percentageCitizenReleased <- sumCitizenReleased / sumCitizen
percentageNonCitizenReleased <- sumNonCitizenReleased / (sumData-sumCitizen)

# create data frame of citizens and number of released
citizenData <- data.frame("Citizen" = c("Non Citizen","Citizen"), "Number of Arrestees" = c(sumData-sumCitizen, sumCitizen), "Percentage of Arrestees" = c((1-percentageCitizen)*100, percentageCitizen*100), "Number of People Released" = c(sumNonCitizenReleased, sumCitizenReleased), "Percentage of Citizen Status Released" = c(percentageNonCitizenReleased * 100, percentageCitizenReleased * 100))
print(citizenData)
##       Citizen Number.of.Arrestees Percentage.of.Arrestees
## 1 Non Citizen                 771                14.75316
## 2     Citizen                4455                85.24684
##   Number.of.People.Released Percentage.of.Citizen.Status.Released
## 1                       559                              72.50324
## 2                      3775                              84.73625

From the above data frame, we can infer that citizen status does have an impact on whether or not a person is released. The percentage of non-citizens that are released for the crime is about 72.5%, while the percentage of citizens that are released for the crime is about 84.7%.

Next, I looked at whether or not employment status has an affect on if a person is released.

## find the percentage of arrestees that were employed vs. unemployed
# sum of all rows
sumData <- nrow(arrests)
# sum of employed arrests
sumEmployed <- sum(arrests$employed == 1)
# percentage of employed arrestees
percentageEmployed <- sumEmployed / sumData
# sum of employed released
sumEmployedReleased <- sum(released_data$employed == "1")
# sum of unemployed released
sumUnemployedReleased <- nrow(released_data) - sumEmployedReleased

# percentage of employed and unemployed released
percentageEmployedReleased <- sumEmployedReleased / sumEmployed
percentageUnemployedReleased <- sumUnemployedReleased / (sumData-sumEmployed)

# create data frame of employment status and number of released
employedData <- data.frame("Employment Status" = c("Unemployed","Employed"), "Number of Arrestees" = c(sumData-sumEmployed, sumEmployed), "Percentage of Arrestees" = c((1-percentageEmployed)*100, percentageEmployed*100), "Number of People Released" = c(sumUnemployedReleased, sumEmployedReleased), "Percentage of Employment Status Released" = c(percentageUnemployedReleased * 100, percentageEmployedReleased * 100))
print(employedData)
##   Employment.Status Number.of.Arrestees Percentage.of.Arrestees
## 1        Unemployed                4783               91.523153
## 2          Employed                 443                8.476847
##   Number.of.People.Released Percentage.of.Employment.Status.Released
## 1                      3954                                 82.66778
## 2                       380                                 85.77878

From the above data frame, we can infer that employment status does not have a significant impact on whether or not a person is released for the crime. The percentage of unemployed arrestees that are released is about 83%, while the percentage of employed arrestees that are released is about 86%. This isn’t a significant difference.

In conclusion, the attributes with the greatest impact on whether or not a person is released for marijuana possession are race, age, citizenship, and amount of previous arrests.