arr = read.csv("C:/Users/Momo/Desktop/R - Learning/Dataset 4-2022/Arrest dataset.csv")
colnames(arr)
## [1] "id" "age" "finance" "week" "arrest" "race"
## [7] "work.exp" "married" "parole" "prior" "educ" "employ1"
head(arr)
## id age finance week arrest race work.exp married parole prior educ
## 1 1 27 no 20 1 black no not married yes 3 3
## 2 2 18 no 17 1 black no not married yes 8 4
## 3 3 19 no 25 1 other yes not married yes 13 3
## 4 4 23 yes 52 0 black yes married yes 1 5
## 5 5 19 no 52 0 other yes not married yes 3 3
## 6 6 24 no 52 0 black yes not married no 2 4
## employ1
## 1 no
## 2 no
## 3 no
## 4 no
## 5 no
## 6 no
tail(arr)
## id age finance week arrest race work.exp married parole prior educ
## 427 427 22 yes 12 1 black yes married yes 2 4
## 428 428 31 yes 52 0 other yes not married yes 3 3
## 429 429 20 no 52 0 black no not married yes 1 4
## 430 430 20 yes 52 0 black yes married yes 1 3
## 431 431 29 no 52 0 black yes not married yes 3 4
## 432 432 24 yes 52 0 black yes not married yes 1 4
## employ1
## 427 no
## 428 no
## 429 no
## 430 no
## 431 yes
## 432 yes
summary(arr)
## id age finance week
## Min. : 1.0 Min. :17.0 Length:432 Min. : 1.00
## 1st Qu.:108.8 1st Qu.:20.0 Class :character 1st Qu.:50.00
## Median :216.5 Median :23.0 Mode :character Median :52.00
## Mean :216.5 Mean :24.6 Mean :45.85
## 3rd Qu.:324.2 3rd Qu.:27.0 3rd Qu.:52.00
## Max. :432.0 Max. :44.0 Max. :52.00
## arrest race work.exp married
## Min. :0.0000 Length:432 Length:432 Length:432
## 1st Qu.:0.0000 Class :character Class :character Class :character
## Median :0.0000 Mode :character Mode :character Mode :character
## Mean :0.2639
## 3rd Qu.:1.0000
## Max. :1.0000
## parole prior educ employ1
## Length:432 Min. : 0.000 Min. :2.000 Length:432
## Class :character 1st Qu.: 1.000 1st Qu.:3.000 Class :character
## Mode :character Median : 2.000 Median :3.000 Mode :character
## Mean : 2.984 Mean :3.477
## 3rd Qu.: 4.000 3rd Qu.:4.000
## Max. :18.000 Max. :6.000
hist(arr$week)
hist(arr$week, col = "blue", border = "white", main = "Distribution of time to arrest (week)", xlab = "Week", ylab = "Number of Participants")
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ age + finance + week + arrest + race + work.exp + married + parole + prior + educ + employ1, data = arr)
| Overall (N=432) |
|
|---|---|
| age | |
| Mean (SD) | 24.6 (6.11) |
| Median [Min, Max] | 23.0 [17.0, 44.0] |
| finance | |
| no | 216 (50.0%) |
| yes | 216 (50.0%) |
| week | |
| Mean (SD) | 45.9 (12.7) |
| Median [Min, Max] | 52.0 [1.00, 52.0] |
| arrest | |
| Mean (SD) | 0.264 (0.441) |
| Median [Min, Max] | 0 [0, 1.00] |
| race | |
| black | 379 (87.7%) |
| other | 53 (12.3%) |
| work.exp | |
| no | 185 (42.8%) |
| yes | 247 (57.2%) |
| married | |
| married | 53 (12.3%) |
| not married | 379 (87.7%) |
| parole | |
| no | 165 (38.2%) |
| yes | 267 (61.8%) |
| prior | |
| Mean (SD) | 2.98 (2.90) |
| Median [Min, Max] | 2.00 [0, 18.0] |
| educ | |
| Mean (SD) | 3.48 (0.834) |
| Median [Min, Max] | 3.00 [2.00, 6.00] |
| employ1 | |
| no | 372 (86.1%) |
| yes | 60 (13.9%) |
table1(~ age + finance + week + arrest + race + married + parole + prior + educ, data = arr)
| Overall (N=432) |
|
|---|---|
| age | |
| Mean (SD) | 24.6 (6.11) |
| Median [Min, Max] | 23.0 [17.0, 44.0] |
| finance | |
| no | 216 (50.0%) |
| yes | 216 (50.0%) |
| week | |
| Mean (SD) | 45.9 (12.7) |
| Median [Min, Max] | 52.0 [1.00, 52.0] |
| arrest | |
| Mean (SD) | 0.264 (0.441) |
| Median [Min, Max] | 0 [0, 1.00] |
| race | |
| black | 379 (87.7%) |
| other | 53 (12.3%) |
| married | |
| married | 53 (12.3%) |
| not married | 379 (87.7%) |
| parole | |
| no | 165 (38.2%) |
| yes | 267 (61.8%) |
| prior | |
| Mean (SD) | 2.98 (2.90) |
| Median [Min, Max] | 2.00 [0, 18.0] |
| educ | |
| Mean (SD) | 3.48 (0.834) |
| Median [Min, Max] | 3.00 [2.00, 6.00] |
library(ggplot2)
ggplot(data = arr, aes(x = week)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(arr, aes(x = week)) + geom_histogram(fill = "blue", col = "white") + labs(title = "Distribution of Time to Arrest", x = "Time to arrest (Week)", y = "Number of Participants")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(arr, aes(x = age)) + geom_histogram(aes(y = ..density..), fill = "blue", col = "white") + geom_density(col = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(arr, aes(x = educ)) + geom_bar()
ggplot(arr, aes(x = educ)) + geom_bar(fill = "blue") + labs(title = "Distribution of Education", x = "Education", y = "Number of Participants")
ggplot(arr, aes(x = educ, fill = factor(arrest))) + geom_bar()