This file documents codes for day 1 exercise using Rmarkdown

# Reading data into R
t = "D:\\Analysis\\Arrest data for cox model.csv"
arr = read.csv(t)
# How many rows and columns 
dim(arr) 
## [1] 432  11

First 6 rows

head(arr)
##   id week arrest finance age  race work     married parole prior educ
## 1  1   20      1      no  27 black   no not married    yes     3    3
## 2  2   17      1      no  18 black   no not married    yes     8    4
## 3  3   25      1      no  19 other  yes not married    yes    13    3
## 4  4   52      0     yes  23 black  yes     married    yes     1    5
## 5  5   52      0      no  19 other  yes not married    yes     3    3
## 6  6   52      0      no  24 black  yes not married     no     2    4

Last 6 rows

tail(arr)
##      id week arrest finance age  race work     married parole prior educ
## 427 427   12      1     yes  22 black  yes     married    yes     2    4
## 428 428   52      0     yes  31 other  yes not married    yes     3    3
## 429 429   52      0      no  20 black   no not married    yes     1    4
## 430 430   52      0     yes  20 black  yes     married    yes     1    3
## 431 431   52      0      no  29 black  yes not married    yes     3    4
## 432 432   52      0     yes  24 black  yes not married    yes     1    4

arr$arrest1[arr$arrest == 1] = "Yes"
arr$arrest1[arr$arrest == 0] = "No"

Descriptive analysis

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ age + arrest1 + race + parole + factor(educ) | finance, data = arr)
no
(N=216)
yes
(N=216)
Overall
(N=432)
age
Mean (SD) 24.2 (5.73) 25.0 (6.47) 24.6 (6.11)
Median [Min, Max] 23.0 [17.0, 44.0] 23.0 [17.0, 44.0] 23.0 [17.0, 44.0]
arrest1
No 150 (69.4%) 168 (77.8%) 318 (73.6%)
Yes 66 (30.6%) 48 (22.2%) 114 (26.4%)
race
black 185 (85.6%) 194 (89.8%) 379 (87.7%)
other 31 (14.4%) 22 (10.2%) 53 (12.3%)
parole
no 81 (37.5%) 84 (38.9%) 165 (38.2%)
yes 135 (62.5%) 132 (61.1%) 267 (61.8%)
factor(educ)
2 17 (7.9%) 7 (3.2%) 24 (5.6%)
3 117 (54.2%) 122 (56.5%) 239 (55.3%)
4 57 (26.4%) 62 (28.7%) 119 (27.5%)
5 21 (9.7%) 18 (8.3%) 39 (9.0%)
6 4 (1.9%) 7 (3.2%) 11 (2.5%)

Descriptive analysis using compareGroups

library(compareGroups)
t = compareGroups(finance ~ age + race + prior + parole, data = arr) 
createTable(t) 
## 
## --------Summary descriptives table by 'finance'---------
## 
## ___________________________________________ 
##               no          yes     p.overall 
##              N=216       N=216              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age       24.2 (5.73) 25.0 (6.47)   0.203   
## race:                               0.241   
##     black 185 (85.6%) 194 (89.8%)           
##     other 31 (14.4%)  22 (10.2%)            
## prior     2.99 (2.92) 2.98 (2.88)   0.987   
## parole:                             0.843   
##     no    81 (37.5%)  84 (38.9%)            
##     yes   135 (62.5%) 132 (61.1%)           
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Simple data vizualization

hist(arr$week,
     main = "Phân bố tuần bị bắt",
     xlab = "Tuần", 
     ylab = "Số đối tượng", 
     col = "blue",
     border = "white")

# Using ggplot2

library(ggplot2)
ggplot(data = arr, aes(x = week)) + geom_histogram(
  fill = "blue",
  col = "white") + labs(title="Phân bố tuần bị bắt", x="Số tuần", y="Số đối tượng")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.