Day 1: Giới thiệu R

(1.1) Việc 1: Tải R và RStudio về máy

(1.2) Việc 2: Cài đặt các gói packages cần thiết

#install.packages(c("readxl", "tidyverse", "ggplot2", "GGally", "table1", "compareGroups", "DescTools", "simpleboot", "boot", "epiDisplay", "caret", "rms", "pROC", “GGally”, “carData”, “Publish”))

library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.8     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(table1)
## 
## Attaching package: 'table1'
## 
## The following objects are masked from 'package:base':
## 
##     units, units<-
library(compareGroups)

(1.3) Việc 3: Đọc dữ liệu arrest

arr = read.csv("C:\\VN trips\\VN trip 4 (Dec 2022)\\VLU\\Regression analysis\\Datasets\\Arrest data.csv", header = TRUE)

(1.4) Việc 4: Thông tin về dữ liệu arr

dim(arr)
## [1] 432  11
head(arr)
##   id week arrest finance age  race work     married parole prior educ
## 1  1   20      1      no  27 black   no not married    yes     3    3
## 2  2   17      1      no  18 black   no not married    yes     8    4
## 3  3   25      1      no  19 other  yes not married    yes    13    3
## 4  4   52      0     yes  23 black  yes     married    yes     1    5
## 5  5   52      0      no  19 other  yes not married    yes     3    3
## 6  6   52      0      no  24 black  yes not married     no     2    4
tail(arr)
##      id week arrest finance age  race work     married parole prior educ
## 427 427   12      1     yes  22 black  yes     married    yes     2    4
## 428 428   52      0     yes  31 other  yes not married    yes     3    3
## 429 429   52      0      no  20 black   no not married    yes     1    4
## 430 430   52      0     yes  20 black  yes     married    yes     1    3
## 431 431   52      0      no  29 black  yes not married    yes     3    4
## 432 432   52      0     yes  24 black  yes not married    yes     1    4

(1.5) Việc 5: Tạo biến mới arrest1 và fin

arr$arrest1[arr$arrest == 1] = "Yes"
arr$arrest1[arr$arrest == 0] = "No"

arr$fin[arr$finance == "yes"] = 1
arr$fin[arr$finance == "no"] = 0

(1.6) Việc 6: Tóm tắt dữ liệu

summary(arr)
##        id             week           arrest         finance         
##  Min.   :  1.0   Min.   : 1.00   Min.   :0.0000   Length:432        
##  1st Qu.:108.8   1st Qu.:50.00   1st Qu.:0.0000   Class :character  
##  Median :216.5   Median :52.00   Median :0.0000   Mode  :character  
##  Mean   :216.5   Mean   :45.85   Mean   :0.2639                     
##  3rd Qu.:324.2   3rd Qu.:52.00   3rd Qu.:1.0000                     
##  Max.   :432.0   Max.   :52.00   Max.   :1.0000                     
##       age           race               work             married         
##  Min.   :17.0   Length:432         Length:432         Length:432        
##  1st Qu.:20.0   Class :character   Class :character   Class :character  
##  Median :23.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :24.6                                                           
##  3rd Qu.:27.0                                                           
##  Max.   :44.0                                                           
##     parole              prior             educ         arrest1         
##  Length:432         Min.   : 0.000   Min.   :2.000   Length:432        
##  Class :character   1st Qu.: 1.000   1st Qu.:3.000   Class :character  
##  Mode  :character   Median : 2.000   Median :3.000   Mode  :character  
##                     Mean   : 2.984   Mean   :3.477                     
##                     3rd Qu.: 4.000   3rd Qu.:4.000                     
##                     Max.   :18.000   Max.   :6.000                     
##      fin           
##  Length:432        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

(1.7) Việc 7: Tóm tắt dữ liệu qua hàm table1

table1(~ age + finance + fin + arrest + arrest1 + race + parole + educ, data=arr)
Overall
(N=432)
age
Mean (SD) 24.6 (6.11)
Median [Min, Max] 23.0 [17.0, 44.0]
finance
no 216 (50.0%)
yes 216 (50.0%)
fin
0 216 (50.0%)
1 216 (50.0%)
arrest
Mean (SD) 0.264 (0.441)
Median [Min, Max] 0 [0, 1.00]
arrest1
No 318 (73.6%)
Yes 114 (26.4%)
race
black 379 (87.7%)
other 53 (12.3%)
parole
no 165 (38.2%)
yes 267 (61.8%)
educ
Mean (SD) 3.48 (0.834)
Median [Min, Max] 3.00 [2.00, 6.00]
table1(~ age + arrest + arrest1 + race + parole + educ | finance, data=arr)
no
(N=216)
yes
(N=216)
Overall
(N=432)
age
Mean (SD) 24.2 (5.73) 25.0 (6.47) 24.6 (6.11)
Median [Min, Max] 23.0 [17.0, 44.0] 23.0 [17.0, 44.0] 23.0 [17.0, 44.0]
arrest
Mean (SD) 0.306 (0.462) 0.222 (0.417) 0.264 (0.441)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
arrest1
No 150 (69.4%) 168 (77.8%) 318 (73.6%)
Yes 66 (30.6%) 48 (22.2%) 114 (26.4%)
race
black 185 (85.6%) 194 (89.8%) 379 (87.7%)
other 31 (14.4%) 22 (10.2%) 53 (12.3%)
parole
no 81 (37.5%) 84 (38.9%) 165 (38.2%)
yes 135 (62.5%) 132 (61.1%) 267 (61.8%)
educ
Mean (SD) 3.44 (0.844) 3.52 (0.824) 3.48 (0.834)
Median [Min, Max] 3.00 [2.00, 6.00] 3.00 [2.00, 6.00] 3.00 [2.00, 6.00]

(1.8) Việc 8: So sánh hai nhóm qua hàm createTable và compareGroups trong package compareGroups

# Cách 1:
t = compareGroups(finance ~ age + race + prior + educ + prior + work + married + parole + arrest, data = arr)
createTable(t)
## 
## --------Summary descriptives table by 'finance'---------
## 
## _________________________________________________ 
##                     no          yes     p.overall 
##                    N=216       N=216              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age             24.2 (5.73) 25.0 (6.47)   0.203   
## race:                                     0.241   
##     black       185 (85.6%) 194 (89.8%)           
##     other       31 (14.4%)  22 (10.2%)            
## prior           2.99 (2.92) 2.98 (2.88)   0.987   
## educ            3.44 (0.84) 3.52 (0.82)   0.300   
## prior           2.99 (2.92) 2.98 (2.88)   0.987   
## work:                                     1.000   
##     no          93 (43.1%)  92 (42.6%)            
##     yes         123 (56.9%) 124 (57.4%)           
## married:                                  0.557   
##     married     29 (13.4%)  24 (11.1%)            
##     not married 187 (86.6%) 192 (88.9%)           
## parole:                                   0.843   
##     no          81 (37.5%)  84 (38.9%)            
##     yes         135 (62.5%) 132 (61.1%)           
## arrest          0.31 (0.46) 0.22 (0.42)   0.050   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
# Cách 2:
createTable(compareGroups(finance ~ age + race + prior + educ + prior + work + married + parole + arrest, data = arr))
## 
## --------Summary descriptives table by 'finance'---------
## 
## _________________________________________________ 
##                     no          yes     p.overall 
##                    N=216       N=216              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age             24.2 (5.73) 25.0 (6.47)   0.203   
## race:                                     0.241   
##     black       185 (85.6%) 194 (89.8%)           
##     other       31 (14.4%)  22 (10.2%)            
## prior           2.99 (2.92) 2.98 (2.88)   0.987   
## educ            3.44 (0.84) 3.52 (0.82)   0.300   
## prior           2.99 (2.92) 2.98 (2.88)   0.987   
## work:                                     1.000   
##     no          93 (43.1%)  92 (42.6%)            
##     yes         123 (56.9%) 124 (57.4%)           
## married:                                  0.557   
##     married     29 (13.4%)  24 (11.1%)            
##     not married 187 (86.6%) 192 (88.9%)           
## parole:                                   0.843   
##     no          81 (37.5%)  84 (38.9%)            
##     yes         135 (62.5%) 132 (61.1%)           
## arrest          0.31 (0.46) 0.22 (0.42)   0.050   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
# Nếu biến số prior không theo phân bố chuẩn (normal distribution)
createTable(compareGroups(finance ~ age + race + prior + educ + prior + work + married + parole + arrest, data = arr, method = c(prior = 2)))
## 
## --------Summary descriptives table by 'finance'---------
## 
## ___________________________________________________________ 
##                        no              yes        p.overall 
##                      N=216            N=216                 
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age               24.2 (5.73)      25.0 (6.47)      0.203   
## race:                                               0.241   
##     black         185 (85.6%)      194 (89.8%)              
##     other          31 (14.4%)       22 (10.2%)              
## prior           2.00 [1.00;4.00] 2.00 [1.00;4.00]   0.773   
## educ              3.44 (0.84)      3.52 (0.82)      0.300   
## prior             2.99 (2.92)      2.98 (2.88)      0.987   
## work:                                               1.000   
##     no             93 (43.1%)       92 (42.6%)              
##     yes           123 (56.9%)      124 (57.4%)              
## married:                                            0.557   
##     married        29 (13.4%)       24 (11.1%)              
##     not married   187 (86.6%)      192 (88.9%)              
## parole:                                             0.843   
##     no             81 (37.5%)       84 (38.9%)              
##     yes           135 (62.5%)      132 (61.1%)              
## arrest            0.31 (0.46)      0.22 (0.42)      0.050   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

(1.9) Mối liên quan giữa số lần bị bắt trước đây và khả năng bị bắt lại

(1.9.1) Student’s t-test

t.test(arr$prior ~ arr$arrest)
## 
##  Welch Two Sample t-test
## 
## data:  arr$prior by arr$arrest
## t = -2.9319, df = 155.9, p-value = 0.003877
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -1.7920133 -0.3493306
## sample estimates:
## mean in group 0 mean in group 1 
##        2.701258        3.771930

(1.9.2) Số lần bị bắt trước đây có phân bố chuẩn?

hist(arr$prior)

(1.9.3) Bootstrap

library(simpleboot)
## Simple Bootstrap Routines (1.1-7)
library(tidyverse)

arrest = arr %>% filter(arrest == 1)
non.arrest = arr %>% filter(arrest == 0)

set.seed(1234)

b.means = two.boot(arrest$prior, non.arrest$prior, mean, R = 1000)
hist (b.means$t, breaks = 20)

quantile(b.means$t, probs=c(0.025, 0.50, 0.975))
##      2.5%       50%     97.5% 
## 0.3638572 1.0390599 1.8544729

(1.9) Việc 9: Ghi lại vào tài khoản rpubs.com (https://rpubs.com/ThachTran/981507)