R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

library(readxl);
library(tidyverse);
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2);
library(dplyr);
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(gapminder)
library(ggthemes)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(base)
library(compareGroups)
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-

#arr=Arrest dataset, ins= Insurance dataset

arr=read.csv("C:\\Users\\hntn\\OneDrive - Sun Hydraulics\\Hoa\\Ftu\\DATA ANALYSIS\\Dataset for TDTU workshop 4-2022\\Arrest dataset.csv")
head(arr)
##   id age finance week arrest  race work.exp     married parole prior educ
## 1  1  27      no   20      1 black       no not married    yes     3    3
## 2  2  18      no   17      1 black       no not married    yes     8    4
## 3  3  19      no   25      1 other      yes not married    yes    13    3
## 4  4  23     yes   52      0 black      yes     married    yes     1    5
## 5  5  19      no   52      0 other      yes not married    yes     3    3
## 6  6  24      no   52      0 black      yes not married     no     2    4
##   employ1
## 1      no
## 2      no
## 3      no
## 4      no
## 5      no
## 6      no
ins=read_excel("C:\\Users\\hntn\\OneDrive - Sun Hydraulics\\Hoa\\Ftu\\DATA ANALYSIS\\Dataset for TDTU workshop 4-2022\\Insurance dataset.xlsx")
head(ins)
## # A tibble: 6 x 7
##     age sex      bmi children smoker region    charge
##   <dbl> <chr>  <dbl>    <dbl> <chr>  <chr>      <dbl>
## 1    19 female  27.9        0 yes    southwest 16885.
## 2    18 male    33.8        1 no     southeast  1726.
## 3    28 male    33          3 no     southeast  4449.
## 4    33 male    22.7        0 no     northwest 21984.
## 5    32 male    28.9        0 no     northwest  3867.
## 6    31 female  25.7        0 no     southeast  3757.
summary(ins)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region              charge     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770
table1(~age+sex+bmi+children+smoker+charge|region,data=ins)
northeast
(N=324)
northwest
(N=325)
southeast
(N=364)
southwest
(N=325)
Overall
(N=1338)
age
Mean (SD) 39.3 (14.1) 39.2 (14.1) 38.9 (14.2) 39.5 (14.0) 39.2 (14.0)
Median [Min, Max] 39.5 [18.0, 64.0] 39.0 [19.0, 64.0] 39.0 [18.0, 64.0] 39.0 [19.0, 64.0] 39.0 [18.0, 64.0]
sex
female 161 (49.7%) 164 (50.5%) 175 (48.1%) 162 (49.8%) 662 (49.5%)
male 163 (50.3%) 161 (49.5%) 189 (51.9%) 163 (50.2%) 676 (50.5%)
bmi
Mean (SD) 29.2 (5.94) 29.2 (5.14) 33.4 (6.48) 30.6 (5.69) 30.7 (6.10)
Median [Min, Max] 28.9 [16.0, 48.1] 28.9 [17.4, 42.9] 33.3 [19.8, 53.1] 30.3 [17.4, 47.6] 30.4 [16.0, 53.1]
children
Mean (SD) 1.05 (1.20) 1.15 (1.17) 1.05 (1.18) 1.14 (1.28) 1.09 (1.21)
Median [Min, Max] 1.00 [0, 5.00] 1.00 [0, 5.00] 1.00 [0, 5.00] 1.00 [0, 5.00] 1.00 [0, 5.00]
smoker
no 257 (79.3%) 267 (82.2%) 273 (75.0%) 267 (82.2%) 1064 (79.5%)
yes 67 (20.7%) 58 (17.8%) 91 (25.0%) 58 (17.8%) 274 (20.5%)
charge
Mean (SD) 13400 (11300) 12400 (11100) 14700 (14000) 12300 (11600) 13300 (12100)
Median [Min, Max] 10100 [1690, 58600] 8970 [1620, 60000] 9290 [1120, 63800] 8800 [1240, 52600] 9380 [1120, 63800]

#DUNG COMPARE GROUPS TOM TAT DU LIEU arr

createTable(compareGroups(finance~age+race + prior + parole, data = arr))
## 
## --------Summary descriptives table by 'finance'---------
## 
## ___________________________________________ 
##               no          yes     p.overall 
##              N=216       N=216              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age       24.2 (5.73) 25.0 (6.47)   0.203   
## race:                               0.241   
##     black 185 (85.6%) 194 (89.8%)           
##     other 31 (14.4%)  22 (10.2%)            
## prior     2.99 (2.92) 2.98 (2.88)   0.987   
## parole:                             0.843   
##     no    81 (37.5%)  84 (38.9%)            
##     yes   135 (62.5%) 132 (61.1%)           
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

#KIEM DINH SU KHAC BIET VE TI LE TAI PHAM GIUA NGUOI DA DEN VA NGUOI KO DA DEN

createTable(compareGroups(arrest~race, data = arr))
## 
## --------Summary descriptives table by 'arrest'---------
## 
## ___________________________________________ 
##                0           1      p.overall 
##              N=318       N=114              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## race:                               0.621   
##     black 277 (87.1%) 102 (89.5%)           
##     other 41 (12.9%)  12 (10.5%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
chisq.test(arr$race, arr$arrest)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  arr$race and arr$arrest
## X-squared = 0.24452, df = 1, p-value = 0.621

BIEU DIEN DUOI THANH BAR

arr$arrest = as.factor(arr$arrest)
createTable(compareGroups(race~ arrest, data=arr))
## 
## --------Summary descriptives table by 'race'---------
## 
## ________________________________________ 
##            black      other    p.overall 
##            N=379       N=53              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## arrest:                          0.621   
##     0   277 (73.1%) 41 (77.4%)           
##     1   102 (26.9%) 12 (22.6%)           
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
arr %>% count(arrest,race) %>% group_by (arrest) %>% mutate (percent  = n / sum(n) *100) %>% ggplot (aes(x=arrest,y= percent, fill = race)) + geom_bar(stat = "identity") + geom_text(aes(label=paste0 (sprintf ("%1.1f", percent), "%")), position = position_stack (vjust =0.5)) + theme (legend.position = "none") + labs (x = "Tái phạm", y= "Phần trăm (%)") 

# VIEC 5 - KIEM DINH GIA THIET VO HIEU RANG KHONG CO SU KHAC BIET VE TI LE TAI PHAM GIUA NHOM DUOC HO TRO TAI CHANH VA KO

createTable(compareGroups(arrest~finance, data=arr))
## 
## --------Summary descriptives table by 'arrest'---------
## 
## _________________________________________ 
##               0          1      p.overall 
##             N=318      N=114              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## finance:                          0.063   
##     no   150 (47.2%) 66 (57.9%)           
##     yes  168 (52.8%) 48 (42.1%)           
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
chisq.test(arr$finance, arr$arrest)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  arr$finance and arr$arrest
## X-squared = 3.4439, df = 1, p-value = 0.06349

#TRINH DO HOC VAN CO LIEN QUAN DEN TI LE TAI PHAM

arr$edu = as.factor(arr$educ)

arr %>% count (arrest, edu) %>% group_by (arrest) %>% mutate (percent = n / sum(n) *100)  %>% ggplot (aes(x= arrest, y = percent, fill = edu)) + geom_bar (stat = "identity")  + geom_text(aes(label=paste0 (sprintf ("%1.1f", percent), "%")), position = position_stack (vjust =0.5)) + theme (legend.position = "none") + labs (x = "Tái phạm", y= "Phần trăm (%)") 

VIEC 6- KIEM DINH SU KHAC BIET VE DO TUOI VA TIEN SU TOI PHAM GIUA NGUOI TAI PHAM VA KO TAI PHAM

Nguoi tai pham tre hon Nguoi khong tai pham?

createTable(compareGroups(arrest~age,data=arr))
## 
## --------Summary descriptives table by 'arrest'---------
## 
## _____________________________________ 
##          0           1      p.overall 
##        N=318       N=114              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age 25.3 (6.31) 22.8 (5.12)  <0.001   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
t.test (arr$age ~ arr$arrest)
## 
##  Welch Two Sample t-test
## 
## data:  arr$age by arr$arrest
## t = 4.1789, df = 243.6, p-value = 4.086e-05
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  1.317143 3.665975
## sample estimates:
## mean in group 0 mean in group 1 
##        25.25472        22.76316
createTable (compareGroups (arrest~ prior, data = arr))
## 
## --------Summary descriptives table by 'arrest'---------
## 
## _______________________________________ 
##            0           1      p.overall 
##          N=318       N=114              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## prior 2.70 (2.55) 3.77 (3.59)   0.004   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
#t.test (arr$prior, arr$arrest)