This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
library(readxl);
library(tidyverse);
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2);
library(dplyr);
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(gapminder)
library(ggthemes)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(base)
library(compareGroups)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
#arr=Arrest dataset, ins= Insurance dataset
arr=read.csv("C:\\Users\\hntn\\OneDrive - Sun Hydraulics\\Hoa\\Ftu\\DATA ANALYSIS\\Dataset for TDTU workshop 4-2022\\Arrest dataset.csv")
head(arr)
## id age finance week arrest race work.exp married parole prior educ
## 1 1 27 no 20 1 black no not married yes 3 3
## 2 2 18 no 17 1 black no not married yes 8 4
## 3 3 19 no 25 1 other yes not married yes 13 3
## 4 4 23 yes 52 0 black yes married yes 1 5
## 5 5 19 no 52 0 other yes not married yes 3 3
## 6 6 24 no 52 0 black yes not married no 2 4
## employ1
## 1 no
## 2 no
## 3 no
## 4 no
## 5 no
## 6 no
ins=read_excel("C:\\Users\\hntn\\OneDrive - Sun Hydraulics\\Hoa\\Ftu\\DATA ANALYSIS\\Dataset for TDTU workshop 4-2022\\Insurance dataset.xlsx")
head(ins)
## # A tibble: 6 x 7
## age sex bmi children smoker region charge
## <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 19 female 27.9 0 yes southwest 16885.
## 2 18 male 33.8 1 no southeast 1726.
## 3 28 male 33 3 no southeast 4449.
## 4 33 male 22.7 0 no northwest 21984.
## 5 32 male 28.9 0 no northwest 3867.
## 6 31 female 25.7 0 no southeast 3757.
summary(ins)
## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charge
## Length:1338 Length:1338 Min. : 1122
## Class :character Class :character 1st Qu.: 4740
## Mode :character Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
table1(~age+sex+bmi+children+smoker+charge|region,data=ins)
| northeast (N=324) |
northwest (N=325) |
southeast (N=364) |
southwest (N=325) |
Overall (N=1338) |
|
|---|---|---|---|---|---|
| age | |||||
| Mean (SD) | 39.3 (14.1) | 39.2 (14.1) | 38.9 (14.2) | 39.5 (14.0) | 39.2 (14.0) |
| Median [Min, Max] | 39.5 [18.0, 64.0] | 39.0 [19.0, 64.0] | 39.0 [18.0, 64.0] | 39.0 [19.0, 64.0] | 39.0 [18.0, 64.0] |
| sex | |||||
| female | 161 (49.7%) | 164 (50.5%) | 175 (48.1%) | 162 (49.8%) | 662 (49.5%) |
| male | 163 (50.3%) | 161 (49.5%) | 189 (51.9%) | 163 (50.2%) | 676 (50.5%) |
| bmi | |||||
| Mean (SD) | 29.2 (5.94) | 29.2 (5.14) | 33.4 (6.48) | 30.6 (5.69) | 30.7 (6.10) |
| Median [Min, Max] | 28.9 [16.0, 48.1] | 28.9 [17.4, 42.9] | 33.3 [19.8, 53.1] | 30.3 [17.4, 47.6] | 30.4 [16.0, 53.1] |
| children | |||||
| Mean (SD) | 1.05 (1.20) | 1.15 (1.17) | 1.05 (1.18) | 1.14 (1.28) | 1.09 (1.21) |
| Median [Min, Max] | 1.00 [0, 5.00] | 1.00 [0, 5.00] | 1.00 [0, 5.00] | 1.00 [0, 5.00] | 1.00 [0, 5.00] |
| smoker | |||||
| no | 257 (79.3%) | 267 (82.2%) | 273 (75.0%) | 267 (82.2%) | 1064 (79.5%) |
| yes | 67 (20.7%) | 58 (17.8%) | 91 (25.0%) | 58 (17.8%) | 274 (20.5%) |
| charge | |||||
| Mean (SD) | 13400 (11300) | 12400 (11100) | 14700 (14000) | 12300 (11600) | 13300 (12100) |
| Median [Min, Max] | 10100 [1690, 58600] | 8970 [1620, 60000] | 9290 [1120, 63800] | 8800 [1240, 52600] | 9380 [1120, 63800] |
#DUNG COMPARE GROUPS TOM TAT DU LIEU arr
createTable(compareGroups(finance~age+race + prior + parole, data = arr))
##
## --------Summary descriptives table by 'finance'---------
##
## ___________________________________________
## no yes p.overall
## N=216 N=216
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## age 24.2 (5.73) 25.0 (6.47) 0.203
## race: 0.241
## black 185 (85.6%) 194 (89.8%)
## other 31 (14.4%) 22 (10.2%)
## prior 2.99 (2.92) 2.98 (2.88) 0.987
## parole: 0.843
## no 81 (37.5%) 84 (38.9%)
## yes 135 (62.5%) 132 (61.1%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
#KIEM DINH SU KHAC BIET VE TI LE TAI PHAM GIUA NGUOI DA DEN VA NGUOI KO DA DEN
createTable(compareGroups(arrest~race, data = arr))
##
## --------Summary descriptives table by 'arrest'---------
##
## ___________________________________________
## 0 1 p.overall
## N=318 N=114
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## race: 0.621
## black 277 (87.1%) 102 (89.5%)
## other 41 (12.9%) 12 (10.5%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
chisq.test(arr$race, arr$arrest)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: arr$race and arr$arrest
## X-squared = 0.24452, df = 1, p-value = 0.621
arr$arrest = as.factor(arr$arrest)
createTable(compareGroups(race~ arrest, data=arr))
##
## --------Summary descriptives table by 'race'---------
##
## ________________________________________
## black other p.overall
## N=379 N=53
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## arrest: 0.621
## 0 277 (73.1%) 41 (77.4%)
## 1 102 (26.9%) 12 (22.6%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
arr %>% count(arrest,race) %>% group_by (arrest) %>% mutate (percent = n / sum(n) *100) %>% ggplot (aes(x=arrest,y= percent, fill = race)) + geom_bar(stat = "identity") + geom_text(aes(label=paste0 (sprintf ("%1.1f", percent), "%")), position = position_stack (vjust =0.5)) + theme (legend.position = "none") + labs (x = "Tái phạm", y= "Phần trăm (%)")
# VIEC 5 - KIEM DINH GIA THIET VO HIEU RANG KHONG CO SU KHAC BIET VE TI
LE TAI PHAM GIUA NHOM DUOC HO TRO TAI CHANH VA KO
createTable(compareGroups(arrest~finance, data=arr))
##
## --------Summary descriptives table by 'arrest'---------
##
## _________________________________________
## 0 1 p.overall
## N=318 N=114
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## finance: 0.063
## no 150 (47.2%) 66 (57.9%)
## yes 168 (52.8%) 48 (42.1%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
chisq.test(arr$finance, arr$arrest)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: arr$finance and arr$arrest
## X-squared = 3.4439, df = 1, p-value = 0.06349
#TRINH DO HOC VAN CO LIEN QUAN DEN TI LE TAI PHAM
arr$edu = as.factor(arr$educ)
arr %>% count (arrest, edu) %>% group_by (arrest) %>% mutate (percent = n / sum(n) *100) %>% ggplot (aes(x= arrest, y = percent, fill = edu)) + geom_bar (stat = "identity") + geom_text(aes(label=paste0 (sprintf ("%1.1f", percent), "%")), position = position_stack (vjust =0.5)) + theme (legend.position = "none") + labs (x = "Tái phạm", y= "Phần trăm (%)")
createTable(compareGroups(arrest~age,data=arr))
##
## --------Summary descriptives table by 'arrest'---------
##
## _____________________________________
## 0 1 p.overall
## N=318 N=114
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## age 25.3 (6.31) 22.8 (5.12) <0.001
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
t.test (arr$age ~ arr$arrest)
##
## Welch Two Sample t-test
##
## data: arr$age by arr$arrest
## t = 4.1789, df = 243.6, p-value = 4.086e-05
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 1.317143 3.665975
## sample estimates:
## mean in group 0 mean in group 1
## 25.25472 22.76316
createTable (compareGroups (arrest~ prior, data = arr))
##
## --------Summary descriptives table by 'arrest'---------
##
## _______________________________________
## 0 1 p.overall
## N=318 N=114
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## prior 2.70 (2.55) 3.77 (3.59) 0.004
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
#t.test (arr$prior, arr$arrest)