Để cài đặt gói phần mềm chúng ta sử dụng hàm install.packages, cú pháp như sau install.packages ('tên gói phần mềm', dependencies= T). Đối số dependencies = T có ý nghĩa yêu cầu cài cùng lúc các gói phụ trợ. Làm quen với những package sau đây: table1, sjPlot, sjmisc.
crime = read.csv("C:\\Users\\Nguyen\\Desktop\\Workshop NCKH 2019\\Dataset\\Crime data 2003to2018.csv")
names(crime)
## [1] "ID" "Category" "Description" "Day" "Date"
## [6] "Time" "District" "Resolution"
dim(crime)
## [1] 1048575 8
head(crime)
## ID Category Description Day
## 1 180362289 VEHICLE THEFT STOLEN MOTORCYCLE Tuesday
## 2 180360948 NON-CRIMINAL AIDED CASE, MENTAL DISTURBED Tuesday
## 3 180360879 OTHER OFFENSES PAROLE VIOLATION Tuesday
## 4 180360879 OTHER OFFENSES TRAFFIC VIOLATION ARREST Tuesday
## 5 180360879 OTHER OFFENSES TRAFFIC VIOLATION Tuesday
## 6 180360829 OTHER OFFENSES DRIVERS LICENSE, SUSPENDED OR REVOKED Tuesday
## Date Time District Resolution
## 1 05/15/2018 10:30 SOUTHERN NONE
## 2 05/15/2018 4:14 SOUTHERN NONE
## 3 05/15/2018 2:01 MISSION ARREST, BOOKED
## 4 05/15/2018 2:01 MISSION ARREST, BOOKED
## 5 05/15/2018 2:01 MISSION ARREST, BOOKED
## 6 05/15/2018 1:27 MISSION NONE
library(sjPlot)
## #refugeeswelcome
plot_frq(crime$Category)
#Quay ngang biểu đồ và sắp xếp từ lớn đến nhỏ
plot_frq(crime$Category, sort.frq = 'asc', coord.flip=T)
#Quay ngang biểu đồ và sắp xếp từ nhỏ đến lớn
plot_frq(crime$Category, sort.frq = 'desc', coord.flip=T)
#Tìm hiểu tội phạm thường xảy ra vào ngày nào?
plot_frq(crime$Day)
#Sắp xếp theo các ngày trong tuần
crime$Day = factor(crime$Day, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
plot_frq(crime$Day)
ob = read.csv("C:\\Users\\Nguyen\\Desktop\\Workshop NCKH 2019\\Dataset\\Obesity data.csv")
ob$OB [ob$bmi < 18.5] = "Underweight"
ob$OB [ob$bmi >= 18.5 & ob$bmi < 24.9] = "Normal"
ob$OB [ob$bmi >= 25.0 & ob$bmi < 29.9] = "Overweight"
ob$OB [ob$bmi >= 30] = "Obese"
ob$OB = factor(ob$OB, levels=c("Underweight" , "Normal", "Overweight", "Obese"))
library(sjPlot)
plot_frq(ob$OB)
students = read.csv("C:\\Users\\Nguyen\\Desktop\\Workshop NCKH 2019\\Dataset\\PISA VN 2015.csv")
schools = read.csv("C:\\Users\\Nguyen\\Desktop\\Workshop NCKH 2019\\Dataset\\PISA VN SCHOOLS 2015.csv")
#Tìm hiểu có bao nhiêu biến số trong students và schools
names(students)
## [1] "CNTSCHID" "AGE" "Gender" "PARED" "HEDRES" "MISCED"
## [7] "FISCED" "HISCED" "WEALTH" "ESCS" "INSTSCIE" "SCIEEFF"
## [13] "JOYSCIE" "ICTRES" "HOMEPOS" "HEDRES.1" "CULTPOSS" "PV1MATH"
## [19] "PV1READ" "PV1SCIE"
names(schools)
## [1] "STRATUM" "CNTSCHID" "SCHSIZE" "CLSIZE" "STRATIO" "SCHLTYPE"
## [7] "Region" "Area"
#Hợp nhất 2 dataset theo biến số CNTSCHID
pisa = merge(students, schools, by="CNTSCHID")
names(pisa)
## [1] "CNTSCHID" "AGE" "Gender" "PARED" "HEDRES" "MISCED"
## [7] "FISCED" "HISCED" "WEALTH" "ESCS" "INSTSCIE" "SCIEEFF"
## [13] "JOYSCIE" "ICTRES" "HOMEPOS" "HEDRES.1" "CULTPOSS" "PV1MATH"
## [19] "PV1READ" "PV1SCIE" "STRATUM" "SCHSIZE" "CLSIZE" "STRATIO"
## [25] "SCHLTYPE" "Region" "Area"
#Dán nhãn các biến số
library(DescTools)
Label(pisa$PV1MATH) = "Math"
Label(pisa$PV1SCIE) = "Science"
Label(pisa$PV1READ) = "Reading"
#Hoặc đổi tên các biến số:
# names (pisa) [names(pisa) == 'PV1MATH'] = 'Math'
# names (pisa) [names(pisa) == 'PV1SCIE'] = 'Science'
# names (pisa) [names(pisa) == 'PV1READ'] = 'Reading'
summary(pisa)
## CNTSCHID AGE Gender PARED
## Min. :70400001 Min. :15.33 Boys :2786 Min. : 3.000
## 1st Qu.:70400052 1st Qu.:15.50 Girls:3040 1st Qu.: 9.000
## Median :70400096 Median :15.75 Median : 9.000
## Mean :70400097 Mean :15.78 Mean : 9.374
## 3rd Qu.:70400143 3rd Qu.:16.00 3rd Qu.:12.000
## Max. :70400188 Max. :16.25 Max. :17.000
## NA's :14
## HEDRES MISCED FISCED HISCED
## Min. :-4.3706 Min. :0.000 Min. :0.000 Min. :0.00
## 1st Qu.:-1.5169 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.00
## Median :-1.1119 Median :2.000 Median :2.000 Median :2.00
## Mean :-1.0470 Mean :2.069 Mean :2.296 Mean :2.58
## 3rd Qu.:-0.7026 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:4.00
## Max. : 1.1767 Max. :6.000 Max. :6.000 Max. :6.00
## NA's :20 NA's :34 NA's :88 NA's :14
## WEALTH ESCS INSTSCIE SCIEEFF
## Min. :-7.635 Min. :-5.657 Min. :-1.9301 Min. :-3.7565
## 1st Qu.:-2.829 1st Qu.:-2.539 1st Qu.: 0.0125 1st Qu.:-0.8585
## Median :-2.163 Median :-1.982 Median : 0.3708 Median :-0.3473
## Mean :-2.219 Mean :-1.822 Mean : 0.4835 Mean :-0.2662
## 3rd Qu.:-1.504 3rd Qu.:-1.221 3rd Qu.: 1.0218 3rd Qu.: 0.3155
## Max. : 3.211 Max. : 1.950 Max. : 1.7359 Max. : 3.2775
## NA's :15 NA's :1 NA's :17 NA's :19
## JOYSCIE ICTRES HOMEPOS HEDRES.1
## Min. :-2.1154 Min. :-3.508 Min. :-8.955 Min. :-4.3706
## 1st Qu.: 0.5094 1st Qu.:-2.587 1st Qu.:-2.669 1st Qu.:-1.5169
## Median : 0.5094 Median :-1.855 Median :-2.047 Median :-1.1119
## Mean : 0.6448 Mean :-1.795 Mean :-2.042 Mean :-1.0470
## 3rd Qu.: 1.1049 3rd Qu.:-1.117 3rd Qu.:-1.354 3rd Qu.:-0.7026
## Max. : 2.1635 Max. : 3.497 Max. : 2.770 Max. : 1.1767
## NA's :19 NA's :34 NA's :2 NA's :20
## CULTPOSS PV1MATH PV1READ PV1SCIE
## Min. :-1.8413 Min. :201.7 Min. :107.1 Min. :292.7
## 1st Qu.:-0.8310 1st Qu.:440.0 1st Qu.:442.5 1st Qu.:470.9
## Median :-0.4113 Median :493.4 Median :489.5 Median :523.9
## Mean :-0.4444 Mean :496.1 Mean :489.9 Mean :524.8
## 3rd Qu.: 0.1157 3rd Qu.:551.5 3rd Qu.:537.6 3rd Qu.:574.8
## Max. : 2.1655 Max. :820.1 Max. :744.1 Max. :807.3
## NA's :52
## STRATUM SCHSIZE CLSIZE STRATIO
## VNM0313:989 Min. : 113 Min. :13.00 Min. : 4.314
## VNM0208:884 1st Qu.: 650 1st Qu.:38.00 1st Qu.:14.024
## VNM0101:806 Median :1090 Median :38.00 Median :16.627
## VNM0207:790 Mean :1082 Mean :40.57 Mean :16.497
## VNM0102:764 3rd Qu.:1419 3rd Qu.:43.00 3rd Qu.:18.983
## VNM0314:679 Max. :4016 Max. :53.00 Max. :38.651
## (Other):914 NA's :34
## SCHLTYPE Region Area
## Min. :1.000 CENTRAL:2006 REMOTE: 410
## 1st Qu.:3.000 NORTH :1958 RURAL :2368
## Median :3.000 SOUTH :1862 URBAN :3048
## Mean :2.849
## 3rd Qu.:3.000
## Max. :3.000
## NA's :35
library(sjPlot)
plot_frq(pisa$Region)
plot_frq(pisa$Area)
table1library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ WEALTH + PARED + HEDRES + PV1MATH + PV1SCIE + PV1READ, data=pisa)
| Overall (n=5826) |
|
|---|---|
| WEALTH | |
| Mean (SD) | -2.22 (1.16) |
| Median [Min, Max] | -2.16 [-7.64, 3.21] |
| Missing | 15 (0.3%) |
| PARED | |
| Mean (SD) | 9.37 (3.51) |
| Median [Min, Max] | 9.00 [3.00, 17.0] |
| Missing | 14 (0.2%) |
| HEDRES | |
| Mean (SD) | -1.05 (0.942) |
| Median [Min, Max] | -1.11 [-4.37, 1.18] |
| Missing | 20 (0.3%) |
| Math | |
| Mean (SD) | 496 (81.5) |
| Median [Min, Max] | 493 [202, 820] |
| Science | |
| Mean (SD) | 525 (75.0) |
| Median [Min, Max] | 524 [293, 807] |
| Reading | |
| Mean (SD) | 490 (70.6) |
| Median [Min, Max] | 489 [107, 744] |
#Phân theo Region
table1(~ WEALTH + PARED + HEDRES + PV1MATH + PV1SCIE + PV1READ | Region, data=pisa)
| CENTRAL (n=2006) |
NORTH (n=1958) |
SOUTH (n=1862) |
Overall (n=5826) |
|
|---|---|---|---|---|
| WEALTH | ||||
| Mean (SD) | -2.40 (1.12) | -2.18 (1.18) | -2.06 (1.14) | -2.22 (1.16) |
| Median [Min, Max] | -2.33 [-7.64, 1.41] | -2.14 [-7.64, 2.63] | -2.03 [-7.64, 3.21] | -2.16 [-7.64, 3.21] |
| Missing | 6 (0.3%) | 8 (0.4%) | 1 (0.1%) | 15 (0.3%) |
| PARED | ||||
| Mean (SD) | 9.49 (3.44) | 9.76 (3.51) | 8.85 (3.54) | 9.37 (3.51) |
| Median [Min, Max] | 9.00 [3.00, 17.0] | 9.00 [3.00, 17.0] | 9.00 [3.00, 17.0] | 9.00 [3.00, 17.0] |
| Missing | 3 (0.1%) | 9 (0.5%) | 2 (0.1%) | 14 (0.2%) |
| HEDRES | ||||
| Mean (SD) | -1.04 (0.955) | -1.06 (0.930) | -1.04 (0.941) | -1.05 (0.942) |
| Median [Min, Max] | -1.11 [-4.37, 1.18] | -1.11 [-4.37, 1.16] | -1.11 [-4.37, 1.16] | -1.11 [-4.37, 1.18] |
| Missing | 7 (0.3%) | 7 (0.4%) | 6 (0.3%) | 20 (0.3%) |
| Math | ||||
| Mean (SD) | 492 (86.5) | 501 (84.4) | 496 (72.2) | 496 (81.5) |
| Median [Min, Max] | 488 [202, 818] | 500 [251, 820] | 494 [241, 719] | 493 [202, 820] |
| Science | ||||
| Mean (SD) | 524 (79.8) | 523 (76.6) | 528 (67.3) | 525 (75.0) |
| Median [Min, Max] | 520 [307, 807] | 522 [293, 775] | 528 [337, 761] | 524 [293, 807] |
| Reading | ||||
| Mean (SD) | 488 (74.3) | 489 (72.4) | 493 (64.4) | 490 (70.6) |
| Median [Min, Max] | 486 [233, 744] | 489 [107, 718] | 493 [272, 698] | 489 [107, 744] |
#Phân theo Area
table1(~ WEALTH + PARED + HEDRES + PV1MATH + PV1SCIE + PV1READ | Area, data=pisa)
| REMOTE (n=410) |
RURAL (n=2368) |
URBAN (n=3048) |
Overall (n=5826) |
|
|---|---|---|---|---|
| WEALTH | ||||
| Mean (SD) | -3.00 (1.25) | -2.22 (1.08) | -2.12 (1.16) | -2.22 (1.16) |
| Median [Min, Max] | -2.83 [-7.64, -0.0430] | -2.16 [-7.64, 1.43] | -2.10 [-7.64, 3.21] | -2.16 [-7.64, 3.21] |
| Missing | 6 (1.5%) | 7 (0.3%) | 2 (0.1%) | 15 (0.3%) |
| PARED | ||||
| Mean (SD) | 7.90 (3.69) | 9.38 (3.47) | 9.56 (3.48) | 9.37 (3.51) |
| Median [Min, Max] | 9.00 [3.00, 17.0] | 9.00 [3.00, 17.0] | 9.00 [3.00, 17.0] | 9.00 [3.00, 17.0] |
| Missing | 8 (2.0%) | 5 (0.2%) | 1 (0.0%) | 14 (0.2%) |
| HEDRES | ||||
| Mean (SD) | -1.66 (1.12) | -0.985 (0.946) | -1.01 (0.883) | -1.05 (0.942) |
| Median [Min, Max] | -1.44 [-4.37, 1.16] | -1.11 [-4.37, 1.18] | -1.11 [-4.37, 1.16] | -1.11 [-4.37, 1.18] |
| Missing | 6 (1.5%) | 9 (0.4%) | 5 (0.2%) | 20 (0.3%) |
| Math | ||||
| Mean (SD) | 450 (82.0) | 500 (81.9) | 499 (79.3) | 496 (81.5) |
| Median [Min, Max] | 446 [216, 696] | 498 [273, 818] | 497 [202, 820] | 493 [202, 820] |
| Science | ||||
| Mean (SD) | 482 (74.4) | 529 (75.5) | 527 (72.8) | 525 (75.0) |
| Median [Min, Max] | 475 [307, 698] | 529 [335, 807] | 525 [293, 799] | 524 [293, 807] |
| Reading | ||||
| Mean (SD) | 440 (76.0) | 491 (67.6) | 496 (69.6) | 490 (70.6) |
| Median [Min, Max] | 439 [233, 643] | 490 [292, 744] | 495 [107, 718] | 489 [107, 744] |
library(compareGroups)
## Loading required package: SNPassoc
## Loading required package: haplo.stats
## Loading required package: survival
## Loading required package: mvtnorm
## Loading required package: parallel
## Registered S3 method overwritten by 'SNPassoc':
## method from
## summary.haplo.glm haplo.stats
t1 = compareGroups(Area ~ WEALTH + PARED + HEDRES + PV1MATH + PV1SCIE + PV1READ, data=pisa)
t2 = compareGroups(Region ~ WEALTH + PARED + HEDRES + PV1MATH + PV1SCIE + PV1READ, data=pisa)
createTable(t1)
##
## --------Summary descriptives table by 'Area'---------
##
## ________________________________________________________
## REMOTE RURAL URBAN p.overall
## N=410 N=2368 N=3048
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## WEALTH -3.00 (1.25) -2.22 (1.08) -2.12 (1.16) <0.001
## PARED 7.90 (3.69) 9.38 (3.47) 9.56 (3.48) <0.001
## HEDRES -1.66 (1.12) -0.98 (0.95) -1.01 (0.88) <0.001
## Math 450 (82.0) 500 (81.9) 499 (79.3) <0.001
## Science 482 (74.4) 529 (75.5) 527 (72.8) <0.001
## Reading 440 (76.0) 491 (67.6) 496 (69.6) <0.001
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
createTable(t2)
##
## --------Summary descriptives table by 'Region'---------
##
## ________________________________________________________
## CENTRAL NORTH SOUTH p.overall
## N=2006 N=1958 N=1862
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## WEALTH -2.40 (1.12) -2.18 (1.18) -2.06 (1.14) <0.001
## PARED 9.49 (3.44) 9.76 (3.51) 8.85 (3.54) <0.001
## HEDRES -1.04 (0.95) -1.06 (0.93) -1.04 (0.94) 0.655
## Math 492 (86.5) 501 (84.4) 496 (72.2) 0.002
## Science 524 (79.8) 523 (76.6) 528 (67.3) 0.075
## Reading 488 (74.3) 489 (72.4) 493 (64.4) 0.042
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯