Để cài đặt gói phần mềm chúng ta sử dụng hàm install.packages, cú pháp như sau install.packages ('tên gói phần mềm', dependencies= T). Đối số dependencies = T có ý nghĩa yêu cầu cài cùng lúc các gói phụ trợ. Làm quen với những package sau đây: table1, sjPlot, sjmisc.

TASK 2

Đọc dữ liệu vào R

crime = read.csv("C:\\Users\\Nguyen\\Desktop\\Workshop NCKH 2019\\Dataset\\Crime data 2003to2018.csv")

Tìm hiểu dữ liệu

names(crime)
## [1] "ID"          "Category"    "Description" "Day"         "Date"       
## [6] "Time"        "District"    "Resolution"
dim(crime)
## [1] 1048575       8
head(crime)
##          ID       Category                           Description     Day
## 1 180362289  VEHICLE THEFT                     STOLEN MOTORCYCLE Tuesday
## 2 180360948   NON-CRIMINAL          AIDED CASE, MENTAL DISTURBED Tuesday
## 3 180360879 OTHER OFFENSES                      PAROLE VIOLATION Tuesday
## 4 180360879 OTHER OFFENSES              TRAFFIC VIOLATION ARREST Tuesday
## 5 180360879 OTHER OFFENSES                     TRAFFIC VIOLATION Tuesday
## 6 180360829 OTHER OFFENSES DRIVERS LICENSE, SUSPENDED OR REVOKED Tuesday
##         Date  Time District     Resolution
## 1 05/15/2018 10:30 SOUTHERN           NONE
## 2 05/15/2018  4:14 SOUTHERN           NONE
## 3 05/15/2018  2:01  MISSION ARREST, BOOKED
## 4 05/15/2018  2:01  MISSION ARREST, BOOKED
## 5 05/15/2018  2:01  MISSION ARREST, BOOKED
## 6 05/15/2018  1:27  MISSION           NONE

Tìm hiểu bao nhiêu loại tội phạm, xảy ra vào ngày nào

library(sjPlot)
## #refugeeswelcome
plot_frq(crime$Category)

#Quay ngang biểu đồ và sắp xếp từ lớn đến nhỏ
plot_frq(crime$Category, sort.frq = 'asc', coord.flip=T)

#Quay ngang biểu đồ và sắp xếp từ nhỏ đến lớn
plot_frq(crime$Category, sort.frq = 'desc', coord.flip=T)

#Tìm hiểu tội phạm thường xảy ra vào ngày nào?
plot_frq(crime$Day)

#Sắp xếp theo các ngày trong tuần
crime$Day = factor(crime$Day, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
plot_frq(crime$Day)

TASK 3

Tạo biến mới

ob = read.csv("C:\\Users\\Nguyen\\Desktop\\Workshop NCKH 2019\\Dataset\\Obesity data.csv")
ob$OB [ob$bmi < 18.5] = "Underweight"
ob$OB [ob$bmi >= 18.5 & ob$bmi < 24.9] = "Normal"
ob$OB [ob$bmi >= 25.0 & ob$bmi < 29.9] = "Overweight"
ob$OB [ob$bmi >= 30] = "Obese"
ob$OB = factor(ob$OB, levels=c("Underweight" , "Normal", "Overweight", "Obese"))
library(sjPlot)
plot_frq(ob$OB)

TASK 4

Hợp nhất dữ liệu

students = read.csv("C:\\Users\\Nguyen\\Desktop\\Workshop NCKH 2019\\Dataset\\PISA VN 2015.csv")
schools = read.csv("C:\\Users\\Nguyen\\Desktop\\Workshop NCKH 2019\\Dataset\\PISA VN SCHOOLS 2015.csv")

#Tìm hiểu có bao nhiêu biến số trong students và schools
names(students)
##  [1] "CNTSCHID" "AGE"      "Gender"   "PARED"    "HEDRES"   "MISCED"  
##  [7] "FISCED"   "HISCED"   "WEALTH"   "ESCS"     "INSTSCIE" "SCIEEFF" 
## [13] "JOYSCIE"  "ICTRES"   "HOMEPOS"  "HEDRES.1" "CULTPOSS" "PV1MATH" 
## [19] "PV1READ"  "PV1SCIE"
names(schools)
## [1] "STRATUM"  "CNTSCHID" "SCHSIZE"  "CLSIZE"   "STRATIO"  "SCHLTYPE"
## [7] "Region"   "Area"
#Hợp nhất 2 dataset theo biến số CNTSCHID
pisa = merge(students, schools, by="CNTSCHID")
names(pisa)
##  [1] "CNTSCHID" "AGE"      "Gender"   "PARED"    "HEDRES"   "MISCED"  
##  [7] "FISCED"   "HISCED"   "WEALTH"   "ESCS"     "INSTSCIE" "SCIEEFF" 
## [13] "JOYSCIE"  "ICTRES"   "HOMEPOS"  "HEDRES.1" "CULTPOSS" "PV1MATH" 
## [19] "PV1READ"  "PV1SCIE"  "STRATUM"  "SCHSIZE"  "CLSIZE"   "STRATIO" 
## [25] "SCHLTYPE" "Region"   "Area"
#Dán nhãn các biến số
library(DescTools)
Label(pisa$PV1MATH) = "Math"
Label(pisa$PV1SCIE) = "Science"
Label(pisa$PV1READ) = "Reading"
#Hoặc đổi tên các biến số:
# names (pisa) [names(pisa) == 'PV1MATH'] = 'Math'
# names (pisa) [names(pisa) == 'PV1SCIE'] = 'Science'
# names (pisa) [names(pisa) == 'PV1READ'] = 'Reading'

Tóm tắt dữ liệu

summary(pisa)
##     CNTSCHID             AGE          Gender         PARED       
##  Min.   :70400001   Min.   :15.33   Boys :2786   Min.   : 3.000  
##  1st Qu.:70400052   1st Qu.:15.50   Girls:3040   1st Qu.: 9.000  
##  Median :70400096   Median :15.75                Median : 9.000  
##  Mean   :70400097   Mean   :15.78                Mean   : 9.374  
##  3rd Qu.:70400143   3rd Qu.:16.00                3rd Qu.:12.000  
##  Max.   :70400188   Max.   :16.25                Max.   :17.000  
##                                                  NA's   :14      
##      HEDRES            MISCED          FISCED          HISCED    
##  Min.   :-4.3706   Min.   :0.000   Min.   :0.000   Min.   :0.00  
##  1st Qu.:-1.5169   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.00  
##  Median :-1.1119   Median :2.000   Median :2.000   Median :2.00  
##  Mean   :-1.0470   Mean   :2.069   Mean   :2.296   Mean   :2.58  
##  3rd Qu.:-0.7026   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:4.00  
##  Max.   : 1.1767   Max.   :6.000   Max.   :6.000   Max.   :6.00  
##  NA's   :20        NA's   :34      NA's   :88      NA's   :14    
##      WEALTH            ESCS           INSTSCIE          SCIEEFF       
##  Min.   :-7.635   Min.   :-5.657   Min.   :-1.9301   Min.   :-3.7565  
##  1st Qu.:-2.829   1st Qu.:-2.539   1st Qu.: 0.0125   1st Qu.:-0.8585  
##  Median :-2.163   Median :-1.982   Median : 0.3708   Median :-0.3473  
##  Mean   :-2.219   Mean   :-1.822   Mean   : 0.4835   Mean   :-0.2662  
##  3rd Qu.:-1.504   3rd Qu.:-1.221   3rd Qu.: 1.0218   3rd Qu.: 0.3155  
##  Max.   : 3.211   Max.   : 1.950   Max.   : 1.7359   Max.   : 3.2775  
##  NA's   :15       NA's   :1        NA's   :17        NA's   :19       
##     JOYSCIE            ICTRES          HOMEPOS          HEDRES.1      
##  Min.   :-2.1154   Min.   :-3.508   Min.   :-8.955   Min.   :-4.3706  
##  1st Qu.: 0.5094   1st Qu.:-2.587   1st Qu.:-2.669   1st Qu.:-1.5169  
##  Median : 0.5094   Median :-1.855   Median :-2.047   Median :-1.1119  
##  Mean   : 0.6448   Mean   :-1.795   Mean   :-2.042   Mean   :-1.0470  
##  3rd Qu.: 1.1049   3rd Qu.:-1.117   3rd Qu.:-1.354   3rd Qu.:-0.7026  
##  Max.   : 2.1635   Max.   : 3.497   Max.   : 2.770   Max.   : 1.1767  
##  NA's   :19        NA's   :34       NA's   :2        NA's   :20       
##     CULTPOSS          PV1MATH         PV1READ         PV1SCIE     
##  Min.   :-1.8413   Min.   :201.7   Min.   :107.1   Min.   :292.7  
##  1st Qu.:-0.8310   1st Qu.:440.0   1st Qu.:442.5   1st Qu.:470.9  
##  Median :-0.4113   Median :493.4   Median :489.5   Median :523.9  
##  Mean   :-0.4444   Mean   :496.1   Mean   :489.9   Mean   :524.8  
##  3rd Qu.: 0.1157   3rd Qu.:551.5   3rd Qu.:537.6   3rd Qu.:574.8  
##  Max.   : 2.1655   Max.   :820.1   Max.   :744.1   Max.   :807.3  
##  NA's   :52                                                       
##     STRATUM       SCHSIZE         CLSIZE         STRATIO      
##  VNM0313:989   Min.   : 113   Min.   :13.00   Min.   : 4.314  
##  VNM0208:884   1st Qu.: 650   1st Qu.:38.00   1st Qu.:14.024  
##  VNM0101:806   Median :1090   Median :38.00   Median :16.627  
##  VNM0207:790   Mean   :1082   Mean   :40.57   Mean   :16.497  
##  VNM0102:764   3rd Qu.:1419   3rd Qu.:43.00   3rd Qu.:18.983  
##  VNM0314:679   Max.   :4016   Max.   :53.00   Max.   :38.651  
##  (Other):914                  NA's   :34                      
##     SCHLTYPE         Region         Area     
##  Min.   :1.000   CENTRAL:2006   REMOTE: 410  
##  1st Qu.:3.000   NORTH  :1958   RURAL :2368  
##  Median :3.000   SOUTH  :1862   URBAN :3048  
##  Mean   :2.849                               
##  3rd Qu.:3.000                               
##  Max.   :3.000                               
##  NA's   :35
library(sjPlot)
plot_frq(pisa$Region)

plot_frq(pisa$Area)

TASK 5

Phân tích mô tả pisa với package table1

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ WEALTH + PARED + HEDRES + PV1MATH + PV1SCIE + PV1READ, data=pisa)
Overall
(n=5826)
WEALTH
Mean (SD) -2.22 (1.16)
Median [Min, Max] -2.16 [-7.64, 3.21]
Missing 15 (0.3%)
PARED
Mean (SD) 9.37 (3.51)
Median [Min, Max] 9.00 [3.00, 17.0]
Missing 14 (0.2%)
HEDRES
Mean (SD) -1.05 (0.942)
Median [Min, Max] -1.11 [-4.37, 1.18]
Missing 20 (0.3%)
Math
Mean (SD) 496 (81.5)
Median [Min, Max] 493 [202, 820]
Science
Mean (SD) 525 (75.0)
Median [Min, Max] 524 [293, 807]
Reading
Mean (SD) 490 (70.6)
Median [Min, Max] 489 [107, 744]
#Phân theo Region
table1(~ WEALTH + PARED + HEDRES + PV1MATH + PV1SCIE + PV1READ | Region, data=pisa)
CENTRAL
(n=2006)
NORTH
(n=1958)
SOUTH
(n=1862)
Overall
(n=5826)
WEALTH
Mean (SD) -2.40 (1.12) -2.18 (1.18) -2.06 (1.14) -2.22 (1.16)
Median [Min, Max] -2.33 [-7.64, 1.41] -2.14 [-7.64, 2.63] -2.03 [-7.64, 3.21] -2.16 [-7.64, 3.21]
Missing 6 (0.3%) 8 (0.4%) 1 (0.1%) 15 (0.3%)
PARED
Mean (SD) 9.49 (3.44) 9.76 (3.51) 8.85 (3.54) 9.37 (3.51)
Median [Min, Max] 9.00 [3.00, 17.0] 9.00 [3.00, 17.0] 9.00 [3.00, 17.0] 9.00 [3.00, 17.0]
Missing 3 (0.1%) 9 (0.5%) 2 (0.1%) 14 (0.2%)
HEDRES
Mean (SD) -1.04 (0.955) -1.06 (0.930) -1.04 (0.941) -1.05 (0.942)
Median [Min, Max] -1.11 [-4.37, 1.18] -1.11 [-4.37, 1.16] -1.11 [-4.37, 1.16] -1.11 [-4.37, 1.18]
Missing 7 (0.3%) 7 (0.4%) 6 (0.3%) 20 (0.3%)
Math
Mean (SD) 492 (86.5) 501 (84.4) 496 (72.2) 496 (81.5)
Median [Min, Max] 488 [202, 818] 500 [251, 820] 494 [241, 719] 493 [202, 820]
Science
Mean (SD) 524 (79.8) 523 (76.6) 528 (67.3) 525 (75.0)
Median [Min, Max] 520 [307, 807] 522 [293, 775] 528 [337, 761] 524 [293, 807]
Reading
Mean (SD) 488 (74.3) 489 (72.4) 493 (64.4) 490 (70.6)
Median [Min, Max] 486 [233, 744] 489 [107, 718] 493 [272, 698] 489 [107, 744]
#Phân theo Area
table1(~ WEALTH + PARED + HEDRES + PV1MATH + PV1SCIE + PV1READ | Area, data=pisa)
REMOTE
(n=410)
RURAL
(n=2368)
URBAN
(n=3048)
Overall
(n=5826)
WEALTH
Mean (SD) -3.00 (1.25) -2.22 (1.08) -2.12 (1.16) -2.22 (1.16)
Median [Min, Max] -2.83 [-7.64, -0.0430] -2.16 [-7.64, 1.43] -2.10 [-7.64, 3.21] -2.16 [-7.64, 3.21]
Missing 6 (1.5%) 7 (0.3%) 2 (0.1%) 15 (0.3%)
PARED
Mean (SD) 7.90 (3.69) 9.38 (3.47) 9.56 (3.48) 9.37 (3.51)
Median [Min, Max] 9.00 [3.00, 17.0] 9.00 [3.00, 17.0] 9.00 [3.00, 17.0] 9.00 [3.00, 17.0]
Missing 8 (2.0%) 5 (0.2%) 1 (0.0%) 14 (0.2%)
HEDRES
Mean (SD) -1.66 (1.12) -0.985 (0.946) -1.01 (0.883) -1.05 (0.942)
Median [Min, Max] -1.44 [-4.37, 1.16] -1.11 [-4.37, 1.18] -1.11 [-4.37, 1.16] -1.11 [-4.37, 1.18]
Missing 6 (1.5%) 9 (0.4%) 5 (0.2%) 20 (0.3%)
Math
Mean (SD) 450 (82.0) 500 (81.9) 499 (79.3) 496 (81.5)
Median [Min, Max] 446 [216, 696] 498 [273, 818] 497 [202, 820] 493 [202, 820]
Science
Mean (SD) 482 (74.4) 529 (75.5) 527 (72.8) 525 (75.0)
Median [Min, Max] 475 [307, 698] 529 [335, 807] 525 [293, 799] 524 [293, 807]
Reading
Mean (SD) 440 (76.0) 491 (67.6) 496 (69.6) 490 (70.6)
Median [Min, Max] 439 [233, 643] 490 [292, 744] 495 [107, 718] 489 [107, 744]

TASK 6

Phân tích mô tả pisa với package “compareGroups”

library(compareGroups)
## Loading required package: SNPassoc
## Loading required package: haplo.stats
## Loading required package: survival
## Loading required package: mvtnorm
## Loading required package: parallel
## Registered S3 method overwritten by 'SNPassoc':
##   method            from       
##   summary.haplo.glm haplo.stats
t1 = compareGroups(Area ~ WEALTH + PARED + HEDRES + PV1MATH + PV1SCIE + PV1READ, data=pisa)
t2 = compareGroups(Region ~ WEALTH + PARED + HEDRES + PV1MATH + PV1SCIE + PV1READ, data=pisa)
createTable(t1)
## 
## --------Summary descriptives table by 'Area'---------
## 
## ________________________________________________________ 
##            REMOTE       RURAL        URBAN     p.overall 
##            N=410        N=2368       N=3048              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## WEALTH  -3.00 (1.25) -2.22 (1.08) -2.12 (1.16)  <0.001   
## PARED   7.90 (3.69)  9.38 (3.47)  9.56 (3.48)   <0.001   
## HEDRES  -1.66 (1.12) -0.98 (0.95) -1.01 (0.88)  <0.001   
## Math     450 (82.0)   500 (81.9)   499 (79.3)   <0.001   
## Science  482 (74.4)   529 (75.5)   527 (72.8)   <0.001   
## Reading  440 (76.0)   491 (67.6)   496 (69.6)   <0.001   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
createTable(t2)
## 
## --------Summary descriptives table by 'Region'---------
## 
## ________________________________________________________ 
##           CENTRAL       NORTH        SOUTH     p.overall 
##            N=2006       N=1958       N=1862              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## WEALTH  -2.40 (1.12) -2.18 (1.18) -2.06 (1.14)  <0.001   
## PARED   9.49 (3.44)  9.76 (3.51)  8.85 (3.54)   <0.001   
## HEDRES  -1.04 (0.95) -1.06 (0.93) -1.04 (0.94)   0.655   
## Math     492 (86.5)   501 (84.4)   496 (72.2)    0.002   
## Science  524 (79.8)   523 (76.6)   528 (67.3)    0.075   
## Reading  488 (74.3)   489 (72.4)   493 (64.4)    0.042   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯