Task 1: Đọc dữ liệu Pisa

t = "file:///E:/Dropbox/Phuong Nam's research/Research documents/GS Nguyen Van Tuan/GS Tuan - Khoa hoc phan tich 2020-01-05/Textbook/TDTU Datasets for 2020 Workshop/PISA Data Vietnam 2015.csv"
pisa = read.csv(t, header = T)
#Tìm hiểu dữ liệu có bao nhiêu ca (dòng) và bao nhiều cột (biến): hàm dim (dimension)
dim(pisa)
## [1] 5826   18
#Liệt kê 6 dòng đầu tiên: hàm head (mặc định 6 hàng đầu)
head(pisa)
##     School SchoolSize ClassSize STratio SchoolType  Area Region   Age Gender
## 1 70400001        883        18  22.075          3 URBAN  SOUTH 15.58   Boys
## 2 70400001        883        18  22.075          3 URBAN  SOUTH 15.92   Boys
## 3 70400001        883        18  22.075          3 URBAN  SOUTH 15.42  Girls
## 4 70400001        883        18  22.075          3 URBAN  SOUTH 15.58  Girls
## 5 70400001        883        18  22.075          3 URBAN  SOUTH 15.92  Girls
## 6 70400001        883        18  22.075          3 URBAN  SOUTH 16.25  Girls
##   PARED HISCED  WEALTH INSTSCIE JOYSCIE  ICTRES    Math    Read Science
## 1     9      2 -2.0697   0.9798  2.1635 -1.5244 439.923 412.290 475.612
## 2    12      4 -1.7903   1.7359  2.1635 -1.9305 406.251 409.598 450.320
## 3     9      2 -2.1942  -0.2063 -0.1808 -1.6093 414.369 384.307 405.787
## 4     5      1 -2.0301  -0.3115 -0.4318 -1.6250 468.801 459.104 462.968
## 5     9      2 -1.0522   0.7648  1.3031 -0.5305 355.432 402.435 453.736
## 6     5      1 -3.0570   0.3708  0.5094 -2.5873 458.955 483.885 529.866
#Tóm tắt dữ liệu: hàm summary
summary(pisa)
##      School           SchoolSize     ClassSize        STratio      
##  Min.   :70400001   Min.   : 113   Min.   :13.00   Min.   : 4.314  
##  1st Qu.:70400052   1st Qu.: 650   1st Qu.:38.00   1st Qu.:14.024  
##  Median :70400096   Median :1090   Median :38.00   Median :16.627  
##  Mean   :70400097   Mean   :1082   Mean   :40.57   Mean   :16.497  
##  3rd Qu.:70400143   3rd Qu.:1419   3rd Qu.:43.00   3rd Qu.:18.983  
##  Max.   :70400188   Max.   :4016   Max.   :53.00   Max.   :38.651  
##                                    NA's   :34                      
##    SchoolType        Area          Region          Age          Gender    
##  Min.   :1.000   REMOTE: 410   CENTRAL:2006   Min.   :15.33   Boys :2786  
##  1st Qu.:3.000   RURAL :2368   NORTH  :1958   1st Qu.:15.50   Girls:3040  
##  Median :3.000   URBAN :3048   SOUTH  :1862   Median :15.75               
##  Mean   :2.849                                Mean   :15.78               
##  3rd Qu.:3.000                                3rd Qu.:16.00               
##  Max.   :3.000                                Max.   :16.25               
##  NA's   :35                                                               
##      PARED            HISCED         WEALTH          INSTSCIE      
##  Min.   : 3.000   Min.   :0.00   Min.   :-7.635   Min.   :-1.9301  
##  1st Qu.: 9.000   1st Qu.:2.00   1st Qu.:-2.829   1st Qu.: 0.0125  
##  Median : 9.000   Median :2.00   Median :-2.163   Median : 0.3708  
##  Mean   : 9.374   Mean   :2.58   Mean   :-2.219   Mean   : 0.4835  
##  3rd Qu.:12.000   3rd Qu.:4.00   3rd Qu.:-1.504   3rd Qu.: 1.0218  
##  Max.   :17.000   Max.   :6.00   Max.   : 3.211   Max.   : 1.7359  
##  NA's   :14       NA's   :14     NA's   :15       NA's   :17       
##     JOYSCIE            ICTRES            Math            Read      
##  Min.   :-2.1154   Min.   :-3.508   Min.   :201.7   Min.   :107.1  
##  1st Qu.: 0.5094   1st Qu.:-2.587   1st Qu.:440.0   1st Qu.:442.5  
##  Median : 0.5094   Median :-1.855   Median :493.4   Median :489.5  
##  Mean   : 0.6448   Mean   :-1.795   Mean   :496.1   Mean   :489.9  
##  3rd Qu.: 1.1049   3rd Qu.:-1.117   3rd Qu.:551.5   3rd Qu.:537.6  
##  Max.   : 2.1635   Max.   : 3.497   Max.   :820.1   Max.   :744.1  
##  NA's   :19        NA's   :34                                      
##     Science     
##  Min.   :292.7  
##  1st Qu.:470.9  
##  Median :523.9  
##  Mean   :524.8  
##  3rd Qu.:574.8  
##  Max.   :807.3  
## 
#Sắp xếp thứ tự Aera bằng hàm factor
table(pisa$Area)
## 
## REMOTE  RURAL  URBAN 
##    410   2368   3048
#Hàm table sẽ sắp xếp mặc định theo thứ tự A, B, C...
pisa$Area = factor(pisa$Area, levels = c("URBAN", "RURAL", "REMOTE"))
#Hàm factor ấn định thứ tự sắp xếp theo mong muốn
table(pisa$Area)
## 
##  URBAN  RURAL REMOTE 
##   3048   2368    410
head(pisa)
##     School SchoolSize ClassSize STratio SchoolType  Area Region   Age Gender
## 1 70400001        883        18  22.075          3 URBAN  SOUTH 15.58   Boys
## 2 70400001        883        18  22.075          3 URBAN  SOUTH 15.92   Boys
## 3 70400001        883        18  22.075          3 URBAN  SOUTH 15.42  Girls
## 4 70400001        883        18  22.075          3 URBAN  SOUTH 15.58  Girls
## 5 70400001        883        18  22.075          3 URBAN  SOUTH 15.92  Girls
## 6 70400001        883        18  22.075          3 URBAN  SOUTH 16.25  Girls
##   PARED HISCED  WEALTH INSTSCIE JOYSCIE  ICTRES    Math    Read Science
## 1     9      2 -2.0697   0.9798  2.1635 -1.5244 439.923 412.290 475.612
## 2    12      4 -1.7903   1.7359  2.1635 -1.9305 406.251 409.598 450.320
## 3     9      2 -2.1942  -0.2063 -0.1808 -1.6093 414.369 384.307 405.787
## 4     5      1 -2.0301  -0.3115 -0.4318 -1.6250 468.801 459.104 462.968
## 5     9      2 -1.0522   0.7648  1.3031 -0.5305 355.432 402.435 453.736
## 6     5      1 -3.0570   0.3708  0.5094 -2.5873 458.955 483.885 529.866
summary(pisa)
##      School           SchoolSize     ClassSize        STratio      
##  Min.   :70400001   Min.   : 113   Min.   :13.00   Min.   : 4.314  
##  1st Qu.:70400052   1st Qu.: 650   1st Qu.:38.00   1st Qu.:14.024  
##  Median :70400096   Median :1090   Median :38.00   Median :16.627  
##  Mean   :70400097   Mean   :1082   Mean   :40.57   Mean   :16.497  
##  3rd Qu.:70400143   3rd Qu.:1419   3rd Qu.:43.00   3rd Qu.:18.983  
##  Max.   :70400188   Max.   :4016   Max.   :53.00   Max.   :38.651  
##                                    NA's   :34                      
##    SchoolType        Area          Region          Age          Gender    
##  Min.   :1.000   URBAN :3048   CENTRAL:2006   Min.   :15.33   Boys :2786  
##  1st Qu.:3.000   RURAL :2368   NORTH  :1958   1st Qu.:15.50   Girls:3040  
##  Median :3.000   REMOTE: 410   SOUTH  :1862   Median :15.75               
##  Mean   :2.849                                Mean   :15.78               
##  3rd Qu.:3.000                                3rd Qu.:16.00               
##  Max.   :3.000                                Max.   :16.25               
##  NA's   :35                                                               
##      PARED            HISCED         WEALTH          INSTSCIE      
##  Min.   : 3.000   Min.   :0.00   Min.   :-7.635   Min.   :-1.9301  
##  1st Qu.: 9.000   1st Qu.:2.00   1st Qu.:-2.829   1st Qu.: 0.0125  
##  Median : 9.000   Median :2.00   Median :-2.163   Median : 0.3708  
##  Mean   : 9.374   Mean   :2.58   Mean   :-2.219   Mean   : 0.4835  
##  3rd Qu.:12.000   3rd Qu.:4.00   3rd Qu.:-1.504   3rd Qu.: 1.0218  
##  Max.   :17.000   Max.   :6.00   Max.   : 3.211   Max.   : 1.7359  
##  NA's   :14       NA's   :14     NA's   :15       NA's   :17       
##     JOYSCIE            ICTRES            Math            Read      
##  Min.   :-2.1154   Min.   :-3.508   Min.   :201.7   Min.   :107.1  
##  1st Qu.: 0.5094   1st Qu.:-2.587   1st Qu.:440.0   1st Qu.:442.5  
##  Median : 0.5094   Median :-1.855   Median :493.4   Median :489.5  
##  Mean   : 0.6448   Mean   :-1.795   Mean   :496.1   Mean   :489.9  
##  3rd Qu.: 1.1049   3rd Qu.:-1.117   3rd Qu.:551.5   3rd Qu.:537.6  
##  Max.   : 2.1635   Max.   : 3.497   Max.   :820.1   Max.   :744.1  
##  NA's   :19        NA's   :34                                      
##     Science     
##  Min.   :292.7  
##  1st Qu.:470.9  
##  Median :523.9  
##  Mean   :524.8  
##  3rd Qu.:574.8  
##  Max.   :807.3  
## 

Task 2: Mã hóa

#xem các giá trị trong SchoolType
table(pisa$SchoolType) 
## 
##    1    3 
##  436 5355
#Tạo biến mới Type từ biến SchoolType
pisa$Type [pisa$SchoolType == 1]= "Private" 
pisa$Type [pisa$SchoolType == 3]= "Public"
table(pisa$Type)
## 
## Private  Public 
##     436    5355

Task 3: Phân tích mô tả với package table1

#Install package "table1"
library(table1)
## Warning: package 'table1' was built under R version 3.6.2
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ SchoolSize + ClassSize + Gender|Region, data =pisa)

CENTRAL
(n=2006)
NORTH
(n=1958)
SOUTH
(n=1862)
Overall
(n=5826)
SchoolSize
Mean (SD) 1110 (590) 1030 (515) 1110 (630) 1080 (581)
Median [Min, Max] 1140 [165, 4020] 936 [134, 2470] 1090 [113, 3370] 1090 [113, 4020]
ClassSize
Mean (SD) 40.7 (7.69) 40.5 (7.59) 40.5 (8.41) 40.6 (7.90)
Median [Min, Max] 43.0 [13.0, 53.0] 38.0 [13.0, 53.0] 43.0 [13.0, 53.0] 38.0 [13.0, 53.0]
Missing 0 (0%) 34 (1.7%) 0 (0%) 34 (0.6%)
Gender
Boys 953 (47.5%) 951 (48.6%) 882 (47.4%) 2786 (47.8%)
Girls 1053 (52.5%) 1007 (51.4%) 980 (52.6%) 3040 (52.2%)
#Task 4: Phân tích mô tả với package compareGroups

#Install package "compareGroups"
#packkage compareGroups hay hơn package table1 vì cho ra trị số P toàn thể (cho biết ít nhất là có 1 cặp có sự khác biệt có ý nghĩa thống kê nhưng không biết chính xác là cặp nào)
library(compareGroups)
## Warning: package 'compareGroups' was built under R version 3.6.2
## Loading required package: SNPassoc
## Warning: package 'SNPassoc' was built under R version 3.6.2
## Loading required package: haplo.stats
## Warning: package 'haplo.stats' was built under R version 3.6.2
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.6.2
## Loading required package: mvtnorm
## Loading required package: parallel
## Registered S3 method overwritten by 'SNPassoc':
##   method            from       
##   summary.haplo.glm haplo.stats
a = compareGroups(Area ~ WEALTH + PARED + Math + Read + Science, data = pisa)
createTable(a)
## 
## --------Summary descriptives table by 'Area'---------
## 
## ________________________________________________________ 
##            URBAN        RURAL        REMOTE    p.overall 
##            N=3048       N=2368       N=410               
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## WEALTH  -2.12 (1.16) -2.22 (1.08) -3.00 (1.25)  <0.001   
## PARED   9.56 (3.48)  9.38 (3.47)  7.90 (3.69)   <0.001   
## Math     499 (79.3)   500 (81.9)   450 (82.0)   <0.001   
## Read     496 (69.6)   491 (67.6)   440 (76.0)   <0.001   
## Science  527 (72.8)   529 (75.5)   482 (74.4)   <0.001   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
a = compareGroups(Gender ~ WEALTH + PARED + Math + Read + Science, data = pisa)
createTable(a)
## 
## --------Summary descriptives table by 'Gender'---------
## 
## ___________________________________________ 
##             Boys        Girls     p.overall 
##            N=2786       N=3040              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## WEALTH  -2.20 (1.14) -2.24 (1.17)   0.250   
## PARED   9.52 (3.48)  9.24 (3.54)    0.002   
## Math     498 (84.1)   495 (79.1)    0.152   
## Read     479 (72.8)   500 (67.1)   <0.001   
## Science  526 (77.1)   524 (72.9)    0.343   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
#Lệnh gộp không cần tạo đối tượng a:
createTable(compareGroups(Gender ~ WEALTH + PARED + Math + Read + Science + Area, data = pisa))
## 
## --------Summary descriptives table by 'Gender'---------
## 
## ______________________________________________ 
##                Boys        Girls     p.overall 
##               N=2786       N=3040              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## WEALTH     -2.20 (1.14) -2.24 (1.17)   0.250   
## PARED      9.52 (3.48)  9.24 (3.54)    0.002   
## Math        498 (84.1)   495 (79.1)    0.152   
## Read        479 (72.8)   500 (67.1)   <0.001   
## Science     526 (77.1)   524 (72.9)    0.343   
## Area:                                  0.034   
##     URBAN  1460 (52.4%) 1588 (52.2%)           
##     RURAL  1106 (39.7%) 1262 (41.5%)           
##     REMOTE 220 (7.90%)  190 (6.25%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯