t = "file:///E:/Dropbox/Phuong Nam's research/Research documents/GS Nguyen Van Tuan/GS Tuan - Khoa hoc phan tich 2020-01-05/Textbook/TDTU Datasets for 2020 Workshop/PISA Data Vietnam 2015.csv"
pisa = read.csv(t, header = T)
#Tìm hiểu dữ liệu có bao nhiêu ca (dòng) và bao nhiều cột (biến): hàm dim (dimension)
dim(pisa)
## [1] 5826 18
#Liệt kê 6 dòng đầu tiên: hàm head (mặc định 6 hàng đầu)
head(pisa)
## School SchoolSize ClassSize STratio SchoolType Area Region Age Gender
## 1 70400001 883 18 22.075 3 URBAN SOUTH 15.58 Boys
## 2 70400001 883 18 22.075 3 URBAN SOUTH 15.92 Boys
## 3 70400001 883 18 22.075 3 URBAN SOUTH 15.42 Girls
## 4 70400001 883 18 22.075 3 URBAN SOUTH 15.58 Girls
## 5 70400001 883 18 22.075 3 URBAN SOUTH 15.92 Girls
## 6 70400001 883 18 22.075 3 URBAN SOUTH 16.25 Girls
## PARED HISCED WEALTH INSTSCIE JOYSCIE ICTRES Math Read Science
## 1 9 2 -2.0697 0.9798 2.1635 -1.5244 439.923 412.290 475.612
## 2 12 4 -1.7903 1.7359 2.1635 -1.9305 406.251 409.598 450.320
## 3 9 2 -2.1942 -0.2063 -0.1808 -1.6093 414.369 384.307 405.787
## 4 5 1 -2.0301 -0.3115 -0.4318 -1.6250 468.801 459.104 462.968
## 5 9 2 -1.0522 0.7648 1.3031 -0.5305 355.432 402.435 453.736
## 6 5 1 -3.0570 0.3708 0.5094 -2.5873 458.955 483.885 529.866
#Tóm tắt dữ liệu: hàm summary
summary(pisa)
## School SchoolSize ClassSize STratio
## Min. :70400001 Min. : 113 Min. :13.00 Min. : 4.314
## 1st Qu.:70400052 1st Qu.: 650 1st Qu.:38.00 1st Qu.:14.024
## Median :70400096 Median :1090 Median :38.00 Median :16.627
## Mean :70400097 Mean :1082 Mean :40.57 Mean :16.497
## 3rd Qu.:70400143 3rd Qu.:1419 3rd Qu.:43.00 3rd Qu.:18.983
## Max. :70400188 Max. :4016 Max. :53.00 Max. :38.651
## NA's :34
## SchoolType Area Region Age Gender
## Min. :1.000 REMOTE: 410 CENTRAL:2006 Min. :15.33 Boys :2786
## 1st Qu.:3.000 RURAL :2368 NORTH :1958 1st Qu.:15.50 Girls:3040
## Median :3.000 URBAN :3048 SOUTH :1862 Median :15.75
## Mean :2.849 Mean :15.78
## 3rd Qu.:3.000 3rd Qu.:16.00
## Max. :3.000 Max. :16.25
## NA's :35
## PARED HISCED WEALTH INSTSCIE
## Min. : 3.000 Min. :0.00 Min. :-7.635 Min. :-1.9301
## 1st Qu.: 9.000 1st Qu.:2.00 1st Qu.:-2.829 1st Qu.: 0.0125
## Median : 9.000 Median :2.00 Median :-2.163 Median : 0.3708
## Mean : 9.374 Mean :2.58 Mean :-2.219 Mean : 0.4835
## 3rd Qu.:12.000 3rd Qu.:4.00 3rd Qu.:-1.504 3rd Qu.: 1.0218
## Max. :17.000 Max. :6.00 Max. : 3.211 Max. : 1.7359
## NA's :14 NA's :14 NA's :15 NA's :17
## JOYSCIE ICTRES Math Read
## Min. :-2.1154 Min. :-3.508 Min. :201.7 Min. :107.1
## 1st Qu.: 0.5094 1st Qu.:-2.587 1st Qu.:440.0 1st Qu.:442.5
## Median : 0.5094 Median :-1.855 Median :493.4 Median :489.5
## Mean : 0.6448 Mean :-1.795 Mean :496.1 Mean :489.9
## 3rd Qu.: 1.1049 3rd Qu.:-1.117 3rd Qu.:551.5 3rd Qu.:537.6
## Max. : 2.1635 Max. : 3.497 Max. :820.1 Max. :744.1
## NA's :19 NA's :34
## Science
## Min. :292.7
## 1st Qu.:470.9
## Median :523.9
## Mean :524.8
## 3rd Qu.:574.8
## Max. :807.3
##
#Sắp xếp thứ tự Aera bằng hàm factor
table(pisa$Area)
##
## REMOTE RURAL URBAN
## 410 2368 3048
#Hàm table sẽ sắp xếp mặc định theo thứ tự A, B, C...
pisa$Area = factor(pisa$Area, levels = c("URBAN", "RURAL", "REMOTE"))
#Hàm factor ấn định thứ tự sắp xếp theo mong muốn
table(pisa$Area)
##
## URBAN RURAL REMOTE
## 3048 2368 410
head(pisa)
## School SchoolSize ClassSize STratio SchoolType Area Region Age Gender
## 1 70400001 883 18 22.075 3 URBAN SOUTH 15.58 Boys
## 2 70400001 883 18 22.075 3 URBAN SOUTH 15.92 Boys
## 3 70400001 883 18 22.075 3 URBAN SOUTH 15.42 Girls
## 4 70400001 883 18 22.075 3 URBAN SOUTH 15.58 Girls
## 5 70400001 883 18 22.075 3 URBAN SOUTH 15.92 Girls
## 6 70400001 883 18 22.075 3 URBAN SOUTH 16.25 Girls
## PARED HISCED WEALTH INSTSCIE JOYSCIE ICTRES Math Read Science
## 1 9 2 -2.0697 0.9798 2.1635 -1.5244 439.923 412.290 475.612
## 2 12 4 -1.7903 1.7359 2.1635 -1.9305 406.251 409.598 450.320
## 3 9 2 -2.1942 -0.2063 -0.1808 -1.6093 414.369 384.307 405.787
## 4 5 1 -2.0301 -0.3115 -0.4318 -1.6250 468.801 459.104 462.968
## 5 9 2 -1.0522 0.7648 1.3031 -0.5305 355.432 402.435 453.736
## 6 5 1 -3.0570 0.3708 0.5094 -2.5873 458.955 483.885 529.866
summary(pisa)
## School SchoolSize ClassSize STratio
## Min. :70400001 Min. : 113 Min. :13.00 Min. : 4.314
## 1st Qu.:70400052 1st Qu.: 650 1st Qu.:38.00 1st Qu.:14.024
## Median :70400096 Median :1090 Median :38.00 Median :16.627
## Mean :70400097 Mean :1082 Mean :40.57 Mean :16.497
## 3rd Qu.:70400143 3rd Qu.:1419 3rd Qu.:43.00 3rd Qu.:18.983
## Max. :70400188 Max. :4016 Max. :53.00 Max. :38.651
## NA's :34
## SchoolType Area Region Age Gender
## Min. :1.000 URBAN :3048 CENTRAL:2006 Min. :15.33 Boys :2786
## 1st Qu.:3.000 RURAL :2368 NORTH :1958 1st Qu.:15.50 Girls:3040
## Median :3.000 REMOTE: 410 SOUTH :1862 Median :15.75
## Mean :2.849 Mean :15.78
## 3rd Qu.:3.000 3rd Qu.:16.00
## Max. :3.000 Max. :16.25
## NA's :35
## PARED HISCED WEALTH INSTSCIE
## Min. : 3.000 Min. :0.00 Min. :-7.635 Min. :-1.9301
## 1st Qu.: 9.000 1st Qu.:2.00 1st Qu.:-2.829 1st Qu.: 0.0125
## Median : 9.000 Median :2.00 Median :-2.163 Median : 0.3708
## Mean : 9.374 Mean :2.58 Mean :-2.219 Mean : 0.4835
## 3rd Qu.:12.000 3rd Qu.:4.00 3rd Qu.:-1.504 3rd Qu.: 1.0218
## Max. :17.000 Max. :6.00 Max. : 3.211 Max. : 1.7359
## NA's :14 NA's :14 NA's :15 NA's :17
## JOYSCIE ICTRES Math Read
## Min. :-2.1154 Min. :-3.508 Min. :201.7 Min. :107.1
## 1st Qu.: 0.5094 1st Qu.:-2.587 1st Qu.:440.0 1st Qu.:442.5
## Median : 0.5094 Median :-1.855 Median :493.4 Median :489.5
## Mean : 0.6448 Mean :-1.795 Mean :496.1 Mean :489.9
## 3rd Qu.: 1.1049 3rd Qu.:-1.117 3rd Qu.:551.5 3rd Qu.:537.6
## Max. : 2.1635 Max. : 3.497 Max. :820.1 Max. :744.1
## NA's :19 NA's :34
## Science
## Min. :292.7
## 1st Qu.:470.9
## Median :523.9
## Mean :524.8
## 3rd Qu.:574.8
## Max. :807.3
##
#xem các giá trị trong SchoolType
table(pisa$SchoolType)
##
## 1 3
## 436 5355
#Tạo biến mới Type từ biến SchoolType
pisa$Type [pisa$SchoolType == 1]= "Private"
pisa$Type [pisa$SchoolType == 3]= "Public"
table(pisa$Type)
##
## Private Public
## 436 5355
#Install package "table1"
library(table1)
## Warning: package 'table1' was built under R version 3.6.2
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ SchoolSize + ClassSize + Gender|Region, data =pisa)
| CENTRAL (n=2006) |
NORTH (n=1958) |
SOUTH (n=1862) |
Overall (n=5826) |
|
|---|---|---|---|---|
| SchoolSize | ||||
| Mean (SD) | 1110 (590) | 1030 (515) | 1110 (630) | 1080 (581) |
| Median [Min, Max] | 1140 [165, 4020] | 936 [134, 2470] | 1090 [113, 3370] | 1090 [113, 4020] |
| ClassSize | ||||
| Mean (SD) | 40.7 (7.69) | 40.5 (7.59) | 40.5 (8.41) | 40.6 (7.90) |
| Median [Min, Max] | 43.0 [13.0, 53.0] | 38.0 [13.0, 53.0] | 43.0 [13.0, 53.0] | 38.0 [13.0, 53.0] |
| Missing | 0 (0%) | 34 (1.7%) | 0 (0%) | 34 (0.6%) |
| Gender | ||||
| Boys | 953 (47.5%) | 951 (48.6%) | 882 (47.4%) | 2786 (47.8%) |
| Girls | 1053 (52.5%) | 1007 (51.4%) | 980 (52.6%) | 3040 (52.2%) |
#Install package "compareGroups"
#packkage compareGroups hay hơn package table1 vì cho ra trị số P toàn thể (cho biết ít nhất là có 1 cặp có sự khác biệt có ý nghĩa thống kê nhưng không biết chính xác là cặp nào)
library(compareGroups)
## Warning: package 'compareGroups' was built under R version 3.6.2
## Loading required package: SNPassoc
## Warning: package 'SNPassoc' was built under R version 3.6.2
## Loading required package: haplo.stats
## Warning: package 'haplo.stats' was built under R version 3.6.2
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.6.2
## Loading required package: mvtnorm
## Loading required package: parallel
## Registered S3 method overwritten by 'SNPassoc':
## method from
## summary.haplo.glm haplo.stats
a = compareGroups(Area ~ WEALTH + PARED + Math + Read + Science, data = pisa)
createTable(a)
##
## --------Summary descriptives table by 'Area'---------
##
## ________________________________________________________
## URBAN RURAL REMOTE p.overall
## N=3048 N=2368 N=410
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## WEALTH -2.12 (1.16) -2.22 (1.08) -3.00 (1.25) <0.001
## PARED 9.56 (3.48) 9.38 (3.47) 7.90 (3.69) <0.001
## Math 499 (79.3) 500 (81.9) 450 (82.0) <0.001
## Read 496 (69.6) 491 (67.6) 440 (76.0) <0.001
## Science 527 (72.8) 529 (75.5) 482 (74.4) <0.001
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
a = compareGroups(Gender ~ WEALTH + PARED + Math + Read + Science, data = pisa)
createTable(a)
##
## --------Summary descriptives table by 'Gender'---------
##
## ___________________________________________
## Boys Girls p.overall
## N=2786 N=3040
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## WEALTH -2.20 (1.14) -2.24 (1.17) 0.250
## PARED 9.52 (3.48) 9.24 (3.54) 0.002
## Math 498 (84.1) 495 (79.1) 0.152
## Read 479 (72.8) 500 (67.1) <0.001
## Science 526 (77.1) 524 (72.9) 0.343
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
#Lệnh gộp không cần tạo đối tượng a:
createTable(compareGroups(Gender ~ WEALTH + PARED + Math + Read + Science + Area, data = pisa))
##
## --------Summary descriptives table by 'Gender'---------
##
## ______________________________________________
## Boys Girls p.overall
## N=2786 N=3040
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## WEALTH -2.20 (1.14) -2.24 (1.17) 0.250
## PARED 9.52 (3.48) 9.24 (3.54) 0.002
## Math 498 (84.1) 495 (79.1) 0.152
## Read 479 (72.8) 500 (67.1) <0.001
## Science 526 (77.1) 524 (72.9) 0.343
## Area: 0.034
## URBAN 1460 (52.4%) 1588 (52.2%)
## RURAL 1106 (39.7%) 1262 (41.5%)
## REMOTE 220 (7.90%) 190 (6.25%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯