Task 2: Đọc dữ liệu “Crime data 2003-2018”

setwd("C:/Users\\HP\\Documents\\R\\Thuc hanh\\Training CR")
t=file.choose()
tp=read.csv(t,header=T)
dim(tp)
## [1] 1048575       8
head(tp)
##          ID       Category                           Description     Day
## 1 180362289  VEHICLE THEFT                     STOLEN MOTORCYCLE Tuesday
## 2 180360948   NON-CRIMINAL          AIDED CASE, MENTAL DISTURBED Tuesday
## 3 180360879 OTHER OFFENSES                      PAROLE VIOLATION Tuesday
## 4 180360879 OTHER OFFENSES              TRAFFIC VIOLATION ARREST Tuesday
## 5 180360879 OTHER OFFENSES                     TRAFFIC VIOLATION Tuesday
## 6 180360829 OTHER OFFENSES DRIVERS LICENSE, SUSPENDED OR REVOKED Tuesday
##         Date  Time District     Resolution
## 1 05/15/2018 10:30 SOUTHERN           NONE
## 2 05/15/2018  4:14 SOUTHERN           NONE
## 3 05/15/2018  2:01  MISSION ARREST, BOOKED
## 4 05/15/2018  2:01  MISSION ARREST, BOOKED
## 5 05/15/2018  2:01  MISSION ARREST, BOOKED
## 6 05/15/2018  1:27  MISSION           NONE

Tính số tội phạm theo ngày, sắp xếp thứ tự ngày

require(sjPlot)
## Loading required package: sjPlot
## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.2.17
## Current Matrix version is 1.2.15
## Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
tp$Day=factor(tp$Day,levels=c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))
table(tp$Day)
## 
##    Monday   Tuesday Wednesday  Thursday    Friday  Saturday    Sunday 
##    144651    146200    150981    148679    161398    154325    142341

Vẽ biểu đồ

plot_frq(tp$Day)

Task 3: Mả hóa (Coding)

t1=file.choose()
ob=read.csv(t1,header=T)

Tạo biến mới “OB”

ob$OB[ob$bmi<18.5]<-"Underweight"
ob$OB[ob$bmi>=18.5&ob$bmi<24.9]<-"Normal"
ob$OB[ob$bmi>=24.9&ob$bmi<29.9]<-"Overweight"
ob$OB[ob$bmi>=29.9]<-"Obese"

Đếm số trường hợp trong biến OB

ob$OB=factor(ob$OB,levels=c("Underweight","Normal","Overweight","Obese"))
table(ob$OB)
## 
## Underweight      Normal  Overweight       Obese 
##         107         857         238          15

Vẽ biểu đồ biến OB

plot_frq(ob$OB)

Task 4: Merge dữ liệu

t4a=file.choose()
students=read.csv(t4a,header = T)
t4b=file.choose()
schools=read.csv(t4b,header = T)
pisa=merge(students,schools,by="CNTSCHID")

Đổi tên biến thành “Math”, “Science”, “Reading”

names(pisa)[names(pisa)=="PV1MATH"]<-"Math"
names(pisa)[names(pisa)=="PV1READ"]<-"Reading"
names(pisa)[names(pisa)=="PV1SCIE"]<-"Science"

Tóm tắt dữ liệu pisa

summary(pisa)
##     CNTSCHID             AGE          Gender         PARED       
##  Min.   :70400001   Min.   :15.33   Boys :2786   Min.   : 3.000  
##  1st Qu.:70400052   1st Qu.:15.50   Girls:3040   1st Qu.: 9.000  
##  Median :70400096   Median :15.75                Median : 9.000  
##  Mean   :70400097   Mean   :15.78                Mean   : 9.374  
##  3rd Qu.:70400143   3rd Qu.:16.00                3rd Qu.:12.000  
##  Max.   :70400188   Max.   :16.25                Max.   :17.000  
##                                                  NA's   :14      
##      HEDRES            MISCED          FISCED          HISCED    
##  Min.   :-4.3706   Min.   :0.000   Min.   :0.000   Min.   :0.00  
##  1st Qu.:-1.5169   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.00  
##  Median :-1.1119   Median :2.000   Median :2.000   Median :2.00  
##  Mean   :-1.0470   Mean   :2.069   Mean   :2.296   Mean   :2.58  
##  3rd Qu.:-0.7026   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:4.00  
##  Max.   : 1.1767   Max.   :6.000   Max.   :6.000   Max.   :6.00  
##  NA's   :20        NA's   :34      NA's   :88      NA's   :14    
##      WEALTH            ESCS           INSTSCIE          SCIEEFF       
##  Min.   :-7.635   Min.   :-5.657   Min.   :-1.9301   Min.   :-3.7565  
##  1st Qu.:-2.829   1st Qu.:-2.539   1st Qu.: 0.0125   1st Qu.:-0.8585  
##  Median :-2.163   Median :-1.982   Median : 0.3708   Median :-0.3473  
##  Mean   :-2.219   Mean   :-1.822   Mean   : 0.4835   Mean   :-0.2662  
##  3rd Qu.:-1.504   3rd Qu.:-1.221   3rd Qu.: 1.0218   3rd Qu.: 0.3155  
##  Max.   : 3.211   Max.   : 1.950   Max.   : 1.7359   Max.   : 3.2775  
##  NA's   :15       NA's   :1        NA's   :17        NA's   :19       
##     JOYSCIE            ICTRES          HOMEPOS          HEDRES.1      
##  Min.   :-2.1154   Min.   :-3.508   Min.   :-8.955   Min.   :-4.3706  
##  1st Qu.: 0.5094   1st Qu.:-2.587   1st Qu.:-2.669   1st Qu.:-1.5169  
##  Median : 0.5094   Median :-1.855   Median :-2.047   Median :-1.1119  
##  Mean   : 0.6448   Mean   :-1.795   Mean   :-2.042   Mean   :-1.0470  
##  3rd Qu.: 1.1049   3rd Qu.:-1.117   3rd Qu.:-1.354   3rd Qu.:-0.7026  
##  Max.   : 2.1635   Max.   : 3.497   Max.   : 2.770   Max.   : 1.1767  
##  NA's   :19        NA's   :34       NA's   :2        NA's   :20       
##     CULTPOSS            Math          Reading         Science     
##  Min.   :-1.8413   Min.   :201.7   Min.   :107.1   Min.   :292.7  
##  1st Qu.:-0.8310   1st Qu.:440.0   1st Qu.:442.5   1st Qu.:470.9  
##  Median :-0.4113   Median :493.4   Median :489.5   Median :523.9  
##  Mean   :-0.4444   Mean   :496.1   Mean   :489.9   Mean   :524.8  
##  3rd Qu.: 0.1157   3rd Qu.:551.5   3rd Qu.:537.6   3rd Qu.:574.8  
##  Max.   : 2.1655   Max.   :820.1   Max.   :744.1   Max.   :807.3  
##  NA's   :52                                                       
##     STRATUM       SCHSIZE         CLSIZE         STRATIO      
##  VNM0313:989   Min.   : 113   Min.   :13.00   Min.   : 4.314  
##  VNM0208:884   1st Qu.: 650   1st Qu.:38.00   1st Qu.:14.024  
##  VNM0101:806   Median :1090   Median :38.00   Median :16.627  
##  VNM0207:790   Mean   :1082   Mean   :40.57   Mean   :16.497  
##  VNM0102:764   3rd Qu.:1419   3rd Qu.:43.00   3rd Qu.:18.983  
##  VNM0314:679   Max.   :4016   Max.   :53.00   Max.   :38.651  
##  (Other):914                  NA's   :34                      
##     SCHLTYPE         Region         Area     
##  Min.   :1.000   CENTRAL:2006   REMOTE: 410  
##  1st Qu.:3.000   NORTH  :1958   RURAL :2368  
##  Median :3.000   SOUTH  :1862   URBAN :3048  
##  Mean   :2.849                               
##  3rd Qu.:3.000                               
##  Max.   :3.000                               
##  NA's   :35

Vẽ biểu đồ theo Region

plot_frq(pisa$Region)

Vẽ biểu đồ theo Area

plot_frq(pisa$Area)

Task 5: Phân tích mô tả với package “table1”

require(table1)
## Loading required package: table1
## Warning: package 'table1' was built under R version 3.5.3
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ WEALTH + PARED + HEDRES + Math + Science + Reading|Region, data=pisa)
CENTRAL
(n=2006)
NORTH
(n=1958)
SOUTH
(n=1862)
Overall
(n=5826)
WEALTH
Mean (SD) -2.40 (1.12) -2.18 (1.18) -2.06 (1.14) -2.22 (1.16)
Median [Min, Max] -2.33 [-7.64, 1.41] -2.14 [-7.64, 2.63] -2.03 [-7.64, 3.21] -2.16 [-7.64, 3.21]
Missing 6 (0.3%) 8 (0.4%) 1 (0.1%) 15 (0.3%)
PARED
Mean (SD) 9.49 (3.44) 9.76 (3.51) 8.85 (3.54) 9.37 (3.51)
Median [Min, Max] 9.00 [3.00, 17.0] 9.00 [3.00, 17.0] 9.00 [3.00, 17.0] 9.00 [3.00, 17.0]
Missing 3 (0.1%) 9 (0.5%) 2 (0.1%) 14 (0.2%)
HEDRES
Mean (SD) -1.04 (0.955) -1.06 (0.930) -1.04 (0.941) -1.05 (0.942)
Median [Min, Max] -1.11 [-4.37, 1.18] -1.11 [-4.37, 1.16] -1.11 [-4.37, 1.16] -1.11 [-4.37, 1.18]
Missing 7 (0.3%) 7 (0.4%) 6 (0.3%) 20 (0.3%)
Math
Mean (SD) 492 (86.5) 501 (84.4) 496 (72.2) 496 (81.5)
Median [Min, Max] 488 [202, 818] 500 [251, 820] 494 [241, 719] 493 [202, 820]
Science
Mean (SD) 524 (79.8) 523 (76.6) 528 (67.3) 525 (75.0)
Median [Min, Max] 520 [307, 807] 522 [293, 775] 528 [337, 761] 524 [293, 807]
Reading
Mean (SD) 488 (74.3) 489 (72.4) 493 (64.4) 490 (70.6)
Median [Min, Max] 486 [233, 744] 489 [107, 718] 493 [272, 698] 489 [107, 744]
table1(~ WEALTH + PARED + HEDRES + Math + Science + Reading | Area, data=pisa)

REMOTE
(n=410)
RURAL
(n=2368)
URBAN
(n=3048)
Overall
(n=5826)
WEALTH
Mean (SD) -3.00 (1.25) -2.22 (1.08) -2.12 (1.16) -2.22 (1.16)
Median [Min, Max] -2.83 [-7.64, -0.0430] -2.16 [-7.64, 1.43] -2.10 [-7.64, 3.21] -2.16 [-7.64, 3.21]
Missing 6 (1.5%) 7 (0.3%) 2 (0.1%) 15 (0.3%)
PARED
Mean (SD) 7.90 (3.69) 9.38 (3.47) 9.56 (3.48) 9.37 (3.51)
Median [Min, Max] 9.00 [3.00, 17.0] 9.00 [3.00, 17.0] 9.00 [3.00, 17.0] 9.00 [3.00, 17.0]
Missing 8 (2.0%) 5 (0.2%) 1 (0.0%) 14 (0.2%)
HEDRES
Mean (SD) -1.66 (1.12) -0.985 (0.946) -1.01 (0.883) -1.05 (0.942)
Median [Min, Max] -1.44 [-4.37, 1.16] -1.11 [-4.37, 1.18] -1.11 [-4.37, 1.16] -1.11 [-4.37, 1.18]
Missing 6 (1.5%) 9 (0.4%) 5 (0.2%) 20 (0.3%)
Math
Mean (SD) 450 (82.0) 500 (81.9) 499 (79.3) 496 (81.5)
Median [Min, Max] 446 [216, 696] 498 [273, 818] 497 [202, 820] 493 [202, 820]
Science
Mean (SD) 482 (74.4) 529 (75.5) 527 (72.8) 525 (75.0)
Median [Min, Max] 475 [307, 698] 529 [335, 807] 525 [293, 799] 524 [293, 807]
Reading
Mean (SD) 440 (76.0) 491 (67.6) 496 (69.6) 490 (70.6)
Median [Min, Max] 439 [233, 643] 490 [292, 744] 495 [107, 718] 489 [107, 744]
Task 6: Phân tích mô tả với package “compareGroups”

require(compareGroups)
## Loading required package: compareGroups
## Warning: package 'compareGroups' was built under R version 3.5.3
## Loading required package: SNPassoc
## Warning: package 'SNPassoc' was built under R version 3.5.3
## Loading required package: haplo.stats
## Warning: package 'haplo.stats' was built under R version 3.5.3
## Loading required package: survival
## Loading required package: mvtnorm
## Loading required package: parallel
t = compareGroups(Area~WEALTH + PARED + HEDRES + Math+ Science + Reading + Region, data=pisa)
createTable(t)
## 
## --------Summary descriptives table by 'Area'---------
## 
## ____________________________________________________________ 
##                REMOTE       RURAL        URBAN     p.overall 
##                N=410        N=2368       N=3048              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## WEALTH      -3.00 (1.25) -2.22 (1.08) -2.12 (1.16)  <0.001   
## PARED       7.90 (3.69)  9.38 (3.47)  9.56 (3.48)   <0.001   
## HEDRES      -1.66 (1.12) -0.98 (0.95) -1.01 (0.88)  <0.001   
## Math         450 (82.0)   500 (81.9)   499 (79.3)   <0.001   
## Science      482 (74.4)   529 (75.5)   527 (72.8)   <0.001   
## Reading      440 (76.0)   491 (67.6)   496 (69.6)   <0.001   
## Region:                                             <0.001   
##     CENTRAL 198 (48.3%)  857 (36.2%)  951 (31.2%)            
##     NORTH   148 (36.1%)  764 (32.3%)  1046 (34.3%)           
##     SOUTH    64 (15.6%)  747 (31.5%)  1051 (34.5%)           
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯