Nhập dữ liệu “pisa”

setwd("C:/Users\\HP\\Documents\\R\\Thuc hanh\\Training CR")
t4a=file.choose()
students=read.csv(t4a,header = T)
t4b=file.choose()
schools=read.csv(t4b,header =T)

Merge dữ liệu

pisa=merge(students,schools,by="CNTSCHID")
head(pisa)

##   CNTSCHID   AGE Gender PARED  HEDRES MISCED FISCED HISCED  WEALTH    ESCS
## 1 70400001 15.58   Boys     9 -1.0418      1      2      2 -2.0697 -1.7899
## 2 70400001 15.92   Boys    12 -2.3041      1      4      4 -1.7903 -1.5423
## 3 70400001 15.42  Girls     9 -0.7218      1      2      2 -2.1942 -2.0475
## 4 70400001 15.58  Girls     5 -1.4595      1      0      1 -2.0301 -2.6136
## 5 70400001 15.92  Girls     9 -0.6172      2      2      2 -1.0522 -1.2179
## 6 70400001 16.25  Girls     5 -1.3738      1      1      1 -3.0570 -2.8451
##   INSTSCIE SCIEEFF JOYSCIE  ICTRES HOMEPOS HEDRES.1 CULTPOSS PV1MATH
## 1   0.9798  0.1421  2.1635 -1.5244 -2.0537  -1.0418  -0.7273 439.923
## 2   1.7359 -0.8432  2.1635 -1.9305 -2.2627  -2.3041  -0.2031 406.251
## 3  -0.2063 -0.1824 -0.1808 -1.6093 -1.9675  -0.7218  -0.2220 414.369
## 4  -0.3115 -1.0555 -0.4318 -1.6250 -2.0686  -1.4595  -0.7039 468.801
## 5   0.7648 -0.1954  1.3031 -0.5305 -0.9471  -0.6172  -0.0971 355.432
## 6   0.3708 -0.5652  0.5094 -2.5873 -2.6986  -1.3738  -0.2031 458.955
##   PV1READ PV1SCIE STRATUM SCHSIZE CLSIZE STRATIO SCHLTYPE Region  Area
## 1 412.290 475.612 VNM0313     883     18  22.075        3  SOUTH URBAN
## 2 409.598 450.320 VNM0313     883     18  22.075        3  SOUTH URBAN
## 3 384.307 405.787 VNM0313     883     18  22.075        3  SOUTH URBAN
## 4 459.104 462.968 VNM0313     883     18  22.075        3  SOUTH URBAN
## 5 402.435 453.736 VNM0313     883     18  22.075        3  SOUTH URBAN
## 6 483.885 529.866 VNM0313     883     18  22.075        3  SOUTH URBAN

dim(pisa)

## [1] 5826   27

Vẽ biểu đồ Histogram

hist(pisa$PV1MATH,col="blue")

hist(pisa$PV1MATH,col="blue",border="white")

hist(pisa$PV1SCIE,col="blue",border="white",xlab="Điểm khoa học",ylab="Tần suất",main="Biểu đồ phân bố điểm khoa học")

Vẽ theo mật độ

hist(pisa$PV1SCIE,col="blue",border="white",prop=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")

## Warning in plot.window(xlim, ylim, "", ...): "prop" is not a graphical
## parameter

## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "prop" is not a graphical parameter

## Warning in axis(1, ...): "prop" is not a graphical parameter

## Warning in axis(2, ...): "prop" is not a graphical parameter

hist(x=pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")
hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")

hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")
hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học",breaks=20)
lines(density(pisa$PV1SCIE),col="red",lwd=3)

hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học",breaks=20)
lines(density(pisa$PV1SCIE),col="coral",lwd=2)

# Vẽ hai phân bố cho nam và nữ

p1=hist(pisa$PV1SCIE[pisa$Gender=="Boys"],plot=F)
p2=hist(pisa$PV1SCIE[pisa$Gender=="Girls"],plot=F)
plot(p1,col="coral",border = "white")
plot(p2,add=T,col=scales::alpha("green",0.5),border="white")

Task 2: Biểu đồ phân bố với lattice

require(lattice)

## Loading required package: lattice

densityplot(~pisa$PV1SCIE,groups=pisa$Gender,data=pisa)

densityplot(~pisa$PV1SCIE,groups=pisa$Area,data=pisa)

densityplot(~pisa$PV1SCIE,groups=pisa$Area,data=pisa,auto.key = list(space="top"))

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 3.5.3

p1=densityplot(~pisa$PV1SCIE,groups=pisa$Area,data=pisa,auto.key = list(space="top"))
p2=densityplot(~pisa$PV1MATH,groups=pisa$Area,data=pisa)
p3=densityplot(~pisa$PV1READ,groups=pisa$Area,data=pisa)
grid.arrange(p1,p2,p3,ncol=3)

# Task 3: Biểu đồ thanh với sjPlot

require(sjPlot)

## Loading required package: sjPlot

## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.2.17
## Current Matrix version is 1.2.15
## Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package

plot_frq(pisa$Area)

plot_frq(pisa$Region)

# Phân tích theo nhóm

sjp.xtab(pisa$Region,pisa$Area,margin="row",bar.pos = "stack",show.summary = T)

sjp.xtab(pisa$Region,pisa$Area,margin="row",bar.pos = "stack",show.summary = T,coord.flip = T)

sjp.xtab(pisa$Area,pisa$Region,coord.flip = T)

# Điểm môn khoa học theo vùng

sjp.grpfrq(pisa$PV1SCIE,pisa$Region,type="box")

## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

Thang Likert

big=file.choose()
big5=read.csv(big,header=T)
test=big5[,c("gender","E1","E2","E3","E4","E5")]
test$E1=as.factor(test$E1)
test$E2=as.factor(test$E2)
test$E3=as.factor(test$E3)
test$E4=as.factor(test$E4)
test$E5=as.factor(test$E5)
library(sjPlot)
plot_likert(test)

## Warning: Detected uneven category count in items. Dropping last category.

## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length

## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length

## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length

## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length

## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length

## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length

#task 4

par(mfrow=c(1,3))
boxplot(pisa$PV1SCIE,col="purple")
boxplot(pisa$PV1SCIE~pisa$Gender,col=c("coral","blue"))
boxplot(pisa$PV1SCIE~pisa$Region,col=c("green","brown2"))

#task 7

o=file.choose()
ob=read.csv(o,header=T)
names(ob)

##  [1] "id"     "gender" "height" "weight" "bmi"    "age"    "bmc"   
##  [8] "bmd"    "fat"    "lean"   "pcfat"

ob$OB[ob$bmi<18.5]="Underweight"
ob$OB[ob$bmi>=18.5&ob$bmi<24.9]="Normal"
ob$OB[ob$bmi>=25&ob$bmi<29.9]="Overweight"
ob$OB[ob$bmi>=30]="Obese"

Thực hành với ggplot2

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.5.3

p=ggplot(data=ob,aes(x=weight,y=pcfat))
p=ggplot(data=ob,aes(x=weight,y=pcfat,fill=gender,col=gender))
p=p+geom_point()+geom_smooth()
p

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

require(ggthemes)

## Loading required package: ggthemes

## Warning: package 'ggthemes' was built under R version 3.5.3

p=p+geom_point()+geom_smooth()+theme_dark()
p

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

p=p+geom_point()+geom_smooth()+theme_economist()
p

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

p=p+geom_point()+geom_smooth()+theme_calc()
p

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

p=p+xlab("Cân nặng")+ylab("Tỉ trọng mỡ")
p

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

vars=ob[,c("gender","weight","age","bmi","pcfat")]
library(GGally)

## Warning: package 'GGally' was built under R version 3.5.3

ggpairs(data=vars,mapping=aes(col=gender))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

require(ggExtra)

## Loading required package: ggExtra

## Warning: package 'ggExtra' was built under R version 3.5.3

p=ggplot(data=ob,aes(x=bmi,y=pcfat,fill=gender,col=gender))
p=p+geom_point()+geom_smooth()
ggMarginal(p,type="histogram",groupColour = T,groupFill = T)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggMarginal(p,type="density",groupColour = T,groupFill = T)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Task 9

library(gridExtra)
t4a=file.choose()
students=read.csv(t4a,header = T)
t4b=file.choose()
schools=read.csv(t4b,header = T)
pisa=merge(students,schools,by="CNTSCHID")
require(ggplot2)
p=ggplot(data=pisa,aes(x=PV1SCIE))
p1=p+geom_histogram(color="white",fill="blue")
p=ggplot(data=pisa,aes(x=PV1SCIE))
p=p+geom_histogram(aes(y=..density..),colour="white",fill="blue")
p2=p+geom_density(col="red")
grid.arrange(p1,p2,nrow=2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p=ggplot(data=pisa,aes(x=PV1SCIE,fill=Area))
p1=p+geom_histogram(position = "dodge")
p2=ggplot(data=pisa,aes(x=PV1SCIE,fill=Area,color=Area))+geom_density(alpha=0.1)
grid.arrange(p1,p2,ncol=2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

require(ggplot2)
require(ggExtra)
t4a=file.choose()
students=read.csv(t4a,header = T)
t4b=file.choose()
schools=read.csv(t4b,header = T)
pisa=merge(students,schools,by="CNTSCHID")
p=ggplot(data=pisa,aes(x=HISCED))
p=p+stat_ecdf(color="red",lwd=2)
p=p+geom_bar(aes(y=(..count..)/sum(..count..)),fill="blue",colour="yellow")
p

## Warning: Removed 14 rows containing non-finite values (stat_ecdf).

## Warning: Removed 14 rows containing non-finite values (stat_count).

p=ggplot(data=pisa,aes(x=PARED,y=PV1SCIE,fill=PARED))
p1=p+geom_boxplot(col="black")+geom_jitter(alpha=0.02)
p=ggplot(data=na.omit(pisa),aes(x=factor(PARED),y=PV1SCIE,fill=factor(PARED),color=factor(PARED)))
p2=p+geom_boxplot(col="black")+geom_jitter(alpha=0.05)
grid.arrange(p1,p2,ncol=2)

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

## Warning: Removed 14 rows containing missing values (stat_boxplot).

## Warning: Removed 14 rows containing missing values (geom_point).

t4a=file.choose()
students=read.csv(t4a,header = T)
t4b=file.choose()
setwd("C:/Users\\HP\\Documents\\R\\Thuc hanh\\Training CR")
t4a=file.choose()
students=read.csv(t4a,header = T)
t4b=file.choose()
schools=read.csv(t4b,header =T)
pisa=merge(students,schools,by="CNTSCHID")
head(pisa)

##   CNTSCHID   AGE Gender PARED  HEDRES MISCED FISCED HISCED  WEALTH    ESCS
## 1 70400001 15.58   Boys     9 -1.0418      1      2      2 -2.0697 -1.7899
## 2 70400001 15.92   Boys    12 -2.3041      1      4      4 -1.7903 -1.5423
## 3 70400001 15.42  Girls     9 -0.7218      1      2      2 -2.1942 -2.0475
## 4 70400001 15.58  Girls     5 -1.4595      1      0      1 -2.0301 -2.6136
## 5 70400001 15.92  Girls     9 -0.6172      2      2      2 -1.0522 -1.2179
## 6 70400001 16.25  Girls     5 -1.3738      1      1      1 -3.0570 -2.8451
##   INSTSCIE SCIEEFF JOYSCIE  ICTRES HOMEPOS HEDRES.1 CULTPOSS PV1MATH
## 1   0.9798  0.1421  2.1635 -1.5244 -2.0537  -1.0418  -0.7273 439.923
## 2   1.7359 -0.8432  2.1635 -1.9305 -2.2627  -2.3041  -0.2031 406.251
## 3  -0.2063 -0.1824 -0.1808 -1.6093 -1.9675  -0.7218  -0.2220 414.369
## 4  -0.3115 -1.0555 -0.4318 -1.6250 -2.0686  -1.4595  -0.7039 468.801
## 5   0.7648 -0.1954  1.3031 -0.5305 -0.9471  -0.6172  -0.0971 355.432
## 6   0.3708 -0.5652  0.5094 -2.5873 -2.6986  -1.3738  -0.2031 458.955
##   PV1READ PV1SCIE STRATUM SCHSIZE CLSIZE STRATIO SCHLTYPE Region  Area
## 1 412.290 475.612 VNM0313     883     18  22.075        3  SOUTH URBAN
## 2 409.598 450.320 VNM0313     883     18  22.075        3  SOUTH URBAN
## 3 384.307 405.787 VNM0313     883     18  22.075        3  SOUTH URBAN
## 4 459.104 462.968 VNM0313     883     18  22.075        3  SOUTH URBAN
## 5 402.435 453.736 VNM0313     883     18  22.075        3  SOUTH URBAN
## 6 483.885 529.866 VNM0313     883     18  22.075        3  SOUTH URBAN

dim(pisa)

## [1] 5826   27

hist(pisa$PV1MATH,col="blue")

hist(pisa$PV1MATH,col="blue",border="white")

hist(pisa$PV1SCIE,col="blue",border="white",xlab="Điểm khoa học",ylab="Tần suất",main="Biểu đồ phân bố điểm khoa học")

hist(pisa$PV1SCIE,col="blue",border="white",prop=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")

## Warning in plot.window(xlim, ylim, "", ...): "prop" is not a graphical
## parameter

## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "prop" is not a graphical parameter

## Warning in axis(1, ...): "prop" is not a graphical parameter

## Warning in axis(2, ...): "prop" is not a graphical parameter

hist(pisa$PV1SCIE,col="blue",border="white",prop=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")

## Warning in plot.window(xlim, ylim, "", ...): "prop" is not a graphical
## parameter

## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "prop" is not a graphical parameter

## Warning in axis(1, ...): "prop" is not a graphical parameter

## Warning in axis(2, ...): "prop" is not a graphical parameter

hist(x=pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")
hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")

hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")
hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học",breaks=20)
lines(density(pisa$PV1SCIE),col="red",lwd=3)

hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học",breaks=20)
lines(density(pisa$PV1SCIE),col="coral",lwd=2)

p1=hist(pisa$PV1SCIE[pisa$Gender=="Boys"],plot=F)
p2=hist(pisa$PV1SCIE[pisa$Gender=="Girls"],plot=F)
plot(p1,col="coral",border = "white")

o=file.choose()
ob=read.csv(o,header=T)
names(ob)

##  [1] "id"     "gender" "height" "weight" "bmi"    "age"    "bmc"   
##  [8] "bmd"    "fat"    "lean"   "pcfat"

ob$OB[ob$bmi<18.5]="Underweight"
ob$OB[ob$bmi>=18.5&ob$bmi<24.9]="Normal"
ob$OB[ob$bmi>=25&ob$bmi<29.9]="Overweight"
ob$OB[ob$bmi>=30]="Obese"
library(ggplot2)
p=ggplot(data=ob,aes(x=weight,y=pcfat))
p=ggplot(data=ob,aes(x=weight,y=pcfat,fill=gender,col=gender))
p=p+geom_point()+geom_smooth()
p

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

require(ggthemes)
p=p+geom_point()+geom_smooth()+theme_dark()
p

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

p=p+geom_point()+geom_smooth()+theme_economist()
p

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

p=p+geom_point()+geom_smooth()+theme_calc()
p

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

p=p+xlab("Cân nặng")+ylab("Tỉ trọng mỡ")
p

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

vars=ob[,c("gender","weight","age","bmi","pcfat")]
library(GGally)
ggpairs(data=vars,mapping=aes(col=gender))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

require(ggExtra)
p=ggplot(data=ob,aes(x=bmi,y=pcfat,fill=gender,col=gender))
p=p+geom_point()+geom_smooth()
ggMarginal(p,type="density",groupColour = T,groupFill = T)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Thuc hanh ngay 2

Nguyen Ngoc Tuyen

8/5/2019

Thực hành với ggplot2