Nhập dữ liệu “pisa”
setwd("C:/Users\\HP\\Documents\\R\\Thuc hanh\\Training CR")
t4a=file.choose()
students=read.csv(t4a,header = T)
t4b=file.choose()
schools=read.csv(t4b,header =T)
Merge dữ liệu
pisa=merge(students,schools,by="CNTSCHID")
head(pisa)
## CNTSCHID AGE Gender PARED HEDRES MISCED FISCED HISCED WEALTH ESCS
## 1 70400001 15.58 Boys 9 -1.0418 1 2 2 -2.0697 -1.7899
## 2 70400001 15.92 Boys 12 -2.3041 1 4 4 -1.7903 -1.5423
## 3 70400001 15.42 Girls 9 -0.7218 1 2 2 -2.1942 -2.0475
## 4 70400001 15.58 Girls 5 -1.4595 1 0 1 -2.0301 -2.6136
## 5 70400001 15.92 Girls 9 -0.6172 2 2 2 -1.0522 -1.2179
## 6 70400001 16.25 Girls 5 -1.3738 1 1 1 -3.0570 -2.8451
## INSTSCIE SCIEEFF JOYSCIE ICTRES HOMEPOS HEDRES.1 CULTPOSS PV1MATH
## 1 0.9798 0.1421 2.1635 -1.5244 -2.0537 -1.0418 -0.7273 439.923
## 2 1.7359 -0.8432 2.1635 -1.9305 -2.2627 -2.3041 -0.2031 406.251
## 3 -0.2063 -0.1824 -0.1808 -1.6093 -1.9675 -0.7218 -0.2220 414.369
## 4 -0.3115 -1.0555 -0.4318 -1.6250 -2.0686 -1.4595 -0.7039 468.801
## 5 0.7648 -0.1954 1.3031 -0.5305 -0.9471 -0.6172 -0.0971 355.432
## 6 0.3708 -0.5652 0.5094 -2.5873 -2.6986 -1.3738 -0.2031 458.955
## PV1READ PV1SCIE STRATUM SCHSIZE CLSIZE STRATIO SCHLTYPE Region Area
## 1 412.290 475.612 VNM0313 883 18 22.075 3 SOUTH URBAN
## 2 409.598 450.320 VNM0313 883 18 22.075 3 SOUTH URBAN
## 3 384.307 405.787 VNM0313 883 18 22.075 3 SOUTH URBAN
## 4 459.104 462.968 VNM0313 883 18 22.075 3 SOUTH URBAN
## 5 402.435 453.736 VNM0313 883 18 22.075 3 SOUTH URBAN
## 6 483.885 529.866 VNM0313 883 18 22.075 3 SOUTH URBAN
dim(pisa)
## [1] 5826 27
Vẽ biểu đồ Histogram
hist(pisa$PV1MATH,col="blue")
hist(pisa$PV1MATH,col="blue",border="white")
hist(pisa$PV1SCIE,col="blue",border="white",xlab="Điểm khoa học",ylab="Tần suất",main="Biểu đồ phân bố điểm khoa học")
Vẽ theo mật độ
hist(pisa$PV1SCIE,col="blue",border="white",prop=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")
## Warning in plot.window(xlim, ylim, "", ...): "prop" is not a graphical
## parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "prop" is not a graphical parameter
## Warning in axis(1, ...): "prop" is not a graphical parameter
## Warning in axis(2, ...): "prop" is not a graphical parameter
hist(x=pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")
hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")
hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")
hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học",breaks=20)
lines(density(pisa$PV1SCIE),col="red",lwd=3)
hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học",breaks=20)
lines(density(pisa$PV1SCIE),col="coral",lwd=2)
# Vẽ hai phân bố cho nam và nữ
p1=hist(pisa$PV1SCIE[pisa$Gender=="Boys"],plot=F)
p2=hist(pisa$PV1SCIE[pisa$Gender=="Girls"],plot=F)
plot(p1,col="coral",border = "white")
plot(p2,add=T,col=scales::alpha("green",0.5),border="white")
Task 2: Biểu đồ phân bố với lattice
require(lattice)
## Loading required package: lattice
densityplot(~pisa$PV1SCIE,groups=pisa$Gender,data=pisa)
densityplot(~pisa$PV1SCIE,groups=pisa$Area,data=pisa)
densityplot(~pisa$PV1SCIE,groups=pisa$Area,data=pisa,auto.key = list(space="top"))
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.5.3
p1=densityplot(~pisa$PV1SCIE,groups=pisa$Area,data=pisa,auto.key = list(space="top"))
p2=densityplot(~pisa$PV1MATH,groups=pisa$Area,data=pisa)
p3=densityplot(~pisa$PV1READ,groups=pisa$Area,data=pisa)
grid.arrange(p1,p2,p3,ncol=3)
# Task 3: Biểu đồ thanh với sjPlot
require(sjPlot)
## Loading required package: sjPlot
## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.2.17
## Current Matrix version is 1.2.15
## Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package
plot_frq(pisa$Area)
plot_frq(pisa$Region)
# Phân tích theo nhóm
sjp.xtab(pisa$Region,pisa$Area,margin="row",bar.pos = "stack",show.summary = T)
sjp.xtab(pisa$Region,pisa$Area,margin="row",bar.pos = "stack",show.summary = T,coord.flip = T)
sjp.xtab(pisa$Area,pisa$Region,coord.flip = T)
# Điểm môn khoa học theo vùng
sjp.grpfrq(pisa$PV1SCIE,pisa$Region,type="box")
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
Thang Likert
big=file.choose()
big5=read.csv(big,header=T)
test=big5[,c("gender","E1","E2","E3","E4","E5")]
test$E1=as.factor(test$E1)
test$E2=as.factor(test$E2)
test$E3=as.factor(test$E3)
test$E4=as.factor(test$E4)
test$E5=as.factor(test$E5)
library(sjPlot)
plot_likert(test)
## Warning: Detected uneven category count in items. Dropping last category.
## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length
## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length
## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length
## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length
## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length
## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length
#task 4
par(mfrow=c(1,3))
boxplot(pisa$PV1SCIE,col="purple")
boxplot(pisa$PV1SCIE~pisa$Gender,col=c("coral","blue"))
boxplot(pisa$PV1SCIE~pisa$Region,col=c("green","brown2"))
#task 7
o=file.choose()
ob=read.csv(o,header=T)
names(ob)
## [1] "id" "gender" "height" "weight" "bmi" "age" "bmc"
## [8] "bmd" "fat" "lean" "pcfat"
ob$OB[ob$bmi<18.5]="Underweight"
ob$OB[ob$bmi>=18.5&ob$bmi<24.9]="Normal"
ob$OB[ob$bmi>=25&ob$bmi<29.9]="Overweight"
ob$OB[ob$bmi>=30]="Obese"
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
p=ggplot(data=ob,aes(x=weight,y=pcfat))
p=ggplot(data=ob,aes(x=weight,y=pcfat,fill=gender,col=gender))
p=p+geom_point()+geom_smooth()
p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
require(ggthemes)
## Loading required package: ggthemes
## Warning: package 'ggthemes' was built under R version 3.5.3
p=p+geom_point()+geom_smooth()+theme_dark()
p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p=p+geom_point()+geom_smooth()+theme_economist()
p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p=p+geom_point()+geom_smooth()+theme_calc()
p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p=p+xlab("Cân nặng")+ylab("Tỉ trọng mỡ")
p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
vars=ob[,c("gender","weight","age","bmi","pcfat")]
library(GGally)
## Warning: package 'GGally' was built under R version 3.5.3
ggpairs(data=vars,mapping=aes(col=gender))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
require(ggExtra)
## Loading required package: ggExtra
## Warning: package 'ggExtra' was built under R version 3.5.3
p=ggplot(data=ob,aes(x=bmi,y=pcfat,fill=gender,col=gender))
p=p+geom_point()+geom_smooth()
ggMarginal(p,type="histogram",groupColour = T,groupFill = T)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggMarginal(p,type="density",groupColour = T,groupFill = T)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Task 9
library(gridExtra)
t4a=file.choose()
students=read.csv(t4a,header = T)
t4b=file.choose()
schools=read.csv(t4b,header = T)
pisa=merge(students,schools,by="CNTSCHID")
require(ggplot2)
p=ggplot(data=pisa,aes(x=PV1SCIE))
p1=p+geom_histogram(color="white",fill="blue")
p=ggplot(data=pisa,aes(x=PV1SCIE))
p=p+geom_histogram(aes(y=..density..),colour="white",fill="blue")
p2=p+geom_density(col="red")
grid.arrange(p1,p2,nrow=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p=ggplot(data=pisa,aes(x=PV1SCIE,fill=Area))
p1=p+geom_histogram(position = "dodge")
p2=ggplot(data=pisa,aes(x=PV1SCIE,fill=Area,color=Area))+geom_density(alpha=0.1)
grid.arrange(p1,p2,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
require(ggplot2)
require(ggExtra)
t4a=file.choose()
students=read.csv(t4a,header = T)
t4b=file.choose()
schools=read.csv(t4b,header = T)
pisa=merge(students,schools,by="CNTSCHID")
p=ggplot(data=pisa,aes(x=HISCED))
p=p+stat_ecdf(color="red",lwd=2)
p=p+geom_bar(aes(y=(..count..)/sum(..count..)),fill="blue",colour="yellow")
p
## Warning: Removed 14 rows containing non-finite values (stat_ecdf).
## Warning: Removed 14 rows containing non-finite values (stat_count).
p=ggplot(data=pisa,aes(x=PARED,y=PV1SCIE,fill=PARED))
p1=p+geom_boxplot(col="black")+geom_jitter(alpha=0.02)
p=ggplot(data=na.omit(pisa),aes(x=factor(PARED),y=PV1SCIE,fill=factor(PARED),color=factor(PARED)))
p2=p+geom_boxplot(col="black")+geom_jitter(alpha=0.05)
grid.arrange(p1,p2,ncol=2)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
## Warning: Removed 14 rows containing missing values (stat_boxplot).
## Warning: Removed 14 rows containing missing values (geom_point).
t4a=file.choose()
students=read.csv(t4a,header = T)
t4b=file.choose()
setwd("C:/Users\\HP\\Documents\\R\\Thuc hanh\\Training CR")
t4a=file.choose()
students=read.csv(t4a,header = T)
t4b=file.choose()
schools=read.csv(t4b,header =T)
pisa=merge(students,schools,by="CNTSCHID")
head(pisa)
## CNTSCHID AGE Gender PARED HEDRES MISCED FISCED HISCED WEALTH ESCS
## 1 70400001 15.58 Boys 9 -1.0418 1 2 2 -2.0697 -1.7899
## 2 70400001 15.92 Boys 12 -2.3041 1 4 4 -1.7903 -1.5423
## 3 70400001 15.42 Girls 9 -0.7218 1 2 2 -2.1942 -2.0475
## 4 70400001 15.58 Girls 5 -1.4595 1 0 1 -2.0301 -2.6136
## 5 70400001 15.92 Girls 9 -0.6172 2 2 2 -1.0522 -1.2179
## 6 70400001 16.25 Girls 5 -1.3738 1 1 1 -3.0570 -2.8451
## INSTSCIE SCIEEFF JOYSCIE ICTRES HOMEPOS HEDRES.1 CULTPOSS PV1MATH
## 1 0.9798 0.1421 2.1635 -1.5244 -2.0537 -1.0418 -0.7273 439.923
## 2 1.7359 -0.8432 2.1635 -1.9305 -2.2627 -2.3041 -0.2031 406.251
## 3 -0.2063 -0.1824 -0.1808 -1.6093 -1.9675 -0.7218 -0.2220 414.369
## 4 -0.3115 -1.0555 -0.4318 -1.6250 -2.0686 -1.4595 -0.7039 468.801
## 5 0.7648 -0.1954 1.3031 -0.5305 -0.9471 -0.6172 -0.0971 355.432
## 6 0.3708 -0.5652 0.5094 -2.5873 -2.6986 -1.3738 -0.2031 458.955
## PV1READ PV1SCIE STRATUM SCHSIZE CLSIZE STRATIO SCHLTYPE Region Area
## 1 412.290 475.612 VNM0313 883 18 22.075 3 SOUTH URBAN
## 2 409.598 450.320 VNM0313 883 18 22.075 3 SOUTH URBAN
## 3 384.307 405.787 VNM0313 883 18 22.075 3 SOUTH URBAN
## 4 459.104 462.968 VNM0313 883 18 22.075 3 SOUTH URBAN
## 5 402.435 453.736 VNM0313 883 18 22.075 3 SOUTH URBAN
## 6 483.885 529.866 VNM0313 883 18 22.075 3 SOUTH URBAN
dim(pisa)
## [1] 5826 27
hist(pisa$PV1MATH,col="blue")
hist(pisa$PV1MATH,col="blue",border="white")
hist(pisa$PV1SCIE,col="blue",border="white",xlab="Điểm khoa học",ylab="Tần suất",main="Biểu đồ phân bố điểm khoa học")
hist(pisa$PV1SCIE,col="blue",border="white",prop=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")
## Warning in plot.window(xlim, ylim, "", ...): "prop" is not a graphical
## parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "prop" is not a graphical parameter
## Warning in axis(1, ...): "prop" is not a graphical parameter
## Warning in axis(2, ...): "prop" is not a graphical parameter
hist(pisa$PV1SCIE,col="blue",border="white",prop=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")
## Warning in plot.window(xlim, ylim, "", ...): "prop" is not a graphical
## parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "prop" is not a graphical parameter
## Warning in axis(1, ...): "prop" is not a graphical parameter
## Warning in axis(2, ...): "prop" is not a graphical parameter
hist(x=pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")
hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")
hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học")
hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học",breaks=20)
lines(density(pisa$PV1SCIE),col="red",lwd=3)
hist(pisa$PV1SCIE,col="blue",border="white", prob=T,xlab="Điểm khoa học",main="Biểu đồ phân bố điểm khoa học",breaks=20)
lines(density(pisa$PV1SCIE),col="coral",lwd=2)
p1=hist(pisa$PV1SCIE[pisa$Gender=="Boys"],plot=F)
p2=hist(pisa$PV1SCIE[pisa$Gender=="Girls"],plot=F)
plot(p1,col="coral",border = "white")
o=file.choose()
ob=read.csv(o,header=T)
names(ob)
## [1] "id" "gender" "height" "weight" "bmi" "age" "bmc"
## [8] "bmd" "fat" "lean" "pcfat"
ob$OB[ob$bmi<18.5]="Underweight"
ob$OB[ob$bmi>=18.5&ob$bmi<24.9]="Normal"
ob$OB[ob$bmi>=25&ob$bmi<29.9]="Overweight"
ob$OB[ob$bmi>=30]="Obese"
library(ggplot2)
p=ggplot(data=ob,aes(x=weight,y=pcfat))
p=ggplot(data=ob,aes(x=weight,y=pcfat,fill=gender,col=gender))
p=p+geom_point()+geom_smooth()
p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
require(ggthemes)
p=p+geom_point()+geom_smooth()+theme_dark()
p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p=p+geom_point()+geom_smooth()+theme_economist()
p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p=p+geom_point()+geom_smooth()+theme_calc()
p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p=p+xlab("Cân nặng")+ylab("Tỉ trọng mỡ")
p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
vars=ob[,c("gender","weight","age","bmi","pcfat")]
library(GGally)
ggpairs(data=vars,mapping=aes(col=gender))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
require(ggExtra)
p=ggplot(data=ob,aes(x=bmi,y=pcfat,fill=gender,col=gender))
p=p+geom_point()+geom_smooth()
ggMarginal(p,type="density",groupColour = T,groupFill = T)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'