Các thư viện lattice, ggplot2, gridExtra
#t=choose.files()
#t
students=read.csv("D:\\20190803 Hoc tap R\\Dataset\\PISA VN 2015.csv")
schools= read.csv("D:\\20190803 Hoc tap R\\Dataset\\PISA VN SCHOOLS 2015.csv")
names(schools)
## [1] "STRATUM" "CNTSCHID" "SCHSIZE" "CLSIZE" "STRATIO" "SCHLTYPE"
## [7] "Region" "Area"
pisa = merge(students,schools, by ="CNTSCHID")
head(pisa)
## CNTSCHID AGE Gender PARED HEDRES MISCED FISCED HISCED WEALTH ESCS
## 1 70400001 15.58 Boys 9 -1.0418 1 2 2 -2.0697 -1.7899
## 2 70400001 15.92 Boys 12 -2.3041 1 4 4 -1.7903 -1.5423
## 3 70400001 15.42 Girls 9 -0.7218 1 2 2 -2.1942 -2.0475
## 4 70400001 15.58 Girls 5 -1.4595 1 0 1 -2.0301 -2.6136
## 5 70400001 15.92 Girls 9 -0.6172 2 2 2 -1.0522 -1.2179
## 6 70400001 16.25 Girls 5 -1.3738 1 1 1 -3.0570 -2.8451
## INSTSCIE SCIEEFF JOYSCIE ICTRES HOMEPOS HEDRES.1 CULTPOSS PV1MATH
## 1 0.9798 0.1421 2.1635 -1.5244 -2.0537 -1.0418 -0.7273 439.923
## 2 1.7359 -0.8432 2.1635 -1.9305 -2.2627 -2.3041 -0.2031 406.251
## 3 -0.2063 -0.1824 -0.1808 -1.6093 -1.9675 -0.7218 -0.2220 414.369
## 4 -0.3115 -1.0555 -0.4318 -1.6250 -2.0686 -1.4595 -0.7039 468.801
## 5 0.7648 -0.1954 1.3031 -0.5305 -0.9471 -0.6172 -0.0971 355.432
## 6 0.3708 -0.5652 0.5094 -2.5873 -2.6986 -1.3738 -0.2031 458.955
## PV1READ PV1SCIE STRATUM SCHSIZE CLSIZE STRATIO SCHLTYPE Region Area
## 1 412.290 475.612 VNM0313 883 18 22.075 3 SOUTH URBAN
## 2 409.598 450.320 VNM0313 883 18 22.075 3 SOUTH URBAN
## 3 384.307 405.787 VNM0313 883 18 22.075 3 SOUTH URBAN
## 4 459.104 462.968 VNM0313 883 18 22.075 3 SOUTH URBAN
## 5 402.435 453.736 VNM0313 883 18 22.075 3 SOUTH URBAN
## 6 483.885 529.866 VNM0313 883 18 22.075 3 SOUTH URBAN
par(mfrow=c(2,2))
hist(x=pisa$PV1SCIE, col="blue")
hist(x=pisa$PV1SCIE, col ="blue", border = "white")
hist(x=pisa$PV1SCIE, col ="blue", border = "white", xlab = "Diem Khoa hoc", ylab = "Tan suat", main = "Phan bo de diem cua khoa hoc")
hist(x=pisa$PV1SCIE, col ="blue", border = "white", xlab = "Diem Khoa hoc", ylab = "Tan suat", main = "Phan bo de diem cua khoa hoc", probability = T)
#prob tinh theo phan tram cua phan bo
lines(density(pisa$PV1SCIE),col= "blue")
p1=hist(pisa$PV1SCIE[pisa$Gender=="Boys"],plot = F)
p2=hist(pisa$PV1SCIE[pisa$Gender=="Girls"],plot = F)
plot(p1, col = "Skyblue", border = "white",main = "Bieu do phan bo cho nam va nu", xlab = "")
plot(p2, add = T, col = scales::alpha("green",0.5), border = "white")
Task 2: Biểu đồ với lattice()
par(mfrow=c(2,2))
#Vẽ density cho nhóm nữa với biến x là điểm pv1Scie
densityplot(~pisa$PV1SCIE, groups = pisa$Gender, data = pisa)
densityplot(~pisa$PV1SCIE, groups = pisa$Area, data = pisa)
densityplot(~pisa$PV1SCIE, groups = pisa$Area, data = pisa, auto.key = list(space="top"))
#Chia 3 cửa sổ
library(gridExtra)
p1=densityplot(~pisa$PV1SCIE, groups = pisa$Area, data = pisa, auto.key = list(space="top"))
p2=densityplot(~pisa$PV1MATH,groups = pisa$Area, data = pisa)
p3=densityplot(~pisa$PV1READ,groups = pisa$Area, data = pisa)
grid.arrange(p1,p2,p3, nrow=3)
#Task3: Biểu đồ thanh với sjPlot
#gọi thư viện sjplot
library(sjPlot)
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
#tìm hiểu bao nhiêu học sinh từng vùng và miền
plot_frq(pisa$Area)
plot_frq(pisa$Region)
#Phân tích theo nhóm
sjp.xtab(pisa$Region, pisa$Area, margin = "row", bar.pos = "stack", show.summary = T)
sjp.xtab(pisa$Region, pisa$Area, margin = "row", bar.pos = "stack", show.summary = T, coord.flip = T)
sjp.grpfrq(pisa$PV1SCIE, pisa$Region, type="box" )
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
Task 6:
#t=choose.files()
#t
big5=read.csv("D:\\20190803 Hoc tap R\\Dataset\\Big Five Personality Data.csv")
head(big5)
## race age engnat gender hand source country E1 E2 E3 E4 E5 E6 E7 E8 E9
## 1 3 53 1 1 1 1 US 4 2 5 2 5 1 4 3 5
## 2 13 46 1 2 1 1 US 2 2 3 3 3 3 1 5 1
## 3 1 14 2 2 1 1 PK 5 1 1 4 5 1 1 5 5
## 4 3 19 2 2 1 1 RO 2 5 2 4 3 4 3 4 4
## 5 11 25 2 2 1 2 US 3 1 3 3 3 1 3 1 3
## 6 13 31 1 2 1 2 US 1 5 2 4 1 3 2 4 1
## E10 N1 N2 N3 N4 N5 N6 N7 N8 N9 N10 A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 C1 C2
## 1 1 1 5 2 5 1 1 1 1 1 1 1 5 1 5 2 3 1 5 4 5 4 1
## 2 5 2 3 4 2 3 4 3 2 2 4 1 3 3 4 4 4 2 3 4 3 4 1
## 3 1 5 1 5 5 5 5 5 5 5 5 5 1 5 5 1 5 1 5 5 5 4 1
## 4 5 5 4 4 2 4 5 5 5 4 5 2 5 4 4 3 5 3 4 4 3 3 3
## 5 5 3 3 3 4 3 3 3 3 3 4 5 5 3 5 1 5 1 5 5 5 3 1
## 6 5 1 5 4 5 1 4 4 1 5 2 2 2 3 4 3 4 3 5 5 3 2 5
## C3 C4 C5 C6 C7 C8 C9 C10 O1 O2 O3 O4 O5 O6 O7 O8 O9 O10
## 1 5 1 5 1 4 1 4 5 4 1 3 1 5 1 4 2 5 5
## 2 3 2 3 1 5 1 4 4 3 3 3 3 2 3 3 1 3 2
## 3 5 1 5 1 5 1 5 5 4 5 5 1 5 1 5 5 5 5
## 4 4 5 1 4 5 4 2 3 4 3 5 2 4 2 5 2 5 5
## 5 5 3 3 1 1 3 3 3 3 1 1 1 3 1 3 1 5 3
## 6 4 3 3 4 5 3 5 3 4 2 1 3 3 5 5 4 5 3
names(big5)
## [1] "race" "age" "engnat" "gender" "hand" "source" "country"
## [8] "E1" "E2" "E3" "E4" "E5" "E6" "E7"
## [15] "E8" "E9" "E10" "N1" "N2" "N3" "N4"
## [22] "N5" "N6" "N7" "N8" "N9" "N10" "A1"
## [29] "A2" "A3" "A4" "A5" "A6" "A7" "A8"
## [36] "A9" "A10" "C1" "C2" "C3" "C4" "C5"
## [43] "C6" "C7" "C8" "C9" "C10" "O1" "O2"
## [50] "O3" "O4" "O5" "O6" "O7" "O8" "O9"
## [57] "O10"
test= big5[, c("gender","E1","E2","E3","E4","E5")]
test$E1 = as.factor(test$E1)
test$E2 = as.factor(test$E2)
test$E3 = as.factor(test$E3)
test$E4 = as.factor(test$E4)
test$E5 = as.factor(test$E5)
library(sjPlot)
plot_likert(test)
## Warning: Detected uneven category count in items. Dropping last category.
## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length
## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length
## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length
## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length
## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length
## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length
Ứng dụng học tập GGPlot2
#task 7 ngày 2:
#Doc du lieu
ob=read.csv("D:\\20190803 Hoc tap R\\Dataset\\Obesity data.csv")
#Tao bien OB
ob$OB[ob$bmi<18.5] = "Underweight"
ob$OB[ob$bmi>=18.5 & ob$bmi< 25] ="Normal"
ob$OB[ob$bmi >= 25 & ob$bmi < 30] = "Overweight"
ob$OB[ob$bmi>=30.0] <- "Obese"
ob$OB= factor(ob$OB, levels = c("Underweight","Normal","Overweight","Obese"))
#tim hieu phan bo cua obs
library(ggplot2)
p=ggplot(data=ob, aes(OB, fill= OB)) + geom_bar()
p= p + xlab("Nhom map")+ ylab ("Tan Suat")
p+ theme(legend.position = "none")
p
Vẽ biểu đồ tương qua giữa cân nặng và lượng mỡ dùng ggplot2
library(ggplot2)
library(ggthemes)
library(gridExtra)
#dua du lieu vao ggplot
p=ggplot (data = ob, aes(x=ob$weight, y=ob$pcfat, fill= ob$gender, color=ob$gender))
p=p+geom_point()+geom_smooth()
p=p+theme_economist()
p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# ve bieu do diem va bieu do smooth
Biểu đồ tương quan và biểu đồ phân bô dùng package ggExtra
library(ggExtra)
p=ggplot(data=ob, aes(x=ob$bmi, y=ob$pcfat, fill=ob$gender, col=ob$gender))
p=p + geom_point() +geom_smooth()
ggMarginal(p, type = "histogram", groupColour = T, groupFill = T)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#thu nghiem voi density
ggMarginal(p, type = "density", groupColour = T, groupFill = T)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Biểu đồ tương quan đa biến (package GGally)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
dat=ob[,c("gender","age","bmi","weight","height","pcfat")]
ggpairs(dat)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Them mau theo nhom
C1=ggpairs(data = dat, mapping = aes(color=gender))
C2=ggpairs(data = ob, mapping = aes(color =gender), columns = c("age","weight","bmi","pcfat"))
Bai tập về biểu đô phân bố môn khoa học
library(gridExtra)
p=ggplot (data=pisa, aes(x=pisa$PV1SCIE))
p1= p + geom_histogram(color="white", fill="blue")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p=ggplot(data=pisa, aes(x=pisa$PV1SCIE))
p = p + geom_histogram(aes (y=..density..),color="white", fill="blue")
p2= p + geom_density(col="red")
grid.arrange(p1,p2, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p=ggplot(data=pisa, aes(x=pisa$PV1SCIE, fill=pisa$Area))
p1=p +geom_histogram(position = "dodge")
p2= ggplot(data=pisa, aes(x=pisa$PV1SCIE,fill=pisa$Area, color=pisa$Area))
p2= p2+geom_density(alpha=0.1)
p2
grid.arrange(p1,p2, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Histogram và xác suất tích luỹ
p = ggplot(data=pisa, aes(x=pisa$HISCED))
p = p+ stat_ecdf(lwd=1, color = "red")
p = p + geom_bar(aes(y=(..count..)/sum(..count..)), fill="blue", colour="yellow")
p
## Warning: Removed 14 rows containing non-finite values (stat_ecdf).
## Warning: Removed 14 rows containing non-finite values (stat_count).
Task 10 vẽ biểu đồ hộp
p = ggplot(data=pisa, aes(x=pisa$Area,y=pisa$PV1SCIE,col=Area, fill = Area))
p1 = p + geom_boxplot(col="black")
p2 = p + geom_boxplot(col="black")+ geom_jitter(alpha=0.1)
grid.arrange(p1,p2,ncol=2)
Biểu đô hộp theo kinh tế
p= ggplot(data=pisa, aes(x=pisa$PARED,y=pisa$PV1SCIE, fill=pisa$PARED))
p1= p + geom_boxplot(col="black") + geom_jitter(alpha=0.02)
p1
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
## Warning: Removed 14 rows containing missing values (stat_boxplot).
## Warning: Removed 14 rows containing missing values (geom_point).
p= ggplot(data =pisa, aes(x=pisa$PARED, y=pisa$PV1SCIE, fill=pisa$PARED, col=pisa$PARED))
p2= p + geom_boxplot(col="black") + geom_jitter(alpha = 0.05)
p2
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
## Warning: Removed 14 rows containing missing values (stat_boxplot).
## Warning: Removed 14 rows containing missing values (geom_point).
grid.arrange(p1, p2, ncol=2)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
## Warning: Removed 14 rows containing missing values (stat_boxplot).
## Warning: Removed 14 rows containing missing values (geom_point).
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
## Warning: Removed 14 rows containing missing values (stat_boxplot).
## Warning: Removed 14 rows containing missing values (geom_point).