R Markdown

Làm quen với phân tích biểu đồ:

Các thư viện lattice, ggplot2, gridExtra

#t=choose.files()
#t
students=read.csv("D:\\20190803 Hoc tap R\\Dataset\\PISA VN 2015.csv")
schools= read.csv("D:\\20190803 Hoc tap R\\Dataset\\PISA VN SCHOOLS 2015.csv")
names(schools)
## [1] "STRATUM"  "CNTSCHID" "SCHSIZE"  "CLSIZE"   "STRATIO"  "SCHLTYPE"
## [7] "Region"   "Area"
pisa  = merge(students,schools, by ="CNTSCHID")

head(pisa)
##   CNTSCHID   AGE Gender PARED  HEDRES MISCED FISCED HISCED  WEALTH    ESCS
## 1 70400001 15.58   Boys     9 -1.0418      1      2      2 -2.0697 -1.7899
## 2 70400001 15.92   Boys    12 -2.3041      1      4      4 -1.7903 -1.5423
## 3 70400001 15.42  Girls     9 -0.7218      1      2      2 -2.1942 -2.0475
## 4 70400001 15.58  Girls     5 -1.4595      1      0      1 -2.0301 -2.6136
## 5 70400001 15.92  Girls     9 -0.6172      2      2      2 -1.0522 -1.2179
## 6 70400001 16.25  Girls     5 -1.3738      1      1      1 -3.0570 -2.8451
##   INSTSCIE SCIEEFF JOYSCIE  ICTRES HOMEPOS HEDRES.1 CULTPOSS PV1MATH
## 1   0.9798  0.1421  2.1635 -1.5244 -2.0537  -1.0418  -0.7273 439.923
## 2   1.7359 -0.8432  2.1635 -1.9305 -2.2627  -2.3041  -0.2031 406.251
## 3  -0.2063 -0.1824 -0.1808 -1.6093 -1.9675  -0.7218  -0.2220 414.369
## 4  -0.3115 -1.0555 -0.4318 -1.6250 -2.0686  -1.4595  -0.7039 468.801
## 5   0.7648 -0.1954  1.3031 -0.5305 -0.9471  -0.6172  -0.0971 355.432
## 6   0.3708 -0.5652  0.5094 -2.5873 -2.6986  -1.3738  -0.2031 458.955
##   PV1READ PV1SCIE STRATUM SCHSIZE CLSIZE STRATIO SCHLTYPE Region  Area
## 1 412.290 475.612 VNM0313     883     18  22.075        3  SOUTH URBAN
## 2 409.598 450.320 VNM0313     883     18  22.075        3  SOUTH URBAN
## 3 384.307 405.787 VNM0313     883     18  22.075        3  SOUTH URBAN
## 4 459.104 462.968 VNM0313     883     18  22.075        3  SOUTH URBAN
## 5 402.435 453.736 VNM0313     883     18  22.075        3  SOUTH URBAN
## 6 483.885 529.866 VNM0313     883     18  22.075        3  SOUTH URBAN
par(mfrow=c(2,2))

hist(x=pisa$PV1SCIE, col="blue")
hist(x=pisa$PV1SCIE, col ="blue", border = "white")
hist(x=pisa$PV1SCIE, col ="blue", border = "white", xlab = "Diem Khoa hoc", ylab = "Tan suat", main = "Phan bo de diem cua khoa hoc")
hist(x=pisa$PV1SCIE, col ="blue", border = "white", xlab = "Diem Khoa hoc", ylab = "Tan suat", main = "Phan bo de diem cua khoa hoc", probability = T)
#prob tinh theo phan tram cua phan bo
lines(density(pisa$PV1SCIE),col= "blue")

p1=hist(pisa$PV1SCIE[pisa$Gender=="Boys"],plot = F)
p2=hist(pisa$PV1SCIE[pisa$Gender=="Girls"],plot = F)

plot(p1, col = "Skyblue", border = "white",main = "Bieu do phan bo cho nam va nu", xlab = "")
plot(p2, add = T, col = scales::alpha("green",0.5), border = "white")

Task 2: Biểu đồ với lattice()

par(mfrow=c(2,2))

#Vẽ density cho nhóm nữa với biến x là điểm pv1Scie

densityplot(~pisa$PV1SCIE, groups = pisa$Gender, data = pisa)

densityplot(~pisa$PV1SCIE, groups = pisa$Area, data = pisa)

densityplot(~pisa$PV1SCIE, groups = pisa$Area, data = pisa, auto.key = list(space="top"))

#Chia 3 cửa sổ 
library(gridExtra)
p1=densityplot(~pisa$PV1SCIE, groups = pisa$Area, data = pisa, auto.key = list(space="top"))
p2=densityplot(~pisa$PV1MATH,groups = pisa$Area, data = pisa)
p3=densityplot(~pisa$PV1READ,groups = pisa$Area, data = pisa)
grid.arrange(p1,p2,p3, nrow=3)

#Task3: Biểu đồ thanh với sjPlot

#gọi thư viện sjplot
library(sjPlot)
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
#tìm hiểu bao nhiêu học sinh từng vùng và miền
plot_frq(pisa$Area)

plot_frq(pisa$Region)

#Phân tích theo nhóm
sjp.xtab(pisa$Region, pisa$Area, margin = "row", bar.pos = "stack", show.summary = T)

sjp.xtab(pisa$Region, pisa$Area, margin = "row", bar.pos = "stack", show.summary = T, coord.flip = T)

sjp.grpfrq(pisa$PV1SCIE, pisa$Region, type="box" )
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

Task 6:

#t=choose.files()
#t
big5=read.csv("D:\\20190803 Hoc tap R\\Dataset\\Big Five Personality Data.csv")
head(big5)
##   race age engnat gender hand source country E1 E2 E3 E4 E5 E6 E7 E8 E9
## 1    3  53      1      1    1      1      US  4  2  5  2  5  1  4  3  5
## 2   13  46      1      2    1      1      US  2  2  3  3  3  3  1  5  1
## 3    1  14      2      2    1      1      PK  5  1  1  4  5  1  1  5  5
## 4    3  19      2      2    1      1      RO  2  5  2  4  3  4  3  4  4
## 5   11  25      2      2    1      2      US  3  1  3  3  3  1  3  1  3
## 6   13  31      1      2    1      2      US  1  5  2  4  1  3  2  4  1
##   E10 N1 N2 N3 N4 N5 N6 N7 N8 N9 N10 A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 C1 C2
## 1   1  1  5  2  5  1  1  1  1  1   1  1  5  1  5  2  3  1  5  4   5  4  1
## 2   5  2  3  4  2  3  4  3  2  2   4  1  3  3  4  4  4  2  3  4   3  4  1
## 3   1  5  1  5  5  5  5  5  5  5   5  5  1  5  5  1  5  1  5  5   5  4  1
## 4   5  5  4  4  2  4  5  5  5  4   5  2  5  4  4  3  5  3  4  4   3  3  3
## 5   5  3  3  3  4  3  3  3  3  3   4  5  5  3  5  1  5  1  5  5   5  3  1
## 6   5  1  5  4  5  1  4  4  1  5   2  2  2  3  4  3  4  3  5  5   3  2  5
##   C3 C4 C5 C6 C7 C8 C9 C10 O1 O2 O3 O4 O5 O6 O7 O8 O9 O10
## 1  5  1  5  1  4  1  4   5  4  1  3  1  5  1  4  2  5   5
## 2  3  2  3  1  5  1  4   4  3  3  3  3  2  3  3  1  3   2
## 3  5  1  5  1  5  1  5   5  4  5  5  1  5  1  5  5  5   5
## 4  4  5  1  4  5  4  2   3  4  3  5  2  4  2  5  2  5   5
## 5  5  3  3  1  1  3  3   3  3  1  1  1  3  1  3  1  5   3
## 6  4  3  3  4  5  3  5   3  4  2  1  3  3  5  5  4  5   3
names(big5)
##  [1] "race"    "age"     "engnat"  "gender"  "hand"    "source"  "country"
##  [8] "E1"      "E2"      "E3"      "E4"      "E5"      "E6"      "E7"     
## [15] "E8"      "E9"      "E10"     "N1"      "N2"      "N3"      "N4"     
## [22] "N5"      "N6"      "N7"      "N8"      "N9"      "N10"     "A1"     
## [29] "A2"      "A3"      "A4"      "A5"      "A6"      "A7"      "A8"     
## [36] "A9"      "A10"     "C1"      "C2"      "C3"      "C4"      "C5"     
## [43] "C6"      "C7"      "C8"      "C9"      "C10"     "O1"      "O2"     
## [50] "O3"      "O4"      "O5"      "O6"      "O7"      "O8"      "O9"     
## [57] "O10"
test= big5[, c("gender","E1","E2","E3","E4","E5")]
test$E1 = as.factor(test$E1)
test$E2 = as.factor(test$E2)
test$E3 = as.factor(test$E3)
test$E4 = as.factor(test$E4)
test$E5 = as.factor(test$E5)

library(sjPlot)

plot_likert(test)
## Warning: Detected uneven category count in items. Dropping last category.
## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length

## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length

## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length

## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length

## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length

## Warning in freq[valid] <- counts: number of items to replace is not a
## multiple of replacement length

Ứng dụng học tập GGPlot2

#task 7 ngày 2:

#Doc du lieu
ob=read.csv("D:\\20190803 Hoc tap R\\Dataset\\Obesity data.csv")
#Tao bien OB
ob$OB[ob$bmi<18.5] = "Underweight"
ob$OB[ob$bmi>=18.5 & ob$bmi< 25] ="Normal"
ob$OB[ob$bmi >= 25 & ob$bmi < 30] = "Overweight"
ob$OB[ob$bmi>=30.0] <- "Obese"
ob$OB= factor(ob$OB, levels = c("Underweight","Normal","Overweight","Obese"))

#tim hieu phan bo cua obs
library(ggplot2)
p=ggplot(data=ob, aes(OB, fill= OB)) + geom_bar()
p= p + xlab("Nhom map")+ ylab ("Tan Suat")
p+ theme(legend.position = "none")

p

Vẽ biểu đồ tương qua giữa cân nặng và lượng mỡ dùng ggplot2

library(ggplot2)
library(ggthemes)
library(gridExtra)
#dua du lieu vao ggplot
p=ggplot (data = ob, aes(x=ob$weight, y=ob$pcfat, fill= ob$gender, color=ob$gender))
p=p+geom_point()+geom_smooth()
p=p+theme_economist()
p
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# ve bieu do diem va bieu do smooth

Biểu đồ tương quan và biểu đồ phân bô dùng package ggExtra

library(ggExtra)

p=ggplot(data=ob, aes(x=ob$bmi, y=ob$pcfat, fill=ob$gender, col=ob$gender))
p=p + geom_point() +geom_smooth()

ggMarginal(p, type = "histogram", groupColour = T, groupFill = T)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#thu nghiem voi density

ggMarginal(p, type = "density", groupColour = T, groupFill = T)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Biểu đồ tương quan đa biến (package GGally)

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
dat=ob[,c("gender","age","bmi","weight","height","pcfat")]
ggpairs(dat)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Them mau theo nhom
C1=ggpairs(data = dat, mapping = aes(color=gender))

C2=ggpairs(data = ob, mapping = aes(color =gender), columns = c("age","weight","bmi","pcfat"))

Bai tập về biểu đô phân bố môn khoa học

library(gridExtra)
p=ggplot (data=pisa, aes(x=pisa$PV1SCIE))
p1= p + geom_histogram(color="white", fill="blue")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p=ggplot(data=pisa, aes(x=pisa$PV1SCIE))
p = p + geom_histogram(aes (y=..density..),color="white", fill="blue")
p2= p + geom_density(col="red")

grid.arrange(p1,p2, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p=ggplot(data=pisa, aes(x=pisa$PV1SCIE, fill=pisa$Area))

p1=p +geom_histogram(position = "dodge")
p2= ggplot(data=pisa, aes(x=pisa$PV1SCIE,fill=pisa$Area, color=pisa$Area))
p2= p2+geom_density(alpha=0.1)

p2

grid.arrange(p1,p2, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Histogram và xác suất tích luỹ

p = ggplot(data=pisa, aes(x=pisa$HISCED))
p = p+ stat_ecdf(lwd=1, color = "red")
p = p + geom_bar(aes(y=(..count..)/sum(..count..)), fill="blue", colour="yellow")
p
## Warning: Removed 14 rows containing non-finite values (stat_ecdf).
## Warning: Removed 14 rows containing non-finite values (stat_count).

Task 10 vẽ biểu đồ hộp

p = ggplot(data=pisa, aes(x=pisa$Area,y=pisa$PV1SCIE,col=Area, fill = Area))
p1 = p + geom_boxplot(col="black")
p2 = p + geom_boxplot(col="black")+ geom_jitter(alpha=0.1)

grid.arrange(p1,p2,ncol=2)

Biểu đô hộp theo kinh tế

p= ggplot(data=pisa, aes(x=pisa$PARED,y=pisa$PV1SCIE, fill=pisa$PARED))
p1= p + geom_boxplot(col="black") + geom_jitter(alpha=0.02)
p1
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
## Warning: Removed 14 rows containing missing values (stat_boxplot).
## Warning: Removed 14 rows containing missing values (geom_point).

p= ggplot(data =pisa, aes(x=pisa$PARED, y=pisa$PV1SCIE, fill=pisa$PARED, col=pisa$PARED))
p2= p + geom_boxplot(col="black") + geom_jitter(alpha = 0.05)
p2
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
## Warning: Removed 14 rows containing missing values (stat_boxplot).
## Warning: Removed 14 rows containing missing values (geom_point).

grid.arrange(p1, p2, ncol=2)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
## Warning: Removed 14 rows containing missing values (stat_boxplot).
## Warning: Removed 14 rows containing missing values (geom_point).
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
## Warning: Removed 14 rows containing missing values (stat_boxplot).
## Warning: Removed 14 rows containing missing values (geom_point).

Including Plots