Đọc dữ liệu 2019

library(ggplot2); library(table1); library(dplyr)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggridges); library(gridExtra); library(viridis)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
## Loading required package: viridisLite
vn = read.csv("~/Dropbox/Bao chi va Khoa hoc/Pho diem 2019/THPT 2019 Quoc gia.csv", header=T)

vn$Language[vn$Code=="N1"] = "English"
vn$Language[vn$Code=="N2"] = "Russian"
vn$Language[vn$Code=="N3"] = "French"
vn$Language[vn$Code=="N4"] = "Chinese"
vn$Language[vn$Code=="N5"] = "German"
vn$Language[vn$Code=="N6"] = "Japanese"

d1 = subset(vn, SoBD < 10000000)
d1$temp = as.character(d1$SoBD) 
d1$t = substr(d1$temp, 1, 1)
d1$Province[d1$t=="1"] <- "Ha Noi"
d1$Province[d1$t=="2"] <- "TPHCM"
d1$Province[d1$t=="3"] <- "Hai Phong"
d1$Province[d1$t=="4"] <- "Da Nang"
d1$Province[d1$t=="5"] <- "Ha Giang"
d1$Province[d1$t=="6"] <- "Cao Bang"
d1$Province[d1$t=="7"] <- "Lai Chau"
d1$Province[d1$t=="8"] <- "Tuyen Quang"
d1$Province[d1$t=="9"] <- "Lang Son"

d2 = subset(vn, SoBD > 10000000)
d2$temp = as.character(d2$SoBD) 
d2$t = substr(d2$temp, 1, 2)
d2$Province[d2$t=="10"] <- "Lang Son"
d2$Province[d2$t=="11"] <- "Bac Kan"
d2$Province[d2$t=="12"] <- "Thai Nguyen"
d2$Province[d2$t=="13"] <- "Yen Bai"
d2$Province[d2$t=="14"] <- "Son La"
d2$Province[d2$t=="15"] <- "Phu Tho"
d2$Province[d2$t=="16"] <- "Vinh Phuc"
d2$Province[d2$t=="17"] <- "Quang Ninh"
d2$Province[d2$t=="18"] <- "Bac Giang"
d2$Province[d2$t=="19"] <- "Bac Ninh"
d2$Province[d2$t=="21"] <- "Hai Duong"
d2$Province[d2$t=="22"] <- "Hung Yen"
d2$Province[d2$t=="23"] <- "Hoa Binh"
d2$Province[d2$t=="24"] <- "Ha Nam"
d2$Province[d2$t=="25"] <- "Nam Dinh"
d2$Province[d2$t=="26"] <- "Thai Binh"
d2$Province[d2$t=="27"] <- "Ninh Binh"
d2$Province[d2$t=="28"] <- "Thanh Hoa"
d2$Province[d2$t=="29"] <- "Nghe An"
d2$Province[d2$t=="30"] <- "Ha Tinh"
d2$Province[d2$t=="31"] <- "Quang Binh"
d2$Province[d2$t=="32"] <- "Quang Tri"
d2$Province[d2$t=="33"] <- "Hue-TT"
d2$Province[d2$t=="34"] <- "Quang Nam"
d2$Province[d2$t=="35"] <- "Quang Ngai"
d2$Province[d2$t=="36"] <- "Kon Tum"
d2$Province[d2$t=="37"] <- "Binh Dinh"
d2$Province[d2$t=="38"] <- "Gia Lai"
d2$Province[d2$t=="39"] <- "Phu Yen"
d2$Province[d2$t=="40"] <- "Dak Lak"
d2$Province[d2$t=="41"] <- "Khanh Hoa"
d2$Province[d2$t=="42"] <- "Lam Dong"
d2$Province[d2$t=="43"] <- "Binh Phuoc"
d2$Province[d2$t=="44"] <- "Binh Duong"
d2$Province[d2$t=="45"] <- "Ninh Thuan"
d2$Province[d2$t=="46"] <- "Tay Ninh"
d2$Province[d2$t=="47"] <- "Binh Thuan"
d2$Province[d2$t=="48"] <- "Dong Nai"
d2$Province[d2$t=="49"] <- "Long An"
d2$Province[d2$t=="50"] <- "Dong Thap"
d2$Province[d2$t=="51"] <- "An Giang"
d2$Province[d2$t=="52"] <- "BR-VT"
d2$Province[d2$t=="53"] <- "Tien Giang"
d2$Province[d2$t=="54"] <- "Kien Giang"
d2$Province[d2$t=="55"] <- "Can Tho"
d2$Province[d2$t=="56"] <- "Ben Tre"
d2$Province[d2$t=="57"] <- "Vinh Long"
d2$Province[d2$t=="58"] <- "Tra Vinh"
d2$Province[d2$t=="59"] <- "Soc Trang"
d2$Province[d2$t=="60"] <- "Bac Lieu"
d2$Province[d2$t=="61"] <- "Ca Mau"
d2$Province[d2$t=="62"] <- "Dien Bien"
d2$Province[d2$t=="63"] <- "Dak Nong"
d2$Province[d2$t=="64"] <- "Hau Giang"
d2$Province[d2$t=="65"] <- "Quoc Phong"

d1 = d1[,  c("ID", "SoBD", "Province", "Math", "Viet", "ForeignLanguage", "Physics", "Chemistry", "Biology", "History", "Geography")]

d2 = d2[,  c("ID", "SoBD", "Province", "Math", "Viet", "ForeignLanguage", "Physics", "Chemistry", "Biology", "History", "Geography")]

vn = rbind(d1, d2) 
vn$Year = 2019
vn$English = vn$ForeignLanguage

vn19 = vn[, c("ID", "SoBD", "Province", "Math", "Viet", "English", "Physics", "Chemistry", "Biology", "History", "Geography", "Year")]

Đọc dữ liệu 2018

vn = read.csv("~/Dropbox/Bao chi va Khoa hoc/Pho diem 2018/THPT 2018 Quoc gia.csv", na.strings="")

d1 = subset(vn, SoBD < 10000000)
d1$temp = as.character(d1$SoBD) 
d1$t = substr(d1$temp, 1, 1)
d1$Province[d1$t=="1"] <- "Ha Noi"
d1$Province[d1$t=="2"] <- "TPHCM"
d1$Province[d1$t=="3"] <- "Hai Phong"
d1$Province[d1$t=="4"] <- "Da Nang"
d1$Province[d1$t=="5"] <- "Ha Giang"
d1$Province[d1$t=="6"] <- "Cao Bang"
d1$Province[d1$t=="7"] <- "Lai Chau"
d1$Province[d1$t=="8"] <- "Tuyen Quang"
d1$Province[d1$t=="9"] <- "Lang Son"

d2 = subset(vn, SoBD > 10000000)
d2$temp = as.character(d2$SoBD) 
d2$t = substr(d2$temp, 1, 2)
d2$Province[d2$t=="10"] <- "Lang Son"
d2$Province[d2$t=="11"] <- "Bac Kan"
d2$Province[d2$t=="12"] <- "Thai Nguyen"
d2$Province[d2$t=="13"] <- "Yen Bai"
d2$Province[d2$t=="14"] <- "Son La"
d2$Province[d2$t=="15"] <- "Phu Tho"
d2$Province[d2$t=="16"] <- "Vinh Phuc"
d2$Province[d2$t=="17"] <- "Quang Ninh"
d2$Province[d2$t=="18"] <- "Bac Giang"
d2$Province[d2$t=="19"] <- "Bac Ninh"
d2$Province[d2$t=="21"] <- "Hai Duong"
d2$Province[d2$t=="22"] <- "Hung Yen"
d2$Province[d2$t=="23"] <- "Hoa Binh"
d2$Province[d2$t=="24"] <- "Ha Nam"
d2$Province[d2$t=="25"] <- "Nam Dinh"
d2$Province[d2$t=="26"] <- "Thai Binh"
d2$Province[d2$t=="27"] <- "Ninh Binh"
d2$Province[d2$t=="28"] <- "Thanh Hoa"
d2$Province[d2$t=="29"] <- "Nghe An"
d2$Province[d2$t=="30"] <- "Ha Tinh"
d2$Province[d2$t=="31"] <- "Quang Binh"
d2$Province[d2$t=="32"] <- "Quang Tri"
d2$Province[d2$t=="33"] <- "Hue-TT"
d2$Province[d2$t=="34"] <- "Quang Nam"
d2$Province[d2$t=="35"] <- "Quang Ngai"
d2$Province[d2$t=="36"] <- "Kon Tum"
d2$Province[d2$t=="37"] <- "Binh Dinh"
d2$Province[d2$t=="38"] <- "Gia Lai"
d2$Province[d2$t=="39"] <- "Phu Yen"
d2$Province[d2$t=="40"] <- "Dak Lak"
d2$Province[d2$t=="41"] <- "Khanh Hoa"
d2$Province[d2$t=="42"] <- "Lam Dong"
d2$Province[d2$t=="43"] <- "Binh Phuoc"
d2$Province[d2$t=="44"] <- "Binh Duong"
d2$Province[d2$t=="45"] <- "Ninh Thuan"
d2$Province[d2$t=="46"] <- "Tay Ninh"
d2$Province[d2$t=="47"] <- "Binh Thuan"
d2$Province[d2$t=="48"] <- "Dong Nai"
d2$Province[d2$t=="49"] <- "Long An"
d2$Province[d2$t=="50"] <- "Dong Thap"
d2$Province[d2$t=="51"] <- "An Giang"
d2$Province[d2$t=="52"] <- "BR-VT"
d2$Province[d2$t=="53"] <- "Tien Giang"
d2$Province[d2$t=="54"] <- "Kien Giang"
d2$Province[d2$t=="55"] <- "Can Tho"
d2$Province[d2$t=="56"] <- "Ben Tre"
d2$Province[d2$t=="57"] <- "Vinh Long"
d2$Province[d2$t=="58"] <- "Tra Vinh"
d2$Province[d2$t=="59"] <- "Soc Trang"
d2$Province[d2$t=="60"] <- "Bac Lieu"
d2$Province[d2$t=="61"] <- "Ca Mau"
d2$Province[d2$t=="62"] <- "Dien Bien"
d2$Province[d2$t=="63"] <- "Dak Nong"
d2$Province[d2$t=="64"] <- "Hau Giang"
d2$Province[d2$t=="65"] <- "Quoc Phong"

d1 = d1[,  c("ID", "SoBD", "Province", "Math", "Viet", "English", "Physics", "Chemistry", "Biology", "History", "Geography")]

d2 = d2[,  c("ID", "SoBD", "Province", "Math", "Viet", "English", "Physics", "Chemistry", "Biology", "History", "Geography")]

sonla = read.csv("~/Dropbox/Bao chi va Khoa hoc/Pho diem 2018/Son La.csv", na.strings="")
sonla$ID= sonla$ID
sonla$SoBD= sonla$ID
sonla$Province="Son La" 

vn18 = rbind(d1, d2, sonla) 
vn18$Year = 2018

Hợp nhất 2 dữ liệu

vn = rbind(vn19, vn18)
vn$Year = as.factor(vn$Year)
head(vn)
##   ID    SoBD Province Math Viet English Physics Chemistry Biology History
## 1  1 1000029   Ha Noi  7.2 5.00     6.8       6       4.5    4.00      NA
## 2  2 1000030   Ha Noi  6.2 6.25     8.0       5       4.5    3.75      NA
## 3  3 1000031   Ha Noi  6.8 5.75     7.2      NA        NA      NA    4.25
## 4  4 1000032   Ha Noi  5.8 4.50     9.0      NA        NA      NA    7.00
## 5  5 1000033   Ha Noi  7.0 5.50     3.6      NA        NA      NA    5.25
## 6  6 1000034   Ha Noi  7.0 6.00     7.2      NA        NA      NA    3.75
##   Geography Year
## 1        NA 2019
## 2        NA 2019
## 3       5.5 2019
## 4       6.5 2019
## 5       7.5 2019
## 6       6.5 2019

Điểm trung bình năm 2019 cho từng tỉnh

# Toán
dat = vn19[, c("Province", "Math")]
dat = na.omit(dat)
means = aggregate(dat$Math, by=list(dat$Province), FUN=mean)
colnames(means) = c("Province", "Mean")
means = transform(means, Province= reorder(Province, Mean))

p = ggplot(means, aes(x=Province, y=Mean, fill=Province))
p = p + geom_bar(stat="identity", width=1, color="white", position=position_dodge())
p = p + theme(legend.position="none") + xlab("") + ylab("Điểm môn toán ")
p + geom_text(aes(y=Mean, ymax=Mean, label=round(Mean, 2)), position= position_dodge(width=1), size=3, vjust=0.5, hjust=1, size=1, color="black") + coord_flip()
## Warning: Duplicated aesthetics after name standardisation: size
## Warning: Ignoring unknown aesthetics: ymax

# Lí 
dat = vn19[, c("Province", "Physics")]
dat = na.omit(dat)
means = aggregate(dat$Physics, by=list(dat$Province), FUN=mean)
colnames(means) = c("Province", "Mean")
means = transform(means, Province= reorder(Province, Mean))

p = ggplot(means, aes(x=Province, y=Mean, fill=Province))
p = p + geom_bar(stat="identity", width=1, color="white", position=position_dodge())
p = p + theme(legend.position="none") + xlab("") + ylab("Điểm môn vật lí ")
p + geom_text(aes(y=Mean, ymax=Mean, label=round(Mean, 2)), position= position_dodge(width=1), size=3, vjust=0.5, hjust=1, size=1, color="black") + coord_flip()
## Warning: Duplicated aesthetics after name standardisation: size

## Warning: Ignoring unknown aesthetics: ymax

# Hoá
dat = vn19[, c("Province", "Chemistry")]
dat = na.omit(dat)
means = aggregate(dat$Chemistry, by=list(dat$Province), FUN=mean)
colnames(means) = c("Province", "Mean")
means = transform(means, Province= reorder(Province, Mean))

p = ggplot(means, aes(x=Province, y=Mean, fill=Province))
p = p + geom_bar(stat="identity", width=1, color="white", position=position_dodge())
p = p + theme(legend.position="none") + xlab("") + ylab("Điểm môn hóa ")
p + geom_text(aes(y=Mean, ymax=Mean, label=round(Mean, 2)), position= position_dodge(width=1), size=3, vjust=0.5, hjust=1, size=1, color="black") + coord_flip()
## Warning: Duplicated aesthetics after name standardisation: size

## Warning: Ignoring unknown aesthetics: ymax

# Sinh
dat = vn19[, c("Province", "Biology")]
dat = na.omit(dat)
means = aggregate(dat$Biology, by=list(dat$Province), FUN=mean)
colnames(means) = c("Province", "Mean")
means = transform(means, Province= reorder(Province, Mean))

p = ggplot(means, aes(x=Province, y=Mean, fill=Province))
p = p + geom_bar(stat="identity", width=1, color="white", position=position_dodge())
p = p + theme(legend.position="none") + xlab("") + ylab("Điểm môn sinh học")
p + geom_text(aes(y=Mean, ymax=Mean, label=round(Mean, 2)), position= position_dodge(width=1), size=3, vjust=0.5, hjust=1, size=1, color="black") + coord_flip()
## Warning: Duplicated aesthetics after name standardisation: size

## Warning: Ignoring unknown aesthetics: ymax

# Ngoại ngữ  
dat = vn19[, c("Province", "English")]
dat = na.omit(dat)
means = aggregate(dat$English, by=list(dat$Province), FUN=mean)
colnames(means) = c("Province", "Mean")
means = transform(means, Province= reorder(Province, Mean))

p = ggplot(means, aes(x=Province, y=Mean, fill=Province))
p = p + geom_bar(stat="identity", width=1, color="white", position=position_dodge())
p = p + theme(legend.position="none") + xlab("") + ylab("Điểm môn ngoại ngữ")
p + geom_text(aes(y=Mean, ymax=Mean, label=round(Mean, 2)), position= position_dodge(width=1), size=3, vjust=0.5, hjust=1, size=1, color="black") + coord_flip()
## Warning: Duplicated aesthetics after name standardisation: size

## Warning: Ignoring unknown aesthetics: ymax

# Sử 
dat = vn19[, c("Province", "History")]
dat = na.omit(dat)
means = aggregate(dat$History, by=list(dat$Province), FUN=mean)
colnames(means) = c("Province", "Mean")
means = transform(means, Province= reorder(Province, Mean))

p = ggplot(means, aes(x=Province, y=Mean, fill=Province))
p = p + geom_bar(stat="identity", width=1, color="white", position=position_dodge())
p = p + theme(legend.position="none") + xlab("") + ylab("Điểm môn sử")
p + geom_text(aes(y=Mean, ymax=Mean, label=round(Mean, 2)), position= position_dodge(width=1), size=3, vjust=0.5, hjust=1, size=1, color="black") + coord_flip()
## Warning: Duplicated aesthetics after name standardisation: size

## Warning: Ignoring unknown aesthetics: ymax

# Địa 
dat = vn19[, c("Province", "Geography")]
dat = na.omit(dat)
means = aggregate(dat$Geography, by=list(dat$Province), FUN=mean)
colnames(means) = c("Province", "Mean")
means = transform(means, Province= reorder(Province, Mean))

p = ggplot(means, aes(x=Province, y=Mean, fill=Province))
p = p + geom_bar(stat="identity", width=1, color="white", position=position_dodge())
p = p + theme(legend.position="none") + xlab("") + ylab("Điểm môn địa lí")
p + geom_text(aes(y=Mean, ymax=Mean, label=round(Mean, 2)), position= position_dodge(width=1), size=3, vjust=0.5, hjust=1, size=1, color="black") + coord_flip()
## Warning: Duplicated aesthetics after name standardisation: size

## Warning: Ignoring unknown aesthetics: ymax

So sánh phân bố 2018 và 2019

# Môn toán, lí, hoá, sinh 
p = ggplot(data=vn, aes(x=Math, fill=Year, col=Year))
p1 = p + geom_density(alpha = 0.1) + xlab("Môn toán") + ylab("Xác suất") + theme(legend.position="top")

p = ggplot(data=vn, aes(x=Physics, fill=Year, col=Year))
p2 = p + geom_density(alpha = 0.1) + xlab("Môn lí") + ylab("Xác suất") + theme(legend.position="top")

p = ggplot(data=vn, aes(x=Chemistry, fill=Year, col=Year))
p3 = p + geom_density(alpha = 0.1) + xlab("Môn hoá") + ylab("Xác suất") + theme(legend.position="top")

p = ggplot(data=vn, aes(x=Biology, fill=Year, col=Year))
p4 = p + geom_density(alpha = 0.1) + xlab("Môn sinh") + ylab("Xác suất") + theme(legend.position="top")

grid.arrange(p1, p2, p3, p4, ncol=2)
## Warning: Removed 7912 rows containing non-finite values (stat_density).
## Warning: Removed 977737 rows containing non-finite values (stat_density).
## Warning: Removed 970296 rows containing non-finite values (stat_density).
## Warning: Removed 982621 rows containing non-finite values (stat_density).

# Môn văn, ngoại ngữ, sử, địa  

p = ggplot(data=vn, aes(x=Viet, fill=Year, col=Year))
p1 = p + geom_density(alpha = 0.1) + xlab("Môn văn") + ylab("Xác suất") + theme(legend.position="top")

p = ggplot(data=vn, aes(x=English, fill=Year, col=Year))
p2 = p + geom_density(alpha = 0.1) + xlab("Môn ngoại ngữ") + ylab("Xác suất") + theme(legend.position="top")

p = ggplot(data=vn, aes(x=History, fill=Year, col=Year))
p3 = p + geom_density(alpha = 0.1) + xlab("Môn sử") + ylab("Xác suất") + theme(legend.position="top")

p = ggplot(data=vn, aes(x=Geography, fill=Year, col=Year))
p4 = p + geom_density(alpha = 0.1) + xlab("Môn địa") + ylab("Xác suất") + theme(legend.position="top")

grid.arrange(p1, p2, p3, p4, ncol=2)
## Warning: Removed 30775 rows containing non-finite values (stat_density).
## Warning: Removed 177288 rows containing non-finite values (stat_density).
## Warning: Removed 612001 rows containing non-finite values (stat_density).
## Warning: Removed 632588 rows containing non-finite values (stat_density).

So sánh 2019 và 2018 cho top 10 và bottom 10

top10 = subset(vn, Province %in%c("Nam Dinh", "TPHCM", "Ha Nam", "Binh Duong", "Thai Binh", "BR-VT", "Hai Phong", "Bac Ninh", "Ninh Binh", "Ha Noi"))

bot10 = subset(vn, Province %in%c("Son La", "Ha Giang", "Hoa Binh", "Cao Bang", "Bac Kan", "Lang Son", "Yen Bai", "Dien Bien", "Lai Chau", "Quang Ninh"))

# Môn toán 
p = ggplot(data=top10, aes(x=Math, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn toán") + theme(legend.position="none") + scale_fill_viridis()

p = ggplot(data=bot10, aes(x=Math, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn toán") + theme(legend.position="none") + scale_fill_viridis()

grid.arrange(p1, p2, ncol=2)
## Picking joint bandwidth of 0.179
## Picking joint bandwidth of 0.198

# Môn lí 
p = ggplot(data=top10, aes(x=Physics, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn lí") + theme(legend.position="none") + scale_fill_viridis()

p = ggplot(data=bot10, aes(x=Physics, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn lí") + theme(legend.position="none") + scale_fill_viridis()

grid.arrange(p1, p2, ncol=2)
## Picking joint bandwidth of 0.201
## Picking joint bandwidth of 0.329

# Môn hoá 
p = ggplot(data=top10, aes(x=Physics, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn hoá") + theme(legend.position="none") + scale_fill_viridis()

p = ggplot(data=bot10, aes(x=Physics, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn hoá") + theme(legend.position="none") + scale_fill_viridis()

grid.arrange(p1, p2, ncol=2)
## Picking joint bandwidth of 0.201
## Picking joint bandwidth of 0.329

# Môn ngoại ngữ  
p = ggplot(data=top10, aes(x=English, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn ngoại ngữ") + theme(legend.position="none") + scale_fill_viridis()

p = ggplot(data=bot10, aes(x=English, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn ngoại ngữ") + theme(legend.position="none") + scale_fill_viridis()

grid.arrange(p1, p2, ncol=2)
## Picking joint bandwidth of 0.191
## Picking joint bandwidth of 0.14

So sánh 2018 và 2019 vài tỉnh

# Hà Giang 
hg = subset(vn, Province=="Ha Giang")

p = ggplot(data=hg, aes(x=Math, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn toán (Hà Giang)") + theme(legend.position="none") + scale_fill_viridis(option="C")

p = ggplot(data=hg, aes(x=Physics, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn lí (Hà Giang)") + theme(legend.position="none") + scale_fill_viridis(option="D")

p = ggplot(data=hg, aes(x=Chemistry, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p3 = p + ylab("") + xlab("Môn hoá (Hà Giang)") + theme(legend.position="none") + scale_fill_viridis(option="E")

p = ggplot(data=hg, aes(x=English, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p4 = p + ylab("") + xlab("Môn ngoại ngữ (Hà Giang)") + theme(legend.position="none") + scale_fill_viridis(option="F")
## Warning in viridisLite::viridis(256, alpha, begin, end, direction, option):
## Option 'F' does not exist. Defaulting to 'viridis'.
grid.arrange(p1, p2, p3, p4, ncol=2)
## Picking joint bandwidth of 0.217
## Picking joint bandwidth of 0.435
## Picking joint bandwidth of 0.438
## Picking joint bandwidth of 0.13

# Sơn La  
hg = subset(vn, Province=="Son La")

p = ggplot(data=hg, aes(x=Math, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn toán (Sơn La) ") + theme(legend.position="none") + scale_fill_viridis(option="C")

p = ggplot(data=hg, aes(x=Physics, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn lí (Sơn La)") + theme(legend.position="none") + scale_fill_viridis(option="D")

p = ggplot(data=hg, aes(x=Chemistry, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p3 = p + ylab("") + xlab("Môn hoá (Sơn La)") + theme(legend.position="none") + scale_fill_viridis(option="E")

p = ggplot(data=hg, aes(x=English, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p4 = p + ylab("") + xlab("Môn ngoại ngữ (Sơn La)") + theme(legend.position="none") + scale_fill_viridis(option="F")
## Warning in viridisLite::viridis(256, alpha, begin, end, direction, option):
## Option 'F' does not exist. Defaulting to 'viridis'.
grid.arrange(p1, p2, p3, p4, ncol=2)
## Picking joint bandwidth of 0.169
## Picking joint bandwidth of 0.364
## Picking joint bandwidth of 0.355
## Picking joint bandwidth of 0.11

# Bắc Kạn
hg = subset(vn, Province=="Bac Kan")

p = ggplot(data=hg, aes(x=Math, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn toán (Bắc Kạn) ") + theme(legend.position="none") + scale_fill_viridis(option="C")

p = ggplot(data=hg, aes(x=Physics, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn lí (Bắc Kạn)") + theme(legend.position="none") + scale_fill_viridis(option="D")

p = ggplot(data=hg, aes(x=Chemistry, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p3 = p + ylab("") + xlab("Môn hoá (Bắc Kạn)") + theme(legend.position="none") + scale_fill_viridis(option="E")

p = ggplot(data=hg, aes(x=English, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p4 = p + ylab("") + xlab("Môn ngoại ngữ (Bắc Kạn)") + theme(legend.position="none") + scale_fill_viridis(option="F")
## Warning in viridisLite::viridis(256, alpha, begin, end, direction, option):
## Option 'F' does not exist. Defaulting to 'viridis'.
grid.arrange(p1, p2, p3, p4, ncol=2)
## Picking joint bandwidth of 0.255
## Picking joint bandwidth of 0.416
## Picking joint bandwidth of 0.433
## Picking joint bandwidth of 0.197

# Hoà Bình
hg = subset(vn, Province=="Hoa Binh")

p = ggplot(data=hg, aes(x=Math, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn toán (Hoà Bình) ") + theme(legend.position="none") + scale_fill_viridis(option="C")

p = ggplot(data=hg, aes(x=Physics, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn lí (Hoà Bình)") + theme(legend.position="none") + scale_fill_viridis(option="D")

p = ggplot(data=hg, aes(x=Chemistry, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p3 = p + ylab("") + xlab("Môn hoá (Hoà Bình)") + theme(legend.position="none") + scale_fill_viridis(option="E")

p = ggplot(data=hg, aes(x=English, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p4 = p + ylab("") + xlab("Môn ngoại ngữ (Hoà Bình)") + theme(legend.position="none") + scale_fill_viridis(option="F")
## Warning in viridisLite::viridis(256, alpha, begin, end, direction, option):
## Option 'F' does not exist. Defaulting to 'viridis'.
grid.arrange(p1, p2, p3, p4, ncol=2)
## Picking joint bandwidth of 0.236
## Picking joint bandwidth of 0.353
## Picking joint bandwidth of 0.356
## Picking joint bandwidth of 0.123

# TPHCM
hg = subset(vn, Province=="TPHCM")

p = ggplot(data=hg, aes(x=Math, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn toán (TPHCM) ") + theme(legend.position="none") + scale_fill_viridis(option="C")

p = ggplot(data=hg, aes(x=Physics, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn lí (TPHCM)") + theme(legend.position="none") + scale_fill_viridis(option="D")

p = ggplot(data=hg, aes(x=Chemistry, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p3 = p + ylab("") + xlab("Môn hoá (TPHCM)") + theme(legend.position="none") + scale_fill_viridis(option="E")

p = ggplot(data=hg, aes(x=English, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p4 = p + ylab("") + xlab("Môn ngoại ngữ (TPHCM)") + theme(legend.position="none") + scale_fill_viridis(option="F")
## Warning in viridisLite::viridis(256, alpha, begin, end, direction, option):
## Option 'F' does not exist. Defaulting to 'viridis'.
grid.arrange(p1, p2, p3, p4, ncol=2)
## Picking joint bandwidth of 0.114
## Picking joint bandwidth of 0.143
## Picking joint bandwidth of 0.14
## Picking joint bandwidth of 0.181

Distribution of scores for 2019 by province

p = ggplot(data=vn19, aes(x=Math, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p + xlab("Môn toán") + theme(legend.position="none") + scale_fill_viridis()
## Picking joint bandwidth of 0.223

p = ggplot(data=vn19, aes(x=Physics, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p + xlab("Môn vật lí") + ylab("") + theme(legend.position="none") + scale_fill_viridis()
## Picking joint bandwidth of 0.27

p = ggplot(data=vn19, aes(x=Chemistry, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p + xlab("Môn hoá học") + ylab("") + theme(legend.position="none") + scale_fill_viridis()
## Picking joint bandwidth of 0.269

p = ggplot(data=vn19, aes(x=Biology, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p + xlab("Môn sinh") + ylab("") + theme(legend.position="none") + scale_fill_viridis()
## Picking joint bandwidth of 0.197

p = ggplot(data=vn19, aes(x=English, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p + xlab("Môn ngoại ngữ") + ylab("") + theme(legend.position="none") + scale_fill_viridis()
## Picking joint bandwidth of 0.201

p = ggplot(data=vn19, aes(x=History, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p + xlab("Môn sử") + ylab("") + theme(legend.position="none") + scale_fill_viridis()
## Picking joint bandwidth of 0.204

p = ggplot(data=vn19, aes(x=Geography, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p + xlab("Môn địa lí") + ylab("") + theme(legend.position="none") + scale_fill_viridis()
## Picking joint bandwidth of 0.178

Phân bố tích lũy 2018 và 2019

# Toán, lí, hoá, sinh 

p1 = ggplot(data=vn, aes(x=Math, col=Year)) + stat_ecdf() + xlab("Môn toán") + ylab("Xác suất") + theme(legend.position="top")

p2 = ggplot(data=vn, aes(x=Physics, col=Year)) + stat_ecdf() + xlab("Môn lí") + ylab("Xác suất") + theme(legend.position="top")

p3 = ggplot(data=vn, aes(x=Chemistry, col=Year)) + stat_ecdf() + xlab("Môn hoá") + ylab("Xác suất") + theme(legend.position="top")

p4 = ggplot(data=vn, aes(x=Biology, col=Year)) + stat_ecdf() + xlab("Môn sinh") + ylab("Xác suất") + theme(legend.position="top")

grid.arrange(p1, p2, p3, p4, ncol=2)
## Warning: Removed 7912 rows containing non-finite values (stat_ecdf).
## Warning: Removed 977737 rows containing non-finite values (stat_ecdf).
## Warning: Removed 970296 rows containing non-finite values (stat_ecdf).
## Warning: Removed 982621 rows containing non-finite values (stat_ecdf).

# Văn, ngoại ngữ, sử, địa 

p1 = ggplot(data=vn, aes(x=Viet, col=Year)) + stat_ecdf() + xlab("Môn văn") + ylab("Xác suất") + theme(legend.position="top")

p2 = ggplot(data=vn, aes(x=English, col=Year)) + stat_ecdf() + xlab("Môn ngoại ngữ") + ylab("Xác suất") + theme(legend.position="top")

p3 = ggplot(data=vn, aes(x=History, col=Year)) + stat_ecdf() + xlab("Môn sử") + ylab("Xác suất") + theme(legend.position="top")

p4 = ggplot(data=vn, aes(x=Geography, col=Year)) + stat_ecdf() + xlab("Môn địa lí") + ylab("Xác suất") + theme(legend.position="top")

grid.arrange(p1, p2, p3, p4, ncol=2)
## Warning: Removed 30775 rows containing non-finite values (stat_ecdf).
## Warning: Removed 177288 rows containing non-finite values (stat_ecdf).
## Warning: Removed 612001 rows containing non-finite values (stat_ecdf).
## Warning: Removed 632588 rows containing non-finite values (stat_ecdf).

Comparison of results between 2018 and 2019

table1(~Math+Physics+Chemistry+Biology+Viet+English+History+Geography | Year, overall=F, data=vn)
2018
(n=754783)
2019
(n=882657)
Math
Mean (SD) 4.86 (1.45) 5.64 (1.74)
Median [Min, Max] 5.00 [0.00, 10.0] 5.80 [0.00, 10.0]
Missing 3507 (0.5%) 4405 (0.5%)
Physics
Mean (SD) 4.96 (1.53) 5.57 (1.59)
Median [Min, Max] 5.00 [0.00, 10.0] 5.75 [0.00, 10.0]
Missing 430091 (57.0%) 547646 (62.0%)
Chemistry
Mean (SD) 4.86 (1.52) 5.35 (1.57)
Median [Min, Max] 4.75 [0.00, 10.0] 5.50 [0.00, 10.0]
Missing 426435 (56.5%) 543861 (61.6%)
Biology
Mean (SD) 4.56 (1.24) 4.68 (1.24)
Median [Min, Max] 4.50 [0.00, 10.0] 4.50 [0.00, 10.0]
Missing 433847 (57.5%) 548774 (62.2%)
Viet
Mean (SD) 5.43 (1.45) 5.48 (1.34)
Median [Min, Max] 5.50 [0.00, 9.75] 5.50 [0.00, 9.50]
Missing 16243 (2.2%) 14532 (1.6%)
English
Mean (SD) 3.92 (1.57) 4.36 (1.81)
Median [Min, Max] 3.60 [0.00, 10.0] 4.00 [0.00, 10.0]
Missing 87059 (11.5%) 90229 (10.2%)
History
Mean (SD) 3.80 (1.24) 4.30 (1.44)
Median [Min, Max] 3.75 [0.00, 10.0] 4.00 [0.00, 10.0]
Missing 299415 (39.7%) 312586 (35.4%)
Geography
Mean (SD) 5.47 (1.27) 6.00 (1.23)
Median [Min, Max] 5.50 [0.00, 10.0] 6.00 [0.00, 10.0]
Missing 312718 (41.4%) 319870 (36.2%)

Thay đổi điểm trung bình 2018 và 2019 cho mỗi tỉnh

library(dplyr)
# t19 = vn19 %>% select(Province, Math, Physics, Chemistry, Biology, Viet, English, History, Geography) %>% group_by(Province) %>% summarise(Math19=mean(na.omit(Math)), Physics19=mean(na.omit(Physics)), Chemistry19=mean(na.omit(Chemistry)), Biology19=mean(na.omit(Biology)), Viet19=mean(na.omit(Viet)), English19=mean(na.omit(English)), History19=mean(na.omit(History)), Geography19=mean(na.omit(Geography)))

# t18 = vn18 %>% select(Province, Math, Physics, Chemistry, Biology, Viet, English, History, Geography) %>% group_by(Province) %>% summarise(Math18=mean(na.omit(Math)), Physics18=mean(na.omit(Physics)),Chemistry18=mean(na.omit(Chemistry)), Biology18=mean(na.omit(Biology)), Viet18=mean(na.omit(Viet)), English18=mean(na.omit(English)), History18=mean(na.omit(History)), Geography18=mean(na.omit(Geography)))

diff = read.csv("~/Dropbox/Bao chi va Khoa hoc/Pho diem 2019/Mean score 2018-2019.csv")

# Môn toán, lí, hoá, sinh  

p = ggplot(data=diff, aes(x=reorder(Province, -Math), y=Math, fill=Province))
p1 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn toán") + theme(legend.position="none", axis.text=element_text(size=7))

p = ggplot(data=diff, aes(x=reorder(Province, -Physics), y=Physics, fill=Province))
p2 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn lí") + theme(legend.position="none",  axis.text=element_text(size=7))

p = ggplot(data=diff, aes(x=reorder(Province, -Chemistry), y=Chemistry, fill=Province))
p3 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn hoá") + theme(legend.position="none", axis.text=element_text(size=7))

p = ggplot(data=diff, aes(x=reorder(Province, -Biology), y=Biology, fill=Province))
p4 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn sinh") + theme(legend.position="none", axis.text=element_text(size=7))

grid.arrange(p1, p2, p3, p4, ncol=2)

# Môn văn, ngoại ngữ, sử, địa 

p = ggplot(data=diff, aes(x=reorder(Province, -Viet), y=Viet, fill=Province))
p1 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn văn") + theme(legend.position="none", axis.text=element_text(size=7))

p = ggplot(data=diff, aes(x=reorder(Province, -English), y= English, fill=Province))
p2 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn ngoại ngữ") + theme(legend.position="none",  axis.text=element_text(size=7))

p = ggplot(data=diff, aes(x=reorder(Province, -History), y=History, fill=Province))
p3 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn sử") + theme(legend.position="none", axis.text=element_text(size=7))

p = ggplot(data=diff, aes(x=reorder(Province, -Geography), y=Geography, fill=Province))
p4 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn địa lí") + theme(legend.position="none", axis.text=element_text(size=7))

grid.arrange(p1, p2, p3, p4, ncol=2)