Đọc dữ liệu 2019

library(ggplot2); library(table1); library(dplyr)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggridges); library(gridExtra); library(viridis)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

## Loading required package: viridisLite

vn = read.csv("~/Dropbox/Bao chi va Khoa hoc/Pho diem 2019/THPT 2019 Quoc gia.csv", header=T)

vn$Language[vn$Code=="N1"] = "English"
vn$Language[vn$Code=="N2"] = "Russian"
vn$Language[vn$Code=="N3"] = "French"
vn$Language[vn$Code=="N4"] = "Chinese"
vn$Language[vn$Code=="N5"] = "German"
vn$Language[vn$Code=="N6"] = "Japanese"

d1 = subset(vn, SoBD < 10000000)
d1$temp = as.character(d1$SoBD) 
d1$t = substr(d1$temp, 1, 1)
d1$Province[d1$t=="1"] <- "Ha Noi"
d1$Province[d1$t=="2"] <- "TPHCM"
d1$Province[d1$t=="3"] <- "Hai Phong"
d1$Province[d1$t=="4"] <- "Da Nang"
d1$Province[d1$t=="5"] <- "Ha Giang"
d1$Province[d1$t=="6"] <- "Cao Bang"
d1$Province[d1$t=="7"] <- "Lai Chau"
d1$Province[d1$t=="8"] <- "Tuyen Quang"
d1$Province[d1$t=="9"] <- "Lang Son"

d2 = subset(vn, SoBD > 10000000)
d2$temp = as.character(d2$SoBD) 
d2$t = substr(d2$temp, 1, 2)
d2$Province[d2$t=="10"] <- "Lang Son"
d2$Province[d2$t=="11"] <- "Bac Kan"
d2$Province[d2$t=="12"] <- "Thai Nguyen"
d2$Province[d2$t=="13"] <- "Yen Bai"
d2$Province[d2$t=="14"] <- "Son La"
d2$Province[d2$t=="15"] <- "Phu Tho"
d2$Province[d2$t=="16"] <- "Vinh Phuc"
d2$Province[d2$t=="17"] <- "Quang Ninh"
d2$Province[d2$t=="18"] <- "Bac Giang"
d2$Province[d2$t=="19"] <- "Bac Ninh"
d2$Province[d2$t=="21"] <- "Hai Duong"
d2$Province[d2$t=="22"] <- "Hung Yen"
d2$Province[d2$t=="23"] <- "Hoa Binh"
d2$Province[d2$t=="24"] <- "Ha Nam"
d2$Province[d2$t=="25"] <- "Nam Dinh"
d2$Province[d2$t=="26"] <- "Thai Binh"
d2$Province[d2$t=="27"] <- "Ninh Binh"
d2$Province[d2$t=="28"] <- "Thanh Hoa"
d2$Province[d2$t=="29"] <- "Nghe An"
d2$Province[d2$t=="30"] <- "Ha Tinh"
d2$Province[d2$t=="31"] <- "Quang Binh"
d2$Province[d2$t=="32"] <- "Quang Tri"
d2$Province[d2$t=="33"] <- "Hue-TT"
d2$Province[d2$t=="34"] <- "Quang Nam"
d2$Province[d2$t=="35"] <- "Quang Ngai"
d2$Province[d2$t=="36"] <- "Kon Tum"
d2$Province[d2$t=="37"] <- "Binh Dinh"
d2$Province[d2$t=="38"] <- "Gia Lai"
d2$Province[d2$t=="39"] <- "Phu Yen"
d2$Province[d2$t=="40"] <- "Dak Lak"
d2$Province[d2$t=="41"] <- "Khanh Hoa"
d2$Province[d2$t=="42"] <- "Lam Dong"
d2$Province[d2$t=="43"] <- "Binh Phuoc"
d2$Province[d2$t=="44"] <- "Binh Duong"
d2$Province[d2$t=="45"] <- "Ninh Thuan"
d2$Province[d2$t=="46"] <- "Tay Ninh"
d2$Province[d2$t=="47"] <- "Binh Thuan"
d2$Province[d2$t=="48"] <- "Dong Nai"
d2$Province[d2$t=="49"] <- "Long An"
d2$Province[d2$t=="50"] <- "Dong Thap"
d2$Province[d2$t=="51"] <- "An Giang"
d2$Province[d2$t=="52"] <- "BR-VT"
d2$Province[d2$t=="53"] <- "Tien Giang"
d2$Province[d2$t=="54"] <- "Kien Giang"
d2$Province[d2$t=="55"] <- "Can Tho"
d2$Province[d2$t=="56"] <- "Ben Tre"
d2$Province[d2$t=="57"] <- "Vinh Long"
d2$Province[d2$t=="58"] <- "Tra Vinh"
d2$Province[d2$t=="59"] <- "Soc Trang"
d2$Province[d2$t=="60"] <- "Bac Lieu"
d2$Province[d2$t=="61"] <- "Ca Mau"
d2$Province[d2$t=="62"] <- "Dien Bien"
d2$Province[d2$t=="63"] <- "Dak Nong"
d2$Province[d2$t=="64"] <- "Hau Giang"
d2$Province[d2$t=="65"] <- "Quoc Phong"

d1 = d1[,  c("ID", "SoBD", "Province", "Math", "Viet", "ForeignLanguage", "Physics", "Chemistry", "Biology", "History", "Geography")]

d2 = d2[,  c("ID", "SoBD", "Province", "Math", "Viet", "ForeignLanguage", "Physics", "Chemistry", "Biology", "History", "Geography")]

vn = rbind(d1, d2) 
vn$Year = 2019
vn$English = vn$ForeignLanguage

vn19 = vn[, c("ID", "SoBD", "Province", "Math", "Viet", "English", "Physics", "Chemistry", "Biology", "History", "Geography", "Year")]

Đọc dữ liệu 2018

vn = read.csv("~/Dropbox/Bao chi va Khoa hoc/Pho diem 2018/THPT 2018 Quoc gia.csv", na.strings="")

d1 = subset(vn, SoBD < 10000000)
d1$temp = as.character(d1$SoBD) 
d1$t = substr(d1$temp, 1, 1)
d1$Province[d1$t=="1"] <- "Ha Noi"
d1$Province[d1$t=="2"] <- "TPHCM"
d1$Province[d1$t=="3"] <- "Hai Phong"
d1$Province[d1$t=="4"] <- "Da Nang"
d1$Province[d1$t=="5"] <- "Ha Giang"
d1$Province[d1$t=="6"] <- "Cao Bang"
d1$Province[d1$t=="7"] <- "Lai Chau"
d1$Province[d1$t=="8"] <- "Tuyen Quang"
d1$Province[d1$t=="9"] <- "Lang Son"

d2 = subset(vn, SoBD > 10000000)
d2$temp = as.character(d2$SoBD) 
d2$t = substr(d2$temp, 1, 2)
d2$Province[d2$t=="10"] <- "Lang Son"
d2$Province[d2$t=="11"] <- "Bac Kan"
d2$Province[d2$t=="12"] <- "Thai Nguyen"
d2$Province[d2$t=="13"] <- "Yen Bai"
d2$Province[d2$t=="14"] <- "Son La"
d2$Province[d2$t=="15"] <- "Phu Tho"
d2$Province[d2$t=="16"] <- "Vinh Phuc"
d2$Province[d2$t=="17"] <- "Quang Ninh"
d2$Province[d2$t=="18"] <- "Bac Giang"
d2$Province[d2$t=="19"] <- "Bac Ninh"
d2$Province[d2$t=="21"] <- "Hai Duong"
d2$Province[d2$t=="22"] <- "Hung Yen"
d2$Province[d2$t=="23"] <- "Hoa Binh"
d2$Province[d2$t=="24"] <- "Ha Nam"
d2$Province[d2$t=="25"] <- "Nam Dinh"
d2$Province[d2$t=="26"] <- "Thai Binh"
d2$Province[d2$t=="27"] <- "Ninh Binh"
d2$Province[d2$t=="28"] <- "Thanh Hoa"
d2$Province[d2$t=="29"] <- "Nghe An"
d2$Province[d2$t=="30"] <- "Ha Tinh"
d2$Province[d2$t=="31"] <- "Quang Binh"
d2$Province[d2$t=="32"] <- "Quang Tri"
d2$Province[d2$t=="33"] <- "Hue-TT"
d2$Province[d2$t=="34"] <- "Quang Nam"
d2$Province[d2$t=="35"] <- "Quang Ngai"
d2$Province[d2$t=="36"] <- "Kon Tum"
d2$Province[d2$t=="37"] <- "Binh Dinh"
d2$Province[d2$t=="38"] <- "Gia Lai"
d2$Province[d2$t=="39"] <- "Phu Yen"
d2$Province[d2$t=="40"] <- "Dak Lak"
d2$Province[d2$t=="41"] <- "Khanh Hoa"
d2$Province[d2$t=="42"] <- "Lam Dong"
d2$Province[d2$t=="43"] <- "Binh Phuoc"
d2$Province[d2$t=="44"] <- "Binh Duong"
d2$Province[d2$t=="45"] <- "Ninh Thuan"
d2$Province[d2$t=="46"] <- "Tay Ninh"
d2$Province[d2$t=="47"] <- "Binh Thuan"
d2$Province[d2$t=="48"] <- "Dong Nai"
d2$Province[d2$t=="49"] <- "Long An"
d2$Province[d2$t=="50"] <- "Dong Thap"
d2$Province[d2$t=="51"] <- "An Giang"
d2$Province[d2$t=="52"] <- "BR-VT"
d2$Province[d2$t=="53"] <- "Tien Giang"
d2$Province[d2$t=="54"] <- "Kien Giang"
d2$Province[d2$t=="55"] <- "Can Tho"
d2$Province[d2$t=="56"] <- "Ben Tre"
d2$Province[d2$t=="57"] <- "Vinh Long"
d2$Province[d2$t=="58"] <- "Tra Vinh"
d2$Province[d2$t=="59"] <- "Soc Trang"
d2$Province[d2$t=="60"] <- "Bac Lieu"
d2$Province[d2$t=="61"] <- "Ca Mau"
d2$Province[d2$t=="62"] <- "Dien Bien"
d2$Province[d2$t=="63"] <- "Dak Nong"
d2$Province[d2$t=="64"] <- "Hau Giang"
d2$Province[d2$t=="65"] <- "Quoc Phong"

d1 = d1[,  c("ID", "SoBD", "Province", "Math", "Viet", "English", "Physics", "Chemistry", "Biology", "History", "Geography")]

d2 = d2[,  c("ID", "SoBD", "Province", "Math", "Viet", "English", "Physics", "Chemistry", "Biology", "History", "Geography")]

sonla = read.csv("~/Dropbox/Bao chi va Khoa hoc/Pho diem 2018/Son La.csv", na.strings="")
sonla$ID= sonla$ID
sonla$SoBD= sonla$ID
sonla$Province="Son La" 

vn18 = rbind(d1, d2, sonla) 
vn18$Year = 2018

Hợp nhất 2 dữ liệu

vn = rbind(vn19, vn18)
vn$Year = as.factor(vn$Year)
head(vn)

##   ID    SoBD Province Math Viet English Physics Chemistry Biology History
## 1  1 1000029   Ha Noi  7.2 5.00     6.8       6       4.5    4.00      NA
## 2  2 1000030   Ha Noi  6.2 6.25     8.0       5       4.5    3.75      NA
## 3  3 1000031   Ha Noi  6.8 5.75     7.2      NA        NA      NA    4.25
## 4  4 1000032   Ha Noi  5.8 4.50     9.0      NA        NA      NA    7.00
## 5  5 1000033   Ha Noi  7.0 5.50     3.6      NA        NA      NA    5.25
## 6  6 1000034   Ha Noi  7.0 6.00     7.2      NA        NA      NA    3.75
##   Geography Year
## 1        NA 2019
## 2        NA 2019
## 3       5.5 2019
## 4       6.5 2019
## 5       7.5 2019
## 6       6.5 2019

Điểm trung bình năm 2019 cho từng tỉnh

# Toán
dat = vn19[, c("Province", "Math")]
dat = na.omit(dat)
means = aggregate(dat$Math, by=list(dat$Province), FUN=mean)
colnames(means) = c("Province", "Mean")
means = transform(means, Province= reorder(Province, Mean))

p = ggplot(means, aes(x=Province, y=Mean, fill=Province))
p = p + geom_bar(stat="identity", width=1, color="white", position=position_dodge())
p = p + theme(legend.position="none") + xlab("") + ylab("Điểm môn toán ")
p + geom_text(aes(y=Mean, ymax=Mean, label=round(Mean, 2)), position= position_dodge(width=1), size=3, vjust=0.5, hjust=1, size=1, color="black") + coord_flip()

## Warning: Duplicated aesthetics after name standardisation: size

## Warning: Ignoring unknown aesthetics: ymax

# Lí 
dat = vn19[, c("Province", "Physics")]
dat = na.omit(dat)
means = aggregate(dat$Physics, by=list(dat$Province), FUN=mean)
colnames(means) = c("Province", "Mean")
means = transform(means, Province= reorder(Province, Mean))

p = ggplot(means, aes(x=Province, y=Mean, fill=Province))
p = p + geom_bar(stat="identity", width=1, color="white", position=position_dodge())
p = p + theme(legend.position="none") + xlab("") + ylab("Điểm môn vật lí ")
p + geom_text(aes(y=Mean, ymax=Mean, label=round(Mean, 2)), position= position_dodge(width=1), size=3, vjust=0.5, hjust=1, size=1, color="black") + coord_flip()

## Warning: Duplicated aesthetics after name standardisation: size

## Warning: Ignoring unknown aesthetics: ymax

# Hoá
dat = vn19[, c("Province", "Chemistry")]
dat = na.omit(dat)
means = aggregate(dat$Chemistry, by=list(dat$Province), FUN=mean)
colnames(means) = c("Province", "Mean")
means = transform(means, Province= reorder(Province, Mean))

p = ggplot(means, aes(x=Province, y=Mean, fill=Province))
p = p + geom_bar(stat="identity", width=1, color="white", position=position_dodge())
p = p + theme(legend.position="none") + xlab("") + ylab("Điểm môn hóa ")
p + geom_text(aes(y=Mean, ymax=Mean, label=round(Mean, 2)), position= position_dodge(width=1), size=3, vjust=0.5, hjust=1, size=1, color="black") + coord_flip()

## Warning: Duplicated aesthetics after name standardisation: size

## Warning: Ignoring unknown aesthetics: ymax

# Sinh
dat = vn19[, c("Province", "Biology")]
dat = na.omit(dat)
means = aggregate(dat$Biology, by=list(dat$Province), FUN=mean)
colnames(means) = c("Province", "Mean")
means = transform(means, Province= reorder(Province, Mean))

p = ggplot(means, aes(x=Province, y=Mean, fill=Province))
p = p + geom_bar(stat="identity", width=1, color="white", position=position_dodge())
p = p + theme(legend.position="none") + xlab("") + ylab("Điểm môn sinh học")
p + geom_text(aes(y=Mean, ymax=Mean, label=round(Mean, 2)), position= position_dodge(width=1), size=3, vjust=0.5, hjust=1, size=1, color="black") + coord_flip()

## Warning: Duplicated aesthetics after name standardisation: size

## Warning: Ignoring unknown aesthetics: ymax

# Ngoại ngữ  
dat = vn19[, c("Province", "English")]
dat = na.omit(dat)
means = aggregate(dat$English, by=list(dat$Province), FUN=mean)
colnames(means) = c("Province", "Mean")
means = transform(means, Province= reorder(Province, Mean))

p = ggplot(means, aes(x=Province, y=Mean, fill=Province))
p = p + geom_bar(stat="identity", width=1, color="white", position=position_dodge())
p = p + theme(legend.position="none") + xlab("") + ylab("Điểm môn ngoại ngữ")
p + geom_text(aes(y=Mean, ymax=Mean, label=round(Mean, 2)), position= position_dodge(width=1), size=3, vjust=0.5, hjust=1, size=1, color="black") + coord_flip()

## Warning: Duplicated aesthetics after name standardisation: size

## Warning: Ignoring unknown aesthetics: ymax

# Sử 
dat = vn19[, c("Province", "History")]
dat = na.omit(dat)
means = aggregate(dat$History, by=list(dat$Province), FUN=mean)
colnames(means) = c("Province", "Mean")
means = transform(means, Province= reorder(Province, Mean))

p = ggplot(means, aes(x=Province, y=Mean, fill=Province))
p = p + geom_bar(stat="identity", width=1, color="white", position=position_dodge())
p = p + theme(legend.position="none") + xlab("") + ylab("Điểm môn sử")
p + geom_text(aes(y=Mean, ymax=Mean, label=round(Mean, 2)), position= position_dodge(width=1), size=3, vjust=0.5, hjust=1, size=1, color="black") + coord_flip()

## Warning: Duplicated aesthetics after name standardisation: size

## Warning: Ignoring unknown aesthetics: ymax

# Địa 
dat = vn19[, c("Province", "Geography")]
dat = na.omit(dat)
means = aggregate(dat$Geography, by=list(dat$Province), FUN=mean)
colnames(means) = c("Province", "Mean")
means = transform(means, Province= reorder(Province, Mean))

p = ggplot(means, aes(x=Province, y=Mean, fill=Province))
p = p + geom_bar(stat="identity", width=1, color="white", position=position_dodge())
p = p + theme(legend.position="none") + xlab("") + ylab("Điểm môn địa lí")
p + geom_text(aes(y=Mean, ymax=Mean, label=round(Mean, 2)), position= position_dodge(width=1), size=3, vjust=0.5, hjust=1, size=1, color="black") + coord_flip()

## Warning: Duplicated aesthetics after name standardisation: size

## Warning: Ignoring unknown aesthetics: ymax

So sánh phân bố 2018 và 2019

# Môn toán, lí, hoá, sinh 
p = ggplot(data=vn, aes(x=Math, fill=Year, col=Year))
p1 = p + geom_density(alpha = 0.1) + xlab("Môn toán") + ylab("Xác suất") + theme(legend.position="top")

p = ggplot(data=vn, aes(x=Physics, fill=Year, col=Year))
p2 = p + geom_density(alpha = 0.1) + xlab("Môn lí") + ylab("Xác suất") + theme(legend.position="top")

p = ggplot(data=vn, aes(x=Chemistry, fill=Year, col=Year))
p3 = p + geom_density(alpha = 0.1) + xlab("Môn hoá") + ylab("Xác suất") + theme(legend.position="top")

p = ggplot(data=vn, aes(x=Biology, fill=Year, col=Year))
p4 = p + geom_density(alpha = 0.1) + xlab("Môn sinh") + ylab("Xác suất") + theme(legend.position="top")

grid.arrange(p1, p2, p3, p4, ncol=2)

## Warning: Removed 7912 rows containing non-finite values (stat_density).

## Warning: Removed 977737 rows containing non-finite values (stat_density).

## Warning: Removed 970296 rows containing non-finite values (stat_density).

## Warning: Removed 982621 rows containing non-finite values (stat_density).

# Môn văn, ngoại ngữ, sử, địa  

p = ggplot(data=vn, aes(x=Viet, fill=Year, col=Year))
p1 = p + geom_density(alpha = 0.1) + xlab("Môn văn") + ylab("Xác suất") + theme(legend.position="top")

p = ggplot(data=vn, aes(x=English, fill=Year, col=Year))
p2 = p + geom_density(alpha = 0.1) + xlab("Môn ngoại ngữ") + ylab("Xác suất") + theme(legend.position="top")

p = ggplot(data=vn, aes(x=History, fill=Year, col=Year))
p3 = p + geom_density(alpha = 0.1) + xlab("Môn sử") + ylab("Xác suất") + theme(legend.position="top")

p = ggplot(data=vn, aes(x=Geography, fill=Year, col=Year))
p4 = p + geom_density(alpha = 0.1) + xlab("Môn địa") + ylab("Xác suất") + theme(legend.position="top")

grid.arrange(p1, p2, p3, p4, ncol=2)

## Warning: Removed 30775 rows containing non-finite values (stat_density).

## Warning: Removed 177288 rows containing non-finite values (stat_density).

## Warning: Removed 612001 rows containing non-finite values (stat_density).

## Warning: Removed 632588 rows containing non-finite values (stat_density).

So sánh 2019 và 2018 cho top 10 và bottom 10

top10 = subset(vn, Province %in%c("Nam Dinh", "TPHCM", "Ha Nam", "Binh Duong", "Thai Binh", "BR-VT", "Hai Phong", "Bac Ninh", "Ninh Binh", "Ha Noi"))

bot10 = subset(vn, Province %in%c("Son La", "Ha Giang", "Hoa Binh", "Cao Bang", "Bac Kan", "Lang Son", "Yen Bai", "Dien Bien", "Lai Chau", "Quang Ninh"))

# Môn toán 
p = ggplot(data=top10, aes(x=Math, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn toán") + theme(legend.position="none") + scale_fill_viridis()

p = ggplot(data=bot10, aes(x=Math, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn toán") + theme(legend.position="none") + scale_fill_viridis()

grid.arrange(p1, p2, ncol=2)

## Picking joint bandwidth of 0.179

## Picking joint bandwidth of 0.198

# Môn lí 
p = ggplot(data=top10, aes(x=Physics, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn lí") + theme(legend.position="none") + scale_fill_viridis()

p = ggplot(data=bot10, aes(x=Physics, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn lí") + theme(legend.position="none") + scale_fill_viridis()

grid.arrange(p1, p2, ncol=2)

## Picking joint bandwidth of 0.201

## Picking joint bandwidth of 0.329

# Môn hoá 
p = ggplot(data=top10, aes(x=Physics, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn hoá") + theme(legend.position="none") + scale_fill_viridis()

p = ggplot(data=bot10, aes(x=Physics, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn hoá") + theme(legend.position="none") + scale_fill_viridis()

grid.arrange(p1, p2, ncol=2)

## Picking joint bandwidth of 0.201
## Picking joint bandwidth of 0.329

# Môn ngoại ngữ  
p = ggplot(data=top10, aes(x=English, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn ngoại ngữ") + theme(legend.position="none") + scale_fill_viridis()

p = ggplot(data=bot10, aes(x=English, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn ngoại ngữ") + theme(legend.position="none") + scale_fill_viridis()

grid.arrange(p1, p2, ncol=2)

## Picking joint bandwidth of 0.191

## Picking joint bandwidth of 0.14

So sánh 2018 và 2019 vài tỉnh

# Hà Giang 
hg = subset(vn, Province=="Ha Giang")

p = ggplot(data=hg, aes(x=Math, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn toán (Hà Giang)") + theme(legend.position="none") + scale_fill_viridis(option="C")

p = ggplot(data=hg, aes(x=Physics, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn lí (Hà Giang)") + theme(legend.position="none") + scale_fill_viridis(option="D")

p = ggplot(data=hg, aes(x=Chemistry, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p3 = p + ylab("") + xlab("Môn hoá (Hà Giang)") + theme(legend.position="none") + scale_fill_viridis(option="E")

p = ggplot(data=hg, aes(x=English, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p4 = p + ylab("") + xlab("Môn ngoại ngữ (Hà Giang)") + theme(legend.position="none") + scale_fill_viridis(option="F")

## Warning in viridisLite::viridis(256, alpha, begin, end, direction, option):
## Option 'F' does not exist. Defaulting to 'viridis'.

grid.arrange(p1, p2, p3, p4, ncol=2)

## Picking joint bandwidth of 0.217

## Picking joint bandwidth of 0.435

## Picking joint bandwidth of 0.438

## Picking joint bandwidth of 0.13

# Sơn La  
hg = subset(vn, Province=="Son La")

p = ggplot(data=hg, aes(x=Math, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn toán (Sơn La) ") + theme(legend.position="none") + scale_fill_viridis(option="C")

p = ggplot(data=hg, aes(x=Physics, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn lí (Sơn La)") + theme(legend.position="none") + scale_fill_viridis(option="D")

p = ggplot(data=hg, aes(x=Chemistry, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p3 = p + ylab("") + xlab("Môn hoá (Sơn La)") + theme(legend.position="none") + scale_fill_viridis(option="E")

p = ggplot(data=hg, aes(x=English, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p4 = p + ylab("") + xlab("Môn ngoại ngữ (Sơn La)") + theme(legend.position="none") + scale_fill_viridis(option="F")

## Warning in viridisLite::viridis(256, alpha, begin, end, direction, option):
## Option 'F' does not exist. Defaulting to 'viridis'.

grid.arrange(p1, p2, p3, p4, ncol=2)

## Picking joint bandwidth of 0.169

## Picking joint bandwidth of 0.364

## Picking joint bandwidth of 0.355

## Picking joint bandwidth of 0.11

# Bắc Kạn
hg = subset(vn, Province=="Bac Kan")

p = ggplot(data=hg, aes(x=Math, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn toán (Bắc Kạn) ") + theme(legend.position="none") + scale_fill_viridis(option="C")

p = ggplot(data=hg, aes(x=Physics, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn lí (Bắc Kạn)") + theme(legend.position="none") + scale_fill_viridis(option="D")

p = ggplot(data=hg, aes(x=Chemistry, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p3 = p + ylab("") + xlab("Môn hoá (Bắc Kạn)") + theme(legend.position="none") + scale_fill_viridis(option="E")

p = ggplot(data=hg, aes(x=English, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p4 = p + ylab("") + xlab("Môn ngoại ngữ (Bắc Kạn)") + theme(legend.position="none") + scale_fill_viridis(option="F")

## Warning in viridisLite::viridis(256, alpha, begin, end, direction, option):
## Option 'F' does not exist. Defaulting to 'viridis'.

grid.arrange(p1, p2, p3, p4, ncol=2)

## Picking joint bandwidth of 0.255

## Picking joint bandwidth of 0.416

## Picking joint bandwidth of 0.433

## Picking joint bandwidth of 0.197

# Hoà Bình
hg = subset(vn, Province=="Hoa Binh")

p = ggplot(data=hg, aes(x=Math, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn toán (Hoà Bình) ") + theme(legend.position="none") + scale_fill_viridis(option="C")

p = ggplot(data=hg, aes(x=Physics, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn lí (Hoà Bình)") + theme(legend.position="none") + scale_fill_viridis(option="D")

p = ggplot(data=hg, aes(x=Chemistry, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p3 = p + ylab("") + xlab("Môn hoá (Hoà Bình)") + theme(legend.position="none") + scale_fill_viridis(option="E")

p = ggplot(data=hg, aes(x=English, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p4 = p + ylab("") + xlab("Môn ngoại ngữ (Hoà Bình)") + theme(legend.position="none") + scale_fill_viridis(option="F")

## Warning in viridisLite::viridis(256, alpha, begin, end, direction, option):
## Option 'F' does not exist. Defaulting to 'viridis'.

grid.arrange(p1, p2, p3, p4, ncol=2)

## Picking joint bandwidth of 0.236

## Picking joint bandwidth of 0.353

## Picking joint bandwidth of 0.356

## Picking joint bandwidth of 0.123

# TPHCM
hg = subset(vn, Province=="TPHCM")

p = ggplot(data=hg, aes(x=Math, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p1 = p + ylab("") + xlab("Môn toán (TPHCM) ") + theme(legend.position="none") + scale_fill_viridis(option="C")

p = ggplot(data=hg, aes(x=Physics, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p2 = p + ylab("") + xlab("Môn lí (TPHCM)") + theme(legend.position="none") + scale_fill_viridis(option="D")

p = ggplot(data=hg, aes(x=Chemistry, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p3 = p + ylab("") + xlab("Môn hoá (TPHCM)") + theme(legend.position="none") + scale_fill_viridis(option="E")

p = ggplot(data=hg, aes(x=English, y=Year, col=Year, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p4 = p + ylab("") + xlab("Môn ngoại ngữ (TPHCM)") + theme(legend.position="none") + scale_fill_viridis(option="F")

## Warning in viridisLite::viridis(256, alpha, begin, end, direction, option):
## Option 'F' does not exist. Defaulting to 'viridis'.

grid.arrange(p1, p2, p3, p4, ncol=2)

## Picking joint bandwidth of 0.114

## Picking joint bandwidth of 0.143

## Picking joint bandwidth of 0.14

## Picking joint bandwidth of 0.181

Distribution of scores for 2019 by province

p = ggplot(data=vn19, aes(x=Math, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p + xlab("Môn toán") + theme(legend.position="none") + scale_fill_viridis()

## Picking joint bandwidth of 0.223

p = ggplot(data=vn19, aes(x=Physics, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p + xlab("Môn vật lí") + ylab("") + theme(legend.position="none") + scale_fill_viridis()

## Picking joint bandwidth of 0.27

p = ggplot(data=vn19, aes(x=Chemistry, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p + xlab("Môn hoá học") + ylab("") + theme(legend.position="none") + scale_fill_viridis()

## Picking joint bandwidth of 0.269

p = ggplot(data=vn19, aes(x=Biology, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p + xlab("Môn sinh") + ylab("") + theme(legend.position="none") + scale_fill_viridis()

## Picking joint bandwidth of 0.197

p = ggplot(data=vn19, aes(x=English, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p + xlab("Môn ngoại ngữ") + ylab("") + theme(legend.position="none") + scale_fill_viridis()

## Picking joint bandwidth of 0.201

p = ggplot(data=vn19, aes(x=History, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p + xlab("Môn sử") + ylab("") + theme(legend.position="none") + scale_fill_viridis()

## Picking joint bandwidth of 0.204

p = ggplot(data=vn19, aes(x=Geography, y=Province, col=Province, fill=..x..))
p = p + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01)
p + xlab("Môn địa lí") + ylab("") + theme(legend.position="none") + scale_fill_viridis()

## Picking joint bandwidth of 0.178

Phân bố tích lũy 2018 và 2019

# Toán, lí, hoá, sinh 

p1 = ggplot(data=vn, aes(x=Math, col=Year)) + stat_ecdf() + xlab("Môn toán") + ylab("Xác suất") + theme(legend.position="top")

p2 = ggplot(data=vn, aes(x=Physics, col=Year)) + stat_ecdf() + xlab("Môn lí") + ylab("Xác suất") + theme(legend.position="top")

p3 = ggplot(data=vn, aes(x=Chemistry, col=Year)) + stat_ecdf() + xlab("Môn hoá") + ylab("Xác suất") + theme(legend.position="top")

p4 = ggplot(data=vn, aes(x=Biology, col=Year)) + stat_ecdf() + xlab("Môn sinh") + ylab("Xác suất") + theme(legend.position="top")

grid.arrange(p1, p2, p3, p4, ncol=2)

## Warning: Removed 7912 rows containing non-finite values (stat_ecdf).

## Warning: Removed 977737 rows containing non-finite values (stat_ecdf).

## Warning: Removed 970296 rows containing non-finite values (stat_ecdf).

## Warning: Removed 982621 rows containing non-finite values (stat_ecdf).

# Văn, ngoại ngữ, sử, địa 

p1 = ggplot(data=vn, aes(x=Viet, col=Year)) + stat_ecdf() + xlab("Môn văn") + ylab("Xác suất") + theme(legend.position="top")

p2 = ggplot(data=vn, aes(x=English, col=Year)) + stat_ecdf() + xlab("Môn ngoại ngữ") + ylab("Xác suất") + theme(legend.position="top")

p3 = ggplot(data=vn, aes(x=History, col=Year)) + stat_ecdf() + xlab("Môn sử") + ylab("Xác suất") + theme(legend.position="top")

p4 = ggplot(data=vn, aes(x=Geography, col=Year)) + stat_ecdf() + xlab("Môn địa lí") + ylab("Xác suất") + theme(legend.position="top")

grid.arrange(p1, p2, p3, p4, ncol=2)

## Warning: Removed 30775 rows containing non-finite values (stat_ecdf).

## Warning: Removed 177288 rows containing non-finite values (stat_ecdf).

## Warning: Removed 612001 rows containing non-finite values (stat_ecdf).

## Warning: Removed 632588 rows containing non-finite values (stat_ecdf).

Comparison of results between 2018 and 2019

table1(~Math+Physics+Chemistry+Biology+Viet+English+History+Geography | Year, overall=F, data=vn)

	2018 (n=754783)	2019 (n=882657)
Math
Mean (SD)	4.86 (1.45)	5.64 (1.74)
Median [Min, Max]	5.00 [0.00, 10.0]	5.80 [0.00, 10.0]
Missing	3507 (0.5%)	4405 (0.5%)
Physics
Mean (SD)	4.96 (1.53)	5.57 (1.59)
Median [Min, Max]	5.00 [0.00, 10.0]	5.75 [0.00, 10.0]
Missing	430091 (57.0%)	547646 (62.0%)
Chemistry
Mean (SD)	4.86 (1.52)	5.35 (1.57)
Median [Min, Max]	4.75 [0.00, 10.0]	5.50 [0.00, 10.0]
Missing	426435 (56.5%)	543861 (61.6%)
Biology
Mean (SD)	4.56 (1.24)	4.68 (1.24)
Median [Min, Max]	4.50 [0.00, 10.0]	4.50 [0.00, 10.0]
Missing	433847 (57.5%)	548774 (62.2%)
Viet
Mean (SD)	5.43 (1.45)	5.48 (1.34)
Median [Min, Max]	5.50 [0.00, 9.75]	5.50 [0.00, 9.50]
Missing	16243 (2.2%)	14532 (1.6%)
English
Mean (SD)	3.92 (1.57)	4.36 (1.81)
Median [Min, Max]	3.60 [0.00, 10.0]	4.00 [0.00, 10.0]
Missing	87059 (11.5%)	90229 (10.2%)
History
Mean (SD)	3.80 (1.24)	4.30 (1.44)
Median [Min, Max]	3.75 [0.00, 10.0]	4.00 [0.00, 10.0]
Missing	299415 (39.7%)	312586 (35.4%)
Geography
Mean (SD)	5.47 (1.27)	6.00 (1.23)
Median [Min, Max]	5.50 [0.00, 10.0]	6.00 [0.00, 10.0]
Missing	312718 (41.4%)	319870 (36.2%)

Thay đổi điểm trung bình 2018 và 2019 cho mỗi tỉnh

library(dplyr)
# t19 = vn19 %>% select(Province, Math, Physics, Chemistry, Biology, Viet, English, History, Geography) %>% group_by(Province) %>% summarise(Math19=mean(na.omit(Math)), Physics19=mean(na.omit(Physics)), Chemistry19=mean(na.omit(Chemistry)), Biology19=mean(na.omit(Biology)), Viet19=mean(na.omit(Viet)), English19=mean(na.omit(English)), History19=mean(na.omit(History)), Geography19=mean(na.omit(Geography)))

# t18 = vn18 %>% select(Province, Math, Physics, Chemistry, Biology, Viet, English, History, Geography) %>% group_by(Province) %>% summarise(Math18=mean(na.omit(Math)), Physics18=mean(na.omit(Physics)),Chemistry18=mean(na.omit(Chemistry)), Biology18=mean(na.omit(Biology)), Viet18=mean(na.omit(Viet)), English18=mean(na.omit(English)), History18=mean(na.omit(History)), Geography18=mean(na.omit(Geography)))

diff = read.csv("~/Dropbox/Bao chi va Khoa hoc/Pho diem 2019/Mean score 2018-2019.csv")

# Môn toán, lí, hoá, sinh  

p = ggplot(data=diff, aes(x=reorder(Province, -Math), y=Math, fill=Province))
p1 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn toán") + theme(legend.position="none", axis.text=element_text(size=7))

p = ggplot(data=diff, aes(x=reorder(Province, -Physics), y=Physics, fill=Province))
p2 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn lí") + theme(legend.position="none",  axis.text=element_text(size=7))

p = ggplot(data=diff, aes(x=reorder(Province, -Chemistry), y=Chemistry, fill=Province))
p3 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn hoá") + theme(legend.position="none", axis.text=element_text(size=7))

p = ggplot(data=diff, aes(x=reorder(Province, -Biology), y=Biology, fill=Province))
p4 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn sinh") + theme(legend.position="none", axis.text=element_text(size=7))

grid.arrange(p1, p2, p3, p4, ncol=2)

# Môn văn, ngoại ngữ, sử, địa 

p = ggplot(data=diff, aes(x=reorder(Province, -Viet), y=Viet, fill=Province))
p1 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn văn") + theme(legend.position="none", axis.text=element_text(size=7))

p = ggplot(data=diff, aes(x=reorder(Province, -English), y= English, fill=Province))
p2 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn ngoại ngữ") + theme(legend.position="none",  axis.text=element_text(size=7))

p = ggplot(data=diff, aes(x=reorder(Province, -History), y=History, fill=Province))
p3 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn sử") + theme(legend.position="none", axis.text=element_text(size=7))

p = ggplot(data=diff, aes(x=reorder(Province, -Geography), y=Geography, fill=Province))
p4 = p + geom_bar(stat="identity") + coord_flip() + xlab("") + ylab("Thay đổi (2019-2018) điểm trung bình môn địa lí") + theme(legend.position="none", axis.text=element_text(size=7))

grid.arrange(p1, p2, p3, p4, ncol=2)

Analysis of HSC 2019 data

TVN

14/07/2019

Đọc dữ liệu 2019

Đọc dữ liệu 2018

Hợp nhất 2 dữ liệu

Điểm trung bình năm 2019 cho từng tỉnh

So sánh phân bố 2018 và 2019

So sánh 2019 và 2018 cho top 10 và bottom 10

So sánh 2018 và 2019 vài tỉnh

Distribution of scores for 2019 by province

Phân bố tích lũy 2018 và 2019

Comparison of results between 2018 and 2019

Thay đổi điểm trung bình 2018 và 2019 cho mỗi tỉnh