Loading required packages
library(dplyr)
library(tidyverse)
library(stringr)
library(stringi)
library(rvest)
library(readxl)
library(ggplot2)
Loading THPT 2018 Quoc gia.csv into R under the name raw_data
raw_data <- read.table( file = "D:/2020/R course/data_day12/THPT_2018.csv", fill= TRUE, header = TRUE, sep = ",")
raw_data %>%
mutate(SoBD = as.character(SoBD)) -> raw_data
raw_data %>%
mutate(soluongkt = str_count(SoBD)) -> raw_data_1
See the dataset with the new column
head(raw_data_1)
## ID SoBD Math Viet English Physics Chemistry Biology History Geography
## 1 1 18010226 3.0 3.75 3.0 NA NA NA 3.0 6.50
## 2 2 18010229 8.8 7.50 9.0 NA NA NA 6.0 9.00
## 3 3 18010232 6.0 5.50 4.0 5.75 5.5 5.00 NA NA
## 4 4 18010242 3.4 5.75 2.6 NA NA NA 3.5 4.75
## 5 5 18010247 3.8 6.75 3.0 NA NA NA 3.5 6.25
## 6 6 18010252 5.0 6.50 2.2 2.00 3.5 4.25 NA NA
## GDCD KhoiA KhoiB KhoiC KhoiD KhoiA1 soluongkt
## 1 8.25 NA NA 13.25 9.75 NA 8
## 2 8.25 NA NA 22.50 25.30 NA 8
## 3 NA 17.25 16.50 NA 15.50 15.75 8
## 4 7.25 NA NA 14.00 11.75 NA 8
## 5 8.00 NA NA 16.50 13.55 NA 8
## 6 NA 10.50 12.75 NA 13.70 9.20 8
raw_data_1 %>%
filter(soluongkt == 8) %>%
count()
## n
## 1 592039
raw_data_1 %>%
mutate(SoBD_new = case_when(soluongkt == 7 ~ paste0("0", SoBD),
TRUE ~ SoBD)) -> raw_data
province_code <- "https://thuvienphapluat.vn/cong-van/Giao-duc/Cong-van-417-BGDDT-KTKDCLGD-huong-dan-thuc-hien-Quy-che-thi-trung-hoc-pho-thong-quoc-gia-2017-339311.aspx"
code_path <- '//*[@id="divContentDoc"]/div/div/div/table[7]'
df_code <- province_code %>%
read_html() %>%
html_nodes(xpath = code_path) %>%
html_table %>%
.[[1]]
head(df_code)
## X1 X2
## 1 Mã\r\n s<U+1EDF> Tên\r\n s<U+1EDF>
## 2 01 S<U+1EDF> GDÐT Hà N<U+1ED9>i
## 3 02 S<U+1EDF> GDÐT TP. H<U+1ED3> Chí Minh
## 4 03 S<U+1EDF> GDÐT H<U+1EA3>i Phòng
## 5 04 S<U+1EDF> GDÐT Ðà N<U+1EB5>ng
## 6 05 S<U+1EDF> GDÐT Hà Giang
## X3
## 1 Mã\r\n c<U+1EE5>m (H<U+1ED9>i d<U+1ED3>ng) thi
## 2 01
## 3 02
## 4 03
## 5 04
## 6 05
## X4
## 1 Tên\r\n H<U+1ED9>i d<U+1ED3>ng thi
## 2 S<U+1EDF> GDÐT Hà N<U+1ED9>i
## 3 S<U+1EDF> GDÐT TP. H<U+1ED3> Chí Minh
## 4 S<U+1EDF> GDÐT H<U+1EA3>i Phòng
## 5 S<U+1EDF> GDÐT Ðà N<U+1EB5>ng
## 6 S<U+1EDF> GDÐT Hà Giang
df_code %>%
mutate_all(function(x) {stri_trans_general(x, "Latin-ASCII")}) %>%
slice(-1) -> df_code_latin
head(df_code_latin)
## X1 X2 X3 X4
## 1 01 So GDDT Ha Noi 01 So GDDT Ha Noi
## 2 02 So GDDT TP. Ho Chi Minh 02 So GDDT TP. Ho Chi Minh
## 3 03 So GDDT Hai Phong 03 So GDDT Hai Phong
## 4 04 So GDDT Da Nang 04 So GDDT Da Nang
## 5 05 So GDDT Ha Giang 05 So GDDT Ha Giang
## 6 06 So GDDT Cao Bang 06 So GDDT Cao Bang
df_code_latin %>%
select(code = X3, province = X2) %>%
mutate(province = str_replace_all(province, "So GDDT ", "")) -> df_code_complete
head(df_code_complete)
## code province
## 1 01 Ha Noi
## 2 02 TP. Ho Chi Minh
## 3 03 Hai Phong
## 4 04 Da Nang
## 5 05 Ha Giang
## 6 06 Cao Bang
raw_data %>%
mutate(code = str_sub(SoBD_new, start = 1, end = 2 )) -> raw_data
head(raw_data)
## ID SoBD Math Viet English Physics Chemistry Biology History Geography
## 1 1 18010226 3.0 3.75 3.0 NA NA NA 3.0 6.50
## 2 2 18010229 8.8 7.50 9.0 NA NA NA 6.0 9.00
## 3 3 18010232 6.0 5.50 4.0 5.75 5.5 5.00 NA NA
## 4 4 18010242 3.4 5.75 2.6 NA NA NA 3.5 4.75
## 5 5 18010247 3.8 6.75 3.0 NA NA NA 3.5 6.25
## 6 6 18010252 5.0 6.50 2.2 2.00 3.5 4.25 NA NA
## GDCD KhoiA KhoiB KhoiC KhoiD KhoiA1 soluongkt SoBD_new code
## 1 8.25 NA NA 13.25 9.75 NA 8 18010226 18
## 2 8.25 NA NA 22.50 25.30 NA 8 18010229 18
## 3 NA 17.25 16.50 NA 15.50 15.75 8 18010232 18
## 4 7.25 NA NA 14.00 11.75 NA 8 18010242 18
## 5 8.00 NA NA 16.50 13.55 NA 8 18010247 18
## 6 NA 10.50 12.75 NA 13.70 9.20 8 18010252 18
df_exam <- full_join(raw_data, df_code_complete, by = "code")
head(df_exam)
## ID SoBD Math Viet English Physics Chemistry Biology History Geography
## 1 1 18010226 3.0 3.75 3.0 NA NA NA 3.0 6.50
## 2 2 18010229 8.8 7.50 9.0 NA NA NA 6.0 9.00
## 3 3 18010232 6.0 5.50 4.0 5.75 5.5 5.00 NA NA
## 4 4 18010242 3.4 5.75 2.6 NA NA NA 3.5 4.75
## 5 5 18010247 3.8 6.75 3.0 NA NA NA 3.5 6.25
## 6 6 18010252 5.0 6.50 2.2 2.00 3.5 4.25 NA NA
## GDCD KhoiA KhoiB KhoiC KhoiD KhoiA1 soluongkt SoBD_new code province
## 1 8.25 NA NA 13.25 9.75 NA 8 18010226 18 Bac Giang
## 2 8.25 NA NA 22.50 25.30 NA 8 18010229 18 Bac Giang
## 3 NA 17.25 16.50 NA 15.50 15.75 8 18010232 18 Bac Giang
## 4 7.25 NA NA 14.00 11.75 NA 8 18010242 18 Bac Giang
## 5 8.00 NA NA 16.50 13.55 NA 8 18010247 18 Bac Giang
## 6 NA 10.50 12.75 NA 13.70 9.20 8 18010252 18 Bac Giang
df_exam %>%
count(province, sort=TRUE)-> df_exam_ordered
df_exam_ordered
## province n
## 1 TP. Ho Chi Minh 78321
## 2 Ha Noi 38099
## 3 Dong Nai 28651
## 4 Dak Lak 22035
## 5 Thai Binh 21435
## 6 Hai Duong 19973
## 7 Bac Giang 19612
## 8 Binh Dinh 17785
## 9 Quang Nam 17532
## 10 Ha Tinh 16330
## 11 Lam Dong 14985
## 12 Bac Ninh 14815
## 13 Dong Thap 14367
## 14 Tien Giang 14106
## 15 Thanh Hoa 14099
## 16 Long An 14065
## 17 Phu Tho 13699
## 18 Khanh Hoa 13500
## 19 Kien Giang 13482
## 20 Can Tho 13068
## 21 Hung Yen 12888
## 22 Gia Lai 12825
## 23 Quang Ngai 12631
## 24 Vinh Phuc 12602
## 25 Thua Thien -Hue 12387
## 26 Ba Ria-Vung Tau 11846
## 27 Ben Tre 11740
## 28 Binh Thuan 11716
## 29 Binh Duong 11313
## 30 Phu Yen 10731
## 31 Vinh Long 10571
## 32 Binh Phuoc 10219
## 33 Quang Binh 9623
## 34 Ninh Binh 9612
## 35 Soc Trang 9334
## 36 Ca Mau 9275
## 37 Lang Son 9013
## 38 Thai Nguyen 8999
## 39 Hoa Binh 8952
## 40 Tay Ninh 8858
## 41 Ha Nam 8685
## 42 Tra Vinh 8183
## 43 Quang Tri 7889
## 44 Tuyen Quang 7638
## 45 Yen Bai 7022
## 46 Dak Nong 6361
## 47 Lao Cai 6199
## 48 Hau Giang 6190
## 49 Da Nang 6099
## 50 Ninh Thuan 5774
## 51 Dien Bien 5481
## 52 Bac Lieu 5370
## 53 Hai Phong 5099
## 54 Nam Dinh 5099
## 55 Cao Bang 4571
## 56 Kon Tum 4445
## 57 Lai Chau 3232
## 58 Ha Giang 3099
## 59 Bac Kan 2866
## 60 An Giang 1
## 61 Cuc Nha truong - Bo Quoc phong 1
## 62 Nghe An 1
## 63 Quang Ninh 1
## 64 Son La 1
df_exam %>%
select(province, KhoiA) %>%
na.omit() %>%
group_by(province) %>%
summarise(tot = length(KhoiA)) -> df_exam_tot # count the total number of students who attended the KhoiA's exam for each province
df_exam_tot
## # A tibble: 59 x 2
## province tot
## <chr> <int>
## 1 Ba Ria-Vung Tau 6717
## 2 Bac Giang 5064
## 3 Bac Kan 345
## 4 Bac Lieu 2208
## 5 Bac Ninh 5221
## 6 Ben Tre 5408
## 7 Binh Dinh 9884
## 8 Binh Duong 6383
## 9 Binh Phuoc 5367
## 10 Binh Thuan 4636
## # ... with 49 more rows
df_exam %>%
select(province, KhoiA) %>%
na.omit() %>%
filter(KhoiA >=25) %>%
group_by(province) %>%
count() -> df_exam_count # count the number of student who gained >=25 scores in KhoiA for each province
df_exam_count
## # A tibble: 58 x 2
## # Groups: province [58]
## province n
## <chr> <int>
## 1 Ba Ria-Vung Tau 5
## 2 Bac Giang 33
## 3 Bac Lieu 1
## 4 Bac Ninh 47
## 5 Ben Tre 5
## 6 Binh Dinh 6
## 7 Binh Duong 6
## 8 Binh Phuoc 5
## 9 Binh Thuan 1
## 10 Ca Mau 1
## # ... with 48 more rows
full_join(df_exam_count, df_exam_tot, by = "province" ) -> df_final
df_rate <- df_final %>%
mutate(rate = (n/tot)*100) %>%
arrange(-rate) # calculate the proportion of student who achieved >=25 score for KhoiA in each province
df_rate
## # A tibble: 59 x 4
## # Groups: province [59]
## province n tot rate
## <chr> <int> <int> <dbl>
## 1 Ha Giang 27 581 4.65
## 2 Bac Ninh 47 5221 0.900
## 3 Thai Binh 91 10149 0.897
## 4 Ninh Binh 31 3514 0.882
## 5 Hoa Binh 20 2356 0.849
## 6 Thanh Hoa 44 5497 0.800
## 7 Vinh Phuc 30 4070 0.737
## 8 Hai Phong 14 1961 0.714
## 9 Nam Dinh 18 2715 0.663
## 10 Bac Giang 33 5064 0.652
## # ... with 49 more rows
The bar plot is adequate in terms of both aesthetic and the content. It displays sufficient information so that the audiences could get the whole picture at the first glance. A minor point is that the PCI scale used to categorize provinces into groups might not be efficient to clearly distinguish provinces by PCI.
pci_data <- read_excel("D:/2020/R course/data_day12/du-lieu-pci-2018.xlsx", n_max = 64)
pci_data$Rank <- as.character(pci_data$"Xếp hạng")
names(pci_data)[3] <- "PCI"
names(pci_data)[1] <- "Province"
pci_data %>%
mutate_at("Province", function(x) {stri_trans_general(x, "Latin-ASCII")}) %>%
unite("TP.Rank", c("Province", Rank), sep=" ") %>%
mutate(category = case_when(PCI >= 70 ~ "Very good",
PCI < 70 & PCI >= 65 ~ "Good",
PCI < 65 & PCI >= 63 ~ "Fairly good",
PCI < 63 & PCI >= 60 ~ "Average",
TRUE ~ "Fairly low")) %>%
mutate(PCI.2 = format(round(PCI, 2), nsmall = 2))-> pci_data_final
pci_data_final$category_factored <- factor(pci_data_final$category,
levels = c("Very good",
"Good",
"Fairly good",
"Average",
"Fairly low"))
pci_data_final %>%
ggplot(aes(x = reorder(TP.Rank, PCI), y = PCI, fill= category_factored )) +
geom_col(width = 0.7)+
theme_bw()+
xlab(NULL)+
ylab("PCI")+
scale_y_continuous(expand = c(0,0))+
ggtitle("Provincial Competitiveness Index (PCI) 2018")+
theme(legend.title = element_blank())+
theme(axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())+
geom_text(aes(label = PCI.2), size= 2.5, hjust = 1)+
coord_flip()
Generate a function to group variable x according to the following conditions:
x >= 9 -> “GroupA”
8 <= x < 9 -> “GroupB”
7 <= x < 8 -> “GroupC”
6 <= x < 7 -> “GroupD”
5 <= x < 6 -> “GroupE”
x < 5 -> “GroupF”
in which x is a real number, only receive the value within [0, 10]
group <- function (x) {
sapply(x, function (x)
if (x < 0 | x > 10) stop("x must be within 0 and 10")
else if (x >= 9) return("GroupA")
else if (x >= 8 & x < 9) return("GroupB")
else if (x >= 7 & x < 8) return("GroupC")
else if (x >= 6 & x < 7) return("GroupD")
else if (x >= 5 & x < 6) return("GroupE")
else return("GroupF"))
}
Using this function to:
KhoiA_tot <- df_exam %>%
filter(!is.na(KhoiA)) %>%
filter(!is.na(Math)) %>%
mutate(category = group(Math)) %>%
group_by(province) %>%
count(category) %>%
summarise(tot = sum(n)) # total KhoiA student who took Math exam
KhoiA_groupA <- df_exam %>%
filter(!is.na(KhoiA)) %>%
filter(!is.na(Math)) %>%
mutate(category = group(Math)) %>%
filter (category == "GroupA") %>%
group_by(province) %>%
count(category) # the number of KhoiA students who belong to GroupA in Math
KhoiA <- full_join(KhoiA_tot, KhoiA_groupA, by = "province") %>%
select(-category) %>%
mutate(n = replace(n,is.na(n),0))
KhoiA %>%
mutate(groupA_rate = (n/tot)*100) %>%
select(province, groupA_rate) %>%
arrange(-groupA_rate) -> KhoiA_rate
head(KhoiA_rate)
## # A tibble: 6 x 2
## province groupA_rate
## <chr> <dbl>
## 1 Ha Giang 4.48
## 2 Hoa Binh 0.849
## 3 Phu Tho 0.696
## 4 Ha Noi 0.397
## 5 Hai Phong 0.306
## 6 Thanh Hoa 0.291
KhoiB_tot <- df_exam %>%
filter(!is.na(KhoiB)) %>%
filter(!is.na(Biology)) %>%
mutate(category = group(Biology)) %>%
group_by(province) %>%
count(category) %>%
summarise(tot = sum(n))
KhoiB_groupA <- df_exam %>%
filter(!is.na(KhoiB)) %>%
filter(!is.na(Biology)) %>%
mutate(category = group(Biology)) %>%
filter (category == "GroupA") %>%
group_by(province) %>%
count(category)
KhoiB <- full_join(KhoiB_tot, KhoiB_groupA, by = "province") %>%
select(-category) %>%
mutate(n = replace(n,is.na(n),0))
KhoiB %>%
mutate(groupA_rate = (n/tot)*100) %>%
select(province, groupA_rate) %>%
arrange(-groupA_rate) -> KhoiB_rate
head(KhoiB_rate)
## # A tibble: 6 x 2
## province groupA_rate
## <chr> <dbl>
## 1 Ha Giang 0.526
## 2 Dien Bien 0.510
## 3 Kon Tum 0.375
## 4 Ha Tinh 0.296
## 5 Bac Kan 0.288
## 6 Hai Duong 0.190
reg_area <- function (a, b) {
area <- a * b
return(area)
}
Check the function
reg_area( a = 7, b = 3)
## [1] 21
current_regulation <- function(x){
sapply(x, function (x)
if (x < 0) stop("x must be larger than 0")
else if(x >= 0 & x <= 50) return(x * 1549)
else if(x >= 51 & x <= 100) return(x * 1600)
else if(x >= 101 & x <= 200) return(x * 1858)
else if(x >= 201 & x <= 300) return(x * 2340)
else if(x >= 301 & x <= 400) return(x * 2615)
else return(x * 2701))
}
# Check the function
x <- c(50, 78, 145, 211, 316, 500)
cost <- current_regulation(x)
cost
## [1] 77450 124800 269410 493740 826340 1350500
cost/x
## [1] 1549 1600 1858 2340 2615 2701
proposed_regulation <- function(x){
sapply(x, function (x)
if (x < 0) stop("x must be larger than 0")
else if(x >= 0 & x <= 100) return(x * 1549)
else if(x >= 101 & x <= 200) return(x * 1858)
else if(x >= 201 & x <= 400) return(x * 2340)
else if(x >= 400 & x <= 701) return(x * 2701)
else return(x * 3105))
}
# Check the function
x <- c(50, 78, 145, 211, 316, 500, 800)
cost <- proposed_regulation(x)
cost
## [1] 77450 120822 269410 493740 739440 1350500 2484000
cost/x
## [1] 1549 1549 1858 2340 2340 2701 3105