Excersice 1

Loading required packages

library(dplyr)
library(tidyverse)
library(stringr)
library(stringi)
library(rvest)
library(readxl)
library(ggplot2)

Loading THPT 2018 Quoc gia.csv into R under the name raw_data

raw_data <- read.table( file = "D:/2020/R course/data_day12/THPT_2018.csv", fill= TRUE, header = TRUE, sep = ",")
  1. Convert SoBD to character variable
raw_data %>% 
  mutate(SoBD = as.character(SoBD)) -> raw_data
  1. Using str_count() function to investigate the number of characters in SoBD column and add a new column soluongkt displaying these information
raw_data %>%
  mutate(soluongkt = str_count(SoBD)) -> raw_data_1

See the dataset with the new column

head(raw_data_1)
##   ID     SoBD Math Viet English Physics Chemistry Biology History Geography
## 1  1 18010226  3.0 3.75     3.0      NA        NA      NA     3.0      6.50
## 2  2 18010229  8.8 7.50     9.0      NA        NA      NA     6.0      9.00
## 3  3 18010232  6.0 5.50     4.0    5.75       5.5    5.00      NA        NA
## 4  4 18010242  3.4 5.75     2.6      NA        NA      NA     3.5      4.75
## 5  5 18010247  3.8 6.75     3.0      NA        NA      NA     3.5      6.25
## 6  6 18010252  5.0 6.50     2.2    2.00       3.5    4.25      NA        NA
##   GDCD KhoiA KhoiB KhoiC KhoiD KhoiA1 soluongkt
## 1 8.25    NA    NA 13.25  9.75     NA         8
## 2 8.25    NA    NA 22.50 25.30     NA         8
## 3   NA 17.25 16.50    NA 15.50  15.75         8
## 4 7.25    NA    NA 14.00 11.75     NA         8
## 5 8.00    NA    NA 16.50 13.55     NA         8
## 6   NA 10.50 12.75    NA 13.70   9.20         8
  1. Count the number of SoBD that have 8 characters
raw_data_1 %>% 
  filter(soluongkt == 8) %>% 
  count()
##        n
## 1 592039
  1. Add 0 to 7-characters SoBD and create a new column SoBD_new
raw_data_1 %>% 
  mutate(SoBD_new = case_when(soluongkt == 7 ~ paste0("0", SoBD), 
                              TRUE ~ SoBD)) -> raw_data 
  1. Extract province code from the website provided
province_code <- "https://thuvienphapluat.vn/cong-van/Giao-duc/Cong-van-417-BGDDT-KTKDCLGD-huong-dan-thuc-hien-Quy-che-thi-trung-hoc-pho-thong-quoc-gia-2017-339311.aspx" 

code_path <- '//*[@id="divContentDoc"]/div/div/div/table[7]'

df_code <- province_code %>% 
  read_html() %>% 
  html_nodes(xpath = code_path) %>% 
  html_table %>% 
  .[[1]]

head(df_code)
##                  X1                                    X2
## 1 Mã\r\n  s<U+1EDF>                    Tên\r\n  s<U+1EDF>
## 2                01          S<U+1EDF> GDÐT Hà N<U+1ED9>i
## 3                02 S<U+1EDF> GDÐT TP. H<U+1ED3> Chí Minh
## 4                03       S<U+1EDF> GDÐT H<U+1EA3>i Phòng
## 5                04         S<U+1EDF> GDÐT Ðà N<U+1EB5>ng
## 6                05               S<U+1EDF> GDÐT Hà Giang
##                                                X3
## 1 Mã\r\n  c<U+1EE5>m (H<U+1ED9>i d<U+1ED3>ng) thi
## 2                                              01
## 3                                              02
## 4                                              03
## 5                                              04
## 6                                              05
##                                      X4
## 1   Tên\r\n  H<U+1ED9>i d<U+1ED3>ng thi
## 2          S<U+1EDF> GDÐT Hà N<U+1ED9>i
## 3 S<U+1EDF> GDÐT TP. H<U+1ED3> Chí Minh
## 4       S<U+1EDF> GDÐT H<U+1EA3>i Phòng
## 5         S<U+1EDF> GDÐT Ðà N<U+1EB5>ng
## 6               S<U+1EDF> GDÐT Hà Giang
df_code %>% 
  mutate_all(function(x) {stri_trans_general(x, "Latin-ASCII")}) %>% 
  slice(-1) -> df_code_latin

head(df_code_latin)
##   X1                      X2 X3                      X4
## 1 01          So GDDT Ha Noi 01          So GDDT Ha Noi
## 2 02 So GDDT TP. Ho Chi Minh 02 So GDDT TP. Ho Chi Minh
## 3 03       So GDDT Hai Phong 03       So GDDT Hai Phong
## 4 04         So GDDT Da Nang 04         So GDDT Da Nang
## 5 05        So GDDT Ha Giang 05        So GDDT Ha Giang
## 6 06        So GDDT Cao Bang 06        So GDDT Cao Bang
df_code_latin %>% 
  select(code = X3, province = X2) %>% 
  mutate(province = str_replace_all(province, "So GDDT ", "")) -> df_code_complete

head(df_code_complete)
##   code        province
## 1   01          Ha Noi
## 2   02 TP. Ho Chi Minh
## 3   03       Hai Phong
## 4   04         Da Nang
## 5   05        Ha Giang
## 6   06        Cao Bang
  1. Create a new column province for raw_data dataset and arrange the number of students in decreasing order
raw_data %>% 
  mutate(code = str_sub(SoBD_new, start = 1, end = 2 )) -> raw_data

head(raw_data)
##   ID     SoBD Math Viet English Physics Chemistry Biology History Geography
## 1  1 18010226  3.0 3.75     3.0      NA        NA      NA     3.0      6.50
## 2  2 18010229  8.8 7.50     9.0      NA        NA      NA     6.0      9.00
## 3  3 18010232  6.0 5.50     4.0    5.75       5.5    5.00      NA        NA
## 4  4 18010242  3.4 5.75     2.6      NA        NA      NA     3.5      4.75
## 5  5 18010247  3.8 6.75     3.0      NA        NA      NA     3.5      6.25
## 6  6 18010252  5.0 6.50     2.2    2.00       3.5    4.25      NA        NA
##   GDCD KhoiA KhoiB KhoiC KhoiD KhoiA1 soluongkt SoBD_new code
## 1 8.25    NA    NA 13.25  9.75     NA         8 18010226   18
## 2 8.25    NA    NA 22.50 25.30     NA         8 18010229   18
## 3   NA 17.25 16.50    NA 15.50  15.75         8 18010232   18
## 4 7.25    NA    NA 14.00 11.75     NA         8 18010242   18
## 5 8.00    NA    NA 16.50 13.55     NA         8 18010247   18
## 6   NA 10.50 12.75    NA 13.70   9.20         8 18010252   18
df_exam <- full_join(raw_data, df_code_complete, by = "code") 

head(df_exam)
##   ID     SoBD Math Viet English Physics Chemistry Biology History Geography
## 1  1 18010226  3.0 3.75     3.0      NA        NA      NA     3.0      6.50
## 2  2 18010229  8.8 7.50     9.0      NA        NA      NA     6.0      9.00
## 3  3 18010232  6.0 5.50     4.0    5.75       5.5    5.00      NA        NA
## 4  4 18010242  3.4 5.75     2.6      NA        NA      NA     3.5      4.75
## 5  5 18010247  3.8 6.75     3.0      NA        NA      NA     3.5      6.25
## 6  6 18010252  5.0 6.50     2.2    2.00       3.5    4.25      NA        NA
##   GDCD KhoiA KhoiB KhoiC KhoiD KhoiA1 soluongkt SoBD_new code  province
## 1 8.25    NA    NA 13.25  9.75     NA         8 18010226   18 Bac Giang
## 2 8.25    NA    NA 22.50 25.30     NA         8 18010229   18 Bac Giang
## 3   NA 17.25 16.50    NA 15.50  15.75         8 18010232   18 Bac Giang
## 4 7.25    NA    NA 14.00 11.75     NA         8 18010242   18 Bac Giang
## 5 8.00    NA    NA 16.50 13.55     NA         8 18010247   18 Bac Giang
## 6   NA 10.50 12.75    NA 13.70   9.20         8 18010252   18 Bac Giang
df_exam %>% 
  count(province, sort=TRUE)-> df_exam_ordered

df_exam_ordered
##                          province     n
## 1                 TP. Ho Chi Minh 78321
## 2                          Ha Noi 38099
## 3                        Dong Nai 28651
## 4                         Dak Lak 22035
## 5                       Thai Binh 21435
## 6                       Hai Duong 19973
## 7                       Bac Giang 19612
## 8                       Binh Dinh 17785
## 9                       Quang Nam 17532
## 10                        Ha Tinh 16330
## 11                       Lam Dong 14985
## 12                       Bac Ninh 14815
## 13                      Dong Thap 14367
## 14                     Tien Giang 14106
## 15                      Thanh Hoa 14099
## 16                        Long An 14065
## 17                        Phu Tho 13699
## 18                      Khanh Hoa 13500
## 19                     Kien Giang 13482
## 20                        Can Tho 13068
## 21                       Hung Yen 12888
## 22                        Gia Lai 12825
## 23                     Quang Ngai 12631
## 24                      Vinh Phuc 12602
## 25                Thua Thien -Hue 12387
## 26                Ba Ria-Vung Tau 11846
## 27                        Ben Tre 11740
## 28                     Binh Thuan 11716
## 29                     Binh Duong 11313
## 30                        Phu Yen 10731
## 31                      Vinh Long 10571
## 32                     Binh Phuoc 10219
## 33                     Quang Binh  9623
## 34                      Ninh Binh  9612
## 35                      Soc Trang  9334
## 36                         Ca Mau  9275
## 37                       Lang Son  9013
## 38                    Thai Nguyen  8999
## 39                       Hoa Binh  8952
## 40                       Tay Ninh  8858
## 41                         Ha Nam  8685
## 42                       Tra Vinh  8183
## 43                      Quang Tri  7889
## 44                    Tuyen Quang  7638
## 45                        Yen Bai  7022
## 46                       Dak Nong  6361
## 47                        Lao Cai  6199
## 48                      Hau Giang  6190
## 49                        Da Nang  6099
## 50                     Ninh Thuan  5774
## 51                      Dien Bien  5481
## 52                       Bac Lieu  5370
## 53                      Hai Phong  5099
## 54                       Nam Dinh  5099
## 55                       Cao Bang  4571
## 56                        Kon Tum  4445
## 57                       Lai Chau  3232
## 58                       Ha Giang  3099
## 59                        Bac Kan  2866
## 60                       An Giang     1
## 61 Cuc Nha truong - Bo Quoc phong     1
## 62                        Nghe An     1
## 63                     Quang Ninh     1
## 64                         Son La     1
  1. Calculate the proportion of the students who gained >=25 score for KhoiA
df_exam %>% 
  select(province, KhoiA) %>% 
  na.omit() %>% 
  group_by(province) %>% 
  summarise(tot = length(KhoiA)) -> df_exam_tot # count the total number of students who attended the KhoiA's exam for each province

df_exam_tot
## # A tibble: 59 x 2
##    province          tot
##    <chr>           <int>
##  1 Ba Ria-Vung Tau  6717
##  2 Bac Giang        5064
##  3 Bac Kan           345
##  4 Bac Lieu         2208
##  5 Bac Ninh         5221
##  6 Ben Tre          5408
##  7 Binh Dinh        9884
##  8 Binh Duong       6383
##  9 Binh Phuoc       5367
## 10 Binh Thuan       4636
## # ... with 49 more rows
df_exam  %>% 
  select(province, KhoiA) %>% 
  na.omit() %>% 
  filter(KhoiA >=25) %>% 
  group_by(province) %>% 
  count() -> df_exam_count # count the number of student who gained >=25 scores in KhoiA for each province
df_exam_count
## # A tibble: 58 x 2
## # Groups:   province [58]
##    province            n
##    <chr>           <int>
##  1 Ba Ria-Vung Tau     5
##  2 Bac Giang          33
##  3 Bac Lieu            1
##  4 Bac Ninh           47
##  5 Ben Tre             5
##  6 Binh Dinh           6
##  7 Binh Duong          6
##  8 Binh Phuoc          5
##  9 Binh Thuan          1
## 10 Ca Mau              1
## # ... with 48 more rows
full_join(df_exam_count, df_exam_tot, by = "province" ) -> df_final

df_rate <- df_final %>% 
  mutate(rate = (n/tot)*100) %>% 
  arrange(-rate) # calculate the proportion of student who achieved >=25 score for KhoiA in each province
df_rate
## # A tibble: 59 x 4
## # Groups:   province [59]
##    province      n   tot  rate
##    <chr>     <int> <int> <dbl>
##  1 Ha Giang     27   581 4.65 
##  2 Bac Ninh     47  5221 0.900
##  3 Thai Binh    91 10149 0.897
##  4 Ninh Binh    31  3514 0.882
##  5 Hoa Binh     20  2356 0.849
##  6 Thanh Hoa    44  5497 0.800
##  7 Vinh Phuc    30  4070 0.737
##  8 Hai Phong    14  1961 0.714
##  9 Nam Dinh     18  2715 0.663
## 10 Bac Giang    33  5064 0.652
## # ... with 49 more rows

Excersice 2

  1. Find any flaw of the following bar plot

The bar plot is adequate in terms of both aesthetic and the content. It displays sufficient information so that the audiences could get the whole picture at the first glance. A minor point is that the PCI scale used to categorize provinces into groups might not be efficient to clearly distinguish provinces by PCI.

  1. Produce a bar plot with the same format using pci dataset
pci_data <- read_excel("D:/2020/R course/data_day12/du-lieu-pci-2018.xlsx", n_max = 64)

pci_data$Rank <- as.character(pci_data$"Xếp hạng")
names(pci_data)[3] <- "PCI" 
names(pci_data)[1] <- "Province"

pci_data %>% 
  mutate_at("Province", function(x) {stri_trans_general(x, "Latin-ASCII")}) %>% 
  unite("TP.Rank", c("Province", Rank), sep=" ") %>% 
  mutate(category = case_when(PCI >= 70 ~ "Very good",
                              PCI < 70 & PCI >= 65 ~ "Good",
                              PCI < 65 & PCI >= 63 ~ "Fairly good",
                              PCI < 63 & PCI >= 60 ~ "Average",
                              TRUE ~ "Fairly low")) %>% 
  mutate(PCI.2 = format(round(PCI, 2), nsmall = 2))-> pci_data_final

pci_data_final$category_factored <- factor(pci_data_final$category, 
                                           levels = c("Very good", 
                                                      "Good", 
                                                      "Fairly good",
                                                      "Average", 
                                                      "Fairly low"))
pci_data_final %>% 
  ggplot(aes(x = reorder(TP.Rank, PCI), y = PCI, fill= category_factored )) +
  geom_col(width = 0.7)+
  theme_bw()+
  xlab(NULL)+
  ylab("PCI")+
  scale_y_continuous(expand = c(0,0))+
  ggtitle("Provincial Competitiveness Index (PCI) 2018")+
  theme(legend.title = element_blank())+
  theme(axis.line = element_line(colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank())+ 
  geom_text(aes(label = PCI.2), size= 2.5, hjust = 1)+
  coord_flip()

Excercise 3

Generate a function to group variable x according to the following conditions:

x >= 9 -> “GroupA”

8 <= x < 9 -> “GroupB”

7 <= x < 8 -> “GroupC”

6 <= x < 7 -> “GroupD”

5 <= x < 6 -> “GroupE”

x < 5 -> “GroupF”

in which x is a real number, only receive the value within [0, 10]

group <- function (x) {
  
  sapply(x, function (x)
  if (x < 0 | x > 10) stop("x must be within 0 and 10")
  else if (x >= 9) return("GroupA")
  else if (x >= 8 & x < 9) return("GroupB")
  else if (x >= 7 & x < 8) return("GroupC")
  else if (x >= 6 & x < 7) return("GroupD")
  else if (x >= 5 & x < 6) return("GroupE")
  else return("GroupF"))

}

Using this function to:

  1. calculate the rate of GroupA student in Math among KhoiA taking students
KhoiA_tot <- df_exam %>% 
   filter(!is.na(KhoiA)) %>% 
   filter(!is.na(Math)) %>% 
   mutate(category = group(Math)) %>% 
   group_by(province) %>% 
   count(category) %>% 
   summarise(tot = sum(n)) # total KhoiA student who took Math exam 

KhoiA_groupA <- df_exam %>% 
   filter(!is.na(KhoiA)) %>% 
   filter(!is.na(Math)) %>% 
   mutate(category = group(Math)) %>% 
   filter (category == "GroupA") %>% 
   group_by(province) %>% 
   count(category) # the number of KhoiA students who belong to GroupA in Math

KhoiA <- full_join(KhoiA_tot, KhoiA_groupA, by = "province") %>% 
  select(-category) %>% 
  mutate(n = replace(n,is.na(n),0))

KhoiA %>% 
  mutate(groupA_rate = (n/tot)*100) %>% 
  select(province, groupA_rate) %>% 
  arrange(-groupA_rate) -> KhoiA_rate

head(KhoiA_rate)
## # A tibble: 6 x 2
##   province  groupA_rate
##   <chr>           <dbl>
## 1 Ha Giang        4.48 
## 2 Hoa Binh        0.849
## 3 Phu Tho         0.696
## 4 Ha Noi          0.397
## 5 Hai Phong       0.306
## 6 Thanh Hoa       0.291
  1. Given only the number of students in KhoiB, calculate the rate of GroupA students in Biology for each province
KhoiB_tot <- df_exam %>% 
   filter(!is.na(KhoiB)) %>% 
   filter(!is.na(Biology)) %>% 
   mutate(category = group(Biology)) %>% 
   group_by(province) %>% 
   count(category) %>% 
   summarise(tot = sum(n))

KhoiB_groupA <- df_exam %>% 
   filter(!is.na(KhoiB)) %>% 
   filter(!is.na(Biology)) %>% 
   mutate(category = group(Biology)) %>% 
   filter (category == "GroupA") %>% 
   group_by(province) %>% 
   count(category) 

KhoiB <- full_join(KhoiB_tot, KhoiB_groupA, by = "province")  %>% 
  select(-category) %>% 
  mutate(n = replace(n,is.na(n),0))

KhoiB %>% 
  mutate(groupA_rate = (n/tot)*100) %>% 
  select(province, groupA_rate) %>% 
  arrange(-groupA_rate) -> KhoiB_rate

head(KhoiB_rate)
## # A tibble: 6 x 2
##   province  groupA_rate
##   <chr>           <dbl>
## 1 Ha Giang        0.526
## 2 Dien Bien       0.510
## 3 Kon Tum         0.375
## 4 Ha Tinh         0.296
## 5 Bac Kan         0.288
## 6 Hai Duong       0.190

Addtional excercises

  1. Function to calculate the area of a regtangle with
reg_area <- function (a, b) {
   
  area <- a * b
  
  return(area)
}

Check the function

 reg_area( a = 7, b = 3)
## [1] 21
  1. Function to calculate households’ electricity cost following the current regulation
current_regulation <- function(x){
  sapply(x, function (x)
    if (x < 0) stop("x must be larger than 0")
    else if(x >= 0 & x <= 50) return(x * 1549)
    else if(x >= 51 & x <= 100) return(x * 1600)
    else if(x >= 101 & x <= 200) return(x * 1858)
    else if(x >= 201 & x <= 300) return(x * 2340)
    else if(x >= 301 & x <= 400) return(x * 2615)
    else return(x * 2701))
  
}

# Check the function

x <- c(50, 78, 145, 211, 316, 500)

cost <- current_regulation(x)

cost
## [1]   77450  124800  269410  493740  826340 1350500
cost/x
## [1] 1549 1600 1858 2340 2615 2701
  1. Function to calculate household’s electricity cost following the proposed regulation
proposed_regulation <- function(x){
  sapply(x, function (x)
  if (x < 0) stop("x must be larger than 0")
  else if(x >= 0 & x <= 100) return(x * 1549)
  else if(x >= 101 & x <= 200) return(x * 1858)
  else if(x >= 201 & x <= 400) return(x * 2340)
  else if(x >= 400 & x <= 701) return(x * 2701)
  else return(x * 3105))
}
 # Check the function

x <- c(50, 78, 145, 211, 316, 500, 800)

cost <- proposed_regulation(x)

cost
## [1]   77450  120822  269410  493740  739440 1350500 2484000
cost/x
## [1] 1549 1549 1858 2340 2340 2701 3105