Excersice 1

Loading required packages

library(dplyr)
library(tidyverse)
library(stringr)
library(stringi)
library(rvest)
library(readxl)
library(ggplot2)

Loading THPT 2018 Quoc gia.csv into R under the name raw_data

raw_data <- read.table( file = "D:/2020/R course/data_day12/THPT_2018.csv", fill= TRUE, header = TRUE, sep = ",")

Convert SoBD to character variable

raw_data %>% 
  mutate(SoBD = as.character(SoBD)) -> raw_data

Using str_count() function to investigate the number of characters in SoBD column and add a new column soluongkt displaying these information

raw_data %>%
  mutate(soluongkt = str_count(SoBD)) -> raw_data_1

See the dataset with the new column

head(raw_data_1)

##   ID     SoBD Math Viet English Physics Chemistry Biology History Geography
## 1  1 18010226  3.0 3.75     3.0      NA        NA      NA     3.0      6.50
## 2  2 18010229  8.8 7.50     9.0      NA        NA      NA     6.0      9.00
## 3  3 18010232  6.0 5.50     4.0    5.75       5.5    5.00      NA        NA
## 4  4 18010242  3.4 5.75     2.6      NA        NA      NA     3.5      4.75
## 5  5 18010247  3.8 6.75     3.0      NA        NA      NA     3.5      6.25
## 6  6 18010252  5.0 6.50     2.2    2.00       3.5    4.25      NA        NA
##   GDCD KhoiA KhoiB KhoiC KhoiD KhoiA1 soluongkt
## 1 8.25    NA    NA 13.25  9.75     NA         8
## 2 8.25    NA    NA 22.50 25.30     NA         8
## 3   NA 17.25 16.50    NA 15.50  15.75         8
## 4 7.25    NA    NA 14.00 11.75     NA         8
## 5 8.00    NA    NA 16.50 13.55     NA         8
## 6   NA 10.50 12.75    NA 13.70   9.20         8

Count the number of SoBD that have 8 characters

raw_data_1 %>% 
  filter(soluongkt == 8) %>% 
  count()

##        n
## 1 592039

Add 0 to 7-characters SoBD and create a new column SoBD_new

raw_data_1 %>% 
  mutate(SoBD_new = case_when(soluongkt == 7 ~ paste0("0", SoBD), 
                              TRUE ~ SoBD)) -> raw_data

Extract province code from the website provided

Get the table

province_code <- "https://thuvienphapluat.vn/cong-van/Giao-duc/Cong-van-417-BGDDT-KTKDCLGD-huong-dan-thuc-hien-Quy-che-thi-trung-hoc-pho-thong-quoc-gia-2017-339311.aspx" 

code_path <- '//*[@id="divContentDoc"]/div/div/div/table[7]'

df_code <- province_code %>% 
  read_html() %>% 
  html_nodes(xpath = code_path) %>% 
  html_table %>% 
  .[[1]]

head(df_code)

##                  X1                                    X2
## 1 Mã\r\n  s<U+1EDF>                    Tên\r\n  s<U+1EDF>
## 2                01          S<U+1EDF> GDÐT Hà N<U+1ED9>i
## 3                02 S<U+1EDF> GDÐT TP. H<U+1ED3> Chí Minh
## 4                03       S<U+1EDF> GDÐT H<U+1EA3>i Phòng
## 5                04         S<U+1EDF> GDÐT Ðà N<U+1EB5>ng
## 6                05               S<U+1EDF> GDÐT Hà Giang
##                                                X3
## 1 Mã\r\n  c<U+1EE5>m (H<U+1ED9>i d<U+1ED3>ng) thi
## 2                                              01
## 3                                              02
## 4                                              03
## 5                                              04
## 6                                              05
##                                      X4
## 1   Tên\r\n  H<U+1ED9>i d<U+1ED3>ng thi
## 2          S<U+1EDF> GDÐT Hà N<U+1ED9>i
## 3 S<U+1EDF> GDÐT TP. H<U+1ED3> Chí Minh
## 4       S<U+1EDF> GDÐT H<U+1EA3>i Phòng
## 5         S<U+1EDF> GDÐT Ðà N<U+1EB5>ng
## 6               S<U+1EDF> GDÐT Hà Giang

Handle with the font error

df_code %>% 
  mutate_all(function(x) {stri_trans_general(x, "Latin-ASCII")}) %>% 
  slice(-1) -> df_code_latin

head(df_code_latin)

##   X1                      X2 X3                      X4
## 1 01          So GDDT Ha Noi 01          So GDDT Ha Noi
## 2 02 So GDDT TP. Ho Chi Minh 02 So GDDT TP. Ho Chi Minh
## 3 03       So GDDT Hai Phong 03       So GDDT Hai Phong
## 4 04         So GDDT Da Nang 04         So GDDT Da Nang
## 5 05        So GDDT Ha Giang 05        So GDDT Ha Giang
## 6 06        So GDDT Cao Bang 06        So GDDT Cao Bang

Separate strings to get the names of provinces

df_code_latin %>% 
  select(code = X3, province = X2) %>% 
  mutate(province = str_replace_all(province, "So GDDT ", "")) -> df_code_complete

head(df_code_complete)

##   code        province
## 1   01          Ha Noi
## 2   02 TP. Ho Chi Minh
## 3   03       Hai Phong
## 4   04         Da Nang
## 5   05        Ha Giang
## 6   06        Cao Bang

Create a new column province for raw_data dataset and arrange the number of students in decreasing order

Extract the first two characters of SoBD in raw_data dataset and create a column code for province code

raw_data %>% 
  mutate(code = str_sub(SoBD_new, start = 1, end = 2 )) -> raw_data

head(raw_data)

##   ID     SoBD Math Viet English Physics Chemistry Biology History Geography
## 1  1 18010226  3.0 3.75     3.0      NA        NA      NA     3.0      6.50
## 2  2 18010229  8.8 7.50     9.0      NA        NA      NA     6.0      9.00
## 3  3 18010232  6.0 5.50     4.0    5.75       5.5    5.00      NA        NA
## 4  4 18010242  3.4 5.75     2.6      NA        NA      NA     3.5      4.75
## 5  5 18010247  3.8 6.75     3.0      NA        NA      NA     3.5      6.25
## 6  6 18010252  5.0 6.50     2.2    2.00       3.5    4.25      NA        NA
##   GDCD KhoiA KhoiB KhoiC KhoiD KhoiA1 soluongkt SoBD_new code
## 1 8.25    NA    NA 13.25  9.75     NA         8 18010226   18
## 2 8.25    NA    NA 22.50 25.30     NA         8 18010229   18
## 3   NA 17.25 16.50    NA 15.50  15.75         8 18010232   18
## 4 7.25    NA    NA 14.00 11.75     NA         8 18010242   18
## 5 8.00    NA    NA 16.50 13.55     NA         8 18010247   18
## 6   NA 10.50 12.75    NA 13.70   9.20         8 18010252   18

Create province column for the raw_data dataset by joining the two datasets df_code_complete and raw_data by the common code column

df_exam <- full_join(raw_data, df_code_complete, by = "code") 

head(df_exam)

##   ID     SoBD Math Viet English Physics Chemistry Biology History Geography
## 1  1 18010226  3.0 3.75     3.0      NA        NA      NA     3.0      6.50
## 2  2 18010229  8.8 7.50     9.0      NA        NA      NA     6.0      9.00
## 3  3 18010232  6.0 5.50     4.0    5.75       5.5    5.00      NA        NA
## 4  4 18010242  3.4 5.75     2.6      NA        NA      NA     3.5      4.75
## 5  5 18010247  3.8 6.75     3.0      NA        NA      NA     3.5      6.25
## 6  6 18010252  5.0 6.50     2.2    2.00       3.5    4.25      NA        NA
##   GDCD KhoiA KhoiB KhoiC KhoiD KhoiA1 soluongkt SoBD_new code  province
## 1 8.25    NA    NA 13.25  9.75     NA         8 18010226   18 Bac Giang
## 2 8.25    NA    NA 22.50 25.30     NA         8 18010229   18 Bac Giang
## 3   NA 17.25 16.50    NA 15.50  15.75         8 18010232   18 Bac Giang
## 4 7.25    NA    NA 14.00 11.75     NA         8 18010242   18 Bac Giang
## 5 8.00    NA    NA 16.50 13.55     NA         8 18010247   18 Bac Giang
## 6   NA 10.50 12.75    NA 13.70   9.20         8 18010252   18 Bac Giang

Count the number of students in each province and arrange in decreasing order

df_exam %>% 
  count(province, sort=TRUE)-> df_exam_ordered

df_exam_ordered

##                          province     n
## 1                 TP. Ho Chi Minh 78321
## 2                          Ha Noi 38099
## 3                        Dong Nai 28651
## 4                         Dak Lak 22035
## 5                       Thai Binh 21435
## 6                       Hai Duong 19973
## 7                       Bac Giang 19612
## 8                       Binh Dinh 17785
## 9                       Quang Nam 17532
## 10                        Ha Tinh 16330
## 11                       Lam Dong 14985
## 12                       Bac Ninh 14815
## 13                      Dong Thap 14367
## 14                     Tien Giang 14106
## 15                      Thanh Hoa 14099
## 16                        Long An 14065
## 17                        Phu Tho 13699
## 18                      Khanh Hoa 13500
## 19                     Kien Giang 13482
## 20                        Can Tho 13068
## 21                       Hung Yen 12888
## 22                        Gia Lai 12825
## 23                     Quang Ngai 12631
## 24                      Vinh Phuc 12602
## 25                Thua Thien -Hue 12387
## 26                Ba Ria-Vung Tau 11846
## 27                        Ben Tre 11740
## 28                     Binh Thuan 11716
## 29                     Binh Duong 11313
## 30                        Phu Yen 10731
## 31                      Vinh Long 10571
## 32                     Binh Phuoc 10219
## 33                     Quang Binh  9623
## 34                      Ninh Binh  9612
## 35                      Soc Trang  9334
## 36                         Ca Mau  9275
## 37                       Lang Son  9013
## 38                    Thai Nguyen  8999
## 39                       Hoa Binh  8952
## 40                       Tay Ninh  8858
## 41                         Ha Nam  8685
## 42                       Tra Vinh  8183
## 43                      Quang Tri  7889
## 44                    Tuyen Quang  7638
## 45                        Yen Bai  7022
## 46                       Dak Nong  6361
## 47                        Lao Cai  6199
## 48                      Hau Giang  6190
## 49                        Da Nang  6099
## 50                     Ninh Thuan  5774
## 51                      Dien Bien  5481
## 52                       Bac Lieu  5370
## 53                      Hai Phong  5099
## 54                       Nam Dinh  5099
## 55                       Cao Bang  4571
## 56                        Kon Tum  4445
## 57                       Lai Chau  3232
## 58                       Ha Giang  3099
## 59                        Bac Kan  2866
## 60                       An Giang     1
## 61 Cuc Nha truong - Bo Quoc phong     1
## 62                        Nghe An     1
## 63                     Quang Ninh     1
## 64                         Son La     1

Calculate the proportion of the students who gained >=25 score for KhoiA

df_exam %>% 
  select(province, KhoiA) %>% 
  na.omit() %>% 
  group_by(province) %>% 
  summarise(tot = length(KhoiA)) -> df_exam_tot # count the total number of students who attended the KhoiA's exam for each province

df_exam_tot

## # A tibble: 59 x 2
##    province          tot
##    <chr>           <int>
##  1 Ba Ria-Vung Tau  6717
##  2 Bac Giang        5064
##  3 Bac Kan           345
##  4 Bac Lieu         2208
##  5 Bac Ninh         5221
##  6 Ben Tre          5408
##  7 Binh Dinh        9884
##  8 Binh Duong       6383
##  9 Binh Phuoc       5367
## 10 Binh Thuan       4636
## # ... with 49 more rows

df_exam  %>% 
  select(province, KhoiA) %>% 
  na.omit() %>% 
  filter(KhoiA >=25) %>% 
  group_by(province) %>% 
  count() -> df_exam_count # count the number of student who gained >=25 scores in KhoiA for each province
df_exam_count

## # A tibble: 58 x 2
## # Groups:   province [58]
##    province            n
##    <chr>           <int>
##  1 Ba Ria-Vung Tau     5
##  2 Bac Giang          33
##  3 Bac Lieu            1
##  4 Bac Ninh           47
##  5 Ben Tre             5
##  6 Binh Dinh           6
##  7 Binh Duong          6
##  8 Binh Phuoc          5
##  9 Binh Thuan          1
## 10 Ca Mau              1
## # ... with 48 more rows

full_join(df_exam_count, df_exam_tot, by = "province" ) -> df_final

df_rate <- df_final %>% 
  mutate(rate = (n/tot)*100) %>% 
  arrange(-rate) # calculate the proportion of student who achieved >=25 score for KhoiA in each province
df_rate

## # A tibble: 59 x 4
## # Groups:   province [59]
##    province      n   tot  rate
##    <chr>     <int> <int> <dbl>
##  1 Ha Giang     27   581 4.65 
##  2 Bac Ninh     47  5221 0.900
##  3 Thai Binh    91 10149 0.897
##  4 Ninh Binh    31  3514 0.882
##  5 Hoa Binh     20  2356 0.849
##  6 Thanh Hoa    44  5497 0.800
##  7 Vinh Phuc    30  4070 0.737
##  8 Hai Phong    14  1961 0.714
##  9 Nam Dinh     18  2715 0.663
## 10 Bac Giang    33  5064 0.652
## # ... with 49 more rows

Excersice 2

Find any flaw of the following bar plot

The bar plot is adequate in terms of both aesthetic and the content. It displays sufficient information so that the audiences could get the whole picture at the first glance. A minor point is that the PCI scale used to categorize provinces into groups might not be efficient to clearly distinguish provinces by PCI.

Produce a bar plot with the same format using pci dataset

pci_data <- read_excel("D:/2020/R course/data_day12/du-lieu-pci-2018.xlsx", n_max = 64)

pci_data$Rank <- as.character(pci_data$"Xếp hạng")
names(pci_data)[3] <- "PCI" 
names(pci_data)[1] <- "Province"

pci_data %>% 
  mutate_at("Province", function(x) {stri_trans_general(x, "Latin-ASCII")}) %>% 
  unite("TP.Rank", c("Province", Rank), sep=" ") %>% 
  mutate(category = case_when(PCI >= 70 ~ "Very good",
                              PCI < 70 & PCI >= 65 ~ "Good",
                              PCI < 65 & PCI >= 63 ~ "Fairly good",
                              PCI < 63 & PCI >= 60 ~ "Average",
                              TRUE ~ "Fairly low")) %>% 
  mutate(PCI.2 = format(round(PCI, 2), nsmall = 2))-> pci_data_final

pci_data_final$category_factored <- factor(pci_data_final$category, 
                                           levels = c("Very good", 
                                                      "Good", 
                                                      "Fairly good",
                                                      "Average", 
                                                      "Fairly low"))
pci_data_final %>% 
  ggplot(aes(x = reorder(TP.Rank, PCI), y = PCI, fill= category_factored )) +
  geom_col(width = 0.7)+
  theme_bw()+
  xlab(NULL)+
  ylab("PCI")+
  scale_y_continuous(expand = c(0,0))+
  ggtitle("Provincial Competitiveness Index (PCI) 2018")+
  theme(legend.title = element_blank())+
  theme(axis.line = element_line(colour = "black"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank())+ 
  geom_text(aes(label = PCI.2), size= 2.5, hjust = 1)+
  coord_flip()

Excercise 3

Generate a function to group variable x according to the following conditions:

x >= 9 -> “GroupA”

8 <= x < 9 -> “GroupB”

7 <= x < 8 -> “GroupC”

6 <= x < 7 -> “GroupD”

5 <= x < 6 -> “GroupE”

x < 5 -> “GroupF”

in which x is a real number, only receive the value within [0, 10]

group <- function (x) {
  
  sapply(x, function (x)
  if (x < 0 | x > 10) stop("x must be within 0 and 10")
  else if (x >= 9) return("GroupA")
  else if (x >= 8 & x < 9) return("GroupB")
  else if (x >= 7 & x < 8) return("GroupC")
  else if (x >= 6 & x < 7) return("GroupD")
  else if (x >= 5 & x < 6) return("GroupE")
  else return("GroupF"))

}

Using this function to:

calculate the rate of GroupA student in Math among KhoiA taking students

KhoiA_tot <- df_exam %>% 
   filter(!is.na(KhoiA)) %>% 
   filter(!is.na(Math)) %>% 
   mutate(category = group(Math)) %>% 
   group_by(province) %>% 
   count(category) %>% 
   summarise(tot = sum(n)) # total KhoiA student who took Math exam 

KhoiA_groupA <- df_exam %>% 
   filter(!is.na(KhoiA)) %>% 
   filter(!is.na(Math)) %>% 
   mutate(category = group(Math)) %>% 
   filter (category == "GroupA") %>% 
   group_by(province) %>% 
   count(category) # the number of KhoiA students who belong to GroupA in Math

KhoiA <- full_join(KhoiA_tot, KhoiA_groupA, by = "province") %>% 
  select(-category) %>% 
  mutate(n = replace(n,is.na(n),0))

KhoiA %>% 
  mutate(groupA_rate = (n/tot)*100) %>% 
  select(province, groupA_rate) %>% 
  arrange(-groupA_rate) -> KhoiA_rate

head(KhoiA_rate)

## # A tibble: 6 x 2
##   province  groupA_rate
##   <chr>           <dbl>
## 1 Ha Giang        4.48 
## 2 Hoa Binh        0.849
## 3 Phu Tho         0.696
## 4 Ha Noi          0.397
## 5 Hai Phong       0.306
## 6 Thanh Hoa       0.291

Given only the number of students in KhoiB, calculate the rate of GroupA students in Biology for each province

KhoiB_tot <- df_exam %>% 
   filter(!is.na(KhoiB)) %>% 
   filter(!is.na(Biology)) %>% 
   mutate(category = group(Biology)) %>% 
   group_by(province) %>% 
   count(category) %>% 
   summarise(tot = sum(n))

KhoiB_groupA <- df_exam %>% 
   filter(!is.na(KhoiB)) %>% 
   filter(!is.na(Biology)) %>% 
   mutate(category = group(Biology)) %>% 
   filter (category == "GroupA") %>% 
   group_by(province) %>% 
   count(category) 

KhoiB <- full_join(KhoiB_tot, KhoiB_groupA, by = "province")  %>% 
  select(-category) %>% 
  mutate(n = replace(n,is.na(n),0))

KhoiB %>% 
  mutate(groupA_rate = (n/tot)*100) %>% 
  select(province, groupA_rate) %>% 
  arrange(-groupA_rate) -> KhoiB_rate

head(KhoiB_rate)

## # A tibble: 6 x 2
##   province  groupA_rate
##   <chr>           <dbl>
## 1 Ha Giang        0.526
## 2 Dien Bien       0.510
## 3 Kon Tum         0.375
## 4 Ha Tinh         0.296
## 5 Bac Kan         0.288
## 6 Hai Duong       0.190

Addtional excercises

Function to calculate the area of a regtangle with

reg_area <- function (a, b) {
   
  area <- a * b
  
  return(area)
}

Check the function

 reg_area( a = 7, b = 3)

## [1] 21

Function to calculate households’ electricity cost following the current regulation

current_regulation <- function(x){
  sapply(x, function (x)
    if (x < 0) stop("x must be larger than 0")
    else if(x >= 0 & x <= 50) return(x * 1549)
    else if(x >= 51 & x <= 100) return(x * 1600)
    else if(x >= 101 & x <= 200) return(x * 1858)
    else if(x >= 201 & x <= 300) return(x * 2340)
    else if(x >= 301 & x <= 400) return(x * 2615)
    else return(x * 2701))
  
}

# Check the function

x <- c(50, 78, 145, 211, 316, 500)

cost <- current_regulation(x)

cost

## [1]   77450  124800  269410  493740  826340 1350500

cost/x

## [1] 1549 1600 1858 2340 2615 2701

Function to calculate household’s electricity cost following the proposed regulation

proposed_regulation <- function(x){
  sapply(x, function (x)
  if (x < 0) stop("x must be larger than 0")
  else if(x >= 0 & x <= 100) return(x * 1549)
  else if(x >= 101 & x <= 200) return(x * 1858)
  else if(x >= 201 & x <= 400) return(x * 2340)
  else if(x >= 400 & x <= 701) return(x * 2701)
  else return(x * 3105))
}
 # Check the function

x <- c(50, 78, 145, 211, 316, 500, 800)

cost <- proposed_regulation(x)

cost

## [1]   77450  120822  269410  493740  739440 1350500 2484000

cost/x

## [1] 1549 1549 1858 2340 2340 2701 3105

Excercise day 2

Hoa Do

2021-01-08

Excersice 1

Excersice 2

Excercise 3

Addtional excercises