I. Activate packages

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(patchwork)

II. Set directory

1. set directory:

setwd("c:\\users\\laptop\\desktop\\data\\DiemThiTHPT2018")

2. Inspect directory

Find out the current directory:

getwd()
## [1] "C:/Users/LapTop/Desktop/data/DiemThiTHPT2018"

Check folders and files in the data directory:

dir()
## [1] "Bài v? nhà tu?n 3_R.R"                
## [2] "di?m TB t?nh Hà Giang v?i t?nh khác.R"
## [3] "diem-thi-thpt-2018.csv"               
## [4] "Diem_Thi_2018.html"                   
## [5] "Diem_Thi_2018.rmd"                    
## [6] "prov-code.csv"                        
## [7] "Rplot.png"                            
## [8] "Rplot01.png"                          
## [9] "TL_GS_PGS.png"

3. Import Data

Title data: diem_thi-thpt-2018.csv

DT <- read_csv("diem-thi-thpt-2018.csv" )
## Parsed with column specification:
## cols(
##   ID = col_integer(),
##   SoBD = col_integer(),
##   Math = col_double(),
##   Viet = col_double(),
##   English = col_double(),
##   Physics = col_double(),
##   Chemistry = col_double(),
##   Biology = col_double(),
##   History = col_double(),
##   Geography = col_double(),
##   GDCD = col_double(),
##   KhoiA = col_double(),
##   KhoiB = col_double(),
##   KhoiC = col_double(),
##   KhoiD = col_double(),
##   KhoiA1 = col_double(),
##   MaTinh = col_integer()
## )
Pro <- read_csv("prov-code.csv")
## Parsed with column specification:
## cols(
##   ProvID = col_integer(),
##   ProvLabel = col_character()
## )

Check data structure:

str(DT, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame':    478333 obs. of  17 variables:
##  $ ID       : int  478337 478338 478339 478340 478341 478342 478343 478344 478345 478346 ...
##  $ SoBD     : int  2058543 2058560 2058561 2058565 2058534 2058542 2058544 2058546 2058548 2058550 ...
##  $ Math     : num  4.6 5.4 5.6 6 4.4 4.6 5.8 6.2 7.6 3.6 ...
##  $ Viet     : num  4 5.5 6 7.25 5 6.5 4.5 5.5 5.25 4 ...
##  $ English  : num  2.2 3.4 2.6 5.8 2.4 4.2 3.6 3.2 3.6 4.6 ...
##  $ Physics  : num  5.75 NA NA 2 NA NA 4 3.75 NA 3.5 ...
##  $ Chemistry: num  5.5 NA NA 3.5 NA NA 5 6.25 NA 4 ...
##  $ Biology  : num  5 NA NA 4.25 NA NA 4.25 5.75 NA 5 ...
##  $ History  : num  2.5 3.25 3.25 6.5 3.5 2.5 5.5 3.5 3.75 4 ...
##  $ Geography: num  4 6.25 5.5 8.25 5.5 4 5.75 5 6.75 5.75 ...
##  $ GDCD     : num  4.25 8 8.5 8.75 8.5 6.5 6.75 6.75 9.5 8.5 ...
##  $ KhoiA    : num  17.2 NA NA 10.5 NA ...
##  $ KhoiB    : num  16.5 NA NA 12.8 NA ...
##  $ KhoiC    : num  10.5 15 14.8 22 14 ...
##  $ KhoiD    : num  10.8 14.3 14.2 19.1 11.8 ...
##  $ KhoiA1   : num  15.8 NA NA 9.2 NA ...
##  $ MaTinh   : int  2 2 2 2 2 2 2 2 2 2 ...
str(Pro, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame':    63 obs. of  2 variables:
##  $ ProvID   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ProvLabel: chr  "HA NOI" "HO CHI MINH" "HAI PHONG" "DA NANG" ...

III. Data processing

1. Create table data

DT <- left_join(DT, Pro, by = c("MaTinh" = "ProvID"))
DT_2018<- DT %>% 
  group_by(MaTinh, ProvLabel) %>% 
  summarise(min_gdcd = min(GDCD, na.rm = TRUE),
            max_gdcd = max(GDCD, na.rm = TRUE), 
            mean_gdcd = mean(GDCD, na.rm = TRUE)
  ) %>% 
  ungroup() %>% 
  mutate(mean_score_gdcd = mean(DT$GDCD, na.rm = TRUE)) %>% 
  mutate(mean = mean_gdcd > mean_score_gdcd)

2. Chart drawing

ggplot(DT_2018, aes(ProvLabel, mean_gdcd)) +
  geom_linerange(aes(ProvLabel, ymin = min_gdcd, ymax = max_gdcd), color = "yellow") +
  geom_hline(yintercept = 7.157761, color = "black") +
  geom_point(aes(color = "red"), show.legend = FALSE)+
  coord_flip()+
  theme_minimal()+
  theme(panel.grid.major.x = element_blank())+
  scale_y_continuous(name = "Average scores in the provinces", 
                     breaks = seq(0, 10, 1))+
  scale_x_discrete(name = "Province")