library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(patchwork)
setwd("c:\\users\\laptop\\desktop\\data\\DiemThiTHPT2018")
Find out the current directory:
getwd()
## [1] "C:/Users/LapTop/Desktop/data/DiemThiTHPT2018"
Check folders and files in the data directory:
dir()
## [1] "Bài v? nhà tu?n 3_R.R"
## [2] "di?m TB t?nh Hà Giang v?i t?nh khác.R"
## [3] "diem-thi-thpt-2018.csv"
## [4] "Diem_Thi_2018.html"
## [5] "Diem_Thi_2018.rmd"
## [6] "prov-code.csv"
## [7] "Rplot.png"
## [8] "Rplot01.png"
## [9] "TL_GS_PGS.png"
Title data: diem_thi-thpt-2018.csv
DT <- read_csv("diem-thi-thpt-2018.csv" )
## Parsed with column specification:
## cols(
## ID = col_integer(),
## SoBD = col_integer(),
## Math = col_double(),
## Viet = col_double(),
## English = col_double(),
## Physics = col_double(),
## Chemistry = col_double(),
## Biology = col_double(),
## History = col_double(),
## Geography = col_double(),
## GDCD = col_double(),
## KhoiA = col_double(),
## KhoiB = col_double(),
## KhoiC = col_double(),
## KhoiD = col_double(),
## KhoiA1 = col_double(),
## MaTinh = col_integer()
## )
Pro <- read_csv("prov-code.csv")
## Parsed with column specification:
## cols(
## ProvID = col_integer(),
## ProvLabel = col_character()
## )
Check data structure:
str(DT, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 478333 obs. of 17 variables:
## $ ID : int 478337 478338 478339 478340 478341 478342 478343 478344 478345 478346 ...
## $ SoBD : int 2058543 2058560 2058561 2058565 2058534 2058542 2058544 2058546 2058548 2058550 ...
## $ Math : num 4.6 5.4 5.6 6 4.4 4.6 5.8 6.2 7.6 3.6 ...
## $ Viet : num 4 5.5 6 7.25 5 6.5 4.5 5.5 5.25 4 ...
## $ English : num 2.2 3.4 2.6 5.8 2.4 4.2 3.6 3.2 3.6 4.6 ...
## $ Physics : num 5.75 NA NA 2 NA NA 4 3.75 NA 3.5 ...
## $ Chemistry: num 5.5 NA NA 3.5 NA NA 5 6.25 NA 4 ...
## $ Biology : num 5 NA NA 4.25 NA NA 4.25 5.75 NA 5 ...
## $ History : num 2.5 3.25 3.25 6.5 3.5 2.5 5.5 3.5 3.75 4 ...
## $ Geography: num 4 6.25 5.5 8.25 5.5 4 5.75 5 6.75 5.75 ...
## $ GDCD : num 4.25 8 8.5 8.75 8.5 6.5 6.75 6.75 9.5 8.5 ...
## $ KhoiA : num 17.2 NA NA 10.5 NA ...
## $ KhoiB : num 16.5 NA NA 12.8 NA ...
## $ KhoiC : num 10.5 15 14.8 22 14 ...
## $ KhoiD : num 10.8 14.3 14.2 19.1 11.8 ...
## $ KhoiA1 : num 15.8 NA NA 9.2 NA ...
## $ MaTinh : int 2 2 2 2 2 2 2 2 2 2 ...
str(Pro, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 63 obs. of 2 variables:
## $ ProvID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ ProvLabel: chr "HA NOI" "HO CHI MINH" "HAI PHONG" "DA NANG" ...
DT <- left_join(DT, Pro, by = c("MaTinh" = "ProvID"))
DT_2018<- DT %>%
group_by(MaTinh, ProvLabel) %>%
summarise(min_gdcd = min(GDCD, na.rm = TRUE),
max_gdcd = max(GDCD, na.rm = TRUE),
mean_gdcd = mean(GDCD, na.rm = TRUE)
) %>%
ungroup() %>%
mutate(mean_score_gdcd = mean(DT$GDCD, na.rm = TRUE)) %>%
mutate(mean = mean_gdcd > mean_score_gdcd)
ggplot(DT_2018, aes(ProvLabel, mean_gdcd)) +
geom_linerange(aes(ProvLabel, ymin = min_gdcd, ymax = max_gdcd), color = "yellow") +
geom_hline(yintercept = 7.157761, color = "black") +
geom_point(aes(color = "red"), show.legend = FALSE)+
coord_flip()+
theme_minimal()+
theme(panel.grid.major.x = element_blank())+
scale_y_continuous(name = "Average scores in the provinces",
breaks = seq(0, 10, 1))+
scale_x_discrete(name = "Province")