library(tidyverse); library(ggplot2); library(gridExtra); library(readxl)
## Warning: package 'tidyverse' was built under R version 4.0.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.2
## Warning: package 'tibble' was built under R version 4.0.2
## Warning: package 'tidyr' was built under R version 4.0.2
## Warning: package 'readr' was built under R version 4.0.2
## Warning: package 'dplyr' was built under R version 4.0.2
## Warning: package 'stringr' was built under R version 4.0.2
## Warning: package 'forcats' was built under R version 4.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine

Reading data

Data can be downloaded from https://github.com/tuanvnguyen/General-datasets/tree/main

df = read_excel("~/Dropbox/Science Evaluation/Most Cited Scientists 2019 and 2020/Updated 2021/2021 Fin VNese authors career for R.xlsx")
df$selfcites =  df$selfcites*100

cit = read_excel("~/Dropbox/Science Evaluation/Most Cited Scientists 2019 and 2020/Updated 2021/Authors career 2021 Full List.xlsx")
cit$selfcites =  cit$selfcites*100
cit = as.data.frame(cit)

Some descriptive analyses

# Number of scientists by field of research 
df %>% count(field)
## # A tibble: 16 x 2
##    field                                        n
##    <chr>                                    <int>
##  1 Agriculture, Fisheries & Forestry            5
##  2 Biology                                      3
##  3 Biomedical Research                          4
##  4 Built Environment & Design                   1
##  5 Chemistry                                    8
##  6 Clinical Medicine                           22
##  7 Earth & Environmental Sciences               1
##  8 Economics & Business                         3
##  9 Enabling & Strategic Technologies           19
## 10 Engineering                                 28
## 11 Information & Communication Technologies    33
## 12 Mathematics & Statistics                     2
## 13 Physics & Astronomy                         14
## 14 Psychology & Cognitive Sciences              1
## 15 Public Health & Health Services              2
## 16 Social Sciences                              2
# How many have selfcites>12% 
df %>% filter(country=="vnm") %>% count(selfcites>12.0)
## # A tibble: 2 x 2
##   `selfcites > 12`     n
##   <lgl>            <int>
## 1 FALSE                3
## 2 TRUE                25
# Median and mean of self-citation
df %>% group_by(country) %>% summarise(med = median(selfcites), mean=mean(selfcites))
## # A tibble: 18 x 3
##    country   med  mean
##    <chr>   <dbl> <dbl>
##  1 aus      15.1  16.3
##  2 aut       6.2   6.2
##  3 can      12.9  11.6
##  4 deu      13.9  13.9
##  5 fra      12.8  13.0
##  6 gbr      14.3  15.3
##  7 hkg      16.5  16.5
##  8 jpn      24.6  24.6
##  9 kor       5     5  
## 10 nor      13.7  13.7
## 11 pol      37.6  37.6
## 12 sau      20.7  20.7
## 13 sgp      13.4  12.5
## 14 tha      20.9  20.9
## 15 twn      26.8  34.2
## 16 usa      11.5  13.9
## 17 vnm      24.1  25.6
## 18 <NA>     17.6  15.4

Visualizing data

df %>% count(field) %>% ggplot(aes(x=field, y=n, fill=field)) + geom_bar(stat="identity") + theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1)) + labs(x="Lãnh vực", y="Số nhà khoa học")

df %>% filter(country != "") %>% ggplot(aes(x=country, y=selfcites, col=country)) + geom_boxplot(aes(col=country)) + geom_jitter(alpha=0.5) + theme(legend.position="none") + labs(x="Nước", y="Tỉ lệ tự trích dẫn")

df %>% ggplot(aes(x=h.index)) + geom_histogram(col="white", fill="blue") + theme(legend.position="none") + labs(x="Chỉ số H", y="Số nhà khoa học")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Full dataset

cit %>% filter(country %in% c("ind", "irn", "vnm", "usa", "aus", "can", "fra")) %>% ggplot(aes(x=country, y=selfcites, col=country)) + geom_boxplot() + geom_hline(aes(yintercept = 12), linetype="dashed", color="red") + theme(legend.position="none") + labs(x="Quốc gia", y="Tỉ lệ tự trích dẫn")