library(tidyverse); library(ggplot2); library(gridExtra); library(readxl)
## Warning: package 'tidyverse' was built under R version 4.0.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.2
## Warning: package 'tibble' was built under R version 4.0.2
## Warning: package 'tidyr' was built under R version 4.0.2
## Warning: package 'readr' was built under R version 4.0.2
## Warning: package 'dplyr' was built under R version 4.0.2
## Warning: package 'stringr' was built under R version 4.0.2
## Warning: package 'forcats' was built under R version 4.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
Data can be downloaded from https://github.com/tuanvnguyen/General-datasets/tree/main
df = read_excel("~/Dropbox/Science Evaluation/Most Cited Scientists 2019 and 2020/Updated 2021/2021 Fin VNese authors career for R.xlsx")
df$selfcites = df$selfcites*100
cit = read_excel("~/Dropbox/Science Evaluation/Most Cited Scientists 2019 and 2020/Updated 2021/Authors career 2021 Full List.xlsx")
cit$selfcites = cit$selfcites*100
cit = as.data.frame(cit)
# Number of scientists by field of research
df %>% count(field)
## # A tibble: 16 x 2
## field n
## <chr> <int>
## 1 Agriculture, Fisheries & Forestry 5
## 2 Biology 3
## 3 Biomedical Research 4
## 4 Built Environment & Design 1
## 5 Chemistry 8
## 6 Clinical Medicine 22
## 7 Earth & Environmental Sciences 1
## 8 Economics & Business 3
## 9 Enabling & Strategic Technologies 19
## 10 Engineering 28
## 11 Information & Communication Technologies 33
## 12 Mathematics & Statistics 2
## 13 Physics & Astronomy 14
## 14 Psychology & Cognitive Sciences 1
## 15 Public Health & Health Services 2
## 16 Social Sciences 2
# How many have selfcites>12%
df %>% filter(country=="vnm") %>% count(selfcites>12.0)
## # A tibble: 2 x 2
## `selfcites > 12` n
## <lgl> <int>
## 1 FALSE 3
## 2 TRUE 25
# Median and mean of self-citation
df %>% group_by(country) %>% summarise(med = median(selfcites), mean=mean(selfcites))
## # A tibble: 18 x 3
## country med mean
## <chr> <dbl> <dbl>
## 1 aus 15.1 16.3
## 2 aut 6.2 6.2
## 3 can 12.9 11.6
## 4 deu 13.9 13.9
## 5 fra 12.8 13.0
## 6 gbr 14.3 15.3
## 7 hkg 16.5 16.5
## 8 jpn 24.6 24.6
## 9 kor 5 5
## 10 nor 13.7 13.7
## 11 pol 37.6 37.6
## 12 sau 20.7 20.7
## 13 sgp 13.4 12.5
## 14 tha 20.9 20.9
## 15 twn 26.8 34.2
## 16 usa 11.5 13.9
## 17 vnm 24.1 25.6
## 18 <NA> 17.6 15.4
df %>% count(field) %>% ggplot(aes(x=field, y=n, fill=field)) + geom_bar(stat="identity") + theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1)) + labs(x="Lãnh vực", y="Số nhà khoa học")
df %>% filter(country != "") %>% ggplot(aes(x=country, y=selfcites, col=country)) + geom_boxplot(aes(col=country)) + geom_jitter(alpha=0.5) + theme(legend.position="none") + labs(x="Nước", y="Tỉ lệ tự trích dẫn")
df %>% ggplot(aes(x=h.index)) + geom_histogram(col="white", fill="blue") + theme(legend.position="none") + labs(x="Chỉ số H", y="Số nhà khoa học")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cit %>% filter(country %in% c("ind", "irn", "vnm", "usa", "aus", "can", "fra")) %>% ggplot(aes(x=country, y=selfcites, col=country)) + geom_boxplot() + geom_hline(aes(yintercept = 12), linetype="dashed", color="red") + theme(legend.position="none") + labs(x="Quốc gia", y="Tỉ lệ tự trích dẫn")