Background

This is the good thing about open database. Although Google Scholar (GS) is not a completely open database, but people at Google let us to do the following things using R and Scholar. The following post was my first trial to the scholar package to harvest GS data.

Getting your GS ID

Your GS ID will be written in the url of your profile, for instance:

  1. Erwin: url https://scholar.google.co.id/citations?user=Myvc78MAAAAJ&hl=en&authuser=1; GS ID “Myvc78MAAAAJ&hl”"
  2. Prof. Edy Soewono: url https://scholar.google.co.id/citations?user=6U_YtvMAAAAJ&hl=en&authuser=1; GS ID “6U_YtvMAAAAJ&hl”
  3. Prof. Khairurrijal: url https://scholar.google.co.id/citations?user=4_asJ0MAAAAJ&hl=en&authuser=1; GS ID “4_asJ0MAAAAJ&hl”
  4. Dr. Jon Tennant: url https://scholar.google.co.id/citations?user=P7FvGMEAAAAJ&hl=en; GS ID “P7FvGMEAAAAJ&hl”

Installing packages

In my trial, I used the following packages:

  1. scholar
  2. tidyverse
# Ommit the '#' sign if you haven't installed the packages
#install.packages("scholar")
library(scholar)
#install.packages("tidyverse")
library(tidyverse)
#install.packages("gridExtra")
library(gridExtra)

Storing GS ID

E <- "Myvc78MAAAAJ&hl"  #hydrogeology
ES <- "6U_YtvMAAAAJ&hl" #math
Kh <- "4_asJ0MAAAAJ&hl" #physics
JT <- "P7FvGMEAAAAJ&hl" #paleontology

Getting GS profiles information

E.profile <- get_profile(E)
ES.profile <- get_profile(ES)
Kh.profile <- get_profile(Kh)
JT.profile <- get_profile(JT)

E.profile
ES.profile
Kh.profile
JT.profile

How many papers have they published?

E.num <- get_num_articles(E)
ES.num <- get_num_articles(ES)
Kh.num <- get_num_articles(Kh)
JT.num <- get_num_articles(JT)

num <- c(E.num, ES.num, Kh.num, JT.num)
barplot(num,
        names.arg=c("E", "ES", "Kh", "JT")) 

Comparing career

IDs <- c(ES, Kh, JT)
compare_scholar_careers(IDs, career = TRUE)

Getting citation history

ES.cite.year <- get_citation_history(ES)  # success
Kh.cite.year <- get_citation_history(Kh)  # success
JT.cite.year <- get_citation_history(JT)  # success
# E.cite.year <- get_citation_history(E)   # hmm, error, strange

Plotting citation history

ES.hist <- ggplot(ES.cite.year, aes(year,cites)) + 
  geom_bar(stat='identity',fill=colors()[128]) +
  scale_x_continuous(
    breaks = c(2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017))

Kh.hist <- ggplot(Kh.cite.year, 
       aes(year,cites)) + 
  geom_bar(stat='identity',fill=colors()[120]) +
  scale_x_continuous(
    breaks = c(2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017))

JT.hist <- ggplot(JT.cite.year, 
                aes(year,cites)) + 
  geom_bar(stat='identity',fill=colors()[118]) +
  scale_x_continuous(
    breaks = c(2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017))

ES.hist
Kh.hist
JT.hist

## My work around
E.cite.year <- get_publications(E)
E.cite.year <- as.data.frame(lapply(E.cite.year,
                                 function(x) if (is.factor(x)) as.character(x) else x))
w <-grep("Ultrafast", E.cite.year$title) 
E.cite.year$title[w]
E.cite.year$pubid[w] 
E.cite.year <- get_article_cite_history(E, E.cite.year$pubid[w])

E.hist <- ggplot(E.cite.year, aes(year, cites)) +
  geom_bar(stat='identity',fill=colors()[110]) +
  scale_x_continuous(
    breaks = c(2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017))+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Getting number of unique journals

get_num_distinct_journals(E)
get_num_distinct_journals(ES)
get_num_distinct_journals(Kh)
get_num_distinct_journals(JT)

Getting number of top journals

get_num_top_journals(E)
get_num_top_journals(ES)
get_num_top_journals(Kh)
get_num_top_journals(JT)

When did they become scientists?

get_oldest_article(E)
get_oldest_article(ES)
get_oldest_article(Kh)
get_oldest_article(JT)

Getting publications and write to a csv file

E.pubs <- get_publications(E)
write.csv(pubs, file = "citations.csv")

Predict their h-index

E.x <- predict_h_index(E)
ES.x <- predict_h_index(ES)
Kh.x <- predict_h_index(Kh)
JT.x <- predict_h_index(JT)