Interesting text data?

In my search for interesting text data to work on, I had the idea to look at TED talks over recent years that have included the topic of AI (artificial intelligence). In further posts I will be analysing this data, but this post is about the code I used to collect the data in the first place.

# Some packages

library(tidyverse)
library(data.table)
library(rvest)

# Some handy dandy functions

patientWaiter<- function(minutes_wait){
  # Ask R to wait a number of minutes before making another request to the server
  for (minute in 1:minutes_wait){
    cat("\n\n",minute, " :")
    for (second in 1:60) {
      Sys.sleep(1)
      cat("\t",second)
    }
  }
} 

get_talk_links<- function(pg){
  # Extract TED Talk links from the TED query result page
  alllinks<-  pg %>% html_nodes("a") %>% html_attr("href")
  goodlinks<- alllinks[alllinks %like% "/talks/"] %>% unique()
  return(goodlinks)
}

extract_meta_table<- function(pg){
  # Extract some information contained in the webpage under the meta tags
  all_meta_attrs <- unique(unlist(lapply(lapply(pg %>% html_nodes("meta"), html_attrs), names)))
  dat <- data.frame(lapply(all_meta_attrs, function(x) {
    pg %>% html_nodes("meta") %>% html_attr(x)
  }),stringsAsFactors = FALSE)
  colnames(dat) <- all_meta_attrs
  N<- nrow(dat)
  dat$RowNum<- 1:N
  return(dat)
}

newmatrix<- function(nr,nc,thing=NA) {
  # Create an empty data frame
  newmat<- matrix(rep(thing,nr*nc),nrow=nr,ncol=nc)
  newmat<- data.frame(newmat)
  return(newmat)
  
}

Let’s begin

Initially I did a manual search for AI-related talks from within the TED talks website, and made a note of the urls that I could see…


# Initial search query for AI-related TED Talks

STEM<-      "https://www.ted.com"
searchurl<- "https://www.ted.com/talks?sort=relevance&topics%5B%5D=Technology&language=en&q=AI"

By downloading the first page of search results (search_page1), I can see that this page contains links to the search results (individual talks) as well as links to further pages of search results (otherpages).


search_page1<- read_html(searchurl)

alllinks<-     search_page1 %>% html_nodes("a") %>% html_attr("href")
goodlinks<-    alllinks[alllinks %like% "/talks/"] %>% unique()

otherpages<-   alllinks[alllinks %like% "/talks\\?"] %>% print()
[1] "/talks?language=en&page=2&q=AI&sort=relevance&topics%5B%5D=Technology"
[2] "/talks?language=en&page=3&q=AI&sort=relevance&topics%5B%5D=Technology"
[3] "/talks?language=en&page=2&q=AI&sort=relevance&topics%5B%5D=Technology"
# [1] "/talks?language=en&page=2&q=AI&sort=relevance&topics%5B%5D=Technology"
# [2] "/talks?language=en&page=3&q=AI&sort=relevance&topics%5B%5D=Technology"
# [3] "/talks?language=en&page=2&q=AI&sort=relevance&topics%5B%5D=Technology"

Compile a list of sites to visit

From here I would like to put together a list of links for talks that I might want to download. I have used the get_talk_links function that I defined above, and then collected these into a single list of links. I could have done this part with the map function.

search_page2<- read_html(paste0(STEM, otherpages[1]))
search_page3<- read_html(paste0(STEM, otherpages[2]))
search_page4<- read_html(paste0(STEM, otherpages[3]))

link_list<- list()

link_list[[1]]<- get_talk_links(search_page1)
link_list[[2]]<- get_talk_links(search_page2)
link_list[[3]]<- get_talk_links(search_page3)
link_list[[4]]<- get_talk_links(search_page4)


talk_links<- unlist(link_list)
saveRDS(talk_links, "talk_links.rds")

Get ready to Scrape!

I can see that the ordinary links we have collected take us to the main TED video page. In order to visit the webpage that contains the text transcript, we have to tweak the links slightly…

# Convert ordinary links into links that will lead to the talk transcript
transcript_links<- map_chr(talk_links,~paste0(STEM,str_replace(.x, "\\?language=en","/transcript\\?language=en")))

Set up some empty objects into which to collect the data. I know preference varies quite widely on this point, but this time the data is quite well-behaved so a dataframe should be OK. I also like to save the raw html pages in a list in case I want to go back and extract any additional information later.

# Set up an empty data frame to hold the results
outdf<- newmatrix(length(transcript_links),9)
names(outdf)<- c("ID","date","author","long_title","keywords","description","title","author2","full_text")

# Set up a one-row version of the dataframe that can be easily collapsed
res<- newmatrix(1,9)
names(res)<- names(outdf)

# Set up an empty list to hold the raw html pages in case our parsing goes awry
page_list<- list()

Visit and Scrape

Now we’re ready to visit each of the talk transcript web pages in turn and collect the information we’re interested in.

for  (i in 1:length(transcript_links)){

    random_minutes<-  sample(2:6,1)  # Choose a random waiting time
    link<-            transcript_links[i]
    pg<-              read_html(link)
    page_list[[i]]<-  pg
    pnodes<-          pg %>% html_nodes("p") %>% html_text() %>% stripWhitespace() %>% trimws()
    res$full_text<-   str_c(pnodes, collapse=" ")

    meta<- extract_meta_table(pg)
    meta$itemprop[is.na(meta$itemprop)]<- ""
    meta$name[is.na(meta$name)]<- ""

    res$ID<-          i
    res$date<-        meta$content[meta$itemprop== "uploadDate"]
    res$author<-      meta$content[meta$name== "author"]
    res$long_title<-  meta$content[meta$name== "title"]
    res$keywords<-    meta$content[meta$name== "keywords"]
    res$description<- meta$content[meta$itemprop== "description"]
    res$title<-       meta$content[meta$itemprop== "name"][1]
    res$author2<-     meta$content[meta$itemprop== "name"][2]

    outdf[i,]<- res

    # I like to "save as I go" in case the scrape is interrupted part-way through
    saveRDS(outdf,"outdf.rds")
    outdf$full_text<- str_sub(outdf$full_text,1,32000) # Truncate for the Excel version
    writexl::write_xlsx(outdf,"TED_output.xlsx")
    saveRDS(page_list,"page_list.rds")

    # Some console output
    cat("\n\nWorking on talk number",i,": ",res$title, "-----------------------------------\n")
    patientWaiter(random_minutes)

}

Tidying up the output

We have now completed the main scraping task, and the data now exists in a local data frame and several saved copies. We finish up by doing some basic tidying…

# Remove duplicate talks
df<- df %>% filter(duplicated(title))  # 88 talks
 
# All transcripts carry the following sentence. This can be removed.

endTag<- "TED.com translations are made possible by volunteer translators. Learn more about the Open Translation Project. © TED Conferences, LLC. All rights reserved."

df$full_text<- str_remove_all(df$full_text, endTag)

# Convert to a date object
df$date<- lubridate::ymd(str_sub(df$date,1,10))

saveRDS(df,"TED_data.rds")

What have we collected?

Further analyses will be conducted in future posts, but just as a summary, let’s see how the talks we have collected are spread across recent years. We can see that while TED Talks about AI have been appearing for more than 13 years, there has been a dramatic increase in AI as a topic if interest in the past 3 to 4 years. We can see fewer talks from 2020 as we are only half-way through the year, and there may well be fewer TED talks this year due to the COVID-19 pandemic.

df %>% mutate(Year= year(date)) %>% count(Year) %>%
  ggplot(aes(x= Year, y=n)) +
  geom_col(fill= "lightblue", color= "grey50", size= 0.2) +
  geom_text(aes(label=n), vjust=-0.5, hjust= 0.4) +
  scale_y_continuous(limits = c(0,21), expand = c(0, 0)) +
  scale_x_continuous(breaks= 2007:2020) +
  theme_bw() + 
  labs(title= "TED Talks about AI: Count by Year", x= "\nYear", y=NULL) +
  theme(axis.text.x = element_text(angle= 90, hjust= 1,size=9),
        panel.grid = element_blank(),
        axis.text.y= element_blank(),
        axis.line.y = element_blank(),
        axis.ticks = element_blank(),
        axis.line.x = element_line(color= "grey50", size= 0.4),
        panel.border = element_blank())

NA
NA
---
title: "TED Talks in AI: Scraping the Data"
output: html_notebook
date: "2020-07-01"
author: "Cel McCracken"
---

### Interesting text data?

In my search for interesting text data to work on, I had the idea to look at TED talks over recent years that have included the topic of AI (artificial intelligence). In further posts I will be analysing this data, but this post is about the code I used to collect the data in the first place.

![from https://www.ted.com/talks/grady_booch_don_t_fear_superintelligent_ai/transcript?language=en](/Users/Celeste/Desktop/TED_screenshot.png)

```{r }
# Some packages

library(tidyverse)
library(data.table)
library(rvest)

# Some handy dandy functions

patientWaiter<- function(minutes_wait){
  # Ask R to wait a number of minutes before making another request to the server
  for (minute in 1:minutes_wait){
    cat("\n\n",minute, " :")
    for (second in 1:60) {
      Sys.sleep(1)
      cat("\t",second)
    }
  }
} 

get_talk_links<- function(pg){
  # Extract TED Talk links from the TED query result page
  alllinks<-  pg %>% html_nodes("a") %>% html_attr("href")
  goodlinks<- alllinks[alllinks %like% "/talks/"] %>% unique()
  return(goodlinks)
}

extract_meta_table<- function(pg){
  # Extract some information contained in the webpage under the meta tags
  all_meta_attrs <- unique(unlist(lapply(lapply(pg %>% html_nodes("meta"), html_attrs), names)))
  dat <- data.frame(lapply(all_meta_attrs, function(x) {
    pg %>% html_nodes("meta") %>% html_attr(x)
  }),stringsAsFactors = FALSE)
  colnames(dat) <- all_meta_attrs
  N<- nrow(dat)
  dat$RowNum<- 1:N
  return(dat)
}

newmatrix<- function(nr,nc,thing=NA) {
  # Create an empty data frame
  newmat<- matrix(rep(thing,nr*nc),nrow=nr,ncol=nc)
  newmat<- data.frame(newmat)
  return(newmat)
  
}

```



### Let's begin

Initially I did a manual search for AI-related talks from within the TED talks website, and made a note of the urls that I could see...


```{r}
# Initial search query for AI-related TED Talks

STEM<-      "https://www.ted.com"
searchurl<- "https://www.ted.com/talks?sort=relevance&topics%5B%5D=Technology&language=en&q=AI"

```

By downloading the first page of search results (`search_page1`), I can see that this page contains links to the search results (individual talks) as well as links to further pages of search results (`otherpages`).


```{r}
search_page1<- read_html(searchurl)

alllinks<-     search_page1 %>% html_nodes("a") %>% html_attr("href")
goodlinks<-    alllinks[alllinks %like% "/talks/"] %>% unique()

otherpages<-   alllinks[alllinks %like% "/talks\\?"] %>% print()
```

### Compile a list of sites to visit

From here I would like to put together a list of links for talks that I might want to download.  I have used the `get_talk_links` function that I defined above, and then collected these into a single list of links.  I could have done this part with the `map` function.

```{r}
search_page2<- read_html(paste0(STEM, otherpages[1]))
search_page3<- read_html(paste0(STEM, otherpages[2]))
search_page4<- read_html(paste0(STEM, otherpages[3]))

link_list<- list()

link_list[[1]]<- get_talk_links(search_page1)
link_list[[2]]<- get_talk_links(search_page2)
link_list[[3]]<- get_talk_links(search_page3)
link_list[[4]]<- get_talk_links(search_page4)


talk_links<- unlist(link_list)
saveRDS(talk_links, "talk_links.rds")
```


### Get ready to Scrape!

I can see that the ordinary links we have collected take us to the main TED video page. In order to visit the webpage that contains the text transcript, we have to tweak the links slightly...

```{r}
# Convert ordinary links into links that will lead to the talk transcript
transcript_links<- map_chr(talk_links,~paste0(STEM,str_replace(.x, "\\?language=en","/transcript\\?language=en")))

```

Set up some empty objects into which to collect the data. I know preference varies quite widely on this point, but this time the data is quite well-behaved so a dataframe should be OK.  I also like to save the raw html pages in a list in case I want to go back and extract any additional information later.

```{r}
# Set up an empty data frame to hold the results
outdf<- newmatrix(length(transcript_links),9)
names(outdf)<- c("ID","date","author","long_title","keywords","description","title","author2","full_text")

# Set up a one-row version of the dataframe that can be easily collapsed
res<- newmatrix(1,9)
names(res)<- names(outdf)

# Set up an empty list to hold the raw html pages in case our parsing goes awry
page_list<- list()
```

### Visit and Scrape
Now we're ready to visit each of the talk transcript web pages in turn and collect the information we're interested in.

```{r}
for  (i in 1:length(transcript_links)){

    random_minutes<-  sample(2:6,1)  # Choose a random waiting time
    link<-            transcript_links[i]
    pg<-              read_html(link)
    page_list[[i]]<-  pg
    pnodes<-          pg %>% html_nodes("p") %>% html_text() %>% stripWhitespace() %>% trimws()
    res$full_text<-   str_c(pnodes, collapse=" ")

    meta<- extract_meta_table(pg)
    meta$itemprop[is.na(meta$itemprop)]<- ""
    meta$name[is.na(meta$name)]<- ""

    res$ID<-          i
    res$date<-        meta$content[meta$itemprop== "uploadDate"]
    res$author<-      meta$content[meta$name== "author"]
    res$long_title<-  meta$content[meta$name== "title"]
    res$keywords<-    meta$content[meta$name== "keywords"]
    res$description<- meta$content[meta$itemprop== "description"]
    res$title<-       meta$content[meta$itemprop== "name"][1]
    res$author2<-     meta$content[meta$itemprop== "name"][2]

    outdf[i,]<- res

    # I like to "save as I go" in case the scrape is interrupted part-way through
    saveRDS(outdf,"outdf.rds")
    outdf$full_text<- str_sub(outdf$full_text,1,32000) # Truncate for the Excel version
    writexl::write_xlsx(outdf,"TED_output.xlsx")
    saveRDS(page_list,"page_list.rds")

    # Some console output
    cat("\n\nWorking on talk number",i,": ",res$title, "-----------------------------------\n")
    patientWaiter(random_minutes)

}

```

### Tidying up the output
We have now completed the main scraping task, and the data now exists in a local data frame and several saved copies.  We finish up by doing some basic tidying...

```{r}
# Remove duplicate talks
df<- df %>% filter(duplicated(title))  # 88 talks
 
# All transcripts carry the following sentence. This can be removed.

endTag<- "TED.com translations are made possible by volunteer translators. Learn more about the Open Translation Project. © TED Conferences, LLC. All rights reserved."

df$full_text<- str_remove_all(df$full_text, endTag)

# Convert to a date object
df$date<- lubridate::ymd(str_sub(df$date,1,10))

saveRDS(df,"TED_data.rds")
```

### What have we collected?

Further analyses will be conducted in future posts, but just as a summary, let's see how the talks we have collected are spread across recent years.  We can see that while TED Talks about AI have been appearing for more than 13 years, there has been a dramatic increase in AI as a topic if interest in the past 3 to 4 years.  We can see fewer talks from 2020 as we are only half-way through the year, and there may well be fewer TED talks this year due to the COVID-19 pandemic.

```{r}
# Barplot of counts by year

df %>% mutate(Year= year(date)) %>% count(Year) %>%
  ggplot(aes(x= Year, y=n)) +
  geom_col(fill= "lightblue", color= "grey50", size= 0.2) +
  geom_text(aes(label=n), vjust= -0.5, hjust= 0.4) +
  scale_y_continuous(limits = c(0, 21), expand = c(0, 0)) +
  scale_x_continuous(breaks= 2007:2020) +
  theme_bw() + 
  labs(title= "TED Talks about AI: Count by Year", x= "\nYear", y=NULL) +
  theme(axis.text.x = element_text(angle= 90, hjust= 1, size=9),
        panel.grid = element_blank(),
        axis.text.y= element_blank(),
        axis.line.y = element_blank(),
        axis.ticks = element_blank(),
        axis.line.x = element_line(color= "grey50", size= 0.4),
        panel.border = element_blank())
  
  
```