Mhmm scrapping data, everyone loves it.
library(rvest)
library(dplyr)
library(stringr)
library(tidytext)
library(ggplot2)
library(ggthemes)
library(tm)
library(tidyr)
library(wordcloud)
library(tm)
library(knitr)

out = NULL
for (i in 1:6 ){
  for (j in 1:6) {
      if ( i > 1) {
     x <- read_html(paste0("http://www.analyticshour.io/all-podcast-episodes/page/", i))
    htmlNode <- x %>% 
      html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate ")) %>%
      html_children 
    check <- x %>% 
      html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate > a")) %>% html_text() %>% substr(2, 4) %>% as.numeric()
    url <- substr(htmlNode, gregexpr('"', htmlNode)[[1]][1]+1, gregexpr('"', htmlNode)[[1]][2]-1)
    if (check > 53 & check < 79) { #2017 episodes    
      content <- read_html(url) 
      post_id <- content %>%   html_nodes("body") %>% html_attr("class") %>% {gsub("\\D", "", .)} 
      test_text<- content %>% 
        html_nodes(paste0("#post-",post_id," > div ")) %>% html_text() %>% strsplit(" ") %>% unlist
      test_text <- test_text[which(grepl('Transcript', test_text))+1:length(test_text)] 
      text_df <- data.frame(ep = check, test_text)
      out = rbind(out,text_df)
    } else (
      break
    )
      } else {
      
        x <- read_html('http://www.analyticshour.io/all-podcast-episodes/')
        htmlNode <- x %>% 
          html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate ")) %>%
          html_children 
        check <- x %>% 
          html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate > a")) %>% html_text() %>% substr(2, 4) %>% as.numeric()
        url <- substr(htmlNode, gregexpr('"', htmlNode)[[1]][1]+1, gregexpr('"', htmlNode)[[1]][2]-1)
        
        if (check > 53 & check < 79) { #2017 episodes     
          
          content <- read_html(url) 
          post_id <- content %>%   html_nodes("body") %>% html_attr("class") %>% {gsub("\\D", "", .)} 
          test_text<- content %>% 
            html_nodes(paste0("#post-",post_id," > div ")) %>% html_text() %>% strsplit(" ") %>% unlist
          test_text <- test_text[which(grepl('Transcript', test_text))+1:length(test_text)] 
          text_df <- data.frame(ep = check, test_text)
          out = rbind(out,text_df)
        } 
      }
    }
  }
Is data cleaning worse than cleaning house?
out$word <- str_replace_all(out$test_text, "‘|“|\\.|\\,|\\?|\\:|\"|\\!|\\`|/","")
out$word <- str_replace_all(out$word,"\n"," ")
out$word <- iconv(out$word, "latin1", "ASCII", sub="")
out <- out[c(1,3)] %>%   mutate(word = strsplit(as.character(word), " ")) %>%  unnest(word)
out$word <- tolower(out$word)
out <- out[complete.cases(out), ]
Some weak data viz
out %>% filter(word == '[laughter]') %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#EF4A62") + theme_hc() + xlab('Episode') + ylab('Number of laughs') + ggtitle("The funniest episode?")

out %>% filter(grepl("fuck", word)) %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#3FA0D9") + theme_hc() + xlab('Episode') + ylab('Number of #$%^') + ggtitle("Not for children!!")

out %>% filter(grepl("machine|^ai$|artificial", word)) %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#39308A") + theme_hc() + xlab('Episode') + ylab('Machine learnings / ai mentions') + ggtitle("Machines taking over the world")

out %>% filter(grepl("mobile", word)) %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#E2E87A") + theme_hc() + xlab('Episode') + ylab('Mobile mentions') + ggtitle("Was 2017 year of mobile?")

out %>% filter(grepl("^adobe$|^google$", word)) %>% group_by(ep,word) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n, fill = word)) + geom_col() + theme_hc() + xlab('Episode') + ylab('Adobe vs google mentions') + ggtitle("Google vs adobe") + scale_fill_manual(values = c("#2B2047","#FFC519"))

The most popular post by episode (and more data cleaning)
out <- out %>%  filter(!word %in% stop_words$word & !word %in%  c('mh','jn','sa','tw','mk','cb','im','ar','yeah','youre','[chuckle]','[laughter]','gonna','dont','ive','[music]','ss','isnt','tim','moe','helbling','jd','youve','bit','lot','whos','ago','hes','shes','doesnt','michael','wilson','theyre','wanna','mg')) 
out %>% group_by(ep,word) %>% summarise(n = n()) %>% ungroup() %>% group_by(ep) %>% filter(n == max(n))  %>%  print(n=26)
## # A tibble: 26 x 3
## # Groups:   ep [25]
##       ep      word     n
##    <dbl>     <chr> <int>
##  1    54 analytics    64
##  2    55  thinking    33
##  3    56  question    41
##  4    57      data   226
##  5    58      data    61
##  6    59      data    35
##  7    60    people    63
##  8    61    people    78
##  9    62   analyst    36
## 10    63      tool    72
## 11    64     store    85
## 12    65    people    48
## 13    66      data   166
## 14    67    python    56
## 15    68    people   108
## 16    69      bias    51
## 17    70 customers    83
## 18    71  learning    85
## 19    72      data   152
## 20    73    people    43
## 21    74     cloud    61
## 22    74    google    61
## 23    75      data    35
## 24    76  insights    38
## 25    77      data   111
## 26    78 analytics    44
Oh yeah, our main event and people’s favourite - word cloud!
forCloud <- out %>% group_by(word) %>% summarise(n = n())
wordcloud(forCloud$word, forCloud$n, min.freq=60,colors=brewer.pal(6, "Dark2"))