Mhmm scrapping data, everyone loves it.
library(rvest)
library(dplyr)
library(stringr)
library(tidytext)
library(ggplot2)
library(ggthemes)
library(tm)
library(tidyr)
library(wordcloud)
library(tm)
library(knitr)
out = NULL
for (i in 1:6 ){
for (j in 1:6) {
if ( i > 1) {
x <- read_html(paste0("http://www.analyticshour.io/all-podcast-episodes/page/", i))
htmlNode <- x %>%
html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate ")) %>%
html_children
check <- x %>%
html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate > a")) %>% html_text() %>% substr(2, 4) %>% as.numeric()
url <- substr(htmlNode, gregexpr('"', htmlNode)[[1]][1]+1, gregexpr('"', htmlNode)[[1]][2]-1)
if (check > 53 & check < 79) { #2017 episodes
content <- read_html(url)
post_id <- content %>% html_nodes("body") %>% html_attr("class") %>% {gsub("\\D", "", .)}
test_text<- content %>%
html_nodes(paste0("#post-",post_id," > div ")) %>% html_text() %>% strsplit(" ") %>% unlist
test_text <- test_text[which(grepl('Transcript', test_text))+1:length(test_text)]
text_df <- data.frame(ep = check, test_text)
out = rbind(out,text_df)
} else (
break
)
} else {
x <- read_html('http://www.analyticshour.io/all-podcast-episodes/')
htmlNode <- x %>%
html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate ")) %>%
html_children
check <- x %>%
html_nodes(paste0("body > div.super-container.light-icons > div.main-content.page.archive-page > div > div > div > div > article:nth-child(",j,") > div > footer > ul > li.title.not-truncate > a")) %>% html_text() %>% substr(2, 4) %>% as.numeric()
url <- substr(htmlNode, gregexpr('"', htmlNode)[[1]][1]+1, gregexpr('"', htmlNode)[[1]][2]-1)
if (check > 53 & check < 79) { #2017 episodes
content <- read_html(url)
post_id <- content %>% html_nodes("body") %>% html_attr("class") %>% {gsub("\\D", "", .)}
test_text<- content %>%
html_nodes(paste0("#post-",post_id," > div ")) %>% html_text() %>% strsplit(" ") %>% unlist
test_text <- test_text[which(grepl('Transcript', test_text))+1:length(test_text)]
text_df <- data.frame(ep = check, test_text)
out = rbind(out,text_df)
}
}
}
}
Is data cleaning worse than cleaning house?
out$word <- str_replace_all(out$test_text, "‘|“|\\.|\\,|\\?|\\:|\"|\\!|\\`|/","")
out$word <- str_replace_all(out$word,"\n"," ")
out$word <- iconv(out$word, "latin1", "ASCII", sub="")
out <- out[c(1,3)] %>% mutate(word = strsplit(as.character(word), " ")) %>% unnest(word)
out$word <- tolower(out$word)
out <- out[complete.cases(out), ]
Some weak data viz
out %>% filter(word == '[laughter]') %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#EF4A62") + theme_hc() + xlab('Episode') + ylab('Number of laughs') + ggtitle("The funniest episode?")

out %>% filter(grepl("fuck", word)) %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#3FA0D9") + theme_hc() + xlab('Episode') + ylab('Number of #$%^') + ggtitle("Not for children!!")

out %>% filter(grepl("machine|^ai$|artificial", word)) %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#39308A") + theme_hc() + xlab('Episode') + ylab('Machine learnings / ai mentions') + ggtitle("Machines taking over the world")

out %>% filter(grepl("mobile", word)) %>% group_by(ep) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n)) + geom_col(fill = "#E2E87A") + theme_hc() + xlab('Episode') + ylab('Mobile mentions') + ggtitle("Was 2017 year of mobile?")

out %>% filter(grepl("^adobe$|^google$", word)) %>% group_by(ep,word) %>% summarise(n = n()) %>% ggplot(aes(x = ep, y = n, fill = word)) + geom_col() + theme_hc() + xlab('Episode') + ylab('Adobe vs google mentions') + ggtitle("Google vs adobe") + scale_fill_manual(values = c("#2B2047","#FFC519"))

The most popular post by episode (and more data cleaning)
out <- out %>% filter(!word %in% stop_words$word & !word %in% c('mh','jn','sa','tw','mk','cb','im','ar','yeah','youre','[chuckle]','[laughter]','gonna','dont','ive','[music]','ss','isnt','tim','moe','helbling','jd','youve','bit','lot','whos','ago','hes','shes','doesnt','michael','wilson','theyre','wanna','mg'))
out %>% group_by(ep,word) %>% summarise(n = n()) %>% ungroup() %>% group_by(ep) %>% filter(n == max(n)) %>% print(n=26)
## # A tibble: 26 x 3
## # Groups: ep [25]
## ep word n
## <dbl> <chr> <int>
## 1 54 analytics 64
## 2 55 thinking 33
## 3 56 question 41
## 4 57 data 226
## 5 58 data 61
## 6 59 data 35
## 7 60 people 63
## 8 61 people 78
## 9 62 analyst 36
## 10 63 tool 72
## 11 64 store 85
## 12 65 people 48
## 13 66 data 166
## 14 67 python 56
## 15 68 people 108
## 16 69 bias 51
## 17 70 customers 83
## 18 71 learning 85
## 19 72 data 152
## 20 73 people 43
## 21 74 cloud 61
## 22 74 google 61
## 23 75 data 35
## 24 76 insights 38
## 25 77 data 111
## 26 78 analytics 44
Oh yeah, our main event and people’s favourite - word cloud!
forCloud <- out %>% group_by(word) %>% summarise(n = n())
wordcloud(forCloud$word, forCloud$n, min.freq=60,colors=brewer.pal(6, "Dark2"))
