library(rvest)
library(dplyr)
Project Fundamentals: Scraping Title, Publication Date, and Author
For each of the reference blog entries on the first page, you should pull out the title, date, and author, and store these in an R data frame.
search.results <-html("http://www.r-bloggers.com/search/web%20scraping")
titles <- search.results %>%
html_nodes("#leftcontent h2") %>%
html_text()
dates <- search.results %>%
html_nodes(".date") %>%
html_text()
authors <- search.results %>%
html_nodes("div.meta > a") %>%
html_text()
rBloggers <- as.data.frame(cbind(titles, dates, authors))
str(rBloggers)
## 'data.frame': 10 obs. of 3 variables:
## $ titles : Factor w/ 10 levels "A Little Web Scraping Exercise with XML-Package",..: 4 2 10 9 6 1 3 7 5 8
## $ dates : Factor w/ 10 levels "April 5, 2012",..: 9 10 5 6 3 1 4 2 8 7
## $ authors: Factor w/ 6 levels "axiomOfChoice",..: 3 2 5 5 5 4 6 1 6 6
Next Step: Scraping All 17 Result Pages
scrape.results <-function(URL){
search.results <- html(URL)
titles <- search.results %>%
html_nodes("#leftcontent h2") %>%
html_text()
dates <- search.results %>%
html_nodes(".date") %>%
html_text()
authors <- search.results %>%
html_nodes("div.meta > a") %>%
html_text()
rBloggers <- as.data.frame(cbind(titles, dates, authors))
Sys.sleep(1)
return(rBloggers)
}
construct.URL <- function(page){
url.part <-("http://www.r-bloggers.com/search/web%20scraping/page/") #5/
URL <- paste(url.part,page, sep="")
return(URL)
}
pages <- 1:17
URLs <- sapply(pages, FUN = construct.URL)
scrape.all <- lapply(URLs, FUN = scrape.results)
scrape.all <- do.call("rbind", scrape.all)
str(scrape.all)
## 'data.frame': 165 obs. of 3 variables:
## $ titles : Factor w/ 161 levels "A Little Web Scraping Exercise with XML-Package",..: 4 2 10 9 6 1 3 7 5 8 ...
## $ dates : Factor w/ 149 levels "April 5, 2012",..: 9 10 5 6 3 1 4 2 8 7 ...
## $ authors: Factor w/ 97 levels "axiomOfChoice",..: 3 2 5 5 5 4 6 1 6 6 ...
Final Step: Wordcloud
library(tm)
library(wordcloud)
words <- Corpus(VectorSource(scrape.all$titles))
#wordcloud(words)
#based on: https://georeferenced.wordpress.com/2013/01/15/rwordcloud/
library(SnowballC)
words.proc <- words %>%
tm_map(stripWhitespace)%>%
tm_map(content_transformer(tolower))%>%
tm_map(removeWords, stopwords("english"))%>%
tm_map(stemDocument)
wordcloud(words.proc)
