Project 4

library(rvest)
library(dplyr)

Project Fundamentals: Scraping Title, Publication Date, and Author

For each of the reference blog entries on the first page, you should pull out the title, date, and author, and store these in an R data frame.

search.results <-html("http://www.r-bloggers.com/search/web%20scraping")

titles <- search.results %>%
  html_nodes("#leftcontent h2") %>%
  html_text()

dates <- search.results %>%
  html_nodes(".date") %>%
  html_text()

authors <- search.results %>%
  html_nodes("div.meta > a") %>%
  html_text()
 
rBloggers <- as.data.frame(cbind(titles, dates, authors))
str(rBloggers)

## 'data.frame':    10 obs. of  3 variables:
##  $ titles : Factor w/ 10 levels "A Little Web Scraping Exercise with XML-Package",..: 4 2 10 9 6 1 3 7 5 8
##  $ dates  : Factor w/ 10 levels "April 5, 2012",..: 9 10 5 6 3 1 4 2 8 7
##  $ authors: Factor w/ 6 levels "axiomOfChoice",..: 3 2 5 5 5 4 6 1 6 6

Next Step: Scraping All 17 Result Pages

scrape.results <-function(URL){
  
  search.results <- html(URL)
  
  titles <- search.results %>%
  html_nodes("#leftcontent h2") %>%
  html_text()

dates <- search.results %>%
  html_nodes(".date") %>%
  html_text()

authors <- search.results %>%
  html_nodes("div.meta > a") %>%
  html_text()
  
  rBloggers <- as.data.frame(cbind(titles, dates, authors))
  
  Sys.sleep(1)
  
  return(rBloggers)
}

construct.URL <- function(page){
  url.part <-("http://www.r-bloggers.com/search/web%20scraping/page/") #5/
  URL <- paste(url.part,page, sep="")
  return(URL)
}
 
pages <- 1:17

URLs <- sapply(pages, FUN = construct.URL)

scrape.all <- lapply(URLs, FUN = scrape.results)
scrape.all <- do.call("rbind", scrape.all)

str(scrape.all)

## 'data.frame':    165 obs. of  3 variables:
##  $ titles : Factor w/ 161 levels "A Little Web Scraping Exercise with XML-Package",..: 4 2 10 9 6 1 3 7 5 8 ...
##  $ dates  : Factor w/ 149 levels "April 5, 2012",..: 9 10 5 6 3 1 4 2 8 7 ...
##  $ authors: Factor w/ 97 levels "axiomOfChoice",..: 3 2 5 5 5 4 6 1 6 6 ...

Final Step: Wordcloud

library(tm)
library(wordcloud)

words <- Corpus(VectorSource(scrape.all$titles))

#wordcloud(words)

#based on: https://georeferenced.wordpress.com/2013/01/15/rwordcloud/

library(SnowballC)

words.proc <- words %>%
  tm_map(stripWhitespace)%>%
  tm_map(content_transformer(tolower))%>%
  tm_map(removeWords, stopwords("english"))%>%
  tm_map(stemDocument)

wordcloud(words.proc)

Project 4

J. Hamski

April 22, 2015

Project Fundamentals: Scraping Title, Publication Date, and Author

Next Step: Scraping All 17 Result Pages

Final Step: Wordcloud