Extracting content from diversified web resources, cleaning up the raw data, preparing them for the statistical analysis and actually performing the analysis it is far from being a simple task. The New York Times web site provides a rich set of APIs, as described here: https://developer.nytimes.com/api . Task is to choose one of the New York Times APIs, construct an interface in R to read in the JSON data, and transform it into an R DataFrame.
Preparing Data
Reproducibility
Workflow
library(DT)
library(jsonlite)
library(tidyjson)
library(dplyr)
library(tidyr)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)
library(wordcloud2)
library(tm)
library(textclean)
library(lares)
<-get_creds()$`nyt.api`$baseurl
baseurl<-get_creds()$`nyt.api`$apikey apikey
<- function(section) {
get_data<-paste(baseurl,section,".json?api-key=",sep = "")
url<- fromJSON(URLencode(paste0(url, apikey)))
request <- request$results
stories <-data.frame(Subsection=stories$subsection,
newdataTitle=stories$title,
Abstract=stories$abstract,
Byline=stories$byline,
Created=as.Date(stories$created_date),
'Short URL'=stories$short_url, stringsAsFactors = FALSE);
return(newdata)
}
<-function(dataframe) {
get_wordcloud<-dataframe$Abstract
abstract<- Corpus(VectorSource(abstract))
words <- words %>%
words tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(replace_contraction) %>%
tm_map(replace_curly_quote) %>%
tm_map(stripWhitespace)
<- tm_map(words, content_transformer(tolower))
words <- tm_map(words, removeWords, stopwords("english"))
words <- TermDocumentMatrix(words)
dtm <- as.matrix(dtm)
matrix <- sort(rowSums(matrix),decreasing=TRUE)
words <- data.frame(word = names(words),freq=words)
df =df[-1,]
dfset.seed(1234) # for reproducibility
wordcloud(words = df$word, freq = df$freq, min.freq = 1,
max.words=150, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
}
These are sections available to select on Top Stories API : Automobiles, Books, Business, Health, Movies, Politics, Science, Sports, Technology, Travel, US and World.
API doesnt allow fetching top stories for all sections so only selected sections are used for data analysis
<-get_data("automobiles") df
datatable(df)
get_wordcloud(df)
<-get_data("books") df
datatable(df)
get_wordcloud(df)
<-get_data("sports") df
datatable(df)
get_wordcloud(df)
<-get_data("health") df
datatable(df)
get_wordcloud(df)