Step 1. We will load all libraries we will use for this project

#install.packages("jsonlite")

library(jsonlite)

library(stringr)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

library(rvest)
## Loading required package: xml2

Step 2. We will assign parameters to variables provided by NYT

myKey<-"c8177b4948ca478093562d83205f9ba4"

myURL<-"http://api.nytimes.com/svc/search/v2/articlesearch.json"

# q

mySearch<-"Siberian winter"

mySearch1<-str_c("?q=",str_replace(mySearch,' ','+'))

# fq

mySearchD<-"coldest"

mySearchD1<-str_c("&?fq=",str_replace(mySearchD,' ','+'))

myBday<-"20080101"

myBday1<-str_c("&?begin_date=",myBday)

end_date<-"20181027"

end_date<-str_c("&?end_date=",end_date)

#needs to be factor newest or oldest

sort<-""

sort<-ifelse(!is.na(sort),str_c("&?sort=",sort),"")

fl<-""

fl<-ifelse(str_length(fl)>0,str_c("&?fl=",fl),"")

hl<-TRUE

hl<-str_c("&?hl=",hl)

page<-NA

page<-ifelse(!is.na(page),str_c("&?page=",page),"")

facet_field<-NA

facet_field<-ifelse(!is.na(facet_field),str_c("&?facet_field=",facet_field),"")

facet_filter<-NA

facet_filter<-ifelse(!is.na(facet_filter),str_c("&?facet_filter=",facet_filter),"")

Step 3. We will pull JSON data from NYT API

myQuery<-str_c(myURL,mySearch1,mySearchD1,myBday1,end_date,sort,fl,hl,page,facet_field,facet_filter,"&api-key=",myKey)

myQuery
## [1] "http://api.nytimes.com/svc/search/v2/articlesearch.json?q=Siberian+winter&?fq=coldest&?begin_date=20080101&?end_date=20181027&?sort=&?hl=TRUE&api-key=c8177b4948ca478093562d83205f9ba4"
NYT <- fromJSON(myQuery, flatten=TRUE)%>% data.frame()

Step 4. We will only pull columns that appear useful and we will rename these columns

NYT1 <-select(NYT, response.docs.web_url,response.docs.snippet,response.docs.print_page,response.docs.keywords,response.docs.pub_date,response.docs.document_type,response.docs.word_count,response.docs.headline.main, response.docs.headline.print_headline,response.docs.byline.original)%>%rename(url = response.docs.web_url,snippet=response.docs.snippet,pages=response.docs.print_page,keywords=response.docs.keywords,pub_date=response.docs.pub_date,doc_type=response.docs.document_type,word_count=response.docs.word_count,headline=response.docs.headline.main, headline1=response.docs.headline.print_headline,byline=response.docs.byline.original)

Step 5. We will unnest the keyword column which is right now in dataframe format - basically unreadable. We will also convert publishing date to date format. We do have a problem as one of the keywords values is NULL and unnest cannot handle NULL

NYT2<-unnest(NYT1,keywords)%>%select(-c(rank,major))%>%distinct()%>%group_by(url,name)%>%top_n(1)%>%spread(name,value)
## Selecting by value
NYT2$pub_date<-as.Date(NYT2$pub_date)