Step 1. We will load all libraries we will use for this project
#install.packages("jsonlite")
library(jsonlite)
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(rvest)
## Loading required package: xml2
Step 2. We will assign parameters to variables provided by NYT
myKey<-"c8177b4948ca478093562d83205f9ba4"
myURL<-"http://api.nytimes.com/svc/search/v2/articlesearch.json"
# q
mySearch<-"Siberian winter"
mySearch1<-str_c("?q=",str_replace(mySearch,' ','+'))
# fq
mySearchD<-"coldest"
mySearchD1<-str_c("&?fq=",str_replace(mySearchD,' ','+'))
myBday<-"20080101"
myBday1<-str_c("&?begin_date=",myBday)
end_date<-"20181027"
end_date<-str_c("&?end_date=",end_date)
#needs to be factor newest or oldest
sort<-""
sort<-ifelse(!is.na(sort),str_c("&?sort=",sort),"")
fl<-""
fl<-ifelse(str_length(fl)>0,str_c("&?fl=",fl),"")
hl<-TRUE
hl<-str_c("&?hl=",hl)
page<-NA
page<-ifelse(!is.na(page),str_c("&?page=",page),"")
facet_field<-NA
facet_field<-ifelse(!is.na(facet_field),str_c("&?facet_field=",facet_field),"")
facet_filter<-NA
facet_filter<-ifelse(!is.na(facet_filter),str_c("&?facet_filter=",facet_filter),"")
Step 3. We will pull JSON data from NYT API
myQuery<-str_c(myURL,mySearch1,mySearchD1,myBday1,end_date,sort,fl,hl,page,facet_field,facet_filter,"&api-key=",myKey)
myQuery
## [1] "http://api.nytimes.com/svc/search/v2/articlesearch.json?q=Siberian+winter&?fq=coldest&?begin_date=20080101&?end_date=20181027&?sort=&?hl=TRUE&api-key=c8177b4948ca478093562d83205f9ba4"
NYT <- fromJSON(myQuery, flatten=TRUE)%>% data.frame()
Step 4. We will only pull columns that appear useful and we will rename these columns
NYT1 <-select(NYT, response.docs.web_url,response.docs.snippet,response.docs.print_page,response.docs.keywords,response.docs.pub_date,response.docs.document_type,response.docs.word_count,response.docs.headline.main, response.docs.headline.print_headline,response.docs.byline.original)%>%rename(url = response.docs.web_url,snippet=response.docs.snippet,pages=response.docs.print_page,keywords=response.docs.keywords,pub_date=response.docs.pub_date,doc_type=response.docs.document_type,word_count=response.docs.word_count,headline=response.docs.headline.main, headline1=response.docs.headline.print_headline,byline=response.docs.byline.original)
Step 5. We will unnest the keyword column which is right now in dataframe format - basically unreadable. We will also convert publishing date to date format. We do have a problem as one of the keywords values is NULL and unnest cannot handle NULL
NYT2<-unnest(NYT1,keywords)%>%select(-c(rank,major))%>%distinct()%>%group_by(url,name)%>%top_n(1)%>%spread(name,value)
## Selecting by value
NYT2$pub_date<-as.Date(NYT2$pub_date)