library(jsonlite)
library(lubridate)
library(dplyr)
library(stringr)

fromDate<-seq(ymd('2010-01-01'),ymd('2017-06-01'), by = '3 month')
toDate<-seq(ymd('2010-04-01'),ymd('2017-09-01'), by = '3 month')
dates<-data.frame(fromDate=fromDate, toDate=toDate)

urls<-do.call(rbind, apply(dates, 2, function(x){
  url<-paste0("https://content.guardianapis.com/data?page-size=200&from-date=", dates$fromDate, "&to-date=", dates$toDate, "&order-by=newest&show-fields=all&api-key=bd5ba340-ad14-420e-8e36-7704f5ac4c6a")
  data.frame(url=url, stringsAsFactors = FALSE)
}))
row.names(urls)<-NULL
urlss<-urls$url

iguardian <- do.call(rbind, lapply(urlss, function(url){
  print(which(urlss==url))
  document <- fromJSON(url)
  
  type <- document$response$results$type
  if(is.null(type))return()
  sectionId <- document$response$results$sectionId
  webPublicationDate <- document$response$results$webPublicationDate
  webUrl <- document$response$results$webUrl
  wordcount <- document$response$results$fields$wordcount
  commentCloseDate <- document$response$results$fields$commentCloseDate
  commentable <- document$response$results$fields$commentable
  firstPublicationDate <- document$response$results$fields$firstPublicationDate
  shortUrl <- document$response$results$fields$shortUrl
  
  out <- data.frame(type=type, sectionId=sectionId, webPublicationDate=webPublicationDate, webUrl=webUrl, wordcount=wordcount,
             commentCloseDate=ifelse(is.null(commentCloseDate), "", commentCloseDate), commentable=commentable, 
             firstPublicationDate=ifelse(is.null(firstPublicationDate), "", firstPublicationDate),
             shortUrl=shortUrl, stringsAsFactors = FALSE)
  return(out)
}))

iguardian<-iguardian[!duplicated(iguardian), ]

urlss<-iguardian$shortUrl

gcomments<-do.call(rbind, lapply(urlss, function(url){
  print(which(urlss==url))
    comments<-""
    sUrl<-str_extract(url, '(/p/)(.*?)([a-z, 0-9]{5})')
    b<-paste0('https://api.nextgen.guardianapps.co.uk/discussion/comment-counts.json?shortUrls=', sUrl)
    tryCatch(comments<-fromJSON(b)[[1]][[2]], error=function(e) NULL)
  data.frame(comments=comments, url=url, stringsAsFactors = FALSE)
}))

iguardian<-left_join(iguardian, gcomments, by=c("shortUrl"="url"))
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
load("guardiani.rda")
guardiani$webPublicationDate<-as.Date(guardiani$webPublicationDate)

hist(guardiani$webPublicationDate, "year", "%y", freq = T)

hist(guardiani$webPublicationDate, "month", "%m", freq = T)

min(guardiani$webPublicationDate)
## [1] "2010-05-11"
max(guardiani$webPublicationDate)
## [1] "2017-07-01"
topics<-as.data.frame(round(prop.table(table(guardiani$sectionId))*100))
df<-topics%>%filter(Freq>1)
#df<-as.data.frame(table(guardiani$sectionId))
library(ggplot2)
ggplot(df, aes(x=reorder(Var1, Freq), y=Freq))+geom_bar(stat="identity")+coord_flip()+ guides(fill=FALSE)+ggtitle("N=1745")

library(scales)
ggplot(guardiani, aes(x=webPublicationDate))+ geom_histogram(binwidth=30, colour="white") + scale_x_date(labels = date_format("%Y"))

df<-guardiani%>%filter(commentable=="true")
ggplot(df, aes(x=webPublicationDate))+ geom_histogram(binwidth=30, colour="white") + scale_x_date(labels = date_format("%Y"))

# hist(df$webPublicationDate, "year", "%y", freq = T)
# hist(df$webPublicationDate, "year", "%y", freq = T)
# hist(df$webPublicationDate, "month", "%m", freq = T)
guardian2015a<-guardiani%>%filter(commentable=="true", webPublicationDate>"2015-01-01", webPublicationDate<"2016-01-01")
# N=114
# summary(as.numeric(guardian2015a$comments))
# topics<-as.data.frame(round(prop.table(table(guardian2015a$sectionId))*100))
# topics%>%filter(Freq>1)

topics<-as.data.frame(round(prop.table(table(guardian2015a$sectionId))*100))
df<-topics%>%filter(Freq>1)
#df<-as.data.frame(table(guardiani$sectionId))
library(ggplot2)
ggplot(df, aes(x=reorder(Var1, Freq), y=Freq))+geom_bar(stat="identity")+coord_flip()+ guides(fill=FALSE)+ggtitle("N=123")