library(jsonlite)
library(lubridate)
library(dplyr)
library(stringr)
fromDate<-seq(ymd('2010-01-01'),ymd('2017-06-01'), by = '3 month')
toDate<-seq(ymd('2010-04-01'),ymd('2017-09-01'), by = '3 month')
dates<-data.frame(fromDate=fromDate, toDate=toDate)
urls<-do.call(rbind, apply(dates, 2, function(x){
url<-paste0("https://content.guardianapis.com/data?page-size=200&from-date=", dates$fromDate, "&to-date=", dates$toDate, "&order-by=newest&show-fields=all&api-key=bd5ba340-ad14-420e-8e36-7704f5ac4c6a")
data.frame(url=url, stringsAsFactors = FALSE)
}))
row.names(urls)<-NULL
urlss<-urls$url
iguardian <- do.call(rbind, lapply(urlss, function(url){
print(which(urlss==url))
document <- fromJSON(url)
type <- document$response$results$type
if(is.null(type))return()
sectionId <- document$response$results$sectionId
webPublicationDate <- document$response$results$webPublicationDate
webUrl <- document$response$results$webUrl
wordcount <- document$response$results$fields$wordcount
commentCloseDate <- document$response$results$fields$commentCloseDate
commentable <- document$response$results$fields$commentable
firstPublicationDate <- document$response$results$fields$firstPublicationDate
shortUrl <- document$response$results$fields$shortUrl
out <- data.frame(type=type, sectionId=sectionId, webPublicationDate=webPublicationDate, webUrl=webUrl, wordcount=wordcount,
commentCloseDate=ifelse(is.null(commentCloseDate), "", commentCloseDate), commentable=commentable,
firstPublicationDate=ifelse(is.null(firstPublicationDate), "", firstPublicationDate),
shortUrl=shortUrl, stringsAsFactors = FALSE)
return(out)
}))
iguardian<-iguardian[!duplicated(iguardian), ]
urlss<-iguardian$shortUrl
gcomments<-do.call(rbind, lapply(urlss, function(url){
print(which(urlss==url))
comments<-""
sUrl<-str_extract(url, '(/p/)(.*?)([a-z, 0-9]{5})')
b<-paste0('https://api.nextgen.guardianapps.co.uk/discussion/comment-counts.json?shortUrls=', sUrl)
tryCatch(comments<-fromJSON(b)[[1]][[2]], error=function(e) NULL)
data.frame(comments=comments, url=url, stringsAsFactors = FALSE)
}))
iguardian<-left_join(iguardian, gcomments, by=c("shortUrl"="url"))
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
load("guardiani.rda")
guardiani$webPublicationDate<-as.Date(guardiani$webPublicationDate)
hist(guardiani$webPublicationDate, "year", "%y", freq = T)

hist(guardiani$webPublicationDate, "month", "%m", freq = T)

min(guardiani$webPublicationDate)
## [1] "2010-05-11"
max(guardiani$webPublicationDate)
## [1] "2017-07-01"
topics<-as.data.frame(round(prop.table(table(guardiani$sectionId))*100))
df<-topics%>%filter(Freq>1)
#df<-as.data.frame(table(guardiani$sectionId))
library(ggplot2)
ggplot(df, aes(x=reorder(Var1, Freq), y=Freq))+geom_bar(stat="identity")+coord_flip()+ guides(fill=FALSE)+ggtitle("N=1745")

library(scales)
ggplot(guardiani, aes(x=webPublicationDate))+ geom_histogram(binwidth=30, colour="white") + scale_x_date(labels = date_format("%Y"))

df<-guardiani%>%filter(commentable=="true")
ggplot(df, aes(x=webPublicationDate))+ geom_histogram(binwidth=30, colour="white") + scale_x_date(labels = date_format("%Y"))

# hist(df$webPublicationDate, "year", "%y", freq = T)
# hist(df$webPublicationDate, "year", "%y", freq = T)
# hist(df$webPublicationDate, "month", "%m", freq = T)
guardian2015a<-guardiani%>%filter(commentable=="true", webPublicationDate>"2015-01-01", webPublicationDate<"2016-01-01")
# N=114
# summary(as.numeric(guardian2015a$comments))
# topics<-as.data.frame(round(prop.table(table(guardian2015a$sectionId))*100))
# topics%>%filter(Freq>1)
topics<-as.data.frame(round(prop.table(table(guardian2015a$sectionId))*100))
df<-topics%>%filter(Freq>1)
#df<-as.data.frame(table(guardiani$sectionId))
library(ggplot2)
ggplot(df, aes(x=reorder(Var1, Freq), y=Freq))+geom_bar(stat="identity")+coord_flip()+ guides(fill=FALSE)+ggtitle("N=123")
