rm(list=ls())

library(rvest)
library(plyr)
library(pdftools)
library(glue)
library(tidyverse)
library(tm)
library(textcat)
library(tidytext)
library(stringr)
library(dplyr)
library(purrr)
library(textmineR)
library(quanteda)
library(stringr)

setwd("~/Desktop/R/Introductory Data Science/Group Project")
data_frame <- read.csv("Covid-19\ Policy\ Mentions\ -\ 2020-04-06.csv") %>%
  select(Outlet.or.Author, Mention.Title, Country, Mention.Date, 
         Mention.URL, Research.Output.Title, Publication.Date)
urls <- data_frame$Mention.URL %>%
  as.character %>%
  unique
df_uniq_url <- data_frame[!duplicated(data_frame[,"Mention.URL",]),]
df_uniq_url$document <- paste0("document_", seq(1,nrow(df_uniq_url), by = 1), ".pdf")
pdf_names <- paste0("document_", seq(urls), ".pdf")
setwd("PDF\ downloads")
mapply(download.file, urls, pdf_names)
file_vector <- list.files(path = getwd())
corpus <- Corpus(URISource(file_vector),
                 readerControl = list(reader = readPDF))
languages <- textcat(corpus)
names(languages)[languages != "english"] %>%
  unlink()
file_vector <- list.files(path = getwd())
corpus <- Corpus(URISource(file_vector),
                 readerControl = list(reader = readPDF))
df_uniq_url <- df_uniq_url[grepl(paste(file_vector, collapse="|"), df_uniq_url$document),]

data_poli <- data.frame(text=sapply(corpus, identity), 
                        stringsAsFactors = FALSE)
data_poli <- data_poli[-2,]    
data_poli <- pivot_longer(data_poli,cols = starts_with("text"), 
                          names_to = "document", 
                          values_to = "text")
data_poli$document <- gsub("text.", "", data_poli$document)
merged_df <- merge(df_uniq_url, data_poli, by.x = "document", by.y = "document")
merged_df$text <- as.character(merged_df$text)
merged_df <- merged_df %>%
  select(document, Mention.Title, Mention.Date, text, Outlet.or.Author) %>%
  rename(date = Mention.Date) %>%
  rename(source = Outlet.or.Author)
corpus_poli <- corpus(merged_df, docid_field = "document", 
                      text_field = "text")
dtm_poli <- dfm(corpus_poli,
                tolower = TRUE,
                stem = TRUE,
                remove = stopwords())