rm(list=ls())
library(rvest)
library(plyr)
library(pdftools)
library(glue)
library(tidyverse)
library(tm)
library(textcat)
library(tidytext)
library(stringr)
library(dplyr)
library(purrr)
library(textmineR)
library(quanteda)
library(stringr)
setwd("~/Desktop/R/Introductory Data Science/Group Project")
data_frame <- read.csv("Covid-19\ Policy\ Mentions\ -\ 2020-04-06.csv") %>%
select(Outlet.or.Author, Mention.Title, Country, Mention.Date,
Mention.URL, Research.Output.Title, Publication.Date)
urls <- data_frame$Mention.URL %>%
as.character %>%
unique
df_uniq_url <- data_frame[!duplicated(data_frame[,"Mention.URL",]),]
df_uniq_url$document <- paste0("document_", seq(1,nrow(df_uniq_url), by = 1), ".pdf")
pdf_names <- paste0("document_", seq(urls), ".pdf")
setwd("PDF\ downloads")
mapply(download.file, urls, pdf_names)
file_vector <- list.files(path = getwd())
corpus <- Corpus(URISource(file_vector),
readerControl = list(reader = readPDF))
languages <- textcat(corpus)
names(languages)[languages != "english"] %>%
unlink()
file_vector <- list.files(path = getwd())
corpus <- Corpus(URISource(file_vector),
readerControl = list(reader = readPDF))
df_uniq_url <- df_uniq_url[grepl(paste(file_vector, collapse="|"), df_uniq_url$document),]
data_poli <- data.frame(text=sapply(corpus, identity),
stringsAsFactors = FALSE)
data_poli <- data_poli[-2,]
data_poli <- pivot_longer(data_poli,cols = starts_with("text"),
names_to = "document",
values_to = "text")
data_poli$document <- gsub("text.", "", data_poli$document)
merged_df <- merge(df_uniq_url, data_poli, by.x = "document", by.y = "document")
merged_df$text <- as.character(merged_df$text)
merged_df <- merged_df %>%
select(document, Mention.Title, Mention.Date, text, Outlet.or.Author) %>%
rename(date = Mention.Date) %>%
rename(source = Outlet.or.Author)
corpus_poli <- corpus(merged_df, docid_field = "document",
text_field = "text")
dtm_poli <- dfm(corpus_poli,
tolower = TRUE,
stem = TRUE,
remove = stopwords())