#install.packages("udpipe")
library(dplyr)
## Warning: Installed Rcpp (0.12.13) different from Rcpp used to build dplyr (0.12.11).
## Please reinstall dplyr to avoid random crashes or undefined behavior.
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(udpipe)
## Warning: package 'udpipe' was built under R version 3.4.4
news <- read.csv('C:\\Users\\r631758\\Desktop\\r631758\\R codes\\NLPExample\\data\\abcnews-date-text.csv', header = T, stringsAsFactors = F)
news %>% group_by(publish_date) %>% count() %>% arrange(desc(n))
library(stringr)
news_more<- news%>% mutate(year = str_sub(publish_date,1,4),
month = str_sub(publish_date,5,6),
date = str_sub(publish_date,7,8))
news_more %>% group_by(year) %>% count() %>% ggplot() + geom_bar(aes(year,n), stat ='identity')

library(udpipe)
#during first time model download execute the below line too
# We do need to execute the next code before running udmodel_english
model <- udpipe_download_model(language = "english")
## Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.0/master/inst/udpipe-ud-2.0-170801/english-ud-2.0-170801.udpipe to C:/Users/r631758/Desktop/r631758/R codes/NLPExample/english-ud-2.0-170801.udpipe
udmodel_english <- udpipe_load_model(file = 'english-ud-2.0-170801.udpipe')
news_more_2008 <- news_more %>% filter(year == 2008 & month == 10)
s <- udpipe_annotate(udmodel_english, news_more_2008$headline_text)
x <- data.frame(s)
library(lattice)
stats <- txt_freq(x$upos)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = stats, col = "yellow",
main = "UPOS (Universal Parts of Speech)\n frequency of occurrence",
xlab = "Freq")

most occurring Nouns
## NOUNS
stats <- subset(x, upos %in% c("NOUN"))
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue",
main = "Most occurring nouns", xlab = "Freq")

Most Occurring Adjectives
It’d be very hard to find a news agency that doesn’t like exaggerating and in English, you exaggerate your object with Adjective. So, let’s explore the most occurring Adjectives
## ADJECTIVES
stats <- subset(x, upos %in% c("ADJ"))
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "purple",
main = "Most occurring adjectives", xlab = "Freq")

TOP NOUN???-???VERB Pairs as Keyword pairs
In English (or probably in many languages), Simple a noun and a verb can form a phrase. Like, Dog barked???-???with the noun Dog and Barked, we can understand the context of the sentence. Reverse-engineering the same with this headlines data, let us bring out top phrases - that are just keywords/topics
## Using a sequence of POS tags (noun phrases / verb phrases)
x$phrase_tag <- as_phrasemachine(x$upos, type = "upos")
stats <- keywords_phrases(x = x$phrase_tag, term = tolower(x$token),
pattern = "(A|N)*N(P+D*(A|N)*N)*",
is_regex = TRUE, detailed = FALSE)
stats <- subset(stats, ngram > 1 & freq > 3)
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ freq, data = head(stats, 20), col = "magenta",
main = "Keywords - simple noun phrases", xlab = "Frequency")
