#install.packages("udpipe")
library(dplyr)
## Warning: Installed Rcpp (0.12.13) different from Rcpp used to build dplyr (0.12.11).
## Please reinstall dplyr to avoid random crashes or undefined behavior.
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(udpipe)
## Warning: package 'udpipe' was built under R version 3.4.4
news <- read.csv('C:\\Users\\r631758\\Desktop\\r631758\\R codes\\NLPExample\\data\\abcnews-date-text.csv', header = T, stringsAsFactors = F)
news %>% group_by(publish_date) %>% count() %>% arrange(desc(n))
library(stringr)
news_more<- news%>% mutate(year = str_sub(publish_date,1,4),
                        month = str_sub(publish_date,5,6),
                        date = str_sub(publish_date,7,8))

news_more %>% group_by(year) %>% count()  %>% ggplot() + geom_bar(aes(year,n), stat ='identity')

library(udpipe)
#during first time model download execute the below line too
# We do need to execute the next code before running udmodel_english
model <- udpipe_download_model(language = "english")
## Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.0/master/inst/udpipe-ud-2.0-170801/english-ud-2.0-170801.udpipe to C:/Users/r631758/Desktop/r631758/R codes/NLPExample/english-ud-2.0-170801.udpipe
udmodel_english <- udpipe_load_model(file = 'english-ud-2.0-170801.udpipe')
news_more_2008 <- news_more %>% filter(year == 2008 & month == 10)
s <- udpipe_annotate(udmodel_english, news_more_2008$headline_text)
x <- data.frame(s)
library(lattice)
stats <- txt_freq(x$upos)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = stats, col = "yellow", 
         main = "UPOS (Universal Parts of Speech)\n frequency of occurrence", 
         xlab = "Freq")

most occurring Nouns

## NOUNS
stats <- subset(x, upos %in% c("NOUN")) 
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue", 
         main = "Most occurring nouns", xlab = "Freq")

Most Occurring Adjectives

It’d be very hard to find a news agency that doesn’t like exaggerating and in English, you exaggerate your object with Adjective. So, let’s explore the most occurring Adjectives

## ADJECTIVES
stats <- subset(x, upos %in% c("ADJ")) 
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "purple", 
         main = "Most occurring adjectives", xlab = "Freq")

Most Occurring Verbs

The reporting nature of Media outlets could be very well understood with the way kind of verbs they are using. Do the bring any sign of optimision or they just infuse pessimism? The usage of verbs can answer them.

## VERBS
stats <- subset(x, upos %in% c("VERB")) 
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "gold", 
         main = "Most occurring Verbs", xlab = "Freq")

Automated Keywords Extraction with RAKE

TOP NOUN???-???VERB Pairs as Keyword pairs

In English (or probably in many languages), Simple a noun and a verb can form a phrase. Like, Dog barked???-???with the noun Dog and Barked, we can understand the context of the sentence. Reverse-engineering the same with this headlines data, let us bring out top phrases - that are just keywords/topics

## Using a sequence of POS tags (noun phrases / verb phrases)
x$phrase_tag <- as_phrasemachine(x$upos, type = "upos")
stats <- keywords_phrases(x = x$phrase_tag, term = tolower(x$token), 
                          pattern = "(A|N)*N(P+D*(A|N)*N)*", 
                          is_regex = TRUE, detailed = FALSE)
stats <- subset(stats, ngram > 1 & freq > 3)
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ freq, data = head(stats, 20), col = "magenta", 
         main = "Keywords - simple noun phrases", xlab = "Frequency")