#install.packages("udpipe")

library(dplyr)

## Warning: Installed Rcpp (0.12.13) different from Rcpp used to build dplyr (0.12.11).
## Please reinstall dplyr to avoid random crashes or undefined behavior.

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(udpipe)

## Warning: package 'udpipe' was built under R version 3.4.4

news <- read.csv('C:\\Users\\r631758\\Desktop\\r631758\\R codes\\NLPExample\\data\\abcnews-date-text.csv', header = T, stringsAsFactors = F)
news %>% group_by(publish_date) %>% count() %>% arrange(desc(n))

library(stringr)
news_more<- news%>% mutate(year = str_sub(publish_date,1,4),
                        month = str_sub(publish_date,5,6),
                        date = str_sub(publish_date,7,8))

news_more %>% group_by(year) %>% count()  %>% ggplot() + geom_bar(aes(year,n), stat ='identity')

library(udpipe)
#during first time model download execute the below line too
# We do need to execute the next code before running udmodel_english
model <- udpipe_download_model(language = "english")

## Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.0/master/inst/udpipe-ud-2.0-170801/english-ud-2.0-170801.udpipe to C:/Users/r631758/Desktop/r631758/R codes/NLPExample/english-ud-2.0-170801.udpipe

udmodel_english <- udpipe_load_model(file = 'english-ud-2.0-170801.udpipe')

news_more_2008 <- news_more %>% filter(year == 2008 & month == 10)
s <- udpipe_annotate(udmodel_english, news_more_2008$headline_text)
x <- data.frame(s)

library(lattice)
stats <- txt_freq(x$upos)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = stats, col = "yellow", 
         main = "UPOS (Universal Parts of Speech)\n frequency of occurrence", 
         xlab = "Freq")

most occurring Nouns

## NOUNS
stats <- subset(x, upos %in% c("NOUN")) 
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue", 
         main = "Most occurring nouns", xlab = "Freq")

Most Occurring Adjectives

It’d be very hard to find a news agency that doesn’t like exaggerating and in English, you exaggerate your object with Adjective. So, let’s explore the most occurring Adjectives

## ADJECTIVES
stats <- subset(x, upos %in% c("ADJ")) 
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "purple", 
         main = "Most occurring adjectives", xlab = "Freq")

Most Occurring Verbs

The reporting nature of Media outlets could be very well understood with the way kind of verbs they are using. Do the bring any sign of optimision or they just infuse pessimism? The usage of verbs can answer them.

## VERBS
stats <- subset(x, upos %in% c("VERB")) 
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "gold", 
         main = "Most occurring Verbs", xlab = "Freq")

Automated Keywords Extraction with RAKE

Time for some advanced Machine Learning. RAKE is one of the most popular (unsupervised) algorithms for extracting keywords in Information retrieval. RAKE short for Rapid Automatic Keyword Extraction algorithm, is a domain independent keyword extraction algorithm which tries to determine key phrases in a body of text by analyzing the frequency of word appearance and its co-occurrence with other words in the text.

## Using RAKE
stats <- keywords_rake(x = x, term = "lemma", group = "doc_id", 
                       relevant = x$upos %in% c("NOUN", "ADJ"))
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ rake, data = head(subset(stats, freq > 3), 20), col = "red", 
         main = "Keywords identified by RAKE", 
         xlab = "Rake")

TOP NOUN???-???VERB Pairs as Keyword pairs

In English (or probably in many languages), Simple a noun and a verb can form a phrase. Like, Dog barked???-???with the noun Dog and Barked, we can understand the context of the sentence. Reverse-engineering the same with this headlines data, let us bring out top phrases - that are just keywords/topics

## Using a sequence of POS tags (noun phrases / verb phrases)
x$phrase_tag <- as_phrasemachine(x$upos, type = "upos")
stats <- keywords_phrases(x = x$phrase_tag, term = tolower(x$token), 
                          pattern = "(A|N)*N(P+D*(A|N)*N)*", 
                          is_regex = TRUE, detailed = FALSE)
stats <- subset(stats, ngram > 1 & freq > 3)
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ freq, data = head(stats, 20), col = "magenta", 
         main = "Keywords - simple noun phrases", xlab = "Frequency")

Udpipe NLP

most occurring Nouns

Most Occurring Adjectives

It’d be very hard to find a news agency that doesn’t like exaggerating and in English, you exaggerate your object with Adjective. So, let’s explore the most occurring Adjectives

Most Occurring Verbs

The reporting nature of Media outlets could be very well understood with the way kind of verbs they are using. Do the bring any sign of optimision or they just infuse pessimism? The usage of verbs can answer them.

Automated Keywords Extraction with RAKE

TOP NOUN???-???VERB Pairs as Keyword pairs