##Dataset

The Data set under analysis can be downloaded from the following link: Data set

Four folders can be found. Each folder cointain corpuses of texts obtained from blogs, news and twitter in 4 languages. In this report we are going to work on the english folder.

Corpus generation

In order to work with the files a Corpus will be created with the tm package.

source <- DirSource("Data/en_US")
VC_en_US <- VCorpus(source)

Statistics

Now, general information about each file is obtained:

### Number of characters of each file
for (i in 1:3){
a<-VC_en_US[[i]]
print(a)}   
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 206824505
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 203223159
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 162096031
### Number of text lines per file
for (i in 1:3){
  t_i<-as.character(VC_en_US[[i]])
  largo<-as.numeric(length(t_i))
  print(paste("File ", meta(VC_en_US[[i]], "id")," has ", largo," lines of text",sep=""))
}
## [1] "File en_US.blogs.txt has 899288 lines of text"
## [1] "File en_US.news.txt has 1010242 lines of text"
## [1] "File en_US.twitter.txt has 2360148 lines of text"

In order to continue, the Corpus is now cleaned.

##### Cleaning
VC_en_US<-tm_map(VC_en_US, content_transformer(tolower)) #Conversion to lower case
VC_en_US<-tm_map(VC_en_US,removeWords, stopwords("english")) #Removal of stopwords 
VC_en_US<-tm_map(VC_en_US, stemDocument) #Stemming: reducing inflected or derived words to their word stem
VC_en_US<-tm_map(VC_en_US, removeNumbers) #Remotion of numbers
VC_en_US<-tm_map(VC_en_US, stripWhitespace)# Elimination of extra whitespace

Statistics on single words

Now, we are going to determine the most common words in the Corpus VC_en_US, per document. Now we are going to use the new tidytext package

library(tidytext)
library(dplyr)
library(ggplot2)

For en_US_blogs

data(stop_words)
en_US_blogs<-data_frame(line = 1:length(VC_en_US[["en_US.blogs.txt"]][["content"]]),text =VC_en_US[["en_US.blogs.txt"]][["content"]])
en_US_blogs<-unnest_tokens(en_US_blogs,word,text)
en_US_blogs<-anti_join(en_US_blogs,stop_words)
en_US_blogs<-count(en_US_blogs,word, sort = TRUE)

get_sentiments("afinn")
## # A tibble: 2,476 x 2
##    word       score
##    <chr>      <int>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,466 more rows
en_US_blogs_sentiments<-left_join(en_US_blogs,get_sentiments("afinn"))
en_US_blogs_sentiments$value<-en_US_blogs_sentiments$n*en_US_blogs_sentiments$score
en_US_blogs_sentiments$class<-ifelse(en_US_blogs_sentiments$value>0,"positive","negative")
en_US_blogs_sentiments_summary<-aggregate(data=en_US_blogs_sentiments,value~class,sum)
en_US_blogs_sentiments_summary$value<-abs(en_US_blogs_sentiments_summary$value)
g1<-ggplot(en_US_blogs_sentiments_summary, aes(class,value))+geom_col()
g1

Words that contribute the most to positive and negative feelings:

positive<-arrange(en_US_blogs_sentiments,desc(value))
positive<-positive[c(1:10),]
positive<-positive$word
print("Positive words that contribute the most: ")
## [1] "Positive words that contribute the most: "
positive
##  [1] "love"    "fun"     "hope"    "nice"    "perfect" "top"     "enjoy"  
##  [8] "win"     "care"    "god"
negative<-arrange(en_US_blogs_sentiments,value)
negative<-negative[c(1:10),]
negative<-negative$word
print("Negative words that contribute the most: ")
## [1] "Negative words that contribute the most: "
negative
##  [1] "bad"   "die"   "lost"  "kill"  "miss"  "dead"  "hate"  "wrong"
##  [9] "hard"  "death"

For en_US_news

data(stop_words)
en_US_news<-data_frame(line = 1:length(VC_en_US[["en_US.news.txt"]][["content"]]),text =VC_en_US[["en_US.news.txt"]][["content"]])
en_US_news<-unnest_tokens(en_US_news,word,text)
en_US_news<-anti_join(en_US_news,stop_words)
en_US_news<-count(en_US_news,word, sort = TRUE)

get_sentiments("afinn")
## # A tibble: 2,476 x 2
##    word       score
##    <chr>      <int>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,466 more rows
en_US_news_sentiments<-left_join(en_US_news,get_sentiments("afinn"))
en_US_news_sentiments$value<-en_US_news_sentiments$n*en_US_news_sentiments$score
en_US_news_sentiments$class<-ifelse(en_US_news_sentiments$value>0,"positive","negative")
en_US_news_sentiments_summary<-aggregate(data=en_US_news_sentiments,value~class,sum)
en_US_news_sentiments_summary$value<-abs(en_US_news_sentiments_summary$value)
g2<-ggplot(en_US_news_sentiments_summary, aes(class,value))+geom_col()
g2

Words that contribute the most to positive and negative feelings:

positive<-arrange(en_US_news_sentiments,desc(value))
positive<-positive[c(1:10),]
positive<-positive$word
print("Positive words that contribute the most: ")
## [1] "Positive words that contribute the most: "
positive
##  [1] "win"     "love"    "top"     "support" "won"     "care"    "hope"   
##  [8] "fan"     "fun"     "award"
negative<-arrange(en_US_news_sentiments,value)
negative<-negative[c(1:10),]
negative<-negative$word
print("Negative words that contribute the most: ")
## [1] "Negative words that contribute the most: "
negative
##  [1] "lost"   "kill"   "loss"   "fire"   "bad"    "die"    "death" 
##  [8] "pay"    "miss"   "arrest"

For en_US_twitter

data(stop_words)
en_US_twitter<-data_frame(line = 1:length(VC_en_US[["en_US.twitter.txt"]][["content"]]),text =VC_en_US[["en_US.twitter.txt"]][["content"]])
en_US_twitter<-unnest_tokens(en_US_twitter,word,text)
en_US_twitter<-anti_join(en_US_twitter,stop_words)
en_US_twitter<-count(en_US_twitter,word, sort = TRUE)

get_sentiments("afinn")
## # A tibble: 2,476 x 2
##    word       score
##    <chr>      <int>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,466 more rows
en_US_twitter_sentiments<-left_join(en_US_twitter,get_sentiments("afinn"))
en_US_twitter_sentiments$value<-en_US_twitter_sentiments$n*en_US_twitter_sentiments$score
en_US_twitter_sentiments$class<-ifelse(en_US_twitter_sentiments$value>0,"positive","negative")
en_US_twitter_sentiments_summary<-aggregate(data=en_US_twitter_sentiments,value~class,sum)
en_US_twitter_sentiments_summary$value<-abs(en_US_twitter_sentiments_summary$value)
g3<-ggplot(en_US_twitter_sentiments_summary, aes(class,value))+geom_col()
g3

Words that contribute the most to positive and negative feelings:

positive<-arrange(en_US_twitter_sentiments,desc(value))
positive<-positive[c(1:10),]
positive<-positive$word
print("Positive words that contribute the most: ")
## [1] "Positive words that contribute the most: "
positive
##  [1] "love"    "lol"     "fun"     "hope"    "win"     "haha"    "nice"   
##  [8] "wow"     "awesome" "fan"
negative<-arrange(en_US_twitter_sentiments,value)
negative<-negative[c(1:10),]
negative<-negative$word
print("Negative words that contribute the most: ")
## [1] "Negative words that contribute the most: "
negative
##  [1] "fuck"  "shit"  "bad"   "hate"  "miss"  "ass"   "bitch" "damn" 
##  [9] "hell"  "kill"

It can be seen that people uses a lot of insults in twitter when compared to other communication means.

Statistics on n-grams, with n=4 (“fourgram”)

I decided to use fourgrams since for a phrase you usually need 4 words. The most common 20 fourgrams are presented for each file.

For en_US_blogs

en_US_blogs<-data_frame(line = 1:length(VC_en_US[["en_US.blogs.txt"]][["content"]]),text =VC_en_US[["en_US.blogs.txt"]][["content"]])
en_US_blogs<-unnest_tokens(en_US_blogs,fourgram,text,token = "ngrams", n = 4)
en_US_blogs<-count(en_US_blogs,fourgram, sort = TRUE)

Most_common_en_US_blogs_four_grams<-en_US_blogs$fourgram[c(1:20)]
Most_common_en_US_blogs_four_grams
##  [1] "amazon servic llc amazon"                   
##  [2] "servic llc amazon eu"                       
##  [3] "incorpor item c pp"                         
##  [4] "vest interests vest interests"              
##  [5] "interests vest interests vest"              
##  [6] "come amazon servic llc"                     
##  [7] "content provid subject chang"               
##  [8] "provid subject chang remov"                 
##  [9] "subject chang remov time"                   
## [10] "advertis fee advertis link"                 
## [11] "advertis link amazon.com amazon.ca"         
## [12] "amazon amazon.es certain content"           
## [13] "amazon eu associ programm"                  
## [14] "amazon eu content provid"                   
## [15] "amazon.ca amazon.co.uk amazon.de amazon.fr" 
## [16] "amazon.co.uk amazon.de amazon.fr amazon"    
## [17] "amazon.com amazon.ca amazon.co.uk amazon.de"
## [18] "amazon.de amazon.fr amazon amazon.es"       
## [19] "amazon.es certain content appear"           
## [20] "amazon.fr amazon amazon.es certain"

For en_US_news

en_US_news<-data_frame(line = 1:length(VC_en_US[["en_US.news.txt"]][["content"]]),text =VC_en_US[["en_US.news.txt"]][["content"]])
en_US_news<-unnest_tokens(en_US_news,fourgram,text,token = "ngrams", n = 4)
en_US_news<-count(en_US_news,fourgram, sort = TRUE)
Most_common_en_US_news_four_grams<-en_US_news$fourgram[c(1:20)]
Most_common_en_US_news_four_grams
##  [1] "g protein g carbohydrate"    "dow jone industri averag"   
##  [3] "protein g carbohydrate g"    "per serving calories g"     
##  [5] "g fat g saturated"           "presid georg w bush"        
##  [7] "g fiber mg sodium"           "new york new jersey"        
##  [9] "calories g protein g"        "g carbohydrate g fat"       
## [11] "mg cholesterol mg sodium"    "rock roll hall fame"        
## [13] "million cent per share"      "martin luther king jr"      
## [15] "carbohydrate g fat g"        "saturated mg cholesterol mg"
## [17] "fat g saturated mg"          "g saturated mg cholesterol" 
## [19] "calories g fat g"            "mg sodium g fiber"

For en_US_twitter

en_US_twitter<-data_frame(line = 1:length(VC_en_US[["en_US.twitter.txt"]][["content"]]),text =VC_en_US[["en_US.twitter.txt"]][["content"]])
en_US_twitter<-unnest_tokens(en_US_twitter,fourgram,text,token = "ngrams", n = 4)
en_US_twitter<-count(en_US_twitter,fourgram, sort = TRUE)
Most_common_en_US_twitter_four_grams<-en_US_twitter$fourgram[c(1:20)]
Most_common_en_US_twitter_four_grams
##  [1] "thank follow look forward" "happi mother day mom"     
##  [3] "cake cake cake cake"       "just finish mi run"       
##  [5] "happi cinco de mayo"       "happi st patrick day"     
##  [7] "make dream come true"      "s o new follow"           
##  [9] "let us know can"           "ass ass ass ass"          
## [11] "let us know think"         "pleas pleas pleas pleas"  
## [13] "martin luther king jr"     "pleas let us know"        
## [15] "get real reward just"      "real reward just watch"   
## [17] "reward just watch tv"      "happi mother day mother"  
## [19] "happi new year everyone"   "let us know need"

It can be seen that the subject nature of the texts is different according to the file. In the case of US Blogs, issues related to “amazon” are the most common, probably this is related to the presence of advertisement in blogs. In US news, food aspects are largely mentioned, probably that is related also to advertisements. In US twitter the most common fourgrams are related to messages to other people that in general have a positive mood.

Plans for creating the prediction algorithm.

After this analysis it’s clear to me that there are a words and n-grams that are used more often than others. This will be the basis for the prediction algorithm since its possible to assign probabilities to words that will appear next.