NLP Project

Objectives

Exploratory analysis - perform a thorough exploratory analysis of the data, understanding the distribution of words and relationship between the words in the corpora.
Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.

##    Country    File FileSize LineCount WordCount
## 11 Finland    News 119.7 Mb    485758  13056791
## 7  Finland   Blogs 130.4 Mb    439785  15805117
## 3  Finland Twitter    44 Mb    285214   3836079
## 2  Germany    News 105.1 Mb    244743  14742498
## 6  Germany Twitter 137.9 Mb    947774  12409380
## 10 Germany   Blogs  51.3 Mb    181958   6769294
## 8   Russia    News 124.9 Mb    196360  45485831
## 4   Russia   Blogs 132.1 Mb    337100  44147004
## 12  Russia Twitter 153.6 Mb    881414  38653696
## 5      USA    News  19.8 Mb     77259   2693898
## 1      USA   Blogs 255.4 Mb    899288  38154238
## 9      USA Twitter   319 Mb   2360148  30218125

Here we start to select samples (since working on the entire dataset will require high computational power & will consume time, due to their sizes) of our data and work on cleaning them for further investigation

library(qdap)

set.seed(09092020)
# will take samples per country
sampleUS_blogs <- sample(US_blogs,1000, replace = FALSE)
sampleUS_news <- sample(US_news,1000, replace = FALSE)
sampleUS_twitter <- sample(US_twitter,1000, replace = FALSE)

sampleDE_blogs <- sample(DE_blogs,1000, replace = FALSE)
sampleDE_news <- sample(DE_news,1000, replace = FALSE)
sampleDE_twitter <- sample(DE_twitter,1000, replace = FALSE)

sampleFI_blogs <- sample(FI_blogs,1000, replace = FALSE)
sampleFI_news <- sample(FI_news,1000, replace = FALSE)
sampleFI_twitter <- sample(FI_twitter,1000, replace = FALSE)

sampleRU_blogs <- sample(RU_blogs,1000, replace = FALSE)
sampleRU_news <- sample(RU_news,1000, replace = FALSE)
sampleRU_twitter <- sample(RU_twitter,1000, replace = FALSE)

# combine sampled data per language
en_SampledData <- paste(sampleUS_blogs, sampleUS_news, sampleUS_twitter)
de_SampledData <- paste(sampleDE_blogs, sampleDE_news, sampleDE_twitter)
fi_SampledData <- paste(sampleFI_blogs, sampleFI_news, sampleFI_twitter)
ru_SampledData <- paste(sampleRU_blogs, sampleRU_news, sampleRU_twitter)

# detect sentences in combined sampled data
us_sentences <- sent_detect(en_SampledData, language = "en", model = NULL)
de_sentences <- sent_detect(de_SampledData, language = "de", model = NULL)
fi_sentences <- sent_detect(fi_SampledData, language = "fi", model = NULL)
ru_sentences <- sent_detect(ru_SampledData, language = "ru", model = NULL)

# remove unneeded variables
remove(sampleUS_blogs, sampleUS_news, sampleUS_twitter,
       sampleDE_blogs, sampleDE_news, sampleDE_twitter,
       sampleRU_blogs, sampleRU_news, sampleRU_twitter,
       sampleFI_blogs, sampleFI_news, sampleFI_twitter,
       en_SampledData, de_SampledData, fi_SampledData, ru_SampledData,
       US_blogs, US_news, US_twitter,
       DE_blogs, DE_news, DE_twitter,
       RU_blogs, RU_news, RU_twitter,
       FI_blogs, FI_news, FI_twitter)

Here we start cleaning sentences collected from our samples above

library(tm)
# vectorize my ENGLISH data for further cleaning
us_sentences <- VCorpus(VectorSource(us_sentences))

# cleaning of vectorized data will be done on several steps:
us_sentences <- tm_map(us_sentences, removeNumbers) # remove numbers
us_sentences <- tm_map(us_sentences, stripWhitespace) # remove whitespaces
us_sentences <- tm_map(us_sentences, content_transformer(tolower)) # lowercase
us_sentences <- tm_map(us_sentences, removePunctuation) # remove special characters
us_sentences <- tm_map(us_sentences, removeWords, stopwords("english")) # remove stop words

# vectorize my GERMAN data for further cleaning
de_sentences <- VCorpus(VectorSource(de_sentences))

# cleaning of vectorized data will be done on several steps:
de_sentences <- tm_map(de_sentences, removeNumbers) # remove numbers
de_sentences <- tm_map(de_sentences, stripWhitespace) # remove whitespaces
de_sentences <- tm_map(de_sentences, content_transformer(tolower)) # lowercase
de_sentences <- tm_map(de_sentences, removePunctuation) # remove special characters
de_sentences <- tm_map(de_sentences, removeWords, stopwords("german")) # remove stop words

# vectorize my FINNISH data for further cleaning
fi_sentences <- VCorpus(VectorSource(fi_sentences))

# cleaning of vectorized data will be done on several steps:
fi_sentences <- tm_map(fi_sentences, removeNumbers) # remove numbers
fi_sentences <- tm_map(fi_sentences, stripWhitespace) # remove whitespaces
fi_sentences <- tm_map(fi_sentences, content_transformer(tolower)) # lowercase
fi_sentences <- tm_map(fi_sentences, removePunctuation) # remove special characters
fi_sentences <- tm_map(fi_sentences, removeWords, stopwords("finnish")) # remove stop words

# vectorize my RUSSIAN data for further cleaning
ru_sentences <- VCorpus(VectorSource(ru_sentences))

# cleaning of vectorized data will be done on several steps:
ru_sentences <- tm_map(ru_sentences, removeNumbers) # remove numbers
ru_sentences <- tm_map(ru_sentences, stripWhitespace) # remove whitespaces
ru_sentences <- tm_map(ru_sentences, content_transformer(tolower)) # lowercase
ru_sentences <- tm_map(ru_sentences, removePunctuation) # remove special characters
ru_sentences <- tm_map(ru_sentences, removeWords, stopwords("russian")) # remove stop words

Here we structure our cleaned up data into a dataframe for tokenization

library(RWeka)

#structure into dataframes
us_sentences <- data.frame(us_sentences,stringsAsFactors = FALSE)
de_sentences <- data.frame(de_sentences,stringsAsFactors = FALSE)
fi_sentences <- data.frame(fi_sentences,stringsAsFactors = FALSE)
ru_sentences <- data.frame(ru_sentences,stringsAsFactors = FALSE)

# get 1,2 & 3-grams from ENGLISH structured data
us_UnoGram <- NGramTokenizer(us_sentences, Weka_control(min = 1, max = 1))
us_BiGram <- NGramTokenizer(us_sentences, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
us_TriGram <- NGramTokenizer(us_sentences, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))

# get 1,2 & 3-grams from GERMAN structured data
de_UnoGram <- NGramTokenizer(de_sentences, Weka_control(min = 1, max = 1))
de_BiGram <- NGramTokenizer(de_sentences, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
de_TriGram <- NGramTokenizer(de_sentences, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))

# get 1,2 & 3-grams from FINNISH structured data
fi_UnoGram <- NGramTokenizer(fi_sentences, Weka_control(min = 1, max = 1))
fi_BiGram <- NGramTokenizer(fi_sentences, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
fi_TriGram <- NGramTokenizer(fi_sentences, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))

# get 1,2 & 3-grams from RUSSIAN structured data
ru_UnoGram <- NGramTokenizer(ru_sentences, Weka_control(min = 1, max = 1))
ru_BiGram <- NGramTokenizer(ru_sentences, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
ru_TriGram <- NGramTokenizer(ru_sentences, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))

Here we shall explore visually frequency of words, for 1,2 & 3-grams data

- English Data

library(ggplot2)
library(wordcloud)
library(RColorBrewer)

# Count ENGLISH words of the unigram set
us_Unogram_count <- data.frame(table(us_UnoGram))
us_Unogram_count <- us_Unogram_count[order(us_Unogram_count$Freq,decreasing = TRUE),]
colnames(us_Unogram_count) <- c("Word","Count")


# plotting ENGLISH Unigrams
wordcloud(us_UnoGram, max.words=300, random.order=FALSE, 
          use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))

barplot(us_Unogram_count[1:20,2], names = us_Unogram_count$Word[1:20],
        main="Count of top 20 English 1-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count ENGLISH words of the bigram set
us_BiGram_count <- data.frame(table(us_BiGram))
us_BiGram_count <- us_BiGram_count[order(us_BiGram_count$Freq,decreasing = TRUE),]
colnames(us_BiGram_count) <- c("Word","Count")

# plotting ENGLISH bigrams
barplot(us_BiGram_count[1:20,2], names = us_BiGram_count$Word[1:20],
        main="Count of top 20 English 2-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count ENGLISH words of the bigram set
us_TriGram_count <- data.frame(table(us_TriGram))
us_TriGram_count <- us_TriGram_count[order(us_TriGram_count$Freq,decreasing = TRUE),]
colnames(us_TriGram_count) <- c("Word","Count")

# plotting ENGLISH bigrams
barplot(us_TriGram_count[1:20,2], names = us_TriGram_count$Word[1:20],
        main="Count of top 20 English 3-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

- German Data

# Count GERMAN words of the unigram set
de_Unogram_count <- data.frame(table(de_UnoGram))
de_Unogram_count <- de_Unogram_count[order(de_Unogram_count$Freq,decreasing = TRUE),]
colnames(de_Unogram_count) <- c("Word","Count")

# plotting GERMAN words
wordcloud(de_UnoGram, max.words=300, random.order=FALSE,
         use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))

barplot(de_Unogram_count[1:20,2], names = de_Unogram_count$Word[1:20],
        main="Count of top 20 German 1-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count GERMAN words of the bigram set
de_BiGram_count <- data.frame(table(de_BiGram))
de_BiGram_count <- de_BiGram_count[order(de_BiGram_count$Freq,decreasing = TRUE),]
colnames(de_BiGram_count) <- c("Word","Count")

# plotting GERMAN bigrams
barplot(de_BiGram_count[1:20,2], names = de_BiGram_count$Word[1:20],
        main="Count of top 20 German 2-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count GERMAN words of the bigram set
de_TriGram_count <- data.frame(table(de_TriGram))
de_TriGram_count <- de_TriGram_count[order(de_TriGram_count$Freq,decreasing = TRUE),]
colnames(de_TriGram_count) <- c("Word","Count")

# plotting GERMAN trigrams
barplot(de_TriGram_count[1:20,2], names = de_TriGram_count$Word[1:20],
        main="Count of top 20 German 3-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

- Finnish Data

# Count GERMAN words of the unigram set
fi_Unogram_count <- data.frame(table(fi_UnoGram))
fi_Unogram_count <- fi_Unogram_count[order(fi_Unogram_count$Freq,decreasing = TRUE),]
colnames(fi_Unogram_count) <- c("Word","Count")

# plotting FINNISH words
wordcloud(fi_UnoGram, max.words=300, random.order=FALSE, 
           use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))

barplot(fi_Unogram_count[1:20,2], names = fi_Unogram_count$Word[1:20],
        main="Count of top 20 Finnish 1-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count FINNISH words of the bigram set
fi_BiGram_count <- data.frame(table(fi_BiGram))
fi_BiGram_count <- fi_BiGram_count[order(fi_BiGram_count$Freq,decreasing = TRUE),]
colnames(fi_BiGram_count) <- c("Word","Count")

# plotting FINNISH bigrams
barplot(fi_BiGram_count[1:20,2], names = fi_BiGram_count$Word[1:20],
        main="Count of top 20 Finnish 2-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count FINNISH words of the bigram set
fi_TriGram_count <- data.frame(table(fi_TriGram))
fi_TriGram_count <- fi_TriGram_count[order(fi_TriGram_count$Freq,decreasing = TRUE),]
colnames(fi_TriGram_count) <- c("Word","Count")

# plotting FINNISH trigrams
barplot(fi_TriGram_count[1:20,2], names = fi_TriGram_count$Word[1:20],
        main="Count of top 20 Finnish 3-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

- Russian Data

# Count RUSSIAN words of the unigram set
ru_Unogram_count <- data.frame(table(ru_UnoGram))
ru_Unogram_count <- ru_Unogram_count[order(ru_Unogram_count$Freq,decreasing = TRUE),]
colnames(ru_Unogram_count) <- c("Word","Count")

# plotting RUSSIAN unigrams

wordcloud(ru_UnoGram, max.words=300, random.order=FALSE, 
          use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))

barplot(ru_Unogram_count[1:20,2], names = ru_Unogram_count$Word[1:20],
        main="Count of top 20 Russian 1-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count RUSSIAN words of the bigram set
ru_BiGram_count <- data.frame(table(ru_BiGram))
ru_BiGram_count <- ru_BiGram_count[order(ru_BiGram_count$Freq,decreasing = TRUE),]
colnames(ru_BiGram_count) <- c("Word","Count")

# plotting RUSSIAN bigrams
barplot(ru_BiGram_count[1:20,2], names = ru_BiGram_count$Word[1:20],
        main="Count of top 20 Russian 2-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count RUSSIAN words of the bigram set
ru_TriGram_count <- data.frame(table(ru_TriGram))
ru_TriGram_count <- ru_TriGram_count[order(ru_TriGram_count$Freq,decreasing = TRUE),]
colnames(ru_TriGram_count) <- c("Word","Count")

# plotting RUSSIAN trigrams
barplot(ru_TriGram_count[1:20,2], names = ru_TriGram_count$Word[1:20],
        main="Count of top 20 Russian 3-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

Last part of this task is to check how many words cover how many percentages of the text we’ve

# I will plot only qord coverage for 1-gram for the 4 languages
woperc <- function(percentage, language) {
  if(language == "en") {totalwords <- sum(us_Unogram_count$Count)}
  else if(language == "de") {totalwords <- sum(de_Unogram_count$Count)}
  else if(language == "fi") {totalwords <- sum(fi_Unogram_count$Count)}
  else if(language == "ru") {totalwords <- sum(ru_Unogram_count$Count)}
  
  percent = 0; cumsum = 0; i = 1
  while (percent < percentage)
    {
    if(language == "en") {cumsum = cumsum + us_Unogram_count$Count[i]}
    else if(language == "de") {cumsum = cumsum + de_Unogram_count$Count[i]}
    else if(language == "fi") {cumsum = cumsum + fi_Unogram_count$Count[i]}
    else if(language == "ru") {cumsum = cumsum + ru_Unogram_count$Count[i]}
    
    percent = cumsum/totalwords
    i = i + 1
    }
  return(i)
}

percents <- c(10,20,30,40,50,60,70,80,90)
en_wordPercentage <- c(woperc(0.1, "en"), woperc(0.2, "en"), woperc(0.3, "en"), woperc(0.4, "en"), woperc(0.5, "en"),
                       woperc(0.6, "en"), woperc(0.7, "en"), woperc(0.8, "en"), woperc(0.9, "en"))
de_wordPercentage <- c(woperc(0.1, "de"), woperc(0.2, "de"), woperc(0.3, "de"), woperc(0.4, "de"), woperc(0.5, "de"),
                       woperc(0.6, "de"), woperc(0.7, "de"), woperc(0.8, "de"), woperc(0.9, "de"))
fi_wordPercentage <- c(woperc(0.1, "fi"), woperc(0.2, "fi"), woperc(0.3, "fi"), woperc(0.4, "fi"), woperc(0.5, "fi"),
                       woperc(0.6, "fi"), woperc(0.7, "fi"), woperc(0.8, "fi"), woperc(0.9, "fi"))
ru_wordPercentage <- c(woperc(0.1, "ru"), woperc(0.2, "ru"), woperc(0.3, "ru"), woperc(0.4, "ru"), woperc(0.5, "ru"),
                       woperc(0.6, "ru"), woperc(0.7, "ru"), woperc(0.8, "ru"), woperc(0.9, "ru"))

wordPercentages <- data.frame(
  Perentage=percents,
  Country=c(rep("English", 9), rep("German", 9), rep("Finnish", 9), rep("Russian", 9)),
  wordCount=c(en_wordPercentage,de_wordPercentage,fi_wordPercentage,ru_wordPercentage)
)

qplot(Perentage, wordCount,data = wordPercentages, color = Country, geom=c("line","point")) +
  geom_text(aes(label=wordCount)) +
  ggtitle("Comparing 1-gram word-coverage percantages") +
  scale_x_discrete(breaks=c(10,20,30,40,50,60,70,80,90), labels=c(10,20,30,40,50,60,70,80,90))

NLP Project

KKher

9/9/2020

Data

Objectives

Here we start to select samples (since working on the entire dataset will require high computational power & will consume time, due to their sizes) of our data and work on cleaning them for further investigation

Here we start cleaning sentences collected from our samples above

Here we structure our cleaned up data into a dataframe for tokenization

Here we shall explore visually frequency of words, for 1,2 & 3-grams data

- English Data

- German Data

- Finnish Data

- Russian Data

Last part of this task is to check how many words cover how many percentages of the text we’ve

Conclusion