Data

our data comes from HC Corpora, and can be downloaded directly from here

First, we will read all our files:

US_blogs <- readLines("./final/en_US/en_US.blogs.txt")
US_news <- readLines("./final/en_US/en_US.news.txt")
US_twitter <- readLines("./final/en_US/en_US.twitter.txt")

DE_twitter <- readLines("./final/de_DE/de_DE.twitter.txt")
DE_news <- readLines("./final/de_DE/de_DE.news.txt")
DE_blogs <- readLines("./final/de_DE/de_DE.blogs.txt")

FI_blogs <- readLines("./final/fi_FI/fi_FI.blogs.txt")
FI_news <- readLines("./final/fi_FI/fi_FI.news.txt")
FI_twitter <- readLines("./final/fi_FI/fi_FI.twitter.txt")

RU_twitter <- readLines("./final/ru_RU/ru_RU.twitter.txt")
RU_news <- readLines("./final/ru_RU/ru_RU.news.txt")
RU_blogs <- readLines("./final/ru_RU/ru_RU.blogs.txt")

Objectives

  1. Exploratory analysis - perform a thorough exploratory analysis of the data, understanding the distribution of words and relationship between the words in the corpora.
  2. Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.
##    Country    File FileSize LineCount WordCount
## 11 Finland    News 119.7 Mb    485758  13056791
## 7  Finland   Blogs 130.4 Mb    439785  15805117
## 3  Finland Twitter    44 Mb    285214   3836079
## 2  Germany    News 105.1 Mb    244743  14742498
## 6  Germany Twitter 137.9 Mb    947774  12409380
## 10 Germany   Blogs  51.3 Mb    181958   6769294
## 8   Russia    News 124.9 Mb    196360  45485831
## 4   Russia   Blogs 132.1 Mb    337100  44147004
## 12  Russia Twitter 153.6 Mb    881414  38653696
## 5      USA    News  19.8 Mb     77259   2693898
## 1      USA   Blogs 255.4 Mb    899288  38154238
## 9      USA Twitter   319 Mb   2360148  30218125

Here we start to select samples (since working on the entire dataset will require high computational power & will consume time, due to their sizes) of our data and work on cleaning them for further investigation

library(qdap)

set.seed(09092020)
# will take samples per country
sampleUS_blogs <- sample(US_blogs,1000, replace = FALSE)
sampleUS_news <- sample(US_news,1000, replace = FALSE)
sampleUS_twitter <- sample(US_twitter,1000, replace = FALSE)

sampleDE_blogs <- sample(DE_blogs,1000, replace = FALSE)
sampleDE_news <- sample(DE_news,1000, replace = FALSE)
sampleDE_twitter <- sample(DE_twitter,1000, replace = FALSE)

sampleFI_blogs <- sample(FI_blogs,1000, replace = FALSE)
sampleFI_news <- sample(FI_news,1000, replace = FALSE)
sampleFI_twitter <- sample(FI_twitter,1000, replace = FALSE)

sampleRU_blogs <- sample(RU_blogs,1000, replace = FALSE)
sampleRU_news <- sample(RU_news,1000, replace = FALSE)
sampleRU_twitter <- sample(RU_twitter,1000, replace = FALSE)

# combine sampled data per language
en_SampledData <- paste(sampleUS_blogs, sampleUS_news, sampleUS_twitter)
de_SampledData <- paste(sampleDE_blogs, sampleDE_news, sampleDE_twitter)
fi_SampledData <- paste(sampleFI_blogs, sampleFI_news, sampleFI_twitter)
ru_SampledData <- paste(sampleRU_blogs, sampleRU_news, sampleRU_twitter)

# detect sentences in combined sampled data
us_sentences <- sent_detect(en_SampledData, language = "en", model = NULL)
de_sentences <- sent_detect(de_SampledData, language = "de", model = NULL)
fi_sentences <- sent_detect(fi_SampledData, language = "fi", model = NULL)
ru_sentences <- sent_detect(ru_SampledData, language = "ru", model = NULL)

# remove unneeded variables
remove(sampleUS_blogs, sampleUS_news, sampleUS_twitter,
       sampleDE_blogs, sampleDE_news, sampleDE_twitter,
       sampleRU_blogs, sampleRU_news, sampleRU_twitter,
       sampleFI_blogs, sampleFI_news, sampleFI_twitter,
       en_SampledData, de_SampledData, fi_SampledData, ru_SampledData,
       US_blogs, US_news, US_twitter,
       DE_blogs, DE_news, DE_twitter,
       RU_blogs, RU_news, RU_twitter,
       FI_blogs, FI_news, FI_twitter)

Here we start cleaning sentences collected from our samples above

library(tm)
# vectorize my ENGLISH data for further cleaning
us_sentences <- VCorpus(VectorSource(us_sentences))

# cleaning of vectorized data will be done on several steps:
us_sentences <- tm_map(us_sentences, removeNumbers) # remove numbers
us_sentences <- tm_map(us_sentences, stripWhitespace) # remove whitespaces
us_sentences <- tm_map(us_sentences, content_transformer(tolower)) # lowercase
us_sentences <- tm_map(us_sentences, removePunctuation) # remove special characters
us_sentences <- tm_map(us_sentences, removeWords, stopwords("english")) # remove stop words

# vectorize my GERMAN data for further cleaning
de_sentences <- VCorpus(VectorSource(de_sentences))

# cleaning of vectorized data will be done on several steps:
de_sentences <- tm_map(de_sentences, removeNumbers) # remove numbers
de_sentences <- tm_map(de_sentences, stripWhitespace) # remove whitespaces
de_sentences <- tm_map(de_sentences, content_transformer(tolower)) # lowercase
de_sentences <- tm_map(de_sentences, removePunctuation) # remove special characters
de_sentences <- tm_map(de_sentences, removeWords, stopwords("german")) # remove stop words

# vectorize my FINNISH data for further cleaning
fi_sentences <- VCorpus(VectorSource(fi_sentences))

# cleaning of vectorized data will be done on several steps:
fi_sentences <- tm_map(fi_sentences, removeNumbers) # remove numbers
fi_sentences <- tm_map(fi_sentences, stripWhitespace) # remove whitespaces
fi_sentences <- tm_map(fi_sentences, content_transformer(tolower)) # lowercase
fi_sentences <- tm_map(fi_sentences, removePunctuation) # remove special characters
fi_sentences <- tm_map(fi_sentences, removeWords, stopwords("finnish")) # remove stop words

# vectorize my RUSSIAN data for further cleaning
ru_sentences <- VCorpus(VectorSource(ru_sentences))

# cleaning of vectorized data will be done on several steps:
ru_sentences <- tm_map(ru_sentences, removeNumbers) # remove numbers
ru_sentences <- tm_map(ru_sentences, stripWhitespace) # remove whitespaces
ru_sentences <- tm_map(ru_sentences, content_transformer(tolower)) # lowercase
ru_sentences <- tm_map(ru_sentences, removePunctuation) # remove special characters
ru_sentences <- tm_map(ru_sentences, removeWords, stopwords("russian")) # remove stop words

Here we structure our cleaned up data into a dataframe for tokenization

library(RWeka)

#structure into dataframes
us_sentences <- data.frame(us_sentences,stringsAsFactors = FALSE)
de_sentences <- data.frame(de_sentences,stringsAsFactors = FALSE)
fi_sentences <- data.frame(fi_sentences,stringsAsFactors = FALSE)
ru_sentences <- data.frame(ru_sentences,stringsAsFactors = FALSE)

# get 1,2 & 3-grams from ENGLISH structured data
us_UnoGram <- NGramTokenizer(us_sentences, Weka_control(min = 1, max = 1))
us_BiGram <- NGramTokenizer(us_sentences, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
us_TriGram <- NGramTokenizer(us_sentences, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))

# get 1,2 & 3-grams from GERMAN structured data
de_UnoGram <- NGramTokenizer(de_sentences, Weka_control(min = 1, max = 1))
de_BiGram <- NGramTokenizer(de_sentences, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
de_TriGram <- NGramTokenizer(de_sentences, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))

# get 1,2 & 3-grams from FINNISH structured data
fi_UnoGram <- NGramTokenizer(fi_sentences, Weka_control(min = 1, max = 1))
fi_BiGram <- NGramTokenizer(fi_sentences, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
fi_TriGram <- NGramTokenizer(fi_sentences, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))

# get 1,2 & 3-grams from RUSSIAN structured data
ru_UnoGram <- NGramTokenizer(ru_sentences, Weka_control(min = 1, max = 1))
ru_BiGram <- NGramTokenizer(ru_sentences, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
ru_TriGram <- NGramTokenizer(ru_sentences, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))

Here we shall explore visually frequency of words, for 1,2 & 3-grams data

- English Data

library(ggplot2)
library(wordcloud)
library(RColorBrewer)

# Count ENGLISH words of the unigram set
us_Unogram_count <- data.frame(table(us_UnoGram))
us_Unogram_count <- us_Unogram_count[order(us_Unogram_count$Freq,decreasing = TRUE),]
colnames(us_Unogram_count) <- c("Word","Count")


# plotting ENGLISH Unigrams
wordcloud(us_UnoGram, max.words=300, random.order=FALSE, 
          use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))

barplot(us_Unogram_count[1:20,2], names = us_Unogram_count$Word[1:20],
        main="Count of top 20 English 1-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count ENGLISH words of the bigram set
us_BiGram_count <- data.frame(table(us_BiGram))
us_BiGram_count <- us_BiGram_count[order(us_BiGram_count$Freq,decreasing = TRUE),]
colnames(us_BiGram_count) <- c("Word","Count")

# plotting ENGLISH bigrams
barplot(us_BiGram_count[1:20,2], names = us_BiGram_count$Word[1:20],
        main="Count of top 20 English 2-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count ENGLISH words of the bigram set
us_TriGram_count <- data.frame(table(us_TriGram))
us_TriGram_count <- us_TriGram_count[order(us_TriGram_count$Freq,decreasing = TRUE),]
colnames(us_TriGram_count) <- c("Word","Count")

# plotting ENGLISH bigrams
barplot(us_TriGram_count[1:20,2], names = us_TriGram_count$Word[1:20],
        main="Count of top 20 English 3-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

- German Data

# Count GERMAN words of the unigram set
de_Unogram_count <- data.frame(table(de_UnoGram))
de_Unogram_count <- de_Unogram_count[order(de_Unogram_count$Freq,decreasing = TRUE),]
colnames(de_Unogram_count) <- c("Word","Count")

# plotting GERMAN words
wordcloud(de_UnoGram, max.words=300, random.order=FALSE,
         use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))

barplot(de_Unogram_count[1:20,2], names = de_Unogram_count$Word[1:20],
        main="Count of top 20 German 1-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count GERMAN words of the bigram set
de_BiGram_count <- data.frame(table(de_BiGram))
de_BiGram_count <- de_BiGram_count[order(de_BiGram_count$Freq,decreasing = TRUE),]
colnames(de_BiGram_count) <- c("Word","Count")

# plotting GERMAN bigrams
barplot(de_BiGram_count[1:20,2], names = de_BiGram_count$Word[1:20],
        main="Count of top 20 German 2-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count GERMAN words of the bigram set
de_TriGram_count <- data.frame(table(de_TriGram))
de_TriGram_count <- de_TriGram_count[order(de_TriGram_count$Freq,decreasing = TRUE),]
colnames(de_TriGram_count) <- c("Word","Count")

# plotting GERMAN trigrams
barplot(de_TriGram_count[1:20,2], names = de_TriGram_count$Word[1:20],
        main="Count of top 20 German 3-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

- Finnish Data

# Count GERMAN words of the unigram set
fi_Unogram_count <- data.frame(table(fi_UnoGram))
fi_Unogram_count <- fi_Unogram_count[order(fi_Unogram_count$Freq,decreasing = TRUE),]
colnames(fi_Unogram_count) <- c("Word","Count")

# plotting FINNISH words
wordcloud(fi_UnoGram, max.words=300, random.order=FALSE, 
           use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))

barplot(fi_Unogram_count[1:20,2], names = fi_Unogram_count$Word[1:20],
        main="Count of top 20 Finnish 1-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count FINNISH words of the bigram set
fi_BiGram_count <- data.frame(table(fi_BiGram))
fi_BiGram_count <- fi_BiGram_count[order(fi_BiGram_count$Freq,decreasing = TRUE),]
colnames(fi_BiGram_count) <- c("Word","Count")

# plotting FINNISH bigrams
barplot(fi_BiGram_count[1:20,2], names = fi_BiGram_count$Word[1:20],
        main="Count of top 20 Finnish 2-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count FINNISH words of the bigram set
fi_TriGram_count <- data.frame(table(fi_TriGram))
fi_TriGram_count <- fi_TriGram_count[order(fi_TriGram_count$Freq,decreasing = TRUE),]
colnames(fi_TriGram_count) <- c("Word","Count")

# plotting FINNISH trigrams
barplot(fi_TriGram_count[1:20,2], names = fi_TriGram_count$Word[1:20],
        main="Count of top 20 Finnish 3-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

- Russian Data

# Count RUSSIAN words of the unigram set
ru_Unogram_count <- data.frame(table(ru_UnoGram))
ru_Unogram_count <- ru_Unogram_count[order(ru_Unogram_count$Freq,decreasing = TRUE),]
colnames(ru_Unogram_count) <- c("Word","Count")

# plotting RUSSIAN unigrams

wordcloud(ru_UnoGram, max.words=300, random.order=FALSE, 
          use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))

barplot(ru_Unogram_count[1:20,2], names = ru_Unogram_count$Word[1:20],
        main="Count of top 20 Russian 1-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count RUSSIAN words of the bigram set
ru_BiGram_count <- data.frame(table(ru_BiGram))
ru_BiGram_count <- ru_BiGram_count[order(ru_BiGram_count$Freq,decreasing = TRUE),]
colnames(ru_BiGram_count) <- c("Word","Count")

# plotting RUSSIAN bigrams
barplot(ru_BiGram_count[1:20,2], names = ru_BiGram_count$Word[1:20],
        main="Count of top 20 Russian 2-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

# Count RUSSIAN words of the bigram set
ru_TriGram_count <- data.frame(table(ru_TriGram))
ru_TriGram_count <- ru_TriGram_count[order(ru_TriGram_count$Freq,decreasing = TRUE),]
colnames(ru_TriGram_count) <- c("Word","Count")

# plotting RUSSIAN trigrams
barplot(ru_TriGram_count[1:20,2], names = ru_TriGram_count$Word[1:20],
        main="Count of top 20 Russian 3-grams",
        col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)

Last part of this task is to check how many words cover how many percentages of the text we’ve

# I will plot only qord coverage for 1-gram for the 4 languages
woperc <- function(percentage, language) {
  if(language == "en") {totalwords <- sum(us_Unogram_count$Count)}
  else if(language == "de") {totalwords <- sum(de_Unogram_count$Count)}
  else if(language == "fi") {totalwords <- sum(fi_Unogram_count$Count)}
  else if(language == "ru") {totalwords <- sum(ru_Unogram_count$Count)}
  
  percent = 0; cumsum = 0; i = 1
  while (percent < percentage)
    {
    if(language == "en") {cumsum = cumsum + us_Unogram_count$Count[i]}
    else if(language == "de") {cumsum = cumsum + de_Unogram_count$Count[i]}
    else if(language == "fi") {cumsum = cumsum + fi_Unogram_count$Count[i]}
    else if(language == "ru") {cumsum = cumsum + ru_Unogram_count$Count[i]}
    
    percent = cumsum/totalwords
    i = i + 1
    }
  return(i)
}

percents <- c(10,20,30,40,50,60,70,80,90)
en_wordPercentage <- c(woperc(0.1, "en"), woperc(0.2, "en"), woperc(0.3, "en"), woperc(0.4, "en"), woperc(0.5, "en"),
                       woperc(0.6, "en"), woperc(0.7, "en"), woperc(0.8, "en"), woperc(0.9, "en"))
de_wordPercentage <- c(woperc(0.1, "de"), woperc(0.2, "de"), woperc(0.3, "de"), woperc(0.4, "de"), woperc(0.5, "de"),
                       woperc(0.6, "de"), woperc(0.7, "de"), woperc(0.8, "de"), woperc(0.9, "de"))
fi_wordPercentage <- c(woperc(0.1, "fi"), woperc(0.2, "fi"), woperc(0.3, "fi"), woperc(0.4, "fi"), woperc(0.5, "fi"),
                       woperc(0.6, "fi"), woperc(0.7, "fi"), woperc(0.8, "fi"), woperc(0.9, "fi"))
ru_wordPercentage <- c(woperc(0.1, "ru"), woperc(0.2, "ru"), woperc(0.3, "ru"), woperc(0.4, "ru"), woperc(0.5, "ru"),
                       woperc(0.6, "ru"), woperc(0.7, "ru"), woperc(0.8, "ru"), woperc(0.9, "ru"))

wordPercentages <- data.frame(
  Perentage=percents,
  Country=c(rep("English", 9), rep("German", 9), rep("Finnish", 9), rep("Russian", 9)),
  wordCount=c(en_wordPercentage,de_wordPercentage,fi_wordPercentage,ru_wordPercentage)
)

qplot(Perentage, wordCount,data = wordPercentages, color = Country, geom=c("line","point")) +
  geom_text(aes(label=wordCount)) +
  ggtitle("Comparing 1-gram word-coverage percantages") +
  scale_x_discrete(breaks=c(10,20,30,40,50,60,70,80,90), labels=c(10,20,30,40,50,60,70,80,90))

Conclusion