our data comes from HC Corpora, and can be downloaded directly from here
First, we will read all our files:
US_blogs <- readLines("./final/en_US/en_US.blogs.txt")
US_news <- readLines("./final/en_US/en_US.news.txt")
US_twitter <- readLines("./final/en_US/en_US.twitter.txt")
DE_twitter <- readLines("./final/de_DE/de_DE.twitter.txt")
DE_news <- readLines("./final/de_DE/de_DE.news.txt")
DE_blogs <- readLines("./final/de_DE/de_DE.blogs.txt")
FI_blogs <- readLines("./final/fi_FI/fi_FI.blogs.txt")
FI_news <- readLines("./final/fi_FI/fi_FI.news.txt")
FI_twitter <- readLines("./final/fi_FI/fi_FI.twitter.txt")
RU_twitter <- readLines("./final/ru_RU/ru_RU.twitter.txt")
RU_news <- readLines("./final/ru_RU/ru_RU.news.txt")
RU_blogs <- readLines("./final/ru_RU/ru_RU.blogs.txt")
## Country File FileSize LineCount WordCount
## 11 Finland News 119.7 Mb 485758 13056791
## 7 Finland Blogs 130.4 Mb 439785 15805117
## 3 Finland Twitter 44 Mb 285214 3836079
## 2 Germany News 105.1 Mb 244743 14742498
## 6 Germany Twitter 137.9 Mb 947774 12409380
## 10 Germany Blogs 51.3 Mb 181958 6769294
## 8 Russia News 124.9 Mb 196360 45485831
## 4 Russia Blogs 132.1 Mb 337100 44147004
## 12 Russia Twitter 153.6 Mb 881414 38653696
## 5 USA News 19.8 Mb 77259 2693898
## 1 USA Blogs 255.4 Mb 899288 38154238
## 9 USA Twitter 319 Mb 2360148 30218125
library(qdap)
set.seed(09092020)
# will take samples per country
sampleUS_blogs <- sample(US_blogs,1000, replace = FALSE)
sampleUS_news <- sample(US_news,1000, replace = FALSE)
sampleUS_twitter <- sample(US_twitter,1000, replace = FALSE)
sampleDE_blogs <- sample(DE_blogs,1000, replace = FALSE)
sampleDE_news <- sample(DE_news,1000, replace = FALSE)
sampleDE_twitter <- sample(DE_twitter,1000, replace = FALSE)
sampleFI_blogs <- sample(FI_blogs,1000, replace = FALSE)
sampleFI_news <- sample(FI_news,1000, replace = FALSE)
sampleFI_twitter <- sample(FI_twitter,1000, replace = FALSE)
sampleRU_blogs <- sample(RU_blogs,1000, replace = FALSE)
sampleRU_news <- sample(RU_news,1000, replace = FALSE)
sampleRU_twitter <- sample(RU_twitter,1000, replace = FALSE)
# combine sampled data per language
en_SampledData <- paste(sampleUS_blogs, sampleUS_news, sampleUS_twitter)
de_SampledData <- paste(sampleDE_blogs, sampleDE_news, sampleDE_twitter)
fi_SampledData <- paste(sampleFI_blogs, sampleFI_news, sampleFI_twitter)
ru_SampledData <- paste(sampleRU_blogs, sampleRU_news, sampleRU_twitter)
# detect sentences in combined sampled data
us_sentences <- sent_detect(en_SampledData, language = "en", model = NULL)
de_sentences <- sent_detect(de_SampledData, language = "de", model = NULL)
fi_sentences <- sent_detect(fi_SampledData, language = "fi", model = NULL)
ru_sentences <- sent_detect(ru_SampledData, language = "ru", model = NULL)
# remove unneeded variables
remove(sampleUS_blogs, sampleUS_news, sampleUS_twitter,
sampleDE_blogs, sampleDE_news, sampleDE_twitter,
sampleRU_blogs, sampleRU_news, sampleRU_twitter,
sampleFI_blogs, sampleFI_news, sampleFI_twitter,
en_SampledData, de_SampledData, fi_SampledData, ru_SampledData,
US_blogs, US_news, US_twitter,
DE_blogs, DE_news, DE_twitter,
RU_blogs, RU_news, RU_twitter,
FI_blogs, FI_news, FI_twitter)
library(tm)
# vectorize my ENGLISH data for further cleaning
us_sentences <- VCorpus(VectorSource(us_sentences))
# cleaning of vectorized data will be done on several steps:
us_sentences <- tm_map(us_sentences, removeNumbers) # remove numbers
us_sentences <- tm_map(us_sentences, stripWhitespace) # remove whitespaces
us_sentences <- tm_map(us_sentences, content_transformer(tolower)) # lowercase
us_sentences <- tm_map(us_sentences, removePunctuation) # remove special characters
us_sentences <- tm_map(us_sentences, removeWords, stopwords("english")) # remove stop words
# vectorize my GERMAN data for further cleaning
de_sentences <- VCorpus(VectorSource(de_sentences))
# cleaning of vectorized data will be done on several steps:
de_sentences <- tm_map(de_sentences, removeNumbers) # remove numbers
de_sentences <- tm_map(de_sentences, stripWhitespace) # remove whitespaces
de_sentences <- tm_map(de_sentences, content_transformer(tolower)) # lowercase
de_sentences <- tm_map(de_sentences, removePunctuation) # remove special characters
de_sentences <- tm_map(de_sentences, removeWords, stopwords("german")) # remove stop words
# vectorize my FINNISH data for further cleaning
fi_sentences <- VCorpus(VectorSource(fi_sentences))
# cleaning of vectorized data will be done on several steps:
fi_sentences <- tm_map(fi_sentences, removeNumbers) # remove numbers
fi_sentences <- tm_map(fi_sentences, stripWhitespace) # remove whitespaces
fi_sentences <- tm_map(fi_sentences, content_transformer(tolower)) # lowercase
fi_sentences <- tm_map(fi_sentences, removePunctuation) # remove special characters
fi_sentences <- tm_map(fi_sentences, removeWords, stopwords("finnish")) # remove stop words
# vectorize my RUSSIAN data for further cleaning
ru_sentences <- VCorpus(VectorSource(ru_sentences))
# cleaning of vectorized data will be done on several steps:
ru_sentences <- tm_map(ru_sentences, removeNumbers) # remove numbers
ru_sentences <- tm_map(ru_sentences, stripWhitespace) # remove whitespaces
ru_sentences <- tm_map(ru_sentences, content_transformer(tolower)) # lowercase
ru_sentences <- tm_map(ru_sentences, removePunctuation) # remove special characters
ru_sentences <- tm_map(ru_sentences, removeWords, stopwords("russian")) # remove stop words
library(RWeka)
#structure into dataframes
us_sentences <- data.frame(us_sentences,stringsAsFactors = FALSE)
de_sentences <- data.frame(de_sentences,stringsAsFactors = FALSE)
fi_sentences <- data.frame(fi_sentences,stringsAsFactors = FALSE)
ru_sentences <- data.frame(ru_sentences,stringsAsFactors = FALSE)
# get 1,2 & 3-grams from ENGLISH structured data
us_UnoGram <- NGramTokenizer(us_sentences, Weka_control(min = 1, max = 1))
us_BiGram <- NGramTokenizer(us_sentences, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
us_TriGram <- NGramTokenizer(us_sentences, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))
# get 1,2 & 3-grams from GERMAN structured data
de_UnoGram <- NGramTokenizer(de_sentences, Weka_control(min = 1, max = 1))
de_BiGram <- NGramTokenizer(de_sentences, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
de_TriGram <- NGramTokenizer(de_sentences, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))
# get 1,2 & 3-grams from FINNISH structured data
fi_UnoGram <- NGramTokenizer(fi_sentences, Weka_control(min = 1, max = 1))
fi_BiGram <- NGramTokenizer(fi_sentences, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
fi_TriGram <- NGramTokenizer(fi_sentences, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))
# get 1,2 & 3-grams from RUSSIAN structured data
ru_UnoGram <- NGramTokenizer(ru_sentences, Weka_control(min = 1, max = 1))
ru_BiGram <- NGramTokenizer(ru_sentences, Weka_control(min = 2, max = 2, delimiters = " \\r\\n\\t.,;:\"()?!"))
ru_TriGram <- NGramTokenizer(ru_sentences, Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))
library(ggplot2)
library(wordcloud)
library(RColorBrewer)
# Count ENGLISH words of the unigram set
us_Unogram_count <- data.frame(table(us_UnoGram))
us_Unogram_count <- us_Unogram_count[order(us_Unogram_count$Freq,decreasing = TRUE),]
colnames(us_Unogram_count) <- c("Word","Count")
# plotting ENGLISH Unigrams
wordcloud(us_UnoGram, max.words=300, random.order=FALSE,
use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))
barplot(us_Unogram_count[1:20,2], names = us_Unogram_count$Word[1:20],
main="Count of top 20 English 1-grams",
col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)
# Count ENGLISH words of the bigram set
us_BiGram_count <- data.frame(table(us_BiGram))
us_BiGram_count <- us_BiGram_count[order(us_BiGram_count$Freq,decreasing = TRUE),]
colnames(us_BiGram_count) <- c("Word","Count")
# plotting ENGLISH bigrams
barplot(us_BiGram_count[1:20,2], names = us_BiGram_count$Word[1:20],
main="Count of top 20 English 2-grams",
col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)
# Count ENGLISH words of the bigram set
us_TriGram_count <- data.frame(table(us_TriGram))
us_TriGram_count <- us_TriGram_count[order(us_TriGram_count$Freq,decreasing = TRUE),]
colnames(us_TriGram_count) <- c("Word","Count")
# plotting ENGLISH bigrams
barplot(us_TriGram_count[1:20,2], names = us_TriGram_count$Word[1:20],
main="Count of top 20 English 3-grams",
col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)
# Count GERMAN words of the unigram set
de_Unogram_count <- data.frame(table(de_UnoGram))
de_Unogram_count <- de_Unogram_count[order(de_Unogram_count$Freq,decreasing = TRUE),]
colnames(de_Unogram_count) <- c("Word","Count")
# plotting GERMAN words
wordcloud(de_UnoGram, max.words=300, random.order=FALSE,
use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))
barplot(de_Unogram_count[1:20,2], names = de_Unogram_count$Word[1:20],
main="Count of top 20 German 1-grams",
col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)
# Count GERMAN words of the bigram set
de_BiGram_count <- data.frame(table(de_BiGram))
de_BiGram_count <- de_BiGram_count[order(de_BiGram_count$Freq,decreasing = TRUE),]
colnames(de_BiGram_count) <- c("Word","Count")
# plotting GERMAN bigrams
barplot(de_BiGram_count[1:20,2], names = de_BiGram_count$Word[1:20],
main="Count of top 20 German 2-grams",
col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)
# Count GERMAN words of the bigram set
de_TriGram_count <- data.frame(table(de_TriGram))
de_TriGram_count <- de_TriGram_count[order(de_TriGram_count$Freq,decreasing = TRUE),]
colnames(de_TriGram_count) <- c("Word","Count")
# plotting GERMAN trigrams
barplot(de_TriGram_count[1:20,2], names = de_TriGram_count$Word[1:20],
main="Count of top 20 German 3-grams",
col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)
# Count GERMAN words of the unigram set
fi_Unogram_count <- data.frame(table(fi_UnoGram))
fi_Unogram_count <- fi_Unogram_count[order(fi_Unogram_count$Freq,decreasing = TRUE),]
colnames(fi_Unogram_count) <- c("Word","Count")
# plotting FINNISH words
wordcloud(fi_UnoGram, max.words=300, random.order=FALSE,
use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))
barplot(fi_Unogram_count[1:20,2], names = fi_Unogram_count$Word[1:20],
main="Count of top 20 Finnish 1-grams",
col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)
# Count FINNISH words of the bigram set
fi_BiGram_count <- data.frame(table(fi_BiGram))
fi_BiGram_count <- fi_BiGram_count[order(fi_BiGram_count$Freq,decreasing = TRUE),]
colnames(fi_BiGram_count) <- c("Word","Count")
# plotting FINNISH bigrams
barplot(fi_BiGram_count[1:20,2], names = fi_BiGram_count$Word[1:20],
main="Count of top 20 Finnish 2-grams",
col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)
# Count FINNISH words of the bigram set
fi_TriGram_count <- data.frame(table(fi_TriGram))
fi_TriGram_count <- fi_TriGram_count[order(fi_TriGram_count$Freq,decreasing = TRUE),]
colnames(fi_TriGram_count) <- c("Word","Count")
# plotting FINNISH trigrams
barplot(fi_TriGram_count[1:20,2], names = fi_TriGram_count$Word[1:20],
main="Count of top 20 Finnish 3-grams",
col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)
# Count RUSSIAN words of the unigram set
ru_Unogram_count <- data.frame(table(ru_UnoGram))
ru_Unogram_count <- ru_Unogram_count[order(ru_Unogram_count$Freq,decreasing = TRUE),]
colnames(ru_Unogram_count) <- c("Word","Count")
# plotting RUSSIAN unigrams
wordcloud(ru_UnoGram, max.words=300, random.order=FALSE,
use.r.layout=FALSE, colors=brewer.pal(8,"Accent"))
barplot(ru_Unogram_count[1:20,2], names = ru_Unogram_count$Word[1:20],
main="Count of top 20 Russian 1-grams",
col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)
# Count RUSSIAN words of the bigram set
ru_BiGram_count <- data.frame(table(ru_BiGram))
ru_BiGram_count <- ru_BiGram_count[order(ru_BiGram_count$Freq,decreasing = TRUE),]
colnames(ru_BiGram_count) <- c("Word","Count")
# plotting RUSSIAN bigrams
barplot(ru_BiGram_count[1:20,2], names = ru_BiGram_count$Word[1:20],
main="Count of top 20 Russian 2-grams",
col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)
# Count RUSSIAN words of the bigram set
ru_TriGram_count <- data.frame(table(ru_TriGram))
ru_TriGram_count <- ru_TriGram_count[order(ru_TriGram_count$Freq,decreasing = TRUE),]
colnames(ru_TriGram_count) <- c("Word","Count")
# plotting RUSSIAN trigrams
barplot(ru_TriGram_count[1:20,2], names = ru_TriGram_count$Word[1:20],
main="Count of top 20 Russian 3-grams",
col=rgb(0.2,0.4,0.6,0.5),las=1,horiz = T)
# I will plot only qord coverage for 1-gram for the 4 languages
woperc <- function(percentage, language) {
if(language == "en") {totalwords <- sum(us_Unogram_count$Count)}
else if(language == "de") {totalwords <- sum(de_Unogram_count$Count)}
else if(language == "fi") {totalwords <- sum(fi_Unogram_count$Count)}
else if(language == "ru") {totalwords <- sum(ru_Unogram_count$Count)}
percent = 0; cumsum = 0; i = 1
while (percent < percentage)
{
if(language == "en") {cumsum = cumsum + us_Unogram_count$Count[i]}
else if(language == "de") {cumsum = cumsum + de_Unogram_count$Count[i]}
else if(language == "fi") {cumsum = cumsum + fi_Unogram_count$Count[i]}
else if(language == "ru") {cumsum = cumsum + ru_Unogram_count$Count[i]}
percent = cumsum/totalwords
i = i + 1
}
return(i)
}
percents <- c(10,20,30,40,50,60,70,80,90)
en_wordPercentage <- c(woperc(0.1, "en"), woperc(0.2, "en"), woperc(0.3, "en"), woperc(0.4, "en"), woperc(0.5, "en"),
woperc(0.6, "en"), woperc(0.7, "en"), woperc(0.8, "en"), woperc(0.9, "en"))
de_wordPercentage <- c(woperc(0.1, "de"), woperc(0.2, "de"), woperc(0.3, "de"), woperc(0.4, "de"), woperc(0.5, "de"),
woperc(0.6, "de"), woperc(0.7, "de"), woperc(0.8, "de"), woperc(0.9, "de"))
fi_wordPercentage <- c(woperc(0.1, "fi"), woperc(0.2, "fi"), woperc(0.3, "fi"), woperc(0.4, "fi"), woperc(0.5, "fi"),
woperc(0.6, "fi"), woperc(0.7, "fi"), woperc(0.8, "fi"), woperc(0.9, "fi"))
ru_wordPercentage <- c(woperc(0.1, "ru"), woperc(0.2, "ru"), woperc(0.3, "ru"), woperc(0.4, "ru"), woperc(0.5, "ru"),
woperc(0.6, "ru"), woperc(0.7, "ru"), woperc(0.8, "ru"), woperc(0.9, "ru"))
wordPercentages <- data.frame(
Perentage=percents,
Country=c(rep("English", 9), rep("German", 9), rep("Finnish", 9), rep("Russian", 9)),
wordCount=c(en_wordPercentage,de_wordPercentage,fi_wordPercentage,ru_wordPercentage)
)
qplot(Perentage, wordCount,data = wordPercentages, color = Country, geom=c("line","point")) +
geom_text(aes(label=wordCount)) +
ggtitle("Comparing 1-gram word-coverage percantages") +
scale_x_discrete(breaks=c(10,20,30,40,50,60,70,80,90), labels=c(10,20,30,40,50,60,70,80,90))