The exploratory analysis below focuses on discovering a few key features of the datasets. These include the median number of characters for each country and medium as well as the top 10 words of each.
For each dataset, the analysis is performed an a sample created from the first 500 lines of the file.
The chart below shows the median number of characters for each of the datasets.
These are the top 10 words used in news articles from each country:
## country news_top10_words
## 1 US the, and, for, that, with, was, but, his, said, not
## 2 FI että, mutta, oli, myös, ovat, hän, kun, ole, joka, mukaan
## 3 DE die, der, und, den, das, von, mit, sich, auf, dem
## 4 RU что, как, это, для, его, уже, только, все, будет, чтобы
The chart below shows the proportion of words that are top-10 words in each language
In Finnish and Russian the top-10 words account for a smaller percentage of the overall words when compared to English and German. This may make it more difficult to predict words in Finnish and Russian. Further exploration is needed to determine whether this is a problem, and whether this discrepancy between the languages is also present in word-pairs.
The plan is to create a Shiny App that allows the user to input a word or a short phrase in a chosen language. In return the user is presented with the three words most likely to follow. The app will also contain visuals indicating the confidence level of the prediction.
The following code was used to produce the results presented in this report:
require(readr)
require(tm)
require(dplyr)
require(ggplot2)
require(ggthemes)
require(reshape2)
summariseData <- function(dataDir, country, lang) {
df <- data.frame()
df[1, "country"] <- country
prefix <- paste0(dataDir, lang, "_", country, "/", lang, "_", country)
twit <- read_lines(file = paste0(prefix, ".twitter.txt"), n_max=500)
blog <- read_lines(file = paste0(prefix, ".blogs.txt"), n_max=500)
news <- read_lines(file = paste0(prefix, ".news.txt"), n_max=500)
#collect stats
stats <- calculateSummary(twit)
df$twit_median_char <- stats$mean_char #median number of characters
df$twit_top10_words <- stats$top10_words #the top 10 words
df$twit_top10_pct <- stats$top10_pct #percentage top 10 words of total
stats <- calculateSummary(blog)
df$blog_median_char <- stats$mean_char #median number of characters
df$blog_top10_words <- stats$top10_words #the top 10 words
df$blog_top10_pct <- stats$top10_pct #percentage top 10 words of total
stats <- calculateSummary(news)
df$news_median_char <- stats$mean_char #median number of characters
df$news_top10_words <- stats$top10_words #the top 10 words
df$news_top10_pct <- stats$top10_pct #percentage top 10 words of total
return(df)
}
calculateSummary <- function(dat) {
df <- data.frame(id=1)
df$total_char <- sum(nchar(dat)) #total number of characters
df$mean_char <- median(nchar(dat)) #median number of characters
#top-10 words
corp <- VCorpus(VectorSource(dat))
corp <- tm_map(corp, content_transformer(tolower))
tdm <- TermDocumentMatrix(corp)
wordCounts <- as.data.frame(as.matrix(tdm))
wordCounts$total <- rowSums(wordCounts)
wordCounts <- data.frame(word=rownames(wordCounts), amount=wordCounts$total)
top10 <- wordCounts %>%
group_by(word) %>%
summarise(amount=sum(amount)) %>%
arrange(desc(amount)) %>%
mutate(cumulPct = cumsum(amount)/sum(amount)) %>%
head(10)
df$top10_words <- ""
for (w in top10$word) {
df$top10_words <- paste(df$top10_words, w, sep=", ")
}
df$top10_words <- substr(df$top10_words, 3, 1000)
df$top10_pct <- max(top10$cumulPct)
return(df)
}
res <- summariseData("~/Projects/CDS/Capstone/Data/final/", country = "US", lang = "en")
res <- rbind(res, summariseData("~/Projects/CDS/Capstone/Data/final/", country = "FI", lang = "fi"))
res <- rbind(res, summariseData("~/Projects/CDS/Capstone/Data/final/", country = "DE", lang = "de"))
res <- rbind(res, summariseData("~/Projects/CDS/Capstone/Data/final/", country = "RU", lang = "ru"))
#Chart 1:
melted <- melt(res, id.vars = "country", measure.vars = c("twit_median_char", "blog_median_char", "news_median_char"))
g <- ggplot(melted, aes(x=as.factor(country), y=value, fill=variable))
g <- g + geom_bar(stat="identity")
g <- g + labs(x = "Country", y = "Median number of characters", title = "Median number of characters by country")
g <- g + theme_economist()
g
#Chart 2:
melted2 <- melt(res, id.vars = "country", measure.vars = c("twit_top10_pct", "blog_top10_pct", "news_top10_pct"))
g2 <- ggplot(melted2, aes(x=as.factor(country), y=value, fill=variable))
g2 <- g2 + geom_bar(position="dodge", stat="identity")
g2 <- g2 + labs(x = "Country", y = "Top 10 words' share of total words", title = "Top 10 words' share of total words used")
g2 <- g2 + theme_economist()
g2