Exploratory analysis approach

The exploratory analysis below focuses on discovering a few key features of the datasets. These include the median number of characters for each country and medium as well as the top 10 words of each.

For each dataset, the analysis is performed an a sample created from the first 500 lines of the file.

Summary of findings

Number of characters

The chart below shows the median number of characters for each of the datasets.

Top 10 news words in each language

These are the top 10 words used in news articles from each country:

##   country                                          news_top10_words
## 1      US       the, and, for, that, with, was, but, his, said, not
## 2      FI että, mutta, oli, myös, ovat, hän, kun, ole, joka, mukaan
## 3      DE         die, der, und, den, das, von, mit, sich, auf, dem
## 4      RU   что, как, это, для, его, уже, только, все, будет, чтобы

The chart below shows the proportion of words that are top-10 words in each language

In Finnish and Russian the top-10 words account for a smaller percentage of the overall words when compared to English and German. This may make it more difficult to predict words in Finnish and Russian. Further exploration is needed to determine whether this is a problem, and whether this discrepancy between the languages is also present in word-pairs.

Plan for creating the prediction algorithm

The plan is to create a Shiny App that allows the user to input a word or a short phrase in a chosen language. In return the user is presented with the three words most likely to follow. The app will also contain visuals indicating the confidence level of the prediction.

Appendix

The following code was used to produce the results presented in this report:

require(readr)
require(tm)
require(dplyr)
require(ggplot2)
require(ggthemes)
require(reshape2)

summariseData <- function(dataDir, country, lang) {
    df <- data.frame()
    df[1, "country"] <- country
    prefix <- paste0(dataDir, lang, "_", country, "/", lang, "_", country)
    twit <- read_lines(file = paste0(prefix, ".twitter.txt"), n_max=500)
    blog <- read_lines(file = paste0(prefix, ".blogs.txt"), n_max=500)
    news <- read_lines(file = paste0(prefix, ".news.txt"), n_max=500)
    
    #collect stats
    stats <- calculateSummary(twit)
    df$twit_median_char <- stats$mean_char #median number of characters
    df$twit_top10_words <- stats$top10_words #the top 10 words
    df$twit_top10_pct <- stats$top10_pct #percentage top 10 words of total
    
    stats <- calculateSummary(blog)
    df$blog_median_char <- stats$mean_char #median number of characters
    df$blog_top10_words <- stats$top10_words #the top 10 words
    df$blog_top10_pct <- stats$top10_pct #percentage top 10 words of total
    
    stats <- calculateSummary(news)
    df$news_median_char <- stats$mean_char #median number of characters
    df$news_top10_words <- stats$top10_words #the top 10 words
    df$news_top10_pct <- stats$top10_pct #percentage top 10 words of total
    
    return(df)
    
}

calculateSummary <- function(dat) {
    df <- data.frame(id=1)
    df$total_char <- sum(nchar(dat)) #total number of characters
    df$mean_char <- median(nchar(dat)) #median number of characters
    
    #top-10 words
    corp <- VCorpus(VectorSource(dat))
    corp <- tm_map(corp, content_transformer(tolower))
    tdm <- TermDocumentMatrix(corp)
    wordCounts <- as.data.frame(as.matrix(tdm))
    wordCounts$total <- rowSums(wordCounts)
    wordCounts <- data.frame(word=rownames(wordCounts), amount=wordCounts$total)
    
    top10 <- wordCounts %>% 
        group_by(word) %>% 
        summarise(amount=sum(amount)) %>% 
        arrange(desc(amount)) %>% 
        mutate(cumulPct = cumsum(amount)/sum(amount)) %>% 
        head(10)
    
    df$top10_words <- ""
    for (w in top10$word) {
        df$top10_words <- paste(df$top10_words, w, sep=", ")
    }
    df$top10_words <- substr(df$top10_words, 3, 1000)
    df$top10_pct <- max(top10$cumulPct)
    
    return(df)
}


res <- summariseData("~/Projects/CDS/Capstone/Data/final/", country = "US", lang = "en")
res <- rbind(res, summariseData("~/Projects/CDS/Capstone/Data/final/", country = "FI", lang = "fi"))
res <- rbind(res, summariseData("~/Projects/CDS/Capstone/Data/final/", country = "DE", lang = "de"))
res <- rbind(res, summariseData("~/Projects/CDS/Capstone/Data/final/", country = "RU", lang = "ru"))


#Chart 1:
melted <- melt(res, id.vars = "country", measure.vars = c("twit_median_char", "blog_median_char", "news_median_char"))
g <- ggplot(melted, aes(x=as.factor(country), y=value, fill=variable))
g <- g + geom_bar(stat="identity")
g <- g + labs(x = "Country", y = "Median number of characters", title = "Median number of characters by country")
g <- g + theme_economist()
g

#Chart 2:
melted2 <- melt(res, id.vars = "country", measure.vars = c("twit_top10_pct", "blog_top10_pct", "news_top10_pct"))
g2 <- ggplot(melted2, aes(x=as.factor(country), y=value, fill=variable))
g2 <- g2 + geom_bar(position="dodge", stat="identity")
g2 <- g2 + labs(x = "Country", y = "Top 10 words' share of total words", title = "Top 10 words' share of total words used")
g2 <- g2 + theme_economist()
g2

Milestone report

S. Stefansen

18 March 2016