Exploratory Analysis of English Dataset of SwiftKey

Work with data

Download file zip with dataset

datafile<-"C:\\Users\\wassim\\MEGA\\program_data\\r\\Coursera-SwiftKey.zip"
if(!file.exists(datafile)){
  url<-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
  download.file(url, "C:\\Users\\wassim\\MEGA\\program_data\\r\\Coursera-SwiftKey.zip")
}

Install need librarys and run

library(stringi); library(stringr); library(ggplot2); library(NLP); library(tm); library(RWeka); library(rJava); library(openNLP); library(SnowballC); library(qdap); library(wordcloud)

## Warning: package 'stringr' was built under R version 3.4.4

## Warning: package 'ggplot2' was built under R version 3.4.4

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

## Warning: package 'tm' was built under R version 3.4.4

## Warning: package 'RWeka' was built under R version 3.4.4

## Warning: package 'rJava' was built under R version 3.4.4

## Warning: package 'openNLP' was built under R version 3.4.4

## Warning: package 'qdap' was built under R version 3.4.4

## Loading required package: qdapDictionaries

## Loading required package: qdapRegex

## Warning: package 'qdapRegex' was built under R version 3.4.4

## 
## Attaching package: 'qdapRegex'

## The following object is masked from 'package:ggplot2':
## 
##     %+%

## Loading required package: qdapTools

## Warning: package 'qdapTools' was built under R version 3.4.4

## Loading required package: RColorBrewer

## 
## Attaching package: 'qdap'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix

## The following object is masked from 'package:NLP':
## 
##     ngrams

## The following object is masked from 'package:stringr':
## 
##     %>%

## The following object is masked from 'package:base':
## 
##     Filter

## Warning: package 'wordcloud' was built under R version 3.4.4

packages <- c("stringi", "stringr", "ggplot2", "NLP", "tm", "RWeka", "rJava", "openNLP", "SnowballC", "qdap", "wordcloud")
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
  install.packages(setdiff(packages, rownames(installed.packages())))  
}

Read files without unzip

dataEnNews <- read.table(unz(datafile, "final/en_US/en_US.news.txt"),
                          encoding = "UTF-8", sep = "\t",quote = "", stringsAsFactors = FALSE)

dataEnBlogs <- read.table(unz(datafile, "final/en_US/en_US.blogs.txt"), 
                          encoding = "UTF-8", sep = "\t",quote = "", stringsAsFactors = FALSE)

dataEnTwitter <- read.table(unz(datafile, "final/en_US/en_US.twitter.txt"), 
                            encoding = "UTF-8", sep = "\t",quote = "", stringsAsFactors = FALSE)

## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : embedded nul(s) found in input

Choose random 1000 rows from data

dataEnNews1000 <- sample(dataEnNews[1:nrow(dataEnNews),],1000)
dataEnBlogs1000 <- sample(dataEnBlogs[1:nrow(dataEnBlogs),],1000)
dataEnTwitter1000 <- sample(dataEnTwitter[1:nrow(dataEnTwitter),],1000)

dataAll1000<-c(dataEnNews1000,dataEnBlogs1000,dataEnTwitter1000)

Clean data Data cleaning of numbers, punctuation, double spaces, replacement all elements to lower register, steming.

head(dataAll1000)

## [1] "øThere are other industry factors at work. The availability of used cars has fallen while used car prices have risen. These are not factors that encourage customers to visit their local CarMax and go car shopping."                                    
## [2] "Mr. Abel was the treasurer for Jefferson County when he died on Wednesday (March 23, 2011) at Jefferson Regional Medical Center in Crystal City."                                                                                                         
## [3] "\"I'm curious ... noon out here (is different),\" Hitchcock said. \"You saw one team was really dozy in the 12:30 game (Saturday) in Washington at the start, really dozy. I sure as (heck) hope it ain't us. Because if it's us, (the series) is over.\""
## [4] "Southwest said fourth-quarter costs for each mile it flies will rise slightly in the fourth quarter. That forecast doesn't include fuel."                                                                                                                 
## [5] "The Grizzlies also have won a franchise-best 11 straight home games and will open the playoffs at home Sunday against the Clippers."                                                                                                                      
## [6] "(on local control of the St. Louis Police Department)"

dataAll1000<-sent_detect(dataAll1000)
dataAll1000<-tolower(dataAll1000)
dataAll1000<-removePunctuation(dataAll1000)
head(dataAll1000)

## [1] "øthere are other industry factors at work"                                                                                 
## [2] "the availability of used cars has fallen while used car prices have risen"                                                 
## [3] "these are not factors that encourage customers to visit their local carmax and go car shopping"                            
## [4] "mr"                                                                                                                        
## [5] "abel was the treasurer for jefferson county when he died on wednesday at jefferson regional medical center in crystal city"
## [6] "im curious "

dataAll1000 <- gsub("[^a-zA-z ]", '', dataAll1000)
head(dataAll1000)

## [1] "there are other industry factors at work"                                                                                  
## [2] "the availability of used cars has fallen while used car prices have risen"                                                 
## [3] "these are not factors that encourage customers to visit their local carmax and go car shopping"                            
## [4] "mr"                                                                                                                        
## [5] "abel was the treasurer for jefferson county when he died on wednesday at jefferson regional medical center in crystal city"
## [6] "im curious "

Answer on questions Question 1 - Some words are more frequent than others - what are the distributions of word frequencies?

words<-data.frame(unlist(stri_extract_all_words(dataAll1000)))

wordcloud(words$unlist.stri_extract_all_words.dataAll1000.., scale = c(5, 0.5), max.words = 20, 
          colors = brewer.pal(8,"Dark2"), random.order=FALSE, rot.per=0.35, use.r.layout=FALSE)

## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corpus, function(x) removeWords(x,
## stopwords())): transformation drops documents

Question 2 - What are the frequencies of 2-grams and 3-grams in the dataset?

gram<-NGramTokenizer(dataAll1000)

gramOne<-NGramTokenizer(gram, Weka_control(min=1, max=1))
gramOne<-data.frame(table(gramOne))
gramOne<-gramOne[order(gramOne$Freq, decreasing = TRUE),]

gramTwo<-NGramTokenizer(gram, Weka_control(min=2, max=2))
gramTwo<-data.frame(table(gramTwo))
gramTwo<-gramTwo[order(gramTwo$Freq, decreasing = TRUE),]

gramThree<-NGramTokenizer(gram, Weka_control(min=3,max=3))
gramThree<-data.frame(table(gramThree))
gramThree<-gramThree[order(gramThree$Freq, decreasing = TRUE),]

barplot(gramOne$Freq[1:20], names.arg = gramOne$gramOne[1:20],
        space = 0.5,col = 'red', las = 2, main = "Popularity of words in dataset")

barplot(gramTwo$Freq[1:20], names.arg = gramTwo$gramTwo[1:20],
        space = 0.5,col='green',las = 2, main = "Popularity of phrase of two words in dataset")

barplot(gramThree$Freq[1:20], names.arg = gramThree$gramThree[1:20],
        space = 0.5, col='blue',las = 2, main = "Popularity of phrase of three words in dataset")

Question 3 - How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language? 90%?

wordsCount<-nrow(words)
wordsUnique<-unique(words)
wordsUniqueCount<-nrow(wordsUnique)
wordsPercentUnique<-wordsUniqueCount/wordsCount
wordsPercentUnique

## [1] 0.1667435

From result we see, that for cover 50% of all word we need more 90% unique words.

Question 4 - How do you evaluate how many of the words come from foreign languages?

s<-words
sent_token_annotator <- Maxent_Sent_Token_Annotator()
word_token_annotator <- Maxent_Word_Token_Annotator()
library(NLP);
library(openNLP);
library(openNLP)
library(openNLPdata)

## Warning: package 'openNLPdata' was built under R version 3.4.4

library(qdap)

a2 <- annotate(s, list(sent_token_annotator, word_token_annotator))

pos_tag_annotator <- Maxent_POS_Tag_Annotator()

a3 <- annotate(s, pos_tag_annotator, a2)

## Determine the distribution of POS tags for word tokens.
a3w <- subset(a3, type == "word")
tags <- sapply(a3w$features, `[[`, "POS")
tagword<-table(tags)
if(is.na(tagword["FW"])){
  tagword["FW"]<-0
}

#Foreign words in base
fw<-paste(round(100*tagword["FW"]/sum(tagword), 3), sep="")
ew<-paste(round(100*(sum(tagword)-tagword["FW"])/sum(tagword), 3), sep="")

data<-c(fw,ew)
data<-as.numeric(data)
lbls<-c("FW","EW")
lbls <- paste(lbls, data)
lbls <- paste(lbls,"%",sep="")

From result we see, that quantity of foreign words is less 1 percent from all our sample dataset.

pie(data, labels = lbls, main="Percent of foreign words in our dataset",col=rainbow(length(lbls)), radius = 1)

Exploratory Analysis of English Dataset of SwiftKey

wassim

August 29, 2018

Goal of analysis

Work with data