Exploratory Data Analysis

The purpose of this analysis is to review three supplied files and conduct analysis of the word patterns found in those files.

The goals of the assignment are four: * 1) Does the link lead to an HTML page describing the exploratory analysis of the training data set?

# loading the files via con
twitter <- readLines(con <- file("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)   

# note that news is a binary file, thus open with "rb"
news <- readLines(con <- file("./Coursera-SwiftKey/final/en_US/en_US.news.txt", open="rb"), encoding="UTF-8")
close(con)

blogs <- readLines(con <- file("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)

# badwordlist and loading as a txt list
# http://www.frontgatemedia.com/new/wp-content/uploads/2014/03/Terms-to-Block.csv
badwordlist <- readLines(con="badwordlist.txt")

# Creating labels of daat for plotting
xaxislables = c("twitter","news","blogs","badwordlist")

dflength <- c(length(twitter), length(news), length(blogs),length(badwordlist))
dfwordcount <- c(wordcount(twitter), wordcount(news), wordcount(blogs),wordcount(badwordlist))

# creating a data frame of the word and length count
worddataframe <- data.frame(xaxislables, dflength, dfwordcount)
# Print the data frame that hold the number of lines in each data set as well as the word count
print(worddataframe)
##   xaxislables dflength dfwordcount
## 1     twitter  2360148    30373583
## 2        news  1010242    34372530
## 3       blogs   899288    37334131
## 4 badwordlist      725         731

Including Plots for Word, Line and n-grams

You can also embed plots, for example:

Sampling

A 10% sample rate was selected as a starting point of the combined three files. After the file size and sample sized a were selected and displayed, the sample was then again reduced to allow for quicker formatting into a Corpus data set. 2000 lines of data were selected for all n gram analysis.

## [1] 236014.8
## [1] 101024.2
## [1] 89928.8
##                      Length Class             Mode
## txt_train_sample.txt 2      PlainTextDocument list
##   ngrams freq       prop
## 1   the  3336 0.04766395
## 2    to  1935 0.02764681
## 3   and  1678 0.02397485
## 4     a  1678 0.02397485
## 5    of  1326 0.01894556
## 6     i  1217 0.01738820

##            ngrams freq         prop
## 1     one of the    23 0.0003286278
## 2       a lot of    21 0.0003000514
## 3 thanks for the    20 0.0002857633
## 4    some of the    16 0.0002286106
## 5        to be a    16 0.0002286106
## 6    you want to    14 0.0002000343

Next steps:

library(ngram)
library(tm)

set.seed(123)

# source of raw data
# https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
#

# raw data stored here
destfile1="./Coursera-SwiftKey/final/en_US/en_US.twitter.txt" 
destfile2="./Coursera-SwiftKey/final/en_US/en_US.news.txt" 
destfile3="./Coursera-SwiftKey/final/en_US/en_US.blogs.txt" 
fileURL <-
        "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"   

# checking to see if the file have been down loaded
if (!file.exists(destfile1)) {
        setInternet2(TRUE)
        temp <- tempfile()
        download.file(fileURL, temp)
        twitter <- read.table(unz(temp, "en_US.twitter.txt"))
        news <- read.table(unz(temp, "en_US.news.txt"))
        blogs <- read.table(unz(temp, "en_US.blogs.txt"))
        unlink(temp)
}

if (file.exists(destfile1)) {
        print("Files were down loaded earlier")
        list.files("Coursera-SwiftKey/final/en_US")
}

# loading the files via con
twitter <- readLines(con <- file("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)   

# note that news is a binary file, thus open with "rb"
news <- readLines(con <- file("./Coursera-SwiftKey/final/en_US/en_US.news.txt", open="rb"), encoding="UTF-8")
close(con)

blogs <- readLines(con <- file("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt"), encoding = "UTF-8", skipNul = TRUE)
close(con)

# badwordlist and loading as a txt list
# http://www.frontgatemedia.com/new/wp-content/uploads/2014/03/Terms-to-Block.csv
badwordlist <- readLines(con="badwordlist.txt")

xaxislables = c("twitter","news","blogs","badwordlist")

#Checking the files length
print("print count of lines of data")
#print(length(twitter))
#print(length(news))
#print(length(blogs))
#print(length(badwordlist))
dflength <- c(length(twitter), length(news), length(blogs),length(badwordlist))

#Checking the word length
#print("print count of words of data")
#print(wordcount(twitter))
#print(wordcount(news))
#print(wordcount(blogs))
#print(wordcount(badwordlist))
dfwordcount <- c(wordcount(twitter), wordcount(news), wordcount(blogs),wordcount(badwordlist))

# creating a data frame of the word and length count
worddataframe <- data.frame(xaxislables, dflength, dfwordcount)
print(worddataframe)

#plot line count
dflenghtplot <- ggplot(worddataframe, aes(x = factor(xaxislables), y = (dflength)))
dflenghtplot <- dflenghtplot + geom_bar(stat = "identity") +
        labs(y = "# of lines/million", x = "text source", title = "Count of lines per Corpus") 
plot(dflenghtplot)

#plot word count
dfwordplot <- ggplot(worddataframe, aes(x = factor(xaxislables), y = (dfwordcount)))
dfwordplot <- dfwordplot + geom_bar(stat = "identity") +
        labs(y = "# of words/million", x = "text source", title = "Count of words per Corpus") 
plot(dfwordplot)

# 10% of the sample size for easizer testing methods
smp_size_t <- (0.10 * length(twitter))
smp_size_n <- (0.10 * length(news))
smp_size_b <- (0.10 * length(blogs))

print(smp_size_t)
print(smp_size_n)
print(smp_size_b)

# creating test an training files, for all three data files
train_twitter_indx <- sample(seq_len(length(twitter)), size = round(smp_size_t,0))
train_news_indx <- sample(seq_len(length(news)), size = round(smp_size_n,0))
train_blogs_indx <- sample(seq_len(length(blogs)), size = round(smp_size_b,0))

train_twitter <- twitter[train_twitter_indx]
train_news <- news[train_news_indx]
train_blogs <- blogs[train_blogs_indx]

test_twitter <- twitter[-train_twitter_indx]
test_news <- news[-train_news_indx]
test_blogs <- blogs[-train_blogs_indx]

train_sample <- c(train_twitter,train_news,train_blogs)
test_sample <- c(test_twitter,test_news,test_blogs)

writeLines(train_sample, "text_train.txt")
writeLines(test_sample, "text_test.txt")

#Getting a sample of a sample to ensure the n-grams work with in a reasonable  time frame (may go with 50 non random selections to get the full sample tested)

txt_train_sample <- sample(train_sample,size = 3000)

# writting a small sample file for n-gram creation - in it's own folder 
writeLines(txt_train_sample, "./working_folder/txt_train_sample.txt")

#Cleaning up development files
rm(list=ls())                # free up memory for the download of the data sets


#
# This example aided a great deal in getting the my code to function correctly
#
# http://stackoverflow.com/questions/19615181/finding-ngrams-in-r-and-comparing-ngrams-across-corpora
#

# badwordlist and loading as a txt list
# http://www.frontgatemedia.com/new/wp-content/uploads/2014/03/Terms-to-Block.csv
badwordlist <- readLines(con="badwordlist.txt")
oddwordlist <- readLines(con="oddwordlist.txt")

a <- Corpus(DirSource('working_folder'))
summary(a)
a <- tm_map(a, removeNumbers)
a <- tm_map(a, removePunctuation)
a <- tm_map(a, stripWhitespace)
a <- tm_map(a, tolower)
a <- tm_map(a, removeWords, badwordlist)
a <- tm_map(a, removeWords, oddwordlist)

# ACTION ## need to remove repeated words examples
# ACTION ## need to remove "Ã", "â???" until space or 0-9A-zA-z, "â???o","â???Ts", "â???¦", "â???¦", "â???o", 
# ACTION ## may need to remove non meanful single charter letters
# ACTION ## may need to remove non meanful dual charter letters - "rt", 
# ACTION ## may need to remove non meanful contraction combinations - "donâ???Tt", "isnâ???Tt", 
#a <- tm_map(a, removeWords, stopwords("english"))  # stop words are being left in at this time
#a <- tm_map(a, stemDocument, language = "english") # removed stemming as it will have a negitive affect on accuary
# I also got it to work with stemming, but it takes so long...
#a <- tm_map(a, removeWords, "TRUE")
#a <- tm_map(a, removeWords, "FALSE")
#a <- tm_map(a, removeWords, "â???")
#a <- tm_map(a, removeWords, "â???o")
#a <- tm_map(a, removeWords, "â???Ts")
#a <- tm_map(a, removeWords, "â???¦")

adtm <-DocumentTermMatrix(a) 
adtm <- removeSparseTerms(adtm, 0.75)

#inspect(adtm) 

#coping the data for the 3 n gram
b <- a

data_freq <- findMostFreqTerms(adtm, n = 1000)

# this take the "tm" output data and concatenates to one file in whcih the n-gram function can work on it.
# https://cran.r-project.org/web/packages/ngram/vignettes/ngram-guide.pdf

a <- concatenate ( lapply (a, "[", 1) )

ng_1 <- ngram (a, n =1)

get.phrasetable(ng_1)

# make sure this is not printed  with the R mark down file
ngramoutput_1 <- get.phrasetable(ng_1)

head(ngramoutput_1)

#ngram_asweka(str, min = 2, max = 2, sep = " ")

write.csv(ngramoutput_1, file = "ngramoutput_1.csv",row.names = FALSE)

# selecting ngramoutput_1 (a word) greater than or equal to 140 for plotting
wordsample <- ngramoutput_1[ which(ngramoutput_1$freq >= 140), ]

#plot word count
df_freqplot <- ggplot(wordsample, aes(x = factor(wordsample$ngrams), y = (wordsample$freq)))
df_freqplot <- df_freqplot + geom_bar(stat = "identity") +
        labs(y = "# of times word is in sample", x = "word text", title = "Count of words per Corpus") 
plot(df_freqplot)

b <- concatenate ( lapply (b, "[", 1) )

## An ngram object with 3- grams
ng <- ngram (b, n =3)

get.phrasetable(ng)

ngramoutput_3 <- get.phrasetable(ng)

head(ngramoutput_3)

write.csv(ngramoutput_3, file = "ngramoutput_3.csv",row.names = FALSE)

# selecting ngramoutput greater than or equal to 5 for plotting
ngramsample <- ngramoutput_3[ which(ngramoutput_3$freq >= 3), ]

#plot ngram count
ngramsampleplot <- ggplot(ngramsample, aes(x = factor(ngramsample$ngrams), y = (ngramsample$freq)))
ngramsampleplot <- ngramsampleplot + geom_bar(stat = "identity") +
        labs(y = "# of ngrams in sample", x = "ngram text", title = "Count of ngrams per Corpus") 
plot(ngramsampleplot)