1. Data Load

setwd("~/R/milestone/") 

TW <- file("data/en_US.twitter.txt")
BL <- file("data/en_US.blogs.txt",open = "r")
NW <- file("data/en_US.news.txt",open = "r")

#Reading the files
twitter <- readLines(TW,warn = F)
blog    <- readLines(BL,warn = F)
news    <- readLines(NW,warn = F)

#SUMMARY OF THREE FILES : Word counts, line counts and basic data tables

#Creating a vector with the length of each dataset
dtlength <- c(length(twitter), length(blog), length(news))

#Creating a vector with the size of each dataset
dataSize   <- c(    
                    file.size("data/en_US.twitter.txt") / 1024 ^ 2, 
                    file.size("data/en_US.blogs.txt")  / 1024 ^ 2, 
                    file.size("data/en_US.news.txt")  / 1024 ^ 2 
                )
#Creating a vector with the count of words of each dataset
CountWord <- c(
                   sum(stri_count_words(twitter)), 
                   sum(stri_count_words(blog)), 
                   sum(stri_count_words(news))
               )
#Close the files
close(TW)
close(NW)
close(BL)

dtcol <- c("Filename","CountWord","CountLines","FileSizeMb")
cat <- c("Twitter","Blog","News")

gpdata <- cbind(cat,CountWord,dtlength,round(dataSize,2)) #Preparing the dataframe
gpdata <- as.data.frame(gpdata) #Defining the dataframe as dataframe
colnames(gpdata) <- dtcol #Naming the column

#Printing basic summaries of the three files : Word counts, line counts, file size
kable(gpdata)

Filename	CountWord	CountLines	FileSizeMb
Twitter	30218125	2360148	159.36
Blog	38154238	899288	200.42
News	2693898	77259	196.28

2. Exploratory analysis

#2.1. Data Cleaning

#Using the file of news

#Creating a corpus based on News source

dm <- VCorpus(DirSource(directory="data/news", encoding="UTF-8"),readerControl=list(language="en"))

#Creating n-grams from RWeka

##Set control options for Weka learners. A list of class Weka_control which can be coerced to character for passing it to Weka.

unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))

#Cleaning the corpus
##Interface to apply transformation functions (also denoted as mappings) to corpora

dm <- tm_map(dm,content_transformer(removePunctuation))
dm <- tm_map(dm,content_transformer(stripWhitespace))
dm <- tm_map(dm,content_transformer(removeNumbers))
dm <- tm_map(dm,PlainTextDocument)
dm <- tm_map(dm,content_transformer(tolower))

#Create a DocumentTermMatrix
##A corpus for the constructors and either a term-document matrix or a document-term matrix or a simple triplet matrix (package slam) or a term frequency vector for the coercing functions.

tunigram <- DocumentTermMatrix(dm, control=list(tokenizer=unigram))
tbigram <- DocumentTermMatrix(dm, control=list(tokenizer=bigram))

# Plot Word Frequencies appearing in the dataset viewed like Unigram

freq <- sort(colSums(as.matrix(tunigram)), decreasing=TRUE)
df <- data.frame(word=names(freq), freq=freq)

ggplot(df[df$freq > 8500, ], aes(x=word, y=freq)) + geom_bar(stat="identity") + 
theme(axis.text.x=element_text(angle=45, hjust=1)) + xlab("") + ylab("Frequency") + 
ggtitle("Words that appear over 8,500\ntimes in the News dataset")

# Plot Word Frequencies appearing in the dataset viewed like Bigram

freq <- sort(colSums(as.matrix(tbigram)), decreasing=TRUE)
df <- data.frame(word=names(freq), freq=freq)

ggplot(df[df$freq > 1900, ], aes(x=word, y=freq)) + geom_bar(stat="identity") + 
theme(axis.text.x=element_text(angle=45, hjust=1)) + xlab("") +  ylab("Frequency") + 
ggtitle("Words that appear over 1,900\ntimes in the News dataset")

Efficient Estimation of Word represention in a Corpus

Jean Marie Cimula

August 31, 2016

1. Data Load

2. Exploratory analysis