#Loading libraries
library(stringi)
library(tm)
library(ggplot2)
library(knitr)
library(RWeka)
library(SnowballC)
rm(list=ls())
1. Data Load
setwd("~/R/milestone/")
TW <- file("data/en_US.twitter.txt")
BL <- file("data/en_US.blogs.txt",open = "r")
NW <- file("data/en_US.news.txt",open = "r")
#Reading the files
twitter <- readLines(TW,warn = F)
blog <- readLines(BL,warn = F)
news <- readLines(NW,warn = F)
#SUMMARY OF THREE FILES : Word counts, line counts and basic data tables
#Creating a vector with the length of each dataset
dtlength <- c(length(twitter), length(blog), length(news))
#Creating a vector with the size of each dataset
dataSize <- c(
file.size("data/en_US.twitter.txt") / 1024 ^ 2,
file.size("data/en_US.blogs.txt") / 1024 ^ 2,
file.size("data/en_US.news.txt") / 1024 ^ 2
)
#Creating a vector with the count of words of each dataset
CountWord <- c(
sum(stri_count_words(twitter)),
sum(stri_count_words(blog)),
sum(stri_count_words(news))
)
#Close the files
close(TW)
close(NW)
close(BL)
dtcol <- c("Filename","CountWord","CountLines","FileSizeMb")
cat <- c("Twitter","Blog","News")
gpdata <- cbind(cat,CountWord,dtlength,round(dataSize,2)) #Preparing the dataframe
gpdata <- as.data.frame(gpdata) #Defining the dataframe as dataframe
colnames(gpdata) <- dtcol #Naming the column
#Printing basic summaries of the three files : Word counts, line counts, file size
kable(gpdata)
| Twitter |
30218125 |
2360148 |
159.36 |
| Blog |
38154238 |
899288 |
200.42 |
| News |
2693898 |
77259 |
196.28 |
2. Exploratory analysis
#2.1. Data Cleaning
#Using the file of news
#Creating a corpus based on News source
dm <- VCorpus(DirSource(directory="data/news", encoding="UTF-8"),readerControl=list(language="en"))
#Creating n-grams from RWeka
##Set control options for Weka learners. A list of class Weka_control which can be coerced to character for passing it to Weka.
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
#Cleaning the corpus
##Interface to apply transformation functions (also denoted as mappings) to corpora
dm <- tm_map(dm,content_transformer(removePunctuation))
dm <- tm_map(dm,content_transformer(stripWhitespace))
dm <- tm_map(dm,content_transformer(removeNumbers))
dm <- tm_map(dm,PlainTextDocument)
dm <- tm_map(dm,content_transformer(tolower))
#Create a DocumentTermMatrix
##A corpus for the constructors and either a term-document matrix or a document-term matrix or a simple triplet matrix (package slam) or a term frequency vector for the coercing functions.
tunigram <- DocumentTermMatrix(dm, control=list(tokenizer=unigram))
tbigram <- DocumentTermMatrix(dm, control=list(tokenizer=bigram))
# Plot Word Frequencies appearing in the dataset viewed like Unigram
freq <- sort(colSums(as.matrix(tunigram)), decreasing=TRUE)
df <- data.frame(word=names(freq), freq=freq)
ggplot(df[df$freq > 8500, ], aes(x=word, y=freq)) + geom_bar(stat="identity") +
theme(axis.text.x=element_text(angle=45, hjust=1)) + xlab("") + ylab("Frequency") +
ggtitle("Words that appear over 8,500\ntimes in the News dataset")

# Plot Word Frequencies appearing in the dataset viewed like Bigram
freq <- sort(colSums(as.matrix(tbigram)), decreasing=TRUE)
df <- data.frame(word=names(freq), freq=freq)
ggplot(df[df$freq > 1900, ], aes(x=word, y=freq)) + geom_bar(stat="identity") +
theme(axis.text.x=element_text(angle=45, hjust=1)) + xlab("") + ylab("Frequency") +
ggtitle("Words that appear over 1,900\ntimes in the News dataset")
