This is my Milestone report for my final data science project.
The project is concerned with showing my progress in the NLP(New Language Processing) project, from downloading the data to summarizing,exploring the data and also my future plans for training my Predictive model with this data.
require(tm)
require(ngram)
require(ggplot2)
require(stringi)
require(stringr)
require(dplyr)
require(wordcloud)
require(RColorBrewer)
trainURL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
trainDataFile <- "data/Coursera-SwiftKey.zip"
#reading the data
con <- file("en_US.twitter.txt", "r") #making a connection with the file
twt <- readLines(con) #reading the lines
close(con) #closing the connection
con_blo <- file("en_US.blogs.txt","r")
blog <- readLines(con_blo)
close(con_blo)
con_new <- file("en_US.news.txt","r")
news <- readLines(con_new)
close(con_new)
we see that there are 3 text files that we will be using for further analysis & model training, but let us first summarise these text files:
size.file <- round(file.info(c("en_US.twitter.txt","en_US.news.txt","en_US.blogs.txt"))$size / 1024 ^ 2) #size of files
word_twt <- stri_count_words(twt)
word_blog <- stri_count_words(blog) #data into words
word_news <- stri_count_words(news)
## now summarising the Dataset into a simple data Frame
data <- data.frame(source = c("Twitter","News","Blog"),
size = paste(size.file,"MB"),
no_of_lines = c(length(word_twt),length(word_news),length(word_blog)),
no_of_words = c(sum(word_twt),sum(word_news),sum(word_blog)))
data
## source size no_of_lines no_of_words
## 1 Twitter 159 MB 2360148 30096649
## 2 News 196 MB 1010242 34762658
## 3 Blog 200 MB 899288 37546806
We see that even though twitter has high number of lines the number of words is very much similar compared to its counterparts, that is probably because a tweet is only limited to certain words.
The size of all these data are very large which can hinder our future analysis by long processing time & memory storage, so we will be trimming the data for smooth processing.
Now after summarizing the data let us proceed further.
Since the data is very big we will be sampling only a part of the data that will be used for further part of this project, we will be using a sample size of 0.01% of the dataset’s length.
set.seed(200)
sample.data <- c(sample(twt,length(twt) * 0.01),sample(blog,length(blog) * 0.01),sample(news,length(news) * 0.01))
Corpus is a collection of a structured text which is very much important in NLP for analysing and processing text data.
I will be making a function that will turn the sampled text data into a corpus and also do all of these steps:
Firstly we need to get the bad words or profane data set that I will be using in the function to detect and remove from the corpus.
url = "https://github.com/dsojevic/profanity-list/blob/main/en.txt" #downloaded the profanity words from this git repo
Now let us make the bcorpus function.
bcorpus <- function(dataset) {
data <- VCorpus(VectorSource(dataset))
space <- content_transformer(function(x,pattern) gsub(pattern," ",x)) #a nested func to get rid of any pattern by repalcing it with empty space
#getting rid of those particular patterns with space func
data <- tm_map(data,space ,"\\b[A-Z a-z 0-9._ - ]*[@](.*?)[.]{1,3} \\b") #EMAIL pattern
data <- tm_map(data,space,"@[^\\s]+") #twitter pattern
data <- tm_map(data,space ,"(f|ht)tp(s?)://(.*)[.][a-z]+") #Url pattern
#now getting rid of all the bad words
bad_words <- file("en.txt","r")
bad_words <- readLines(bad_words)
data <- tm_map(data,removeWords,bad_words)
#now getting rid of other things
data <- tm_map(data,tolower) # transforming the data into lower case
data <- tm_map(data,removeNumbers) # removing the numbers
data <- tm_map(data,removePunctuation) # removing punctuation marks
data <- tm_map(data,stripWhitespace) # getting rid of extra white spaces
data <- tm_map(data,PlainTextDocument)
return(data)
}
Now using the bcorpus fucntion and saving it into a dataframe.
data <- bcorpus(sample.data) ## now using this function to turn it into corpus
saveRDS(data, file = "en_US.corpus.rds") # saving the data in disk
# turning it into a data frame & saving it into disk
corpusText <- data.frame(text = unlist(sapply(data, '[', "content")), stringsAsFactors = FALSE)
con <- file("en_US.corpus.txt", open = "w")
writeLines(corpusText$text, con)
close(con)
Now we will be performing Exploratory Data Analysis to get the gist of the data so that we can further plan our model training approach which is very important.
tdm <- TermDocumentMatrix(data) #making it into a term document matrix which is a table with words and its frequency
frequency <- sort(rowSums(as.matrix(removeSparseTerms(tdm,0.99))),decreasing = TRUE) #converting it into matrix and using rowSums to get the freq of each word in descending order
word_freq <- data.frame(word = names(frequency),freq = frequency) #data frame with 2 cols, name and the frequency of the word
## plot for the top 5 most frequent words
ggplot(word_freq[1:5,], aes(x = reorder(word_freq[1:5,]$word, -word_freq[1:5,]$freq), y = word_freq[1:5,]$freq)) +
geom_bar(stat = "identity",fill = "#f68060") + ylab("Frequency")
## making a word cloud for the most appearing words
wordcloud(words = word_freq$word, freq = word_freq$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
we see that these 10 words are the most frequent words appearing in the corpus.
I wanted to get rid of the stop words but since our corpus data was too small it was hard for my model to predict words for input that had alot of stopwords,because of that i have changed my approach
Now for my predictive models I will be going with UNI,BI & TRI-Grams as these 3 will be enough for my model training.
For the N-gram generation, I wanted to use the RWefa package but since one of its dependencies is rjava which is somehow not compatible with my system ,so I will be going with tm & ngram package which i have been using already.
Firstly converting the corpus to a character vector as the ngram function only takes character string as input.
text <- sapply(data, as.character)
unigram_result <- ngram(paste(text, collapse = " "), n = 1) # For generating unigrams from the data
unigram_table <- get.phrasetable(unigram_result) #getting a table of the unigrams along with the freq of the data
unigram <- as.data.frame(unigram_table) #into a df for visualization
ggplot(unigram[1:10,], aes(x = reorder(unigram[1:10,]$ngrams, - unigram[1:10,]$freq), y = unigram[1:10,]$freq)) +
geom_bar(stat = "identity",fill = "blue")
save(unigram,file = "unigram.RData")
bigram_result <- ngram(paste(text,collapse = " "), n = 2)
bigram_table <- get.phrasetable(bigram_result)
bigram <- as.data.frame(bigram_table)
ggplot(bigram[1:10,], aes(x = reorder(bigram[1:10,]$ngrams, - bigram[1:10,]$freq), y = bigram[1:10,]$freq)) +
geom_bar(stat = "identity",fill = "blue") + theme(axis.text.x = element_text(hjust = 1.0, angle = 45),
axis.text.y = element_text(hjust = 0.5, vjust = 0.5))
save(bigram,file = "bigram.RData")
trigram_result <- ngram(paste(text,collapse = " "),n = 3)
trigram_table <- get.phrasetable(trigram_result)
trigram <- as.data.frame(trigram_table)
ggplot(trigram[1:10,], aes(x = reorder(trigram[1:10,]$ngrams, - trigram[1:10,]$freq), y = trigram[1:10,]$freq)) +
geom_bar(stat = "identity",fill = "blue") + theme(axis.text.x = element_text(hjust = 1.0, angle = 45),
axis.text.y = element_text(hjust = 0.5, vjust = 0.5))
save(trigram,file = "trigram.RData")
quadgram_result <- ngram(paste(text,collapse = " "),n = 4)
quadgram_table <- get.phrasetable(quadgram_result)
quadgram <- as.data.frame(quadgram_table)
ggplot(quadgram[1:10,], aes(x = reorder(quadgram[1:10,]$ngrams, - quadgram[1:10,]$freq), y = quadgram[1:10,]$freq)) +
geom_bar(stat = "identity",fill = "blue") + theme(axis.text.x = element_text(hjust = 1.0, angle = 45),
axis.text.y = element_text(hjust = 0.5, vjust = 0.5))
save(quadgram,file = "quadgram.RData")