install.packages("tm")
## package 'tm' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'tm'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\baner\AppData\Local\Programs\R\R-4.3.3\library\00LOCK\tm\libs\x64\tm.dll
## to C:\Users\baner\AppData\Local\Programs\R\R-4.3.3\library\tm\libs\x64\tm.dll:
## Permission denied
## Warning: restored 'tm'
##
## The downloaded binary packages are in
## C:\Users\baner\AppData\Local\Temp\RtmpoD46EC\downloaded_packages
install.packages("ggplot2")
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\baner\AppData\Local\Temp\RtmpoD46EC\downloaded_packages
library(tm)
## Loading required package: NLP
require(SnowballC)
## Loading required package: SnowballC
require(wordcloud)
## Loading required package: wordcloud
## Loading required package: RColorBrewer
setwd("C:\\Users\\baner\\Downloads")
url<- "C:\\Users\\baner\\Downloads\\Coursera-SwiftKey"
if(!file.exists("C:\\Users\\baner\\Downloads")){
download.file(Url,destfile="C:\\Users\\baner\\Downloads",mode = "wb")
}
# Check if the directory "Coursera-SwiftKey" exists
if (!file.exists("Coursera-SwiftKey")) {
# Unzip the file if the directory does not exist
unzip(zipfile = "C:/Users/baner/Downloads/Coursera-Swiftkey.zip", exdir = "Coursera-SwiftKey")
}
1.2 Read Data
file_path <- list.files("C:\\Users\\baner\\Downloads\\Coursera-SwiftKey\\final\\en_US",full.names = TRUE)
print(file_path)
## [1] "C:\\Users\\baner\\Downloads\\Coursera-SwiftKey\\final\\en_US/en_US.blogs.txt"
## [2] "C:\\Users\\baner\\Downloads\\Coursera-SwiftKey\\final\\en_US/en_US.news.txt"
## [3] "C:\\Users\\baner\\Downloads\\Coursera-SwiftKey\\final\\en_US/en_US.twitter.txt"
blogs <- "C:\\Users\\baner\\Downloads\\Coursera-Swiftkey\\final\\en_US/en_US.blogs.txt"
en_blogs <- readLines(blogs)
news<- "C:\\Users\\baner\\Downloads\\Coursera-SwiftKey\\final\\en_US/en_US.news.txt"
en_news<- readLines(news)
## Warning in readLines(news): incomplete final line found on
## 'C:\Users\baner\Downloads\Coursera-SwiftKey\final\en_US/en_US.news.txt'
twitter<- "C:\\Users\\baner\\Downloads\\Coursera-SwiftKey\\final\\en_US/en_US.twitter.txt"
en_twitter<- readLines(twitter)
## Warning in readLines(twitter): line 167155 appears to contain an embedded nul
## Warning in readLines(twitter): line 268547 appears to contain an embedded nul
## Warning in readLines(twitter): line 1274086 appears to contain an embedded nul
## Warning in readLines(twitter): line 1759032 appears to contain an embedded nul
set.seed(5454568)
sampleTwitter <- en_twitter[sample(1:length(twitter))]
sampleBlogs <- en_blogs[sample(1:length(blogs))]
sampleNews <- en_news[sample(1:length(news))]
## Combine data samples
sampleData <- c(sampleTwitter,sampleBlogs,sampleNews)
## Save sample data and remove data not needed to free memory
writeLines(sampleData, "sampleData.txt")
rm(en_twitter,en_news,en_blogs,sampleTwitter,sampleNews,sampleBlogs)
2.2 READ SAMPLE DATA AND CLEAN THE DATA
sampleData <- readLines("sampleData.txt", encoding="UTF-8")
corpus <- VCorpus(VectorSource(sampleData))
## Remove space, punctuation, numbers, whitespace, stopwords and change to lowercase
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "\"|/|@|\\|")
corpus <- tm_map(corpus, toSpace, "[^[:graph:]]")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
2.3 SUMMARY AND OVERVIEW DATA
wordcloud(corpus, min.freq=5, max.words=101, random.order=TRUE,
rot.per=0.5, colors=brewer.pal(8, "Set2"), use.r.layout=FALSE)
3. INTERESTING FINDINGS
library(quanteda)
## Package version: 4.0.2
## Unicode version: 15.1
## ICU version: 74.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:tm':
##
## stopwords
## The following objects are masked from 'package:NLP':
##
## meta, meta<-
library(quanteda.textstats)
# Convert your corpus into a quanteda corpus object
quanteda_corpus <- corpus(corpus)
# Tokenize the corpus into words
tokens_unigram <- tokens(quanteda_corpus)
tokens_bigram <- tokens(quanteda_corpus, ngrams = 2)
## Warning: ngrams argument is not used.
tokens_trigram <- tokens(quanteda_corpus, ngrams = 3)
## Warning: ngrams argument is not used.
# Create a document-feature matrix (dfm) from the tokens
dfm_unigram <- dfm(tokens_unigram)
dfm_bigram <- dfm(tokens_bigram)
dfm_trigram <- dfm(tokens_trigram)
# Get the frequency of n-grams and order by decreasing frequency
unigram_freq <- textstat_frequency(dfm_unigram)
bigram_freq <- textstat_frequency(dfm_bigram)
trigram_freq <- textstat_frequency(dfm_trigram)
# Order by frequency
unigram <- unigram_freq[order(unigram_freq$frequency, decreasing = TRUE),]
bigram <- bigram_freq[order(bigram_freq$frequency, decreasing = TRUE),]
trigram <- trigram_freq[order(trigram_freq$frequency, decreasing = TRUE),]
# Print results
print(head(unigram, 10)) # Top 10 unigrams
## feature frequency rank docfreq group
## 1 way 2 1 1 all
## 2 btw 1 2 1 all
## 3 thank 1 2 1 all
## 4 rt 1 2 1 all
## 5 gonna 1 2 1 all
## 6 dc 1 2 1 all
## 7 anytim 1 2 1 all
## 8 soon 1 2 1 all
## 9 love 1 2 1 all
## 10 see 1 2 1 all
print(head(bigram, 10)) # Top 10 bigrams
## feature frequency rank docfreq group
## 1 way 2 1 1 all
## 2 btw 1 2 1 all
## 3 thank 1 2 1 all
## 4 rt 1 2 1 all
## 5 gonna 1 2 1 all
## 6 dc 1 2 1 all
## 7 anytim 1 2 1 all
## 8 soon 1 2 1 all
## 9 love 1 2 1 all
## 10 see 1 2 1 all
print(head(trigram, 10)) # Top 10 trigrams
## feature frequency rank docfreq group
## 1 way 2 1 1 all
## 2 btw 1 2 1 all
## 3 thank 1 2 1 all
## 4 rt 1 2 1 all
## 5 gonna 1 2 1 all
## 6 dc 1 2 1 all
## 7 anytim 1 2 1 all
## 8 soon 1 2 1 all
## 9 love 1 2 1 all
## 10 see 1 2 1 all
3.1 MOST COMMONLY USED ONE WORD
par(mfrow = c(1, 1))
par(mar=c(5,4,2,0))
barplot(unigram[1:30,2],
names.arg=unigram[1:30,1],
col = "red",
main="Most commonly used Words (Top 30)",
las=1,
ylab = "Frequency")
3.2 MOST COMMONLY USED TWO WORD COMBINATIONS
par(mar=c(8.5,4,2,1))
barplot(bigram[1:30,2],
names.arg=bigram[1:30,1],
col = "blue",
main="Most commonly used two word combinations (Top 30)",
las=1,
ylab = "Frequency")
3.3 MOST COMMONLY USED THREE WORD COMBINATIONS
par(mar=c(8.5,4,2,1))
barplot(trigram[1:30,2],
names.arg=trigram[1:30,1],
col = "green",
main="Most commonly used three word combinations (Top 30)",
las=1,
ylab = "Frequency")
4. PLANS FOR CREATING A PREDICTION ALGORITHM AND SHINY APP On the basis
of analysis, I am planning to use dfm dataframe to calculate the
probabilities of the next word occuring with respect to previous words.
For the Shiny app, the plan is to create an app with a simple interface
where the user can enter a string of text. Our prediction model will
then give a list of suggested words to update the next word.