Milestone1

install.packages("tm")

## package 'tm' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'tm'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\baner\AppData\Local\Programs\R\R-4.3.3\library\00LOCK\tm\libs\x64\tm.dll
## to C:\Users\baner\AppData\Local\Programs\R\R-4.3.3\library\tm\libs\x64\tm.dll:
## Permission denied

## Warning: restored 'tm'

## 
## The downloaded binary packages are in
##  C:\Users\baner\AppData\Local\Temp\RtmpoD46EC\downloaded_packages

install.packages("ggplot2")

## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\baner\AppData\Local\Temp\RtmpoD46EC\downloaded_packages

library(tm)

## Loading required package: NLP

require(SnowballC)

## Loading required package: SnowballC

require(wordcloud)

## Loading required package: wordcloud

## Loading required package: RColorBrewer

DOWNLOAD DATA AND READ DATA 1.1 Download Data

setwd("C:\\Users\\baner\\Downloads")
url<- "C:\\Users\\baner\\Downloads\\Coursera-SwiftKey"
if(!file.exists("C:\\Users\\baner\\Downloads")){
  download.file(Url,destfile="C:\\Users\\baner\\Downloads",mode = "wb")
}
# Check if the directory "Coursera-SwiftKey" exists
if (!file.exists("Coursera-SwiftKey")) {
  # Unzip the file if the directory does not exist
  unzip(zipfile = "C:/Users/baner/Downloads/Coursera-Swiftkey.zip", exdir = "Coursera-SwiftKey")
}

1.2 Read Data

file_path <- list.files("C:\\Users\\baner\\Downloads\\Coursera-SwiftKey\\final\\en_US",full.names = TRUE)
print(file_path)

## [1] "C:\\Users\\baner\\Downloads\\Coursera-SwiftKey\\final\\en_US/en_US.blogs.txt"  
## [2] "C:\\Users\\baner\\Downloads\\Coursera-SwiftKey\\final\\en_US/en_US.news.txt"   
## [3] "C:\\Users\\baner\\Downloads\\Coursera-SwiftKey\\final\\en_US/en_US.twitter.txt"

blogs <- "C:\\Users\\baner\\Downloads\\Coursera-Swiftkey\\final\\en_US/en_US.blogs.txt"
en_blogs <- readLines(blogs)
news<- "C:\\Users\\baner\\Downloads\\Coursera-SwiftKey\\final\\en_US/en_US.news.txt" 
en_news<- readLines(news)

## Warning in readLines(news): incomplete final line found on
## 'C:\Users\baner\Downloads\Coursera-SwiftKey\final\en_US/en_US.news.txt'

twitter<- "C:\\Users\\baner\\Downloads\\Coursera-SwiftKey\\final\\en_US/en_US.twitter.txt"
en_twitter<- readLines(twitter)

## Warning in readLines(twitter): line 167155 appears to contain an embedded nul

## Warning in readLines(twitter): line 268547 appears to contain an embedded nul

## Warning in readLines(twitter): line 1274086 appears to contain an embedded nul

## Warning in readLines(twitter): line 1759032 appears to contain an embedded nul

BASIC REPORT OF THE SUMMARY STATISTICS OF THE DATA 2.1 SAMPLE DATA The datasets are considerably big and will require a lot of time to process and analyze. For this initial analysis the data will be randomly sampled using the length of the document of the Blogs, News and Twitter data.

set.seed(5454568)
sampleTwitter <- en_twitter[sample(1:length(twitter))]
sampleBlogs <- en_blogs[sample(1:length(blogs))]
sampleNews <- en_news[sample(1:length(news))]

## Combine data samples
sampleData <- c(sampleTwitter,sampleBlogs,sampleNews)

## Save sample data and remove data not needed to free memory
writeLines(sampleData, "sampleData.txt")
rm(en_twitter,en_news,en_blogs,sampleTwitter,sampleNews,sampleBlogs)

2.2 READ SAMPLE DATA AND CLEAN THE DATA

sampleData <- readLines("sampleData.txt", encoding="UTF-8")
corpus <- VCorpus(VectorSource(sampleData))

## Remove space, punctuation, numbers, whitespace, stopwords and change to lowercase
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "\"|/|@|\\|")
corpus <- tm_map(corpus, toSpace, "[^[:graph:]]")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removeWords, stopwords("english"))

2.3 SUMMARY AND OVERVIEW DATA

wordcloud(corpus, min.freq=5, max.words=101, random.order=TRUE,
          rot.per=0.5, colors=brewer.pal(8, "Set2"), use.r.layout=FALSE)

3. INTERESTING FINDINGS

library(quanteda)

## Package version: 4.0.2
## Unicode version: 15.1
## ICU version: 74.1

## Parallel computing: 8 of 8 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:tm':
## 
##     stopwords

## The following objects are masked from 'package:NLP':
## 
##     meta, meta<-

library(quanteda.textstats)

# Convert your corpus into a quanteda corpus object
quanteda_corpus <- corpus(corpus)

# Tokenize the corpus into words
tokens_unigram <- tokens(quanteda_corpus)
tokens_bigram  <- tokens(quanteda_corpus, ngrams = 2)

## Warning: ngrams argument is not used.

tokens_trigram <- tokens(quanteda_corpus, ngrams = 3)

## Warning: ngrams argument is not used.

# Create a document-feature matrix (dfm) from the tokens
dfm_unigram <- dfm(tokens_unigram)
dfm_bigram  <- dfm(tokens_bigram)
dfm_trigram <- dfm(tokens_trigram)

# Get the frequency of n-grams and order by decreasing frequency
unigram_freq <- textstat_frequency(dfm_unigram)
bigram_freq  <- textstat_frequency(dfm_bigram)
trigram_freq <- textstat_frequency(dfm_trigram)

# Order by frequency
unigram <- unigram_freq[order(unigram_freq$frequency, decreasing = TRUE),]
bigram  <- bigram_freq[order(bigram_freq$frequency, decreasing = TRUE),]
trigram <- trigram_freq[order(trigram_freq$frequency, decreasing = TRUE),]

# Print results
print(head(unigram, 10))  # Top 10 unigrams

##    feature frequency rank docfreq group
## 1      way         2    1       1   all
## 2      btw         1    2       1   all
## 3    thank         1    2       1   all
## 4       rt         1    2       1   all
## 5    gonna         1    2       1   all
## 6       dc         1    2       1   all
## 7   anytim         1    2       1   all
## 8     soon         1    2       1   all
## 9     love         1    2       1   all
## 10     see         1    2       1   all

print(head(bigram, 10))   # Top 10 bigrams

##    feature frequency rank docfreq group
## 1      way         2    1       1   all
## 2      btw         1    2       1   all
## 3    thank         1    2       1   all
## 4       rt         1    2       1   all
## 5    gonna         1    2       1   all
## 6       dc         1    2       1   all
## 7   anytim         1    2       1   all
## 8     soon         1    2       1   all
## 9     love         1    2       1   all
## 10     see         1    2       1   all

print(head(trigram, 10))  # Top 10 trigrams

##    feature frequency rank docfreq group
## 1      way         2    1       1   all
## 2      btw         1    2       1   all
## 3    thank         1    2       1   all
## 4       rt         1    2       1   all
## 5    gonna         1    2       1   all
## 6       dc         1    2       1   all
## 7   anytim         1    2       1   all
## 8     soon         1    2       1   all
## 9     love         1    2       1   all
## 10     see         1    2       1   all

3.1 MOST COMMONLY USED ONE WORD

par(mfrow = c(1, 1))
par(mar=c(5,4,2,0))
barplot(unigram[1:30,2], 
        names.arg=unigram[1:30,1], 
        col = "red", 
        main="Most commonly used Words (Top 30)", 
        las=1, 
        ylab = "Frequency")

3.2 MOST COMMONLY USED TWO WORD COMBINATIONS

par(mar=c(8.5,4,2,1))
barplot(bigram[1:30,2], 
        names.arg=bigram[1:30,1], 
        col = "blue", 
        main="Most commonly used two word combinations (Top 30)", 
        las=1, 
        ylab = "Frequency")

3.3 MOST COMMONLY USED THREE WORD COMBINATIONS

par(mar=c(8.5,4,2,1))
barplot(trigram[1:30,2], 
        names.arg=trigram[1:30,1], 
        col = "green", 
        main="Most commonly used three word combinations (Top 30)", 
        las=1, 
        ylab = "Frequency")

4. PLANS FOR CREATING A PREDICTION ALGORITHM AND SHINY APP On the basis of analysis, I am planning to use dfm dataframe to calculate the probabilities of the next word occuring with respect to previous words. For the Shiny app, the plan is to create an app with a simple interface where the user can enter a string of text. Our prediction model will then give a list of suggested words to update the next word.

Milestone1

Reethika Banerjee

2024-08-05