Load Required Libraries
# Function to check and install missing packages
install_if_missing <- function(pkg) {
if (!require(pkg, character.only = TRUE)) {
install.packages(pkg, dependencies = TRUE)
library(pkg, character.only = TRUE)
}
}
# List of packages to load
packages <- c("tm", "SnowballC", "wordcloud", "RColorBrewer", "RWeka")
# Install and load the packages
lapply(packages, install_if_missing)
## Loading required package: tm
## Loading required package: NLP
## Loading required package: SnowballC
## Loading required package: wordcloud
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 4.4.1
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
# Load necessary packages
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(RWeka)
1. Download and Read Data
1.1 Download Data
# Set working directory
capstone_dir <- "~/capstone_1/"
if (!dir.exists(capstone_dir)) {
dir.create(capstone_dir)
}
setwd(capstone_dir) # Set working directory to the created folder
# Define file URL
file_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
# Check if the file exists, download, and unzip if necessary
if (!file.exists("Coursera.zip")) {
download.file(file_url, destfile = "Coursera.zip", method = "curl")
unzip("Coursera.zip")
}
1.2 Read Data
# Read data files
# Twitter and blogs files
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
# Read news file as binary to handle special characters
news_con <- file("final/en_US/en_US.news.txt", open = "rb")
news <- readLines(news_con, encoding = "UTF-8")
close(news_con)
2. Summary Statistics and Data Sampling
2.1 Sample Data
# Set seed for reproducibility
set.seed(5454568)
# Randomly sample 1,000 lines from each dataset
sampleTwitter <- twitter[sample(1:length(twitter), 1000)]
sampleBlogs <- blogs[sample(1:length(blogs), 1000)]
sampleNews <- news[sample(1:length(news), 1000)]
# Combine the sampled data
sampleData <- c(sampleTwitter, sampleBlogs, sampleNews)
# Save sampled data and free up memory
writeLines(sampleData, "sampleData.txt")
rm(twitter, news, blogs, sampleTwitter, sampleNews, sampleBlogs)
2.2 Read and Clean Sample Data
# Read the sampled data
sampleData <- readLines("sampleData.txt", encoding = "UTF-8")
# Create a text corpus
corpus <- VCorpus(VectorSource(sampleData))
# Define a custom content transformer for replacing patterns with spaces
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
# Apply cleaning transformations to the corpus
corpus <- tm_map(corpus, toSpace, "\"|/|@|\\|")
corpus <- tm_map(corpus, toSpace, "[^[:graph:]]")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
3. Interesting Findings
# Convert corpus to a data frame
corpus.dataframe <- data.frame(text = unlist(sapply(corpus, '[', 'content')), stringsAsFactors = FALSE)
# Generate unigrams, bigrams, and trigrams
uniGramToken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 1, max = 1))))
biGramToken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 2, max = 2))))
triGramToken <- data.frame(table(NGramTokenizer(corpus.dataframe, Weka_control(min = 3, max = 3))))
# Order tokens by frequency
unigram <- uniGramToken[order(uniGramToken$Freq, decreasing = TRUE),]
bigram <- biGramToken[order(biGramToken$Freq, decreasing = TRUE),]
trigram <- triGramToken[order(triGramToken$Freq, decreasing = TRUE),]
# Plot most commonly used words (unigrams)
par(mfrow = c(1, 1))
par(mar = c(5, 4, 2, 0))
barplot(unigram[1:30, 2],
names.arg = unigram[1:30, 1],
col = "red",
main = "Most commonly used Words (Top 30)",
las = 2,
ylab = "Frequency")

# Plot most commonly used two-word combinations (bigrams)
par(mar = c(8.5, 4, 2, 1))
barplot(bigram[1:30, 2],
names.arg = bigram[1:30, 1],
col = "blue",
main = "Most commonly used two word combinations (Top 30)",
las = 2,
ylab = "Frequency")

# Plot most commonly used three-word combinations (trigrams)
par(mar = c(8.5, 4, 2, 1))
barplot(trigram[1:30, 2],
names.arg = trigram[1:30, 1],
col = "green",
main = "Most commonly used three word combinations (Top 30)",
las = 2,
ylab = "Frequency")

4. Next Steps
- Prediction Algorithm: Plan to develop a predictive text model based on n-gram frequency analysis.
- Shiny App: The app will provide an interactive interface for text prediction and visualization.
- Feedback: Seek input on optimizing the cleaning process and enhancing predictive accuracy.