The objective of this project is to develop a predictive text application capable of suggesting the next word a user is likely to type. The application will be built using a statistical language model and deployed through a Shiny web application.
This report summarizes the exploratory analysis performed on the three English-language datasets provided for the project.
The purpose of this analysis is to understand the size, structure and characteristics of the data and to identify an appropriate strategy for building an efficient next-word prediction model.
The datasets were successfully downloaded, extracted and loaded into R. Each file contains English text collected from a different source:
Table 1 provides basic descriptive statistics for three datasets.
# Load required libraries
library(stringi)
library(ggplot2)
# Set working directory to the folder containing the files
setwd("C:/Users/DIVYA/Documents/Project track 2/Data Science/Coursera-SwiftKey/final/en_US")
# Read data files
blogs <- readLines("en_US.blogs.txt",
encoding = "UTF-8",
skipNul = TRUE)
news <- readLines("en_US.news.txt",
encoding = "UTF-8",
skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt",
encoding = "UTF-8",
skipNul = TRUE)
# File sizes (in MB)
blogs_size <- file.info("en_US.blogs.txt")$size / 1024^2
news_size <- file.info("en_US.news.txt")$size / 1024^2
twitter_size <- file.info("en_US.twitter.txt")$size / 1024^2
# Line counts
blogs_lines <- length(blogs)
news_lines <- length(news)
twitter_lines <- length(twitter)
# Word counts
blogs_words <- sum(stri_count_words(blogs))
news_words <- sum(stri_count_words(news))
twitter_words <- sum(stri_count_words(twitter))
# Average words per line
blogs_avg <- blogs_words / blogs_lines
news_avg <- news_words / news_lines
twitter_avg <- twitter_words / twitter_lines
# Summary table
summary_table <- data.frame(
Dataset = c("Blogs", "News", "Twitter"),
FileSize_MB = round(c(blogs_size,
news_size,
twitter_size), 2),
LineCount = c(blogs_lines,
news_lines,
twitter_lines),
WordCount = c(blogs_words,
news_words,
twitter_words),
AvgWordsPerLine = round(c(blogs_avg,
news_avg,
twitter_avg), 2)
)
print(summary_table)
## Dataset FileSize_MB LineCount WordCount AvgWordsPerLine
## 1 Blogs 200.42 899288 37546806 41.75
## 2 News 196.28 1010206 34761151 34.41
## 3 Twitter 159.36 2360148 30096690 12.75
Distribution of words per line
A histogram of words per line was created for each dataset.
Key Findings
par(mfrow = c(1, 3))
hist(blogs_lines,
breaks = 50,
col = "lightblue",
main = "Blogs",
xlab = "Words per Line")
hist(news_lines,
breaks = 50,
col = "lightgreen",
main = "News",
xlab = "Words per Line")
hist(twitter_lines,
breaks = 50,
col = "lightpink",
main = "Twitter",
xlab = "Words per Line")
par(mfrow = c(1,1))
To support next-word prediction, sequences of words were analyzed.
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(NLP)
library(stringi)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
id="sampling"
set.seed(12345)
blogs_sample <- sample(blogs, length(blogs) * 0.01)
news_sample <- sample(news, length(news) * 0.01)
twitter_sample <- sample(twitter, length(twitter) * 0.01)
sample_data <- c(blogs_sample,
news_sample,
twitter_sample)
id="cleaning"
corpus <- VCorpus(VectorSource(sample_data))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
id="complete_unigram_analysis"
library(stringi)
library(ggplot2)
# Take a small sample for faster processing
set.seed(123)
blogs_sample <- sample(blogs, length(blogs) * 0.01)
news_sample <- sample(news, length(news) * 0.01)
twitter_sample <- sample(twitter, length(twitter) * 0.01)
# Combine samples
text_sample <- c(
blogs_sample,
news_sample,
twitter_sample
)
# Merge all text
all_text <- paste(text_sample, collapse = " ")
# Convert to lowercase
all_text <- tolower(all_text)
# Remove punctuation, numbers, special characters
all_text <- gsub("[^a-z ]", " ", all_text)
# Split into words
words <- unlist(strsplit(all_text, "\\s+"))
# Remove blanks
words <- words[nchar(words) > 0]
# Count word frequencies
word_freq <- sort(table(words),
decreasing = TRUE)
# Create data frame
top_unigrams <- data.frame(
Word = names(word_freq)[1:10],
Frequency = as.numeric(word_freq[1:10])
)
# Display results
print(top_unigrams)
## Word Frequency
## 1 the 47869
## 2 to 27734
## 3 and 24326
## 4 a 24045
## 5 i 20348
## 6 of 19882
## 7 in 16709
## 8 it 11483
## 9 that 11322
## 10 for 11165
The frquencies of the 10 most common words (unigrams) are shown using a barplot.
id="plot_unigrams"
ggplot(top_unigrams,
aes(y = reorder(Word, Frequency), x = Frequency)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 Most Frequent Words",
x = "Frequency",
y = "Words") +
theme_minimal()
Distribution of Unigram Frequrncies are shown using a histogram.
id="unigram_histogram"
unigram_df <- data.frame(
Frequency = as.numeric(word_freq)
)
ggplot(unigram_df,
aes(x = Frequency)) +
geom_histogram(
bins = 50,
fill = "steelblue",
color = "black"
) +
labs(title = "Distribution of Unigram Frequencies",
x = "Frequency",
y = "Number of Unique Words") +
theme_minimal()
id="base_bigrams"
# Create bigrams
bigrams <- paste(
words[-length(words)],
words[-1]
)
bigram_freq <- sort(table(bigrams),
decreasing = TRUE)
top_bigrams <- data.frame(
Bigram = names(bigram_freq)[1:10],
Frequency = as.numeric(bigram_freq[1:10])
)
top_bigrams
## Bigram Frequency
## 1 in the 4241
## 2 of the 4217
## 3 it s 2311
## 4 to the 2160
## 5 i m 2093
## 6 for the 2083
## 7 on the 1972
## 8 to be 1603
## 9 don t 1573
## 10 at the 1437
The frquencies of the 10 most common words (bigrams) are shown using a barplot.
id="bigram_barplot"
library(ggplot2)
ggplot(top_bigrams,
aes(y = reorder(Bigram, Frequency),
x = Frequency)) +
geom_bar(stat = "identity",
fill = "lightpink") +
coord_flip() +
labs(title = "Top 10 Most Frequent Bigrams",
x = "Frequency",
y = "Bigram") +
theme_minimal()
Distribution of Bigrams Frequrncies are shown using a histogram.
id="bigram_histogram"
bigram_df <- data.frame(
Frequency = as.numeric(bigram_freq)
)
ggplot(bigram_df,
aes(x = Frequency)) +
geom_histogram(
bins = 50,
fill = "lightpink",
color = "black"
) +
labs(title = "Distribution of Bigram Frequencies",
x = "Frequency",
y = "Count of Bigrams") +
theme_minimal()
id="base_trigrams"
# Create trigrams
trigrams <- paste(
words[-c(length(words)-1, length(words))],
words[-c(1, length(words))],
words[-c(1,2)]
)
trigram_freq <- sort(table(trigrams),
decreasing = TRUE)
top_trigrams <- data.frame(
Trigram = names(trigram_freq)[1:10],
Frequency = as.numeric(trigram_freq[1:10])
)
top_trigrams
## Trigram Frequency
## 1 i don t 525
## 2 one of the 335
## 3 it s a 292
## 4 i can t 277
## 5 a lot of 268
## 6 thanks for the 235
## 7 i m not 232
## 8 you don t 189
## 9 i didn t 187
## 10 don t know 181
The frquencies of the 10 most common words (trigrams) are shown using a barplot.
id="trigram_barplot"
ggplot(top_trigrams,
aes(x = reorder(Trigram, Frequency),
y = Frequency)) +
geom_bar(stat = "identity",
fill = "darkgreen") +
coord_flip() +
labs(title = "Top 10 Most Frequent Trigrams",
x = "Trigram",
y = "Frequency") +
theme_minimal()
Distribution of Trigrams Frequrncies are shown using a histogram.
id="trigram_histogram"
trigram_df <- data.frame(
Frequency = as.numeric(trigram_freq)
)
ggplot(trigram_df,
aes(x = Frequency)) +
geom_histogram(
bins = 50,
fill = "darkgreen",
color = "black"
) +
labs(title = "Distribution of Trigram Frequencies",
x = "Frequency",
y = "Count of Trigrams") +
theme_minimal()
Several observations emerged from the analysis:
The final prediction model will be based on n-gram language modeling.
The process will be
The final Shiny application will provide:
The interface will be designed to be simple, intuitive and suitable for everyday use.