Capstone Project- Milestone Report

Loading the necessary libraries

library(tidyverse)
library(tidytext)
library(pryr)
library(knitr)
library(stringi)
library(dplyr)
library(tm)
library(ggplot2)

0. Summary

This report explores text data from blogs, news articles, and Twitter posts to understand common language patterns. The goal is to prepare for building a text prediction application that can suggest the next word while a user is typing. The analysis compares the three text sources using basic summaries such as file size, line counts, word counts, and line length distributions. It also reviews the most frequent words and common word sequences using n-gram analysis. The main findings are that Twitter text is shorter, while blog and news text contains longer sentences. These differences are important because the final prediction model should work across different writing styles. The final application will use common word patterns to predict the next word and will be deployed as a Shiny web application.

1. Exploratory Analysis

1.1 Reading in the data and compiling basic statistics.

We endured that the sub folder ‘en_US’ is embedded in our working directory.

blog<-read_lines("en_US/en_US.blogs.txt")
news <- read_lines("en_US/en_US.news.txt")
twitter <- read_lines("en_US/en_US.twitter.txt")
blog <- tibble(text = blog)
news <- tibble(text = news)
twitter=tibble(text=twitter)

summary_statistics=data.frame(File_Name=c("Blog", "News", "Twitter"),
      Number_of_Lines = c(
    nrow(blog),
    nrow(news),
    nrow(twitter)),
 # Words_count=c(sum(stri_count_words(blog)),sum(stri_count_words(news)),sum(stri_count_words(twitter))
 # char_count=c(sum(nchar(blog)),sum(nchar(news)),sum(nchar(twitter)))
 # the above two line take forever to run; we used Power shell to extract the info
 words_count=c(37334441,34372598,30373832),
 Characters_count=c(208361438, 203791405,162385042)
                )

The basic statistics from the files are:

kable(summary_statistics)

File_Name	Number_of_Lines	words_count	Characters_count
Blog	899288	37334441	208361438
News	1010242	34372598	203791405
Twitter	2360148	30373832	162385042

1.2 Sampling and cleaning the Data

our strategy is to sample 10% of each of the three data sets and combine them into a single data set called corpus and we cleaned the resulting corpus. Most of the cleaning will be done at the tokenization level.

set.seed(2026)
 sample_news=news%>% 
  slice_sample(prop = 0.10) %>%
  mutate(source="news",count=1)
  
 # mutate(line = row_number())
sample_blog=blog %>% 
  slice_sample(prop = 0.10) %>%
  mutate(source="blog",count=1)
 
sample_twitter=twitter %>% 
  slice_sample(prop = 0.10) %>%
  mutate(source = "twitter",count=1)

sample<-rbind(sample_blog,sample_news,sample_twitter)
corpus=sample

2. Unigram-Word Analysis.

We use unigram tokenization to explore word frequency across the corpus and each of the three samples.

unigrams <-corpus %>%
  unnest_tokens(word, text, token = "ngrams", n = 1)  
word_freq=unigrams%>% group_by (word)%>%
                      summarize(total=sum(count))%>%
                      arrange(desc(total))
word_blog_freq=unigrams%>% filter(source=="blog")%>%
                    group_by (word)%>%
                      summarize(total=sum(count))%>%
                      arrange(desc(total))%>%
                       mutate(source="blog")
word_news_freq=unigrams%>% filter(source=="news")%>%
                    group_by (word)%>%
                      summarize(total=sum(count))%>%
                      arrange(desc(total))%>%
                       mutate(source="news")
word_twitter_freq=unigrams%>% filter(source=="twitter")%>%
                    group_by (word)%>%
                      summarize(total=sum(count))%>%
                      arrange(desc(total))%>%
                       mutate(source="twitter")
top10=data.frame(word_freq[1:10,])

top10_per_source=rbind(word_blog_freq[1:10,], word_news_freq[1:10,],word_twitter_freq[1:10,])

Below ithe barplot of the top 10 most frequent words in the sample.

barplot(height=top10$total, names=top10$word, col="#69b3a2", las=2, horiz=TRUE, main="Top 10 most frequent words", xlab=" frequency", ylab="word")

The panel below displays the 10 most frequent words from the sample of each file.

par(mfrow = c(1,3), mar = c(11,5,3,2), cex = 0.8)
temp=word_blog_freq[1:10,]
barplot(height=temp$total, names=temp$word, col="#f0fff0", las=2, horiz=TRUE, main="Blog:Top 10 words ", xlab=" frequency", ylab="word")
temp=word_news_freq[1:10,]
barplot(height=temp$total, names=temp$word, col="#f0e68e", las=2, horiz=TRUE, main="News:Top 10 words ", xlab=" frequency", ylab="word")
temp=word_twitter_freq[1:10,]
barplot(height=temp$total, names=temp$word, col="#5f9ea0", las=2, horiz=TRUE, main="Twitter:Top 10 words ", xlab=" frequency", ylab="word")

Bigram Analysis

bigrams0 <-sample %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, into = c("word1", "word2"), sep = " ")

We display below the top most frequent bigrams from the combined sample.

temp=bigrams0%>%group_by (word1, word2)%>%
                      summarize(total=sum(count))%>%
                      arrange(desc(total))
dat=temp[1:10,]%>% mutate(bigram=paste(word1,word2))
barplot(height=dat$total, names=dat$bigram, col="#0000ff", las=2, horiz=TRUE, main="Top 10 most frequent bigrams", xlab=" frequency", ylab="bigrams")

We now display the 10 most frequent bigrams per sample of each file.

par(mfrow = c(1,3), mar = c(11,5,3,2), cex = 0.8)
temp=bigrams0%>% filter(source=="blog")%>%group_by (word1, word2)%>%
                      summarize(total=sum(count))%>%
                      arrange(desc(total))

dat=temp[1:10,]%>% mutate(bigram=paste(word1,word2))
barplot(height=dat$total, names=dat$bigram, col="#dda0dd", las=2, horiz=TRUE, main=" Blog:Top 10 most bigrams", xlab=" frequency" )

temp=bigrams0%>% filter(source=="news")%>%group_by (word1, word2)%>%
                      summarize(total=sum(count))%>%
                      arrange(desc(total))

dat=temp[1:10,]%>% mutate(bigram=paste(word1,word2))
barplot(height=dat$total, names=dat$bigram, col="#ffdab9", las=2, horiz=TRUE, main=" News:Top 10 most bigrams", xlab=" frequency" )

temp=bigrams0%>% filter(source=="twitter")%>%group_by (word1, word2)%>%
                      summarize(total=sum(count))%>%
                      arrange(desc(total))

dat=temp[1:10,]%>% mutate(bigram=paste(word1,word2))
barplot(height=dat$total, names=dat$bigram, col="#ffa07a", las=2, horiz=TRUE, main=" Twitter:Top 10 most bigrams", xlab=" frequency" )

Strategy for Prediction.

We will first break up the input text into tokens and use a trigram model to predict the next word by default and resort to a bigram model if the input text contains only one words.