install.packages(c( “stringi”, “ggplot2”, “tm”, “wordcloud”, “dplyr”, “tidytext”, “knitr”, “rmarkdown” )) — title: “Data Science Capstone Milestone Report” author: “Your Name” date: “2026-05-16” output: html_document —

Introduction

The goal of this project is to analyze text datasets from blogs, news, and Twitter in order to understand word patterns and eventually build a predictive text application.

Load Libraries

library(stringi)
library(ggplot2)
library(tm)
library(wordcloud)
library(dplyr)

Load Data

setwd(“C:\Users\Garima\Downloads\Coursera-SwiftKey\final\en_US”)

blogs <- readLines("en_US.blogs.txt",
                   encoding = "UTF-8",
                   skipNul = TRUE)

news_data <- readLines("en_US.news.txt",
                       encoding = "UTF-8",
                       skipNul = TRUE)

twitter <- readLines("en_US.twitter.txt",
                     encoding = "UTF-8",
                     skipNul = TRUE)

Summary Statistics

summary_data <- data.frame(
  File = c("Blogs", "News", "Twitter"),
  
  File_Size_MB = c(
    file.info("data/en_US.blogs.txt")$size / 1024^2,
    file.info("data/en_US.news.txt")$size / 1024^2,
    file.info("data/en_US.twitter.txt")$size / 1024^2
  ),
  
  Line_Count = c(
    length(blogs),
    length(news_data),
    length(twitter)
  ),
  
  Word_Count = c(
    sum(stri_count_words(blogs)),
    sum(stri_count_words(news_data)),
    sum(stri_count_words(twitter))
  )
)

summary_data
##      File File_Size_MB Line_Count Word_Count
## 1   Blogs           NA     899288   37546806
## 2    News           NA    1010206   34761151
## 3 Twitter           NA    2360148   30096690

Sampling Data

set.seed(123)

sample_data <- c(
  sample(blogs, 1000),
  sample(news_data, 1000),
  sample(twitter, 1000)
)

Data Cleaning

corpus <- Corpus(VectorSource(sample_data))

corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents

Word Frequency

tdm <- TermDocumentMatrix(corpus)

matrix_data <- as.matrix(tdm)

word_freq <- sort(rowSums(matrix_data), decreasing = TRUE)

freq_df <- data.frame(
  word = names(word_freq),
  freq = word_freq
)

head(freq_df, 10)
##      word freq
## the   the 4319
## and   and 2205
## that that  990
## for   for  953
## you   you  723
## with with  599
## was   was  599
## have have  465
## this this  448
## are   are  417

Top 20 Frequent Words

ggplot(freq_df[1:20,],
       aes(x = reorder(word, freq),
           y = freq)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 20 Frequent Words",
       x = "Words",
       y = "Frequency")

Histogram of Word Frequencies

hist(freq_df$freq,
     breaks = 50,
     main = "Histogram of Word Frequencies",
     xlab = "Frequency")

Word Cloud

wordcloud(words = freq_df$word,
          freq = freq_df$freq,
          max.words = 100)

# Findings

Some important findings from the exploratory analysis are:

Future Plan

The next step of the project will focus on building a predictive text model using N-gram techniques. The model will predict the next word based on previous words entered by the user. A Shiny application will be developed to provide an interactive interface for real-time prediction.