install.packages(c( “stringi”, “ggplot2”, “tm”, “wordcloud”, “dplyr”, “tidytext”, “knitr”, “rmarkdown” )) — title: “Data Science Capstone Milestone Report” author: “Your Name” date: “2026-05-16” output: html_document —

Introduction

The goal of this project is to analyze text datasets from blogs, news, and Twitter in order to understand word patterns and eventually build a predictive text application.

Load Libraries

library(stringi)
library(ggplot2)
library(tm)
library(wordcloud)
library(dplyr)

Load Data

setwd(“C:\Users\Garima\Downloads\Coursera-SwiftKey\final\en_US”)

blogs <- readLines("en_US.blogs.txt",
                   encoding = "UTF-8",
                   skipNul = TRUE)

news_data <- readLines("en_US.news.txt",
                       encoding = "UTF-8",
                       skipNul = TRUE)

twitter <- readLines("en_US.twitter.txt",
                     encoding = "UTF-8",
                     skipNul = TRUE)

Summary Statistics

summary_data <- data.frame(
  File = c("Blogs", "News", "Twitter"),
  
  File_Size_MB = c(
    file.info("data/en_US.blogs.txt")$size / 1024^2,
    file.info("data/en_US.news.txt")$size / 1024^2,
    file.info("data/en_US.twitter.txt")$size / 1024^2
  ),
  
  Line_Count = c(
    length(blogs),
    length(news_data),
    length(twitter)
  ),
  
  Word_Count = c(
    sum(stri_count_words(blogs)),
    sum(stri_count_words(news_data)),
    sum(stri_count_words(twitter))
  )
)

summary_data

##      File File_Size_MB Line_Count Word_Count
## 1   Blogs           NA     899288   37546806
## 2    News           NA    1010206   34761151
## 3 Twitter           NA    2360148   30096690

Sampling Data

set.seed(123)

sample_data <- c(
  sample(blogs, 1000),
  sample(news_data, 1000),
  sample(twitter, 1000)
)

Data Cleaning

corpus <- Corpus(VectorSource(sample_data))

corpus <- tm_map(corpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents

corpus <- tm_map(corpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents

corpus <- tm_map(corpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents

corpus <- tm_map(corpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents

Word Frequency

tdm <- TermDocumentMatrix(corpus)

matrix_data <- as.matrix(tdm)

word_freq <- sort(rowSums(matrix_data), decreasing = TRUE)

freq_df <- data.frame(
  word = names(word_freq),
  freq = word_freq
)

head(freq_df, 10)

##      word freq
## the   the 4319
## and   and 2205
## that that  990
## for   for  953
## you   you  723
## with with  599
## was   was  599
## have have  465
## this this  448
## are   are  417

Top 20 Frequent Words

ggplot(freq_df[1:20,],
       aes(x = reorder(word, freq),
           y = freq)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 20 Frequent Words",
       x = "Words",
       y = "Frequency")

Histogram of Word Frequencies

hist(freq_df$freq,
     breaks = 50,
     main = "Histogram of Word Frequencies",
     xlab = "Frequency")

Word Cloud

wordcloud(words = freq_df$word,
          freq = freq_df$freq,
          max.words = 100)

# Findings

Some important findings from the exploratory analysis are:

Twitter data contains shorter sentences.
Blogs contain more descriptive language.
News data is more formal and structured.
Common stop words dominate the datasets.
Data cleaning significantly reduces noise in the data.

Future Plan

The next step of the project will focus on building a predictive text model using N-gram techniques. The model will predict the next word based on previous words entered by the user. A Shiny application will be developed to provide an interactive interface for real-time prediction.