title: "Text Data Analysis Report"
author: "Shantanu Bangar"
date: "2024-08-03"
output: html_document
# Install and load required libraries
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(dplyr, ggplot2, wordcloud, tm, tidytext, RColorBrewer)
# Read in the data
twitter_data <- readLines("C:\\Users\\Shantanu\\Downloads\\Coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt")
## Warning in
## readLines("C:\\Users\\Shantanu\\Downloads\\Coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt"):
## line 167155 appears to contain an embedded nul
## Warning in
## readLines("C:\\Users\\Shantanu\\Downloads\\Coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt"):
## line 268547 appears to contain an embedded nul
## Warning in
## readLines("C:\\Users\\Shantanu\\Downloads\\Coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt"):
## line 1274086 appears to contain an embedded nul
## Warning in
## readLines("C:\\Users\\Shantanu\\Downloads\\Coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt"):
## line 1759032 appears to contain an embedded nul
blogs_data <- readLines("C:\\Users\\Shantanu\\Downloads\\Coursera-SwiftKey\\final\\en_US\\en_US.blogs.txt")
news_data <- readLines("C:\\Users\\Shantanu\\Downloads\\Coursera-SwiftKey\\final\\en_US\\en_US.news.txt")
## Warning in
## readLines("C:\\Users\\Shantanu\\Downloads\\Coursera-SwiftKey\\final\\en_US\\en_US.news.txt"):
## incomplete final line found on
## 'C:\Users\Shantanu\Downloads\Coursera-SwiftKey\final\en_US\en_US.news.txt'
# Number of lines
num_lines <- data.frame(
Dataset = c("Twitter", "Blogs", "News"),
Lines = c(length(twitter_data), length(blogs_data), length(news_data))
)
# Calculate word count for each dataset
twitter_words <- sum(sapply(strsplit(twitter_data, " "), length))
blogs_words <- sum(sapply(strsplit(blogs_data, " "), length))
news_words <- sum(sapply(strsplit(news_data, " "), length))
num_words <- data.frame(
Dataset = c("Twitter", "Blogs", "News"),
Words = c(twitter_words, blogs_words, news_words)
)
# Print summary statistics
num_lines
## Dataset Lines
## 1 Twitter 2360148
## 2 Blogs 899288
## 3 News 77259
num_words
## Dataset Words
## 1 Twitter 30373543
## 2 Blogs 37334131
## 3 News 2643969
# Sample data from each dataset
set.seed(123) # For reproducibility
sample_size <- 10000 # Adjust sample size based on available memory
twitter_sample <- sample(twitter_data, size = sample_size)
blogs_sample <- sample(blogs_data, size = sample_size)
news_sample <- sample(news_data, size = sample_size)
# Combine sampled data for analysis
sample_data <- c(twitter_sample, blogs_sample, news_sample)
# Create a corpus and clean the data
corpus <- Corpus(VectorSource(sample_data))
corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
# Create Document Term Matrix
dtm <- DocumentTermMatrix(corpus)
word_freq <- colSums(as.matrix(dtm))
word_freq_df <- data.frame(word = names(word_freq), freq = word_freq)
# Plot top 20 most frequent words
top_words <- word_freq_df %>%
arrange(desc(freq)) %>%
head(20)
ggplot(top_words, aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat = "identity") +
xlab("Words") + ylab("Frequency") +
ggtitle("Top 20 Most Frequent Words") +
theme_minimal()

# Generate word cloud for the top words
wordcloud(words = word_freq_df$word, freq = word_freq_df$freq,
max.words = 100, colors = brewer.pal(8, "Dark2"))
