file_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
file_name <- basename(file_url)
if (!dir.exists("data")) {
dir.create("data")
}
local_file <- file.path("data", file_name)
if (!file.exists(local_file)) {
download.file(url = file_url, destfile = local_file, mode = "wb")
message("File downloaded and saved to: ", local_file)
} else {
message("File already exists at: ", local_file)
}
## File already exists at: data/Coursera-SwiftKey.zip
if (grepl("\\.zip$", local_file, ignore.case = TRUE)) {
unzipped_files <- unzip(local_file, exdir = "data")
file_to_read <- unzipped_files[1] # Use first file in ZIP
} else {
file_to_read <- local_file
}
Looking here to get the number of lines and number of total words in each text document
summarize <- function(filepath = "./data/final/en_US/en_US.twitter.txt", name = "Twitter") {
fullText <- readLines(filepath)
textWords <- unlist(strsplit(tolower(fullText), "\\W+"))
textWords <- textWords[textWords != ""]
print(c(name, " Total Lines: ", length(fullText)))
print(c(name, "Total Words: ", length(textWords)))
}
summarize("./data/final/en_US/en_US.twitter.txt", "Twitter")
## [1] "Twitter" " Total Lines: " "2360148"
## [1] "Twitter" "Total Words: " "31003538"
summarize("./data/final/en_US/en_US.news.txt", "News")
## [1] "News" " Total Lines: " "1010242"
## [1] "News" "Total Words: " "35624455"
summarize("./data/final/en_US/en_US.blogs.txt", "Blogs")
## [1] "Blogs" " Total Lines: " "899288"
## [1] "Blogs" "Total Words: " "38309710"
for this portion I decided to only use words that are 5 or more letters so that “the” & “and” wouldn’t be the league leaders in every data source.
conTwitter <- file("./data/final/en_US/en_US.twitter.txt", "r")
conNews <- file("./data/final/en_US/en_US.news.txt", "r")
conBlogs <- file("./data/final/en_US/en_US.blogs.txt", "r")
numberLines <- 20000 #using 20,000 lines for now to limit runtime. Will tweek later for higher performance and more representative sampling.
myTextTwitter <- readLines(conTwitter, numberLines) ## Read in first text
myTextNews <- readLines(conNews, numberLines) ##
myTextBlogs <- readLines(conBlogs, numberLines) ##
close(conTwitter)
close(conNews)
close(conBlogs)
countWords <- function(text_vector, source_name) {
all_words <- unlist(strsplit(tolower(text_vector), "\\W+"))
all_words <- all_words[all_words != ""]
word_counts <- table(all_words)
word_counts_df <- as.data.frame(word_counts)
names(word_counts_df) <- c("word", "freq")
word_counts_df$source <- source_name
word_counts_df %>% filter(freq >= 5) # Only keep words appearing ≥5 times
}
top20 <- function(text_df, lowerCutoff = 1, upperCutoff = 100, numWords = 20) {
text_df <- text_df %>%
filter(nchar(as.character(word)) >= lowerCutoff) %>% # Exclude short words
filter(nchar(as.character(word)) <= upperCutoff) %>% # Exclude long words
arrange(desc(freq)) %>%
head(numWords) %>% #
mutate(word = fct_reorder(word, freq))
text_df
}
twitterCounts <- countWords(myTextTwitter, "twitter")
newsCounts <- countWords(myTextNews, "news")
blogsCounts <- countWords(myTextBlogs, "blogs")
twitterTop20 <- top20(twitterCounts, lowerCutoff = 5, numWords = 20)
blogsTop20 <- top20(blogsCounts, lowerCutoff = 5, numWords = 20)
newsTop20 <- top20(newsCounts, lowerCutoff = 5, numWords = 20)
# Plotting Twitter Word Usage
ggplot(twitterTop20, aes(x = word, y = freq, fill = freq)) +
geom_bar(stat = "identity", width = 0.8) +
scale_fill_gradient(low = "#5e81ac", high = "#bf616a") +
labs(title = "Top 20 Most Frequent On Twitter",
x = "Word",
y = "Frequency",
fill = "Count") +
coord_flip() + # Horizontal bars for better readability
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
axis.text.y = element_text(margin = margin(r = 5)),
panel.grid.major.y = element_blank(),
legend.position = "right"
)
# Plotting Blogs Word Usage
ggplot(blogsTop20, aes(x = word, y = freq, fill = freq)) +
geom_bar(stat = "identity", width = 0.8) +
scale_fill_gradient(low = "#5e81ac", high = "#bf616a") +
labs(title = "Top 20 Most Frequent on Blogs",
x = "Word",
y = "Frequency",
fill = "Count") +
coord_flip() + # Horizontal bars for better readability
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
axis.text.y = element_text(margin = margin(r = 5)),
panel.grid.major.y = element_blank(),
legend.position = "right"
)
# Plotting News Word Usage
ggplot(newsTop20, aes(x = word, y = freq, fill = freq)) +
geom_bar(stat = "identity", width = 0.8) +
scale_fill_gradient(low = "#5e81ac", high = "#bf616a") + # Modern color gradient
labs(title = "Top 20 Most Frequent On News",
x = "Word",
y = "Frequency",
fill = "Count") +
coord_flip() + # Horizontal bars for better readability
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
axis.text.y = element_text(margin = margin(r = 5)),
panel.grid.major.y = element_blank(),
legend.position = "right"
)
#Combine all counts
word_comparison <- bind_rows(twitterCounts, newsCounts, blogsCounts) %>%
pivot_wider(
names_from = source,
values_from = freq,
values_fill = 0
) %>%
filter(nchar(as.character(word)) >= 5 & nchar(as.character(word)) <= 15) %>%
mutate(total = twitter + news + blogs) %>%
arrange(desc(total)) %>%
# Take top 1000 words for visualization
head(1000)
comparison_plot <- ggplot(word_comparison, aes(x = news, y = blogs)) +
# Use text labels as points (size by total frequency, color by Twitter frequency)
geom_text(
aes(label = word, size = total, color = twitter),
check_overlap = TRUE,
alpha = 0.8
) +
# Color gradient (blue to red)
scale_color_gradientn(
colors = c("#5e81ac", "#d08770", "#bf616a"),
name = "Twitter\nFrequency"
) +
# Size scale for text
scale_size_continuous(
range = c(3, 6),
name = "Total\nFrequency",
guide = "none"
) +
scale_x_continuous(trans = "log10") +
scale_y_continuous(trans = "log10") +
labs(
title = "Word Frequency Across Sources",
subtitle = "Text size: Total frequency | Color: Twitter frequency",
x = "Frequency in News (log scale)",
y = "Frequency in Blogs (log scale)"
) +
theme_minimal() +
theme(
legend.position = "right",
plot.title = element_text(face = "bold", hjust = 0.5, size = 16),
panel.grid.major = element_line(color = "gray90")
) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray50") +
annotate("text",
x = median(word_comparison$news),
y = median(word_comparison$blogs)*10,
label = " ",
angle = 45, color = "gray30", size = 3.5) +
annotate("text",
x = median(word_comparison$news)*10,
y = median(word_comparison$blogs),
label = " ",
angle = 45, color = "gray30", size = 3.5)
print(comparison_plot)
As you can see above. The news is more likely to use words you typically see on the news (not exactly shocking) like “washington”, “officials”, and specific names of cities. As Blogs are more personal in nature, it’s not entirely suprising that words like “birthday”, “jesus”, and “happy” get used more. One word that makes a lot more appearances on twitter than it does in the news or on blogs is, not suprisingly, “twitter”. The word “Tweet” however doesn’t show this same tendency. One thing that suprised me about this graph is that “About” is king of the words that are five or more letters. ## Plans for creating Shiny app I have looked up some documentation on this subject and it appears that the easiest way to do this will be using the tm and Rweka library. Preprocess the data using VCorpus and DocumentTermMatrix functions. Create Ngram matrixes and create tables of the frequency. Not sure exactly how I will proceed from there but will have to create the predictive text function such that probabilities are maximized based the existing ngrams.