Downloading The Data

file_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"  
file_name <- basename(file_url)

if (!dir.exists("data")) {
  dir.create("data")
}

local_file <- file.path("data", file_name)

if (!file.exists(local_file)) {
  download.file(url = file_url, destfile = local_file, mode = "wb")
  message("File downloaded and saved to: ", local_file)
} else {
  message("File already exists at: ", local_file)
}
## File already exists at: data/Coursera-SwiftKey.zip
if (grepl("\\.zip$", local_file, ignore.case = TRUE)) {
  unzipped_files <- unzip(local_file, exdir = "data")
  file_to_read <- unzipped_files[1]  # Use first file in ZIP
} else {
  file_to_read <- local_file
}

Preliminary Exploration of Data

Looking here to get the number of lines and number of total words in each text document

summarize <- function(filepath = "./data/final/en_US/en_US.twitter.txt", name = "Twitter") {
  fullText <- readLines(filepath)
  textWords <- unlist(strsplit(tolower(fullText), "\\W+"))
  textWords <- textWords[textWords != ""]
  print(c(name, " Total Lines: ", length(fullText)))
  print(c(name, "Total Words: ", length(textWords)))
}
  
summarize("./data/final/en_US/en_US.twitter.txt", "Twitter")
## [1] "Twitter"        " Total Lines: " "2360148"       
## [1] "Twitter"       "Total Words: " "31003538"
summarize("./data/final/en_US/en_US.news.txt", "News")
## [1] "News"           " Total Lines: " "1010242"       
## [1] "News"          "Total Words: " "35624455"
summarize("./data/final/en_US/en_US.blogs.txt", "Blogs")
## [1] "Blogs"          " Total Lines: " "899288"        
## [1] "Blogs"         "Total Words: " "38309710"

Getting a more usable slice of the data and graphing it

for this portion I decided to only use words that are 5 or more letters so that “the” & “and” wouldn’t be the league leaders in every data source.

conTwitter <- file("./data/final/en_US/en_US.twitter.txt", "r") 
conNews <- file("./data/final/en_US/en_US.news.txt", "r") 
conBlogs <- file("./data/final/en_US/en_US.blogs.txt", "r") 

numberLines <- 20000 #using 20,000 lines for now to limit runtime. Will tweek later for higher performance and more representative sampling.
myTextTwitter <- readLines(conTwitter, numberLines) ## Read in first text 
myTextNews <- readLines(conNews, numberLines) ## 
myTextBlogs <- readLines(conBlogs, numberLines) ##
close(conTwitter) 
close(conNews) 
close(conBlogs)

countWords <- function(text_vector, source_name) {
  all_words <- unlist(strsplit(tolower(text_vector), "\\W+"))
  all_words <- all_words[all_words != ""]
  
  word_counts <- table(all_words)
  word_counts_df <- as.data.frame(word_counts)
  
  names(word_counts_df) <- c("word", "freq")
  word_counts_df$source <- source_name
  word_counts_df %>% filter(freq >= 5)  # Only keep words appearing ≥5 times
}

top20 <- function(text_df, lowerCutoff = 1, upperCutoff = 100, numWords = 20) {
  text_df <- text_df %>%
  filter(nchar(as.character(word)) >= lowerCutoff) %>%  # Exclude short words
  filter(nchar(as.character(word)) <= upperCutoff) %>%  # Exclude long words
  arrange(desc(freq)) %>% 
  head(numWords) %>%            #
  mutate(word = fct_reorder(word, freq)) 
  
  text_df
}

twitterCounts <- countWords(myTextTwitter, "twitter")
newsCounts <- countWords(myTextNews, "news")
blogsCounts <- countWords(myTextBlogs, "blogs")

twitterTop20 <- top20(twitterCounts, lowerCutoff = 5, numWords = 20)
blogsTop20 <- top20(blogsCounts, lowerCutoff = 5, numWords = 20)
newsTop20 <- top20(newsCounts, lowerCutoff = 5, numWords = 20)

# Plotting Twitter Word Usage
ggplot(twitterTop20, aes(x = word, y = freq, fill = freq)) +
  geom_bar(stat = "identity", width = 0.8) +
  scale_fill_gradient(low = "#5e81ac", high = "#bf616a") +  
  labs(title = "Top 20 Most Frequent On Twitter",
       x = "Word",
       y = "Frequency",
       fill = "Count") +
  coord_flip() +  # Horizontal bars for better readability
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.text.y = element_text(margin = margin(r = 5)),
    panel.grid.major.y = element_blank(),
    legend.position = "right"
  )

# Plotting Blogs Word Usage
ggplot(blogsTop20, aes(x = word, y = freq, fill = freq)) +
  geom_bar(stat = "identity", width = 0.8) +
  scale_fill_gradient(low = "#5e81ac", high = "#bf616a") +  
  labs(title = "Top 20 Most Frequent on Blogs",
       x = "Word",
       y = "Frequency",
       fill = "Count") +
  coord_flip() +  # Horizontal bars for better readability
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.text.y = element_text(margin = margin(r = 5)),
    panel.grid.major.y = element_blank(),
    legend.position = "right"
  )

# Plotting News Word Usage
ggplot(newsTop20, aes(x = word, y = freq, fill = freq)) +
  geom_bar(stat = "identity", width = 0.8) +
  scale_fill_gradient(low = "#5e81ac", high = "#bf616a") +  # Modern color gradient
  labs(title = "Top 20 Most Frequent On News",
       x = "Word",
       y = "Frequency",
       fill = "Count") +
  coord_flip() +  # Horizontal bars for better readability
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.text.y = element_text(margin = margin(r = 5)),
    panel.grid.major.y = element_blank(),
    legend.position = "right"
  )

Making comparitive graph

#Combine all counts 
word_comparison <- bind_rows(twitterCounts, newsCounts, blogsCounts) %>%
  pivot_wider(
    names_from = source, 
    values_from = freq, 
    values_fill = 0  
  ) %>%
  filter(nchar(as.character(word)) >= 5 & nchar(as.character(word)) <= 15) %>%
  mutate(total = twitter + news + blogs) %>%
  arrange(desc(total)) %>%
  # Take top 1000 words for visualization
  head(1000)

comparison_plot <- ggplot(word_comparison, aes(x = news, y = blogs)) +
  # Use text labels as points (size by total frequency, color by Twitter frequency)
  geom_text(
    aes(label = word, size = total, color = twitter),
    check_overlap = TRUE,
    alpha = 0.8
  ) +
  # Color gradient (blue to red)
  scale_color_gradientn(
    colors = c("#5e81ac", "#d08770", "#bf616a"),
    name = "Twitter\nFrequency"
  ) +
  # Size scale for text
  scale_size_continuous(
    range = c(3, 6),  
    name = "Total\nFrequency",
    guide = "none"  
  ) +
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10") +
  labs(
    title = "Word Frequency Across Sources",
    subtitle = "Text size: Total frequency | Color: Twitter frequency",
    x = "Frequency in News (log scale)",
    y = "Frequency in Blogs (log scale)"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",
    plot.title = element_text(face = "bold", hjust = 0.5, size = 16),
    panel.grid.major = element_line(color = "gray90")
  ) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray50") +
  annotate("text", 
           x = median(word_comparison$news), 
           y = median(word_comparison$blogs)*10,
           label = " ", 
           angle = 45, color = "gray30", size = 3.5) +
  annotate("text", 
           x = median(word_comparison$news)*10,
           y = median(word_comparison$blogs),
           label = " ",
           angle = 45, color = "gray30", size = 3.5)

print(comparison_plot)

As you can see above. The news is more likely to use words you typically see on the news (not exactly shocking) like “washington”, “officials”, and specific names of cities. As Blogs are more personal in nature, it’s not entirely suprising that words like “birthday”, “jesus”, and “happy” get used more. One word that makes a lot more appearances on twitter than it does in the news or on blogs is, not suprisingly, “twitter”. The word “Tweet” however doesn’t show this same tendency. One thing that suprised me about this graph is that “About” is king of the words that are five or more letters. ## Plans for creating Shiny app I have looked up some documentation on this subject and it appears that the easiest way to do this will be using the tm and Rweka library. Preprocess the data using VCorpus and DocumentTermMatrix functions. Create Ngram matrixes and create tables of the frequency. Not sure exactly how I will proceed from there but will have to create the predictive text function such that probabilities are maximized based the existing ngrams.