Introduction

This report outlines the initial exploratory data analysis for the SwiftKey dataset. The goal is to demonstrate successful loading and processing of the data and to provide a summary of important characteristics, along with preliminary ideas for building a text prediction algorithm and Shiny app.

Data Loading

# Load the data
twitter <- readLines("en_US.twitter.txt", warn = FALSE)
blogs   <- readLines("en_US.blogs.txt", warn = FALSE)
news    <- readLines("en_US.news.txt", warn = FALSE)

Basic Summary Statistics

# Line count
line_counts <- c(length(twitter), length(blogs), length(news))

# Word count
word_counts <- c(
  sum(str_count(twitter, "\\S+")),
  sum(str_count(blogs, "\\S+")),
  sum(str_count(news, "\\S+"))
)

# Combine into a data frame
summary_table <- data.frame(
  Dataset = c("Twitter", "Blogs", "News"),
  Line_Count = line_counts,
  Word_Count = word_counts
)

summary_table
##   Dataset Line_Count Word_Count
## 1 Twitter    2360148   30373543
## 2   Blogs     899288   37334131
## 3    News    1010242   34372530

Distribution of Line Lengths

# Add a column with line length
line_lengths <- tibble(
  Source = rep(c("Twitter", "Blogs", "News"),
               times = c(length(twitter), length(blogs), length(news))),
  Length = c(nchar(twitter), nchar(blogs), nchar(news))
)

# Plot
ggplot(line_lengths, aes(x = Length, fill = Source)) +
  geom_histogram(bins = 100, alpha = 0.6, position = "identity") +
  xlim(0, 1000) +
  labs(title = "Histogram of Line Lengths",
       x = "Number of Characters", y = "Frequency")

Most Common Words in Twitter

# Create tibble and tokenize
twitter_words <- tibble(text = twitter) |>
  unnest_tokens(word, text) |>
  count(word, sort = TRUE)

# Plot top 20
twitter_words |>
  slice_max(n, n = 20) |>
  ggplot(aes(reorder(word, n), n)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 20 Words in Twitter Data", x = "Word", y = "Frequency")

Interesting Findings

# Longest line in each dataset
max_lengths <- c(
  max(nchar(twitter)),
  max(nchar(blogs)),
  max(nchar(news))
)

data.frame(
  Dataset = c("Twitter", "Blogs", "News"),
  Max_Line_Length = max_lengths
)
##   Dataset Max_Line_Length
## 1 Twitter             140
## 2   Blogs           40833
## 3    News           11384
# Love vs Hate
love <- sum(str_detect(twitter, "\\blove\\b"))
hate <- sum(str_detect(twitter, "\\bhate\\b"))
ratio <- love / hate
c("Love" = love, "Hate" = hate, "Love-to-Hate Ratio" = ratio)
##               Love               Hate Love-to-Hate Ratio 
##       77639.000000       15561.000000           4.989332
# Biostats tweet
biostats_tweet <- grep("biostats", twitter, value = TRUE)
biostats_tweet
## [1] "i know how you feel.. i have biostats on tuesday and i have yet to study =/"
# Kickboxing quote match
quote <- "A computer once beat me at chess, but it was no match for me at kickboxing"
sum(twitter == quote)
## [1] 3