Capstone Milestone Report

Introduction

This report outlines the initial exploratory data analysis for the SwiftKey dataset. The goal is to demonstrate successful loading and processing of the data and to provide a summary of important characteristics, along with preliminary ideas for building a text prediction algorithm and Shiny app.

Data Loading

# Load the data
twitter <- readLines("en_US.twitter.txt", warn = FALSE)
blogs   <- readLines("en_US.blogs.txt", warn = FALSE)
news    <- readLines("en_US.news.txt", warn = FALSE)

Basic Summary Statistics

# Line count
line_counts <- c(length(twitter), length(blogs), length(news))

# Word count
word_counts <- c(
  sum(str_count(twitter, "\\S+")),
  sum(str_count(blogs, "\\S+")),
  sum(str_count(news, "\\S+"))
)

# Combine into a data frame
summary_table <- data.frame(
  Dataset = c("Twitter", "Blogs", "News"),
  Line_Count = line_counts,
  Word_Count = word_counts
)

summary_table

##   Dataset Line_Count Word_Count
## 1 Twitter    2360148   30373543
## 2   Blogs     899288   37334131
## 3    News    1010242   34372530

Distribution of Line Lengths

# Add a column with line length
line_lengths <- tibble(
  Source = rep(c("Twitter", "Blogs", "News"),
               times = c(length(twitter), length(blogs), length(news))),
  Length = c(nchar(twitter), nchar(blogs), nchar(news))
)

# Plot
ggplot(line_lengths, aes(x = Length, fill = Source)) +
  geom_histogram(bins = 100, alpha = 0.6, position = "identity") +
  xlim(0, 1000) +
  labs(title = "Histogram of Line Lengths",
       x = "Number of Characters", y = "Frequency")

Most Common Words in Twitter

# Create tibble and tokenize
twitter_words <- tibble(text = twitter) |>
  unnest_tokens(word, text) |>
  count(word, sort = TRUE)

# Plot top 20
twitter_words |>
  slice_max(n, n = 20) |>
  ggplot(aes(reorder(word, n), n)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 20 Words in Twitter Data", x = "Word", y = "Frequency")

Interesting Findings

# Longest line in each dataset
max_lengths <- c(
  max(nchar(twitter)),
  max(nchar(blogs)),
  max(nchar(news))
)

data.frame(
  Dataset = c("Twitter", "Blogs", "News"),
  Max_Line_Length = max_lengths
)

##   Dataset Max_Line_Length
## 1 Twitter             140
## 2   Blogs           40833
## 3    News           11384

# Love vs Hate
love <- sum(str_detect(twitter, "\\blove\\b"))
hate <- sum(str_detect(twitter, "\\bhate\\b"))
ratio <- love / hate
c("Love" = love, "Hate" = hate, "Love-to-Hate Ratio" = ratio)

##               Love               Hate Love-to-Hate Ratio 
##       77639.000000       15561.000000           4.989332

# Biostats tweet
biostats_tweet <- grep("biostats", twitter, value = TRUE)
biostats_tweet

## [1] "i know how you feel.. i have biostats on tuesday and i have yet to study =/"

# Kickboxing quote match
quote <- "A computer once beat me at chess, but it was no match for me at kickboxing"
sum(twitter == quote)

## [1] 3