This report outlines the initial exploratory data analysis for the SwiftKey dataset. The goal is to demonstrate successful loading and processing of the data and to provide a summary of important characteristics, along with preliminary ideas for building a text prediction algorithm and Shiny app.
# Load the data
twitter <- readLines("en_US.twitter.txt", warn = FALSE)
blogs <- readLines("en_US.blogs.txt", warn = FALSE)
news <- readLines("en_US.news.txt", warn = FALSE)
# Line count
line_counts <- c(length(twitter), length(blogs), length(news))
# Word count
word_counts <- c(
sum(str_count(twitter, "\\S+")),
sum(str_count(blogs, "\\S+")),
sum(str_count(news, "\\S+"))
)
# Combine into a data frame
summary_table <- data.frame(
Dataset = c("Twitter", "Blogs", "News"),
Line_Count = line_counts,
Word_Count = word_counts
)
summary_table
## Dataset Line_Count Word_Count
## 1 Twitter 2360148 30373543
## 2 Blogs 899288 37334131
## 3 News 1010242 34372530
# Add a column with line length
line_lengths <- tibble(
Source = rep(c("Twitter", "Blogs", "News"),
times = c(length(twitter), length(blogs), length(news))),
Length = c(nchar(twitter), nchar(blogs), nchar(news))
)
# Plot
ggplot(line_lengths, aes(x = Length, fill = Source)) +
geom_histogram(bins = 100, alpha = 0.6, position = "identity") +
xlim(0, 1000) +
labs(title = "Histogram of Line Lengths",
x = "Number of Characters", y = "Frequency")
# Create tibble and tokenize
twitter_words <- tibble(text = twitter) |>
unnest_tokens(word, text) |>
count(word, sort = TRUE)
# Plot top 20
twitter_words |>
slice_max(n, n = 20) |>
ggplot(aes(reorder(word, n), n)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 Words in Twitter Data", x = "Word", y = "Frequency")
# Longest line in each dataset
max_lengths <- c(
max(nchar(twitter)),
max(nchar(blogs)),
max(nchar(news))
)
data.frame(
Dataset = c("Twitter", "Blogs", "News"),
Max_Line_Length = max_lengths
)
## Dataset Max_Line_Length
## 1 Twitter 140
## 2 Blogs 40833
## 3 News 11384
# Love vs Hate
love <- sum(str_detect(twitter, "\\blove\\b"))
hate <- sum(str_detect(twitter, "\\bhate\\b"))
ratio <- love / hate
c("Love" = love, "Hate" = hate, "Love-to-Hate Ratio" = ratio)
## Love Hate Love-to-Hate Ratio
## 77639.000000 15561.000000 4.989332
# Biostats tweet
biostats_tweet <- grep("biostats", twitter, value = TRUE)
biostats_tweet
## [1] "i know how you feel.. i have biostats on tuesday and i have yet to study =/"
# Kickboxing quote match
quote <- "A computer once beat me at chess, but it was no match for me at kickboxing"
sum(twitter == quote)
## [1] 3