The purpose of this report is to demonstrate familiarity with the text data provided for the Data Science Capstone Project and to outline plans for building a next-word prediction algorithm and a Shiny web application.
This document provides: - An overview of the datasets - Basic summary statistics - Key findings from exploratory analysis - A brief description of the planned prediction algorithm and Shiny app
The report is written in a concise manner suitable for a non-data scientist manager.
The data used in this project comes from the SwiftKey corpus and includes English-language text from three sources:
library(tm)
library(ggplot2)
library(stringi)
library(dplyr)
blogs <- readLines(“en_US.blogs.txt”, encoding = “UTF-8”, skipNul = TRUE) news <- readLines(“en_US.news.txt”, encoding = “UTF-8”, skipNul = TRUE) twitter <- readLines(“en_US.twitter.txt”, encoding = “UTF-8”, skipNul = TRUE)
print(c(length(blogs), length(news), length(twitter)))
All three datasets were successfully loaded into R.
data_summary <- data.frame( Source = c(“Blogs”, “News”, “Twitter”), Lines = c(length(blogs), length(news), length(twitter)), Words = c( sum(stri_count_words(blogs)), sum(stri_count_words(news)), sum(stri_count_words(twitter)) ) ) print(data_summary)
set.seed(1234)
sample_blogs <- sample(blogs, as.integer(length(blogs) * 0.01)) sample_news <- sample(news, as.integer(length(news) * 0.01)) sample_twitter <- sample(twitter, as.integer(length(twitter) * 0.01))
sample_data <- c(sample_blogs, sample_news, sample_twitter)
corpus <- VCorpus(VectorSource(sample_data))
corpus <- tm_map(corpus, content_transformer(tolower)) corpus <- tm_map(corpus, content_transformer(removePunctuation)) corpus <- tm_map(corpus, content_transformer(removeNumbers)) corpus <- tm_map(corpus, stripWhitespace)
words_per_line <- stri_count_words(sample_data) df_words <- data.frame(words = words_per_line)
ggplot(df_words, aes(x = words)) + geom_histogram(binwidth = 5, fill = “steelblue”, color = “black”) + labs( title = “Distribution of Words per Line”, x = “Number of Words”, y = “Frequency” )
This histogram shows that most lines are relatively short, especially due to Twitter content.
tdm <- TermDocumentMatrix(corpus) term_freq <- rowSums(as.matrix(tdm)) term_freq <- sort(term_freq, decreasing = TRUE)
top_words <- data.frame( word = names(term_freq)[1:10], frequency = term_freq[1:10] )
print(top_words)
ggplot(top_words, aes(x = reorder(word, frequency), y = frequency)) + geom_bar(stat = “identity”, fill = “darkgreen”) + coord_flip() + labs( title = “Top 10 Most Frequent Words”, x = “Word”, y = “Frequency” )
A small number of words account for a large portion of the text, which is typical of natural language data.
```
sample_blogs <- sample(blogs, as.integer(length(blogs) * 0.001)) sample_news <- sample(news, as.integer(length(news) * 0.001)) sample_twitter <- sample(twitter, as.integer(length(twitter) * 0.001))