Basic summary: This report explains the exploratory analysis and goals for the eventual app and algorithm.
Tasks to accomplish: - Demonstrate that you’ve downloaded the data and have successfully loaded it in. - Create a basic report of summary statistics about the data sets. - Report any interesting findings that you amassed so far. - Get feedback on your plans for creating a prediction algorithm and Shiny app
library(tidytext, warn.conflicts = FALSE)
library(tidyr, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(ggplot2, warn.conflicts = FALSE)
library(qdapRegex, warn.conflicts = FALSE)
library(stringr, warn.conflicts = FALSE)
library(tm, warn.conflicts = FALSE)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
blogs_file <- file("C:/Users/Jianyang/Downloads/Coursera/Data Science Capstone/final/en_US/en_US.blogs.txt", "r")
blogs <- readLines(blogs_file, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
close(blogs_file)
twitter_file <- file("C:/Users/Jianyang/Downloads/Coursera/Data Science Capstone/final/en_US/en_US.twitter.txt", "r")
twitter <- readLines(twitter_file, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
close(twitter_file)
news_file <- file("C:/Users/Jianyang/Downloads/Coursera/Data Science Capstone/final/en_US/en_US.news.txt", "rb")
news <- readLines(news_file, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
close(news_file)
len_blog <- length(blogs)
len_twitter <- length(twitter)
len_news <- length(news)
wordcount_blog <- sum(str_count(blogs))
wordcount_twitter <- sum(str_count(twitter))
wordcount_news <- sum(str_count(news))
table <- data.frame(source = c("blogs", "twitter", "news"),
lines_count = c(len_blog, len_twitter, len_news),
word_count = c(wordcount_blog, wordcount_twitter, wordcount_news))
print(table)
## source lines_count word_count
## 1 blogs 899288 206824257
## 2 twitter 2360148 162095975
## 3 news 1010242 203223153
set.seed(123)
twitter_sample <- sample(twitter, length(twitter) * 0.01, replace = FALSE)
blogs_sample <- sample(blogs, length(blogs) * 0.01, replace = FALSE)
news_sample <- sample(news, length(news) * 0.01, replace = FALSE)
data_sample = c(twitter_sample, blogs_sample, news_sample)
NotKnown <- grep("NotKnown", iconv(data_sample, "latin1", "ASCII", sub="NotKnown"))
data_sample <- data_sample[-NotKnown]
data_sample <- rm_white(data_sample)
data_sample <- tolower(data_sample)
data_sample <- removePunctuation(data_sample)
data_sample <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", "", data_sample)
data_sample <- gsub("&", "", data_sample)
data_sample <- gsub("[[:digit:]]", "", data_sample)
data_sample <- gsub("RT :|@[a-z,A-Z]*: ", "", data_sample)
data_sample <- gsub("@\\w+", "", data_sample)
data_sample <- gsub(" #\\S*","", data_sample)
data_sample_df <- data_frame(line = 1:length(data_sample), text = data_sample, stringsAsFactors = FALSE)
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
unigram_freq <- data_sample_df %>%
unnest_tokens(unigram, text, token = "ngrams", n = 3) %>%
separate(unigram, c("word1"), sep = " ",
extra = "drop", fill = "right") %>%
filter(!word1 %in% stop_words$word) %>%
unite(unigram, word1, sep = " ") %>%
count(unigram, sort = TRUE)
ggplot(head(unigram_freq,10), aes(reorder(unigram,n), n)) +
geom_bar(stat="identity") + coord_flip() +
xlab("Unigrams") + ylab("Frequency") +
ggtitle("Most frequent unigrams")
bigram_freq <- data_sample_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 3) %>%
separate(bigram, c("word1", "word2"), sep = " ",
extra = "drop", fill = "right") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
unite(bigram, word1, word2, sep = " ") %>%
count(bigram, sort = TRUE)
ggplot(head(bigram_freq,10), aes(reorder(bigram,n), n)) +
geom_bar(stat="identity") + coord_flip() +
xlab("Bigrams") + ylab("Frequency") +
ggtitle("Most frequent bigrams")
Need to create more models - N grams: trigrams, and create a better filter to remove irrelevant words (e.g. ‘NA’).
Otherwise, the prediction model should be based on these frequencies. For instance, it should respond with suggestions of the words that match all of part of the string already typed, and when a user enters a space (after the first word), it should respond with the possible following words.