22 Aug 2020

Introduction

Basic summary: This report explains the exploratory analysis and goals for the eventual app and algorithm.

Tasks to accomplish: - Demonstrate that you’ve downloaded the data and have successfully loaded it in. - Create a basic report of summary statistics about the data sets. - Report any interesting findings that you amassed so far. - Get feedback on your plans for creating a prediction algorithm and Shiny app

1. Download Data and Load it.

  1. Load the necessary R packages.
  2. Download the folder and load it onto R.
library(tidytext, warn.conflicts = FALSE)
library(tidyr, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(ggplot2, warn.conflicts = FALSE)
library(qdapRegex, warn.conflicts = FALSE)
library(stringr, warn.conflicts = FALSE)
library(tm, warn.conflicts = FALSE)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
blogs_file <- file("C:/Users/Jianyang/Downloads/Coursera/Data Science Capstone/final/en_US/en_US.blogs.txt", "r")
blogs <- readLines(blogs_file, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
close(blogs_file)

twitter_file <- file("C:/Users/Jianyang/Downloads/Coursera/Data Science Capstone/final/en_US/en_US.twitter.txt", "r") 
twitter <- readLines(twitter_file, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
close(twitter_file)

news_file <- file("C:/Users/Jianyang/Downloads/Coursera/Data Science Capstone/final/en_US/en_US.news.txt", "rb")
news <- readLines(news_file, warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
close(news_file)

2. Create basic report of summary statistics.

  1. List down key characteristics of data.
len_blog <- length(blogs)
len_twitter <- length(twitter)
len_news <- length(news)

wordcount_blog <- sum(str_count(blogs))
wordcount_twitter <- sum(str_count(twitter))
wordcount_news <- sum(str_count(news))

table <- data.frame(source = c("blogs", "twitter", "news"),
                  lines_count = c(len_blog, len_twitter, len_news),
                  word_count = c(wordcount_blog, wordcount_twitter, wordcount_news))

print(table)
##    source lines_count word_count
## 1   blogs      899288  206824257
## 2 twitter     2360148  162095975
## 3    news     1010242  203223153

3. Report any interesting findings

  1. Take a sample of data (1%) for detailed analysis.
set.seed(123)
twitter_sample <- sample(twitter, length(twitter) * 0.01, replace = FALSE)
blogs_sample <- sample(blogs, length(blogs) * 0.01, replace = FALSE)
news_sample <- sample(news, length(news) * 0.01, replace = FALSE)

data_sample = c(twitter_sample, blogs_sample, news_sample)
  1. Data pre-processing.
NotKnown <- grep("NotKnown", iconv(data_sample, "latin1", "ASCII", sub="NotKnown"))
data_sample <- data_sample[-NotKnown]
data_sample <- rm_white(data_sample)
data_sample <- tolower(data_sample)
data_sample <- removePunctuation(data_sample)
data_sample <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", "", data_sample)
data_sample <- gsub("&amp", "", data_sample)
data_sample <- gsub("[[:digit:]]", "", data_sample) 
data_sample <- gsub("RT :|@[a-z,A-Z]*: ", "", data_sample) 
data_sample <- gsub("@\\w+", "", data_sample)
data_sample <- gsub(" #\\S*","", data_sample)
  1. Exploratory Data analysis - Unigrams and Bigrams.
data_sample_df <- data_frame(line = 1:length(data_sample), text = data_sample, stringsAsFactors = FALSE)
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
unigram_freq <- data_sample_df %>%
    unnest_tokens(unigram, text, token = "ngrams", n = 3) %>%
    separate(unigram, c("word1"), sep = " ", 
             extra = "drop", fill = "right") %>%
    filter(!word1 %in% stop_words$word) %>%
    unite(unigram, word1, sep = " ") %>%
    count(unigram, sort = TRUE)

ggplot(head(unigram_freq,10), aes(reorder(unigram,n), n)) +   
    geom_bar(stat="identity") + coord_flip() +
    xlab("Unigrams") + ylab("Frequency") +
    ggtitle("Most frequent unigrams")

bigram_freq <- data_sample_df %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 3) %>%
    separate(bigram, c("word1", "word2"), sep = " ", 
             extra = "drop", fill = "right") %>%
    filter(!word1 %in% stop_words$word,
           !word2 %in% stop_words$word) %>%
    unite(bigram, word1, word2, sep = " ") %>%
    count(bigram, sort = TRUE)

ggplot(head(bigram_freq,10), aes(reorder(bigram,n), n)) +   
    geom_bar(stat="identity") + coord_flip() +
    xlab("Bigrams") + ylab("Frequency") +
    ggtitle("Most frequent bigrams")

4. Plans for creating a prediction algorithm and Shiny app

Need to create more models - N grams: trigrams, and create a better filter to remove irrelevant words (e.g. ‘NA’).

Otherwise, the prediction model should be based on these frequencies. For instance, it should respond with suggestions of the words that match all of part of the string already typed, and when a user enters a space (after the first word), it should respond with the possible following words.