Week2: Milestone Report

Purpose

Following shows the purpose of this report:

Demonstrate that I have downloaded the data and have successfully loaded it in.
Create a basic report of summary statistics about the data sets.

Word counts, line counts and basic data tables

Report any interesting findings that you amassed so far.

Basic plots, such as histograms to illustrate features of the data

Plans for creating a prediction algorithm and Shiny app.

Downloading the data

# Read data set
if(!file.exists("../original_data_set")){dir.create("../original_data_set")}
data_set_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(data_set_url,
              destfile="../original_data_set/Coursera-SwiftKey.zip")

# Unzip dataSet
unzip(zipfile="../original_data_set/Coursera-SwiftKey.zip",
      exdir="../original_data_set")

Attaching required packages

library(stringi)
library(tm)
library(dplyr)
library(SnowballC)
library(RWeka)
library(ggplot2)

Creating a basic report of summary statistics about the data sets

Word counts, line counts and basic data tables

# summary statistics about the data sets
# Word counts, line counts and basic data tables

deta_file_list <- c("../original_data_set/final/en_US/en_US.blogs.txt", 
                    "../original_data_set/final/en_US/en_US.news.txt", 
                    "../original_data_set/final/en_US/en_US.twitter.txt")

raw_data <- list(blogs = "", news = "", twitter = "")
basic_data_table <- matrix(0, nrow = 3, ncol = 2, 
                           dimnames = list(c("blogs", "news", "twitter"),
                                           c("Word counts", "line counts")))
library(stringi)
for (i in 1:3) {
  con <- file(deta_file_list[i], "rb")
  raw_data[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
  close(con)
  basic_data_table[i,1] <- sum(stri_count_words(raw_data[[i]]))
  basic_data_table[i,2] <- length(raw_data[[i]])
}

# Print Basic Data Table
basic_data_table

##         Word counts line counts
## blogs      37546246      899288
## news       34762395     1010242
## twitter    30093410     2360148

Reporting interesting findings

create Corpus

set.seed(123)
# Sample 1% of lines from each source file
sample_blog <- sample(raw_data$blogs, 0.01*length(raw_data$blogs))
sample_news <- sample(raw_data$news, 0.01*length(raw_data$news))
sample_twitter <- sample(raw_data$twitter, 0.01*length(raw_data$twitter))
sample <- c(sample_blog, sample_news, sample_twitter)
sample <- iconv(sample, 'UTF-8', 'ASCII')
corpus <- Corpus(VectorSource(as.data.frame(sample, stringsAsFactors = FALSE)))
corpus <- corpus %>%
  tm_map(tolower) %>%  
  tm_map(PlainTextDocument) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(stemDocument) %>% 
  tm_map(stripWhitespace)

What are the distributions of word frequencies?

tokenize_word_freq <-NGramTokenizer(corpus, Weka_control(min = 1, max = 1))
dist_word_freq <- data.frame(table(tokenize_word_freq))
dist_word_freq <- dist_word_freq[order(dist_word_freq$Freq, decreasing = TRUE),]

ggplot(dist_word_freq[1:20,], aes(x=tokenize_word_freq, y=Freq)) +
  geom_bar(stat="Identity")+
  xlab("Unigrams") + ylab("Frequency")+
  ggtitle("Distributions of word frequencies") +
  theme(axis.text.x=element_text(angle=90, hjust=1))

What are the frequencies of 2-grams and 3-grams in the dataset?
2-grams

tokenize_2grams_freq <-NGramTokenizer(corpus, Weka_control(min = 2, max = 2))
dist_2grams_freq <- data.frame(table(tokenize_2grams_freq))
dist_2grams_freq <- dist_2grams_freq[order(dist_2grams_freq$Freq, decreasing = TRUE),]

ggplot(dist_2grams_freq[1:20,], aes(x=tokenize_2grams_freq, y=Freq)) +
  geom_bar(stat="Identity")+
  xlab("2-grams") + ylab("Frequency")+
  ggtitle("Distributions of 2-grams frequencies") +
  theme(axis.text.x=element_text(angle=90, hjust=1))

3-grams

tokenize_3grams_freq <-NGramTokenizer(corpus, Weka_control(min = 3, max = 3))
dist_3grams_freq <- data.frame(table(tokenize_3grams_freq))
dist_3grams_freq <- dist_3grams_freq[order(dist_3grams_freq$Freq, decreasing = TRUE),]

ggplot(dist_3grams_freq[1:20,], aes(x=tokenize_3grams_freq, y=Freq)) +
  geom_bar(stat="Identity")+
  xlab("3-grams") + ylab("Frequency")+
  ggtitle("Distributions of 3-grams frequencies") +
  theme(axis.text.x=element_text(angle=90, hjust=1))

Planing for creating a prediction algorithm and Shiny app

I’m planning to develop a Shiny application which predicts next word/sentence with n-grim model data based on user text input.

Week2: Milestone Report

HH

March 4, 2019

Purpose

Downloading the data

Attaching required packages

Creating a basic report of summary statistics about the data sets

Reporting interesting findings

Planing for creating a prediction algorithm and Shiny app