Executive Summary

In this project after setting work directory data as zip file is downloading from url and unzip ,and due to big data size a sample of 0.02% of data will be selected for data analysis , after cleaning of data ,after statistical analysis of data for creating N-gram the data will be tokenized and sorted according to frequency and sequence and finally the plots will be designed according to sorted data .

Introduction

Goal of Milestone report is enhancing the skills to deal with big data and capability to downloading and cleaning data to sort it based on requested analytic field and designing an prediction algorithm and shiny app.

Setting work directory

setwd("C:/Users/msc/Desktop/F/Data Science/Course 10/Milestone Report")
getwd()
[1] "C:/Users/msc/Desktop/F/Data Science/Course 10/Milestone Report"

Loading necessary Libraries

library(RWeka)
library(dplyr)
library(stringi)
library(tm)
library(RWeka)
library(ggplot2)
library(NLP)
library(tm)

Loading & Unzipping of Data from url

  if(!file.exists("Coursera-SwiftKey.zip")){
  download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
  "Coursera-SwiftKey.zip")
  unzip("Coursera-SwiftKey.zip")
  }

Reading Data

blogs <- readLines("final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

Statistical Summary of Data

  Stat_summary <- data.frame('File' = c("Blogs","News","Twitter"),
                  "Size" = sapply(list(blogs, news, twitter), function(x){format(object.size(x),"MB")}),
                  'No_Entries' = sapply(list(blogs, news, twitter), function(x){length(x)}),
                  'Total_Chars' = sapply(list(blogs, news, twitter), function(x){sum(nchar(x))}),
                  'Max_Chars' = sapply(list(blogs, news, twitter), function(x){max(unlist(lapply(x,                            function(y) nchar(y))))})
                          )


    Stat_summary
    
        File     Size No_Entries Total_Chars Max_Chars
  1   Blogs 248.5 Mb     899288   206824505     40833
  2    News  19.2 Mb      77259    15639408      5760
  3 Twitter 301.4 Mb    2360148   162096031       140
  
  
  stri_stats_general(blogs)
  Lines LinesNEmpty       Chars CharsNWhite 
 899288      899165   206043906   169609063 
 
 stri_stats_general(news)
  Lines LinesNEmpty       Chars CharsNWhite 
  77259       77259    15615538    13048828 
 
 stri_stats_general(twitter)
  Lines LinesNEmpty       Chars CharsNWhite 
2360148     2360148   161961345   133947948 

  lenBlog <- length(blogs)
  lenBlog
  [1] 899288
  lenNews <- length(news)
  lenNews
  [1] 77259
  lenTwit <- length(twitter)
  lenTwit
  [1] 2360148

Cleaning Data

Removal of non-English characters

blogs <- iconv(blogs, "latin1", "ASCII", sub="")
news <- iconv(news, "latin1", "ASCII", sub="")
twitter <- iconv(twitter, "latin1", "ASCII", sub="")

EXploratory Data Analysis

Due to Big Data ,only 0.2% of each file consider as sample.

set.seed(245)
  dataSample<-c(sample(blogs, length(blogs) * 0.002),
                sample(news, length(news) * 0.002),
                sample(twitter, length(twitter) * 0.002))
 
 summary(dataSample)
    Length     Class      Mode 
      6672 character character

Creating a sample of corpus based on 0.02% of data

   corpus_sample<-VCorpus(VectorSource(dataSample))
  corpus_sample<- tm_map(corpus_sample, content_transformer(removePunctuation))
  corpus_sample <- tm_map(corpus_sample, content_transformer(removeNumbers))
   corpus_sample <- tm_map(corpus_sample, content_transformer(tolower))

N-gram

Tokenize sample data ,designing of matrix according to frequency and sequences

 uni_token <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
 bi_token <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
 tri_token <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
 
 
 uni_matrix <- TermDocumentMatrix(corpus_data, control = list(tokenize = uni_token))
 bi_matrix <- TermDocumentMatrix(corpus_data, control = list(tokenize = bi_token))
 tri_matrix <- TermDocumentMatrix(corpus_data, control = list(tokenize = tri_token))
 
 
 uni_corpus <- findFreqTerms(uni_matrix,lowfreq = 20)
 bi_corpus <- findFreqTerms(bi_matrix,lowfreq=20)
 tri_corpus <- findFreqTerms(tri_matrix,lowfreq=20)
 
 
 uni_corpus_Fqc <- rowSums(as.matrix(uni_matrix[uni_corpus,]))
 uni_corpus_Fqc <- data.frame(word=names(uni_corpus_Fqc), frequency=uni_corpus_Fqc)
 bi_corpus_Fqc <- rowSums(as.matrix(bi_matrix[bi_corpus,]))
 bi_corpus_Fqc<- data.frame(word=names(bi_corpus_Fqc), frequency=bi_corpus_Fqc)
 tri_corpus_Fqc <- rowSums(as.matrix(tri_matrix[tri_corpus,]))
 tri_corpus_Fqc <- data.frame(word=names(tri_corpus_Fqc), frequency=tri_corpus_Fqc)
 
 
 uni_corpus_Fqc_Seq<-arrange(uni_corpus_Fqc,desc(frequency))
 uni_corpus_Fqc_Seq<-head(uni_corpus_Fqc_Seq,n=10)
 bi_corpus_Fqc_Seq<-arrange(bi_corpus_Fqc,desc(frequency))
 bi_corpus_Fqc_Seq<-head(bi_corpus_Fqc_Seq,n=10)
 tri_corpus_Fqc_Seq<-arrange(tri_corpus_Fqc,desc(frequency))
 tri_corpus_Fqc_Seq<-head(tri_corpus_Fqc_Seq,n=10)

Plots

 g1 <- ggplot(data = head(uni_corpus_Fqc_Seq,10), aes(x = reorder(word, -frequency), y = frequency)) + 
         geom_bar(stat = "identity", fill = "grey") + 
         ggtitle(paste("Unigrams")) + 
         xlab("Unigrams") + ylab("Frequency") + 
         theme(axis.text.x = element_text(angle = 45, hjust = 1))
         
         
 g2 <- ggplot(data = head(bi_corpus_Fqc_Seq,10), aes(x = reorder(word, -frequency), y = frequency)) + 
         geom_bar(stat = "identity", fill = "pink") + 
         ggtitle(paste("Bigrams")) + 
         xlab("Bigrams") + ylab("Frequency") + 
         theme(axis.text.x = element_text(angle = 45, hjust = 1))
         
         
 g3 <- ggplot(data = head(tri_corpus_Fqc_Seq,10), aes(x = reorder(word, -frequency), y = frequency)) + 
         geom_bar(stat = "identity", fill = "purple") + 
         ggtitle(paste("Trigrams")) + 
         xlab("Trigrams") + ylab("Frequency") + 
         theme(axis.text.x = element_text(angle = 45, hjust = 1))
         
         

  words_blogs <- stri_count_words(blogs)
 summary(words_blogs)
  Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.00    9.00   28.00   41.71   60.00 6725.

words_news <- stri_count_words(news)
summary(words_news)
Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
1.0    19.0    32.0    34.6    46.0  1123.0

words_twitter <- stri_count_words(twitter)
summary(words_twitter)
 Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 1.00    7.00   12.00   12.75   18.00   47.00

  gridExtra::grid.arrange(g1, g2, g3, ncol = 3)

Finding

Approximate size of Blogs files are about 248 Mb,News are 19.2 Mb and twitter are 301.4 Mb but the number of entries in blogs are 900,000 , news are about 77,000 and twitter more than 2,000,000 due to limitation of twitter in character (140). Frequency of blogs and news are same but frequency in twitter is higher due to limitation of character. Some words such as “The”,“And”are most frequent unigrams word and “of the”,“and the”are most frequent bigrams and “Thanks for the” and “one of the” are most frequent trigrams.

Predication Algorithm & Shiny App

Building of corpus and N-gram has significant role in evaluation of dats frequency and prediction of following words as uni,bi,tri gram . Shiny app has capability to run the analytic system without reducing the sample size .

Milestone Report

Farzad Ravari

July 31, 2017