Capstone Project Assigment week 2

Overview

The goal of this project is just to display that you've gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs http://rpubs.com/ that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set.

The motivation for this project is to: 1. Demonstrate that you've downloaded the data and have successfully loaded it in. 2. Create a basic report of summary statistics about the data sets. 3. Report any interesting findings that you amassed so far. 4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

library(wordcloud)
library(dplyr)
library(ggplot2)
library(tm)
library(stopwords)
library(quanteda)

Loading Data

setwd("D:/Coursera/DataScience Spe/Capston Project/en_US")

##The data we will be using in this case, is the En_US text data

        f_blogs = file("D:/Coursera/DataScience Spe/Capston Project/en_US/en_US.blogs.txt")
        d_blogs <- readLines(f_blogs, encoding="UTF-8", skipNul=TRUE)
        close(f_blogs)
        f_news = file("D:/Coursera/DataScience Spe/Capston Project/en_US/en_US.news.txt")
        d_news <- readLines(f_news, encoding = "UTF-8", skipNul = TRUE)
        close(f_news)
        f_tw = file("D:/Coursera/DataScience Spe/Capston Project/en_US/en_US.twitter.txt")
        d_tw <- readLines(f_tw, encoding = "UTF-8", skipNul = TRUE)
        close(f_tw)

Statistics

Up next we will be briefly showing some basic summary of the data we are analyzing

d_stat<- function(lines,t_fl) {
                s_file <- file.info(t_fl)[1]/(1024*1024)
                no_ch <- lapply(lines, nchar)
                maxch <- which.max(no_ch)
                word_count <- sum(sapply(strsplit(lines, "\\s+"), length))
                return(c(t_fl, format(round(as.double(s_file), 2), nsmall=2), length(lines),maxch, word_count))
        }
        
        d_blogs_stat <- d_stat(d_blogs,"en_US.blogs.txt")
        d_news_stat <- d_stat(d_news,"en_US.news.txt")
        d_tw_stat <- d_stat(d_tw,"en_US.twitter.txt")
        
        s_dataf <- c(d_blogs_stat,d_news_stat,d_tw_stat)
        s_table <- data.frame(matrix(unlist(s_dataf), nrow=3, byrow=TRUE))
        colnames(s_table) <- c("Name", "Size(MB)", "Line Count", "Line", "Words Count"); s_table

##                Name Size(MB) Line Count   Line Words Count
## 1   en_US.blogs.txt   200.42     899288 483415    37334131
## 2    en_US.news.txt   196.28      77259  14556     2643969
## 3 en_US.twitter.txt   159.36    2360148     26    30373583

Sampling Data, subset testing data

For testing proposes we are subseting the database in a small portion exactly the 8% of the total amount of data.

d_blogs_test <- sample(d_blogs, length(d_blogs) * 0.080, replace = FALSE)
        d_news_test <- sample(d_news, length(d_news) * 0.080, replace = FALSE)
        d_tw_test <- sample(d_tw, length(d_tw) * 0.080, replace = FALSE)

Creating Functions for Cleaning and Exploratory Data Analysis

For exploratory analysis is important for us, to create, clean and stablish our goal, for this case, we are writing and run 3 functions in order to identify the top 10 words more frequently use in this dataset.

        corpus_data<- function(t_fl) {
                corp_g1<- paste(t_fl, collapse=" ")
                corp_g1 <- VectorSource(corp_g1)
                corp_g1 <- Corpus(corp_g1)
        }
        cleaning <- function(cp_d) {
                
                cp_d <- tm_map(cp_d, removeNumbers)
                cp_d <- tm_map(cp_d, tolower)
                cp_d <- tm_map(cp_d, removePunctuation)
                cp_d <- tm_map(cp_d, stripWhitespace)
                cp_d <- tm_map(cp_d, PlainTextDocument)
                return (cp_d)
        }
        hg_fq_w <- function (cp_d) {
                t_sp <- DocumentTermMatrix(cp_d)
                t_mx <- as.matrix(t_sp)
                fq_w <- colSums(t_mx)
                fq_w <- as.data.frame(sort(fq_w, decreasing=TRUE))
                fq_w$word <- rownames(fq_w)
                colnames(fq_w) <- c("FQ","Word")
                return (fq_w)
        }

Most frequent words

        ###en_US.blogs.txt
        corp_blogs <- corpus_data(d_blogs_test)
        corp_blogs <- cleaning(corp_blogs)
        blogs_hg_fq_w <- hg_fq_w(corp_blogs)
        
        
        ###en_US.news.txt
        corp_news <- corpus_data(d_news_test)
        corp_news <- cleaning(corp_news)
        news_hg_fq_w <- hg_fq_w(corp_news)
        
        ###en_US.twitter.txt
        corp_tw <- corpus_data(d_tw_test)
        corp_tw <- cleaning(corp_tw)
        tw_hg_fq_w <- hg_fq_w(corp_tw)

        ###en_US.blogs.txt
        blogs_hg_fq_w[1:10,]

##          FQ Word
## the  147330  the
## and   86527  and
## that  36589 that
## for   28845  for
## you   23527  you
## with  22790 with
## was   21950  was
## this  20554 this
## have  17478 have
## but   16073  but

        ###en_US.news.txt
        news_hg_fq_w[1:10,]

##         FQ Word
## the  12085  the
## and   5389  and
## for   2123  for
## that  2103 that
## with  1581 with
## said  1546 said
## was   1339  was
## his   1004  his
## from   946 from
## but    943  but

        ###en_US.twitter.txt
        tw_hg_fq_w[1:10,]

##         FQ Word
## the  75151  the
## you  43620  you
## and  34806  and
## for  30484  for
## that 18793 that
## with 14082 with
## your 13669 your
## have 13451 have
## this 12888 this
## are  12514  are

Charts and Wordcloud

As part of our analysis, is important to show graphically our findings thus the reader can understand easier the content

        ###en_US.blogs.txt
        chart_blogs <- ggplot(data = blogs_hg_fq_w[1:10,] , aes(x=FQ,y=Word,colour=Word)) +
                geom_bar(stat="identity", fill="white")+ labs(title ="Top 10 US Blogs most Frequent Words")
        chart_blogs

        wordcloud(blogs_hg_fq_w$Word[1:100], blogs_hg_fq_w$FQ[1:100],
                  colors=brewer.pal(8, "Set1"))

        ###en_US.news.txt
        chart_news <- ggplot(data = news_hg_fq_w[1:10,] , aes(x=FQ,y=Word,colour=Word)) +
                geom_bar(stat="identity", fill="white")+ labs(title ="Top 10 US News most Frequent Words")
        chart_news

        wordcloud(news_hg_fq_w$Word[1:100], news_hg_fq_w$FQ[1:100],
                  colors=brewer.pal(8, "Set2"))

        ###en_US.twitter.txt
        chart_tw <- ggplot(data = tw_hg_fq_w[1:10,] , aes(x=FQ,y=Word,colour=Word)) +
                geom_bar(stat="identity", fill="white")+ labs(title ="Top 10 US Twitter most Frequent Words")
        chart_tw

        wordcloud(tw_hg_fq_w$Word[1:100], tw_hg_fq_w$FQ[1:100],
                  colors=brewer.pal(8, "Set3"))

NGrams models

we will show how to Build a basic n-gram model - using the exploratory analysis we performed, buildingthis basic n-gram model wil help us for predicting the next word based on the previous 1, 2, or 3 words.

 ###Generating tokens
        d_blogs_token <- tokens(d_blogs_test,what ="word", remove_numbers = TRUE,
                        remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
        d_blogs_token <- tokens_tolower(d_blogs_token); d_blogs_token <- tokens_select(d_blogs_token, stopwords(),selection ="remove")
        
        d_news_token <- tokens(d_news_test,what ="word", remove_numbers = TRUE,
                        remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
        d_news_token <- tokens_tolower(d_news_token);d_news_token <- tokens_select(d_news_token, stopwords(),selection ="remove")
      
        d_tw_token <- tokens(d_tw_test,what ="word", remove_numbers = TRUE,
                        remove_punct = TRUE, remove_separators = TRUE, remove_symbols =TRUE )
        d_tw_token <- tokens_tolower(d_tw_token); d_tw_token <- tokens_select(d_tw_token, stopwords(),selection ="remove")
        
        ###Building Uni-gram model
        
        ###en_US.blogs.txt
        d_blog_uni <- tokens_ngrams(d_blogs_token, n=1) 
        df_blog_uni <- dfm(d_blog_uni, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)    
        
        ###en_US.news.txt
        d_news_uni <- tokens_ngrams(d_news_token, n=1) 
        df_news_uni <- dfm(d_news_uni, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)    
        
        ###en_US.twitter.txt
        d_tw_uni <- tokens_ngrams(d_tw_token, n=1) 
        df_tw_uni <- dfm(d_tw_uni, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)    
        

        ###Building Bi-gram model
        
        ###en_US.blogs.txt
        d_blog_bi <- tokens_ngrams(d_blogs_token, n=2) 
        df_blog_bi <- dfm(d_blog_bi, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)    
        
        ###en_US.news.txt
        d_news_bi <- tokens_ngrams(d_news_token, n=2) 
        df_news_bi <- dfm(d_news_bi, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)    
        
        ###en_US.twitter.txt
        d_tw_bi <- tokens_ngrams(d_tw_token, n=2) 
        df_tw_bi <- dfm(d_tw_bi, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)    
        
        ###Building Tri-gram model
        
        ###en_US.blogs.txt
        d_blog_tri <- tokens_ngrams(d_blogs_token, n=3) 
        df_blog_tri <- dfm(d_blog_tri, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)    
                
        ###en_US.news.txt
        d_news_tri <- tokens_ngrams(d_news_token, n=3) 
        df_news_tri <- dfm(d_news_tri, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE)    
        
        ###en_US.twitter.txt
        d_tw_tri <- tokens_ngrams(d_tw_token, n=3) 
        df_tw_tri <- dfm(d_tw_tri, tolower =TRUE, remove = stopwords("english"),remove_punct = TRUE) 

        ###Top 15 Unigrams 
        topfeatures(df_blog_uni,15)

##    one   like   just    can   time    get people   know    now   also    new 
##  10071   8007   7911   7764   7147   5626   4809   4803   4685   4382   4372 
##   even   make    day   good 
##   4185   4069   4026   4001

        topfeatures(df_news_uni,15)

##   said    one    new    can   also   year  first   last  state    two  years 
##   1549    476    423    383    372    370    347    333    325    319    316 
## people   time   just   like 
##    313    302    292    283

        topfeatures(df_tw_uni,15)

##   just   like    get   love   good    can     rt thanks    day    now    one 
##  12134   9759   8915   8570   8050   7165   7159   7126   7091   6625   6578 
##   know      u   time  great 
##   6340   6139   6006   5931

        ###Top 15 Biigrams 
        topfeatures(df_blog_bi,15)

##   years_ago    new_york   right_now even_though   feel_like   last_year 
##         446         398         391         367         360         335 
##     can_see  first_time   make_sure   last_week     can_get  last_night 
##         328         326         308         280         263         258 
##   every_day     one_day high_school 
##         251         251         244

        topfeatures(df_news_bi,15)

##     last_year      new_york   high_school      st_louis    new_jersey 
##            94            83            59            53            48 
##     years_ago united_states san_francisco   los_angeles     last_week 
##            47            39            35            32            32 
##   even_though     two_years    last_month    first_time     right_now 
##            31            30            28            26            25

        topfeatures(df_tw_bi,15)

##       right_now      last_night looking_forward    good_morning  happy_birthday 
##            1351             909             776             667             647 
##       feel_like        just_got       good_luck     follow_back   thanks_follow 
##             568             556             535             495             492 
##      looks_like        let_know         can_get       next_week   please_follow 
##             488             457             444             393             337

        ###Top 15 Trigrams 
        topfeatures(df_blog_tri,15)

##        new_york_city       new_york_times  amazon_services_llc 
##                   67                   49                   40 
##  services_llc_amazon        llc_amazon_eu     couple_weeks_ago 
##                   40                   40                   39 
##        two_years_ago         world_war_ii       many_years_ago 
##                   36                   26                   25 
##         new_york_n.y          let_us_know       happy_new_year 
##                   25                   25                   24 
## preheat_oven_degrees    several_years_ago        two_weeks_ago 
##                   23                   23                   23

        topfeatures(df_news_tri,15)

##          new_york_city   pates_fountain_parks classic_pates_fountain 
##                     12                     11                      8 
##        st_louis_county     gov_chris_christie president_barack_obama 
##                      7                      7                      7 
##            12u_14u_16u         new_york_times assistant_u.s_attorney 
##                      7                      6                      6 
##            run_run_run         past_two_years      u.s_supreme_court 
##                      6                      5                      5 
##            10u_12u_14u            14u_16u_18u   per_serving_calories 
##                      5                      5                      5

        topfeatures(df_tw_tri,15)

##            let_us_know         happy_new_year      happy_mothers_day 
##                    161                    151                    125 
##     happy_mother's_day looking_forward_seeing          cinco_de_mayo 
##                    113                     83                     75 
##          just_got_back  good_morning_everyone          cant_wait_see 
##                     60                     45                     44 
##    thanks_following_us          just_got_home     please_follow_back 
##                     44                     43                     42 
##            come_see_us     follow_back_please         keep_good_work 
##                     41                     40                     39

nteresting findings that we amassed so far

In the research of building this model, I find quite different packages for helping me out, how is the most efficient way in order to create the ngrams prediction models. In my findings, I encountered quanteda package, that really helped me with all the analysis of the textual data, you can find the general description and function in the link bellow https://tutorials.quanteda.io/

Plans for creating a prediction algorithm and Shiny app

Further exploration of ngram models in order to get a optimized code for running it in the Shiny app taking in to account its limitations.