The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set.

The motivation for this project is to:

  1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.
  2. Create a basic report of summary statistics about the data sets.
  3. Report any interesting findings that you amassed so far.
  4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

Tasks

  1. Review criteria less Does the link lead to an HTML page describing the exploratory analysis of the training data set?
  2. Has the data scientist done basic summaries of the three files? Word counts, line counts and basic data tables?
  3. Has the data scientist made basic plots, such as histograms to illustrate features of the data?
  4. Was the report written in a brief, concise style, in a way that a non-data scientist manager could appreciate?
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
textdata <- setRefClass("textdata",
                        fields = list(filepath="character", linecount="ANY", charcount="ANY", wordratio="ANY", wordsearchres="ANY", statementcount="ANY", dfwrangle="ANY", wordplot="ANY"), 
                        methods = list(
                          initialize = function(filepath) {
                            .self$filepath <- filepath
                            .self$linecount <- .self$lineCounter()
                            .self$charcount <- .self$charCounter(countbreak = 0)
                            .self$wordratio <- .self$wordDiv()
                            .self$wordsearchres <- .self$wordSearch()
                            .self$statementcount <- .self$statementCount()
                            .self$dfwrangle  <- .self$datawrangle()
                            .self$wordplot <- .self$wordPlot()
                          },
                          
                          lineCounter = function(.self){
                            f <- file(.self$filepath, open="rb")
                            nlines <- 0L
                            while (length(chunk <- readBin(f, "raw", 65536)) > 0){
                              nlines <- nlines + sum(chunk == as.raw(10L))
                            }
                            close(f)
                            sprintf("The file has %s lines.", nlines)
                            return(nlines)
                          }
                          ,
                          charCounter = function(.self,countbreak=NULL){
                            if (is.null(countbreak)==TRUE){
                              return(NULL)
                            } else {
                              counter = 0
                              charct = 0L
                              for (line in readLines(.self$filepath)){
                                #print(line)
                                current <-  nchar(line)
                                if (current > charct){
                                  charct = current
                                }
                                counter = counter + 1
                                if (counter == countbreak & countbreak !=0){
                                  break
                                } 
                              }
                              .self$charcount = charct
                              return(charct)
                            }
                          }
                          ,
                          wordDiv = function(.self,word1=NULL,word2=NULL){
                            if (is.null(word1)==TRUE || is.null(word2)==TRUE){
                              return(NULL)
                            } else {
                              f <- file(.self$filepath, open="rb")
                              filelines <- readLines(f)
                              div <- length(grep(word1, filelines))/length(grep(word2, filelines))
                              #print(div)
                              close(f)
                              sprintf("We get %s",div)
                              .self$wordratio = div
                              return(div)
                            }
                          }
                          ,
                          wordSearch = function(.self,word=NULL){
                            if (is.null(word)==TRUE){
                              return(NULL)
                            } else {
                              f <- file(.self$filepath, open="rb")
                              filelines <- readLines(f)
                              res <- grep(word, filelines, value = T)
                              close(f)
                              print(res)
                              .self$wordsearchres = res
                              return(res)
                            }
                          }
                          ,
                          statementCount = function(.self,statement=NULL){
                            if (is.null(statement)==TRUE){
                              return(NULL)
                            } else{
                              f <- file(.self$filepath, open="rb")
                              filelines <- readLines(f)
                              lns <- grep(statement, filelines)
                              close(f)
                              sprintf("Frequency of statement %s", length(lns))
                              .self$statementcount = lns
                              return(lns)
                            }
                          }
                          ,
                          datawrangle = function(.self){
                            strfile <- readLines(.self$filepath)
                            df_source <- tibble(line = 1:length(strfile), text = strfile)
                           
                            df_source_unnest <- df_source %>% 
                                   unnest_tokens(word, text) %>% 
                                   mutate(source = toString(.self$filepath))
                            
                            #Clean out numbers
                            df_source_unnest <- df_source_unnest %>% filter(!grepl("[0-9]", word))
                            df <- df_source_unnest
                            
                            df <- df %>% anti_join(stop_words, by = "word")
                            return(df)
                          }
                          ,
                          wordPlot = function(.self){
                            df <- .self$dfwrangle
                            df_wordcount <- df %>% count(word, sort = TRUE)     
                            df_wordcount %>% head(n = 20) %>% mutate(word = reorder(word, n)) %>% 
                                   
                                   ggplot(aes(x = n, y = word)) +
                                   geom_col(fill = "#00abff") +
                                   theme_bw() +
                                   ggtitle("Most frequently used words in the English language")
                          }
                        )
                        )

combineTibbles <- function(tib1,tib2,tib3){
    df <- rbind(tib1, tib2, tib3)
}

Read data and instantiate objects containing results

Prior to initiating the file read, mkdir named “final” and download and extract into the final directory.

blogsfile = "./final/en_us/en_US.blogs.txt"
newsfile = "./final/en_us/en_US.news.txt"
twitfile = "./final/en_us/en_US.twitter.txt"


blogsinfo <- textdata(blogsfile)
newsinfo <- textdata(newsfile)
twitinfo <- textdata(twitfile)
## Warning in readLines(.self$filepath): line 167155 appears to contain an
## embedded nul
## Warning in readLines(.self$filepath): line 268547 appears to contain an
## embedded nul
## Warning in readLines(.self$filepath): line 1274086 appears to contain an
## embedded nul
## Warning in readLines(.self$filepath): line 1759032 appears to contain an
## embedded nul
## Warning in readLines(.self$filepath): line 167155 appears to contain an
## embedded nul
## Warning in readLines(.self$filepath): line 268547 appears to contain an
## embedded nul
## Warning in readLines(.self$filepath): line 1274086 appears to contain an
## embedded nul
## Warning in readLines(.self$filepath): line 1759032 appears to contain an
## embedded nul

Summary for blogs file

print("Word Count: ")
## [1] "Word Count: "
print(blogsinfo$charcount)
## [1] 40833
print("Line Count: ")
## [1] "Line Count: "
print(blogsinfo$linecount)
## [1] 899288

Blogs Plot

blogsinfo$wordplot

## Summary for newsinfo

  • The total word count is: newsinfo$charcount
  • The total number of lines is: newsinfo$linecount
print("Word Count: ")
## [1] "Word Count: "
print(newsinfo$charcount)
## [1] 11384
print("Line Count: ")
## [1] "Line Count: "
print(newsinfo$linecount)
## [1] 1010242

News Word Plot

newsinfo$wordplot

Summary for twitter

print("Word Count: ")
## [1] "Word Count: "
print(twitinfo$charcount)
## [1] 140
print("Line Count: ")
## [1] "Line Count: "
print(twitinfo$linecount)
## [1] 2360148

twitter plot

twitinfo$wordplot

Twitter Word Frequency

Summary for All Source Combined

# merge the dataframes
dfbind <- combineTibbles(blogsinfo$dfwrangle, newsinfo$dfwrangle, twitinfo$dfwrangle)
## Copy an instantiazed class
updatetext <- newsinfo
updatetext$dfwrangle <- dfbind


print("Word Count: ")
## [1] "Word Count: "
print(updatetext$charcount)
## [1] 11384
print("Line Count: ")
## [1] "Line Count: "
print(updatetext$linecount)
## [1] 1010242

All Source Word Frequency

updatetext$wordPlot()