The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set.
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
textdata <- setRefClass("textdata",
fields = list(filepath="character", linecount="ANY", charcount="ANY", wordratio="ANY", wordsearchres="ANY", statementcount="ANY", dfwrangle="ANY", wordplot="ANY"),
methods = list(
initialize = function(filepath) {
.self$filepath <- filepath
.self$linecount <- .self$lineCounter()
.self$charcount <- .self$charCounter(countbreak = 0)
.self$wordratio <- .self$wordDiv()
.self$wordsearchres <- .self$wordSearch()
.self$statementcount <- .self$statementCount()
.self$dfwrangle <- .self$datawrangle()
.self$wordplot <- .self$wordPlot()
},
lineCounter = function(.self){
f <- file(.self$filepath, open="rb")
nlines <- 0L
while (length(chunk <- readBin(f, "raw", 65536)) > 0){
nlines <- nlines + sum(chunk == as.raw(10L))
}
close(f)
sprintf("The file has %s lines.", nlines)
return(nlines)
}
,
charCounter = function(.self,countbreak=NULL){
if (is.null(countbreak)==TRUE){
return(NULL)
} else {
counter = 0
charct = 0L
for (line in readLines(.self$filepath)){
#print(line)
current <- nchar(line)
if (current > charct){
charct = current
}
counter = counter + 1
if (counter == countbreak & countbreak !=0){
break
}
}
.self$charcount = charct
return(charct)
}
}
,
wordDiv = function(.self,word1=NULL,word2=NULL){
if (is.null(word1)==TRUE || is.null(word2)==TRUE){
return(NULL)
} else {
f <- file(.self$filepath, open="rb")
filelines <- readLines(f)
div <- length(grep(word1, filelines))/length(grep(word2, filelines))
#print(div)
close(f)
sprintf("We get %s",div)
.self$wordratio = div
return(div)
}
}
,
wordSearch = function(.self,word=NULL){
if (is.null(word)==TRUE){
return(NULL)
} else {
f <- file(.self$filepath, open="rb")
filelines <- readLines(f)
res <- grep(word, filelines, value = T)
close(f)
print(res)
.self$wordsearchres = res
return(res)
}
}
,
statementCount = function(.self,statement=NULL){
if (is.null(statement)==TRUE){
return(NULL)
} else{
f <- file(.self$filepath, open="rb")
filelines <- readLines(f)
lns <- grep(statement, filelines)
close(f)
sprintf("Frequency of statement %s", length(lns))
.self$statementcount = lns
return(lns)
}
}
,
datawrangle = function(.self){
strfile <- readLines(.self$filepath)
df_source <- tibble(line = 1:length(strfile), text = strfile)
df_source_unnest <- df_source %>%
unnest_tokens(word, text) %>%
mutate(source = toString(.self$filepath))
#Clean out numbers
df_source_unnest <- df_source_unnest %>% filter(!grepl("[0-9]", word))
df <- df_source_unnest
df <- df %>% anti_join(stop_words, by = "word")
return(df)
}
,
wordPlot = function(.self){
df <- .self$dfwrangle
df_wordcount <- df %>% count(word, sort = TRUE)
df_wordcount %>% head(n = 20) %>% mutate(word = reorder(word, n)) %>%
ggplot(aes(x = n, y = word)) +
geom_col(fill = "#00abff") +
theme_bw() +
ggtitle("Most frequently used words in the English language")
}
)
)
combineTibbles <- function(tib1,tib2,tib3){
df <- rbind(tib1, tib2, tib3)
}
Prior to initiating the file read, mkdir named “final” and download and extract into the final directory.
blogsfile = "./final/en_us/en_US.blogs.txt"
newsfile = "./final/en_us/en_US.news.txt"
twitfile = "./final/en_us/en_US.twitter.txt"
blogsinfo <- textdata(blogsfile)
newsinfo <- textdata(newsfile)
twitinfo <- textdata(twitfile)
## Warning in readLines(.self$filepath): line 167155 appears to contain an
## embedded nul
## Warning in readLines(.self$filepath): line 268547 appears to contain an
## embedded nul
## Warning in readLines(.self$filepath): line 1274086 appears to contain an
## embedded nul
## Warning in readLines(.self$filepath): line 1759032 appears to contain an
## embedded nul
## Warning in readLines(.self$filepath): line 167155 appears to contain an
## embedded nul
## Warning in readLines(.self$filepath): line 268547 appears to contain an
## embedded nul
## Warning in readLines(.self$filepath): line 1274086 appears to contain an
## embedded nul
## Warning in readLines(.self$filepath): line 1759032 appears to contain an
## embedded nul
blogsinfo$charcountblogsinfo$linecountprint("Word Count: ")
## [1] "Word Count: "
print(blogsinfo$charcount)
## [1] 40833
print("Line Count: ")
## [1] "Line Count: "
print(blogsinfo$linecount)
## [1] 899288
blogsinfo$wordplot
## Summary for newsinfo
newsinfo$charcountnewsinfo$linecountprint("Word Count: ")
## [1] "Word Count: "
print(newsinfo$charcount)
## [1] 11384
print("Line Count: ")
## [1] "Line Count: "
print(newsinfo$linecount)
## [1] 1010242
newsinfo$wordplot
twitinfo$charcounttwitinfo$linecountprint("Word Count: ")
## [1] "Word Count: "
print(twitinfo$charcount)
## [1] 140
print("Line Count: ")
## [1] "Line Count: "
print(twitinfo$linecount)
## [1] 2360148
twitinfo$wordplot
updatetext$charcountupdatetext$linecount# merge the dataframes
dfbind <- combineTibbles(blogsinfo$dfwrangle, newsinfo$dfwrangle, twitinfo$dfwrangle)
## Copy an instantiazed class
updatetext <- newsinfo
updatetext$dfwrangle <- dfbind
print("Word Count: ")
## [1] "Word Count: "
print(updatetext$charcount)
## [1] 11384
print("Line Count: ")
## [1] "Line Count: "
print(updatetext$linecount)
## [1] 1010242
updatetext$wordPlot()