This report will explain my exploratory analysis and goals for my future NLP algorithm and app.
knitr::opts_chunk$set(cache = TRUE)
library(stringi)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
blogText <- readLines("en_US.blogs.txt", skipNul = TRUE)
newsText <- readLines("en_US.news.txt", skipNul = TRUE, warn = FALSE)
twitterText <- readLines("en_US.twitter.txt", skipNul = TRUE)
blogw <- sapply(gregexpr("\\W+", blogText), length) + 1
newsw <- sapply(gregexpr("\\W+", newsText), length) + 1
twitterw <- sapply(gregexpr("\\W+", twitterText), length) + 1
words <- c(sum(blogw), sum(newsw), sum(twitterw))
lines <- c(length(blogText), length(newsText), length(twitterText))
textSts <- as.data.frame(rbind(lines, words))
names(textSts) <- c("blogs", "news", "twitter")
textSts
## blogs news twitter
## lines 899288 77259 2360148
## words 39386844 2837489 32874052
Per the line and word counts, I will use samples of corpus files instead of its entirety.
blogText <- blogText[1:1000]
newsText <- newsText[1:1000]
twitterText <- twitterText[1:1000]
#Combining them into one sample
allText <- tolower(c(blogText, newsText, twitterText))
#removing contractions
allText <- gsub("let's","let us",allText)
allText <- gsub("'s","",allText)
allText <- gsub("'d"," would",allText)
allText <- gsub("'ve"," have",allText)
allText <- gsub("'t"," not",allText)
allText <- gsub("'ll"," will",allText)
allText <- gsub("'m"," am",allText)
allText <- gsub("'re"," are",allText)
UGRAM <- stri_match_all_regex(allText,"([ ,.!?\"(]|^)(?=([a-zA-Z]+'?[a-zA-Z]*))")
x <- as.character(c())
for (i in 1:length(UGRAM)){
lw <- length(UGRAM[[i]][,3])
x <- c(x, UGRAM[[i]][,3])
}
x <- unique(x)
y <- is.na(x)
x <- x[y==FALSE]
k = 1
y <- as.character(c())
for (i in 1:length(x)){
if (stri_length(x[i]) > 2){
y[k] <- x[i]
k = k+1
}
}
x <- unique(y)
xx <- as.numeric(c())
for (i in 1:length(x)){
y <- grep(x[i],allText, fixed = TRUE)
xx[i] <- length(y)
}
x_wcount <- as.data.frame(cbind(x,xx))
names(x_wcount) <- c("Word", "count")
x_wcount$Word <- as.character(x_wcount$Word)
x_wcount$count <- as.numeric(as.character(x_wcount$count))
x_wcount <- arrange(x_wcount, desc(count))
TGRAM <- stri_match_all_regex(allText,"([ ,.!?\"(]|^)(?=([a-zA-Z]+'?[a-zA-Z]* [a-zA-Z]+'?[a-zA-Z]*))")
x2 <- as.character(c())
for (i in 1:length(TGRAM)){
lw <- length(TGRAM[[i]][,3])
x2 <- c(x2, TGRAM[[i]][,3])
}
x2 <- unique(x2)
y <- is.na(x2)
x2 <- x2[y==FALSE]
k = 1
for (i in 1:length(x2)){
if (as.numeric(stri_locate_first(x2[i], regex = " ")[1,1]) > 2){
y[k] <- x2[i]
k = k+1
}
}
x2 <- unique(y)
xx2 <- as.character(c())
for (i in 1:length(x2)){
y2 <- grep(x2[i],allText, fixed = TRUE)
xx2[i] <- length(y2)
}
px_wcount <- as.data.frame(cbind(x2,xx2))
names(px_wcount) <- c("Word", "count")
px_wcount$Word <- as.character(px_wcount$Word)
px_wcount$count <- as.numeric(as.character(px_wcount$count))
px_wcount <- arrange(px_wcount, desc(count))
THGRAM <- stri_match_all_regex(allText,"([ ,.!?\"(]|^)(?=([a-zA-Z]+'?[a-zA-Z]* [a-zA-Z]+'?[a-zA-Z]* [a-zA-Z]+'?[a-zA-Z]*))")
x3 <- as.character(c())
for (i in 1:length(THGRAM)){
lw <- length(THGRAM[[i]][,3])
x3 <- c(x3, THGRAM[[i]][,3])
}
x3 <- unique(x3)
xx3 <- as.character(c())
for (i in 1:length(x3)){
y3 <- grep(x3[i],allText, fixed = TRUE)
xx3[i] <- length(y3)
}
y <- is.na(x3)
x3 <- x3[y==FALSE]
xx3 <- xx3[y==FALSE]
px3_wcount <- as.data.frame(cbind(x3,xx3))
names(px3_wcount) <- c("Word", "count")
px3_wcount$Word <- as.character(px3_wcount$Word)
px3_wcount$count <- as.numeric(as.character(px3_wcount$count))
px3_wcount <- arrange(px3_wcount, desc(count))
I intend to use this information to help with a Katz’s back-off model. I will also research ways to not need so much memory and why the stringi and quanteda packages result in single letter words. I had issues in developing this report and realize I will have issues with my app if alternate solutions are not found.