Reading Files

blogs<-file("en_US.blogs.txt","r")
blogs_lines<-readLines(blogs)
close(blogs)

news<-file("en_US.news.txt","r")
news_lines<-readLines(news)

## Warning in readLines(news): incomplete final line found on 'en_US.news.txt'

close(news)

twitter<-file("en_US.twitter.txt","r")
twitter_lines<-readLines(twitter)

## Warning in readLines(twitter): line 167155 appears to contain an embedded nul

## Warning in readLines(twitter): line 268547 appears to contain an embedded nul

## Warning in readLines(twitter): line 1274086 appears to contain an embedded nul

## Warning in readLines(twitter): line 1759032 appears to contain an embedded nul

close(twitter)

Cleaning

CleanR <- function(x) {
  sampleTxt <- tolower(x)
  sampleTxt <- gsub("([^[:space:]]*)(@|#|http://|https://)([^[:space:]]*)", " ", sampleTxt)
  sampleTxt <- iconv(sampleTxt, "latin1", "ASCII", sub="")
  sampleTxt <- gsub("[[:punct:]]", "", sampleTxt)
  sampleTxt <- gsub("[[:digit:]]","",sampleTxt)
  sampleTxt <- gsub("'","",sampleTxt)
  sampleTxt <- sampleTxt[stri_count_words(sampleTxt, "\\s+")>2]
}
blogs_lines <- CleanR(blogs_lines)
news_lines   <- CleanR(news_lines)
twitter_lines  <- CleanR(twitter_lines)

Sampling

blogs_lines <- sample(blogs_lines, length(blogs_lines) * 0.5)
news_lines <- sample(news_lines, length(news_lines) * 0.8)
twitter_lines <- sample(blogs_lines, length(blogs_lines) * 0.5)

TriGrams

allFiles <- c(blogs_lines,news_lines, twitter_lines)
corpus <- corpus(allFiles)
remove(blogs_lines, news_lines, twitter_lines)
token <- tokens(corpus, remove_punct = TRUE)
remove(corpus)
toks_nostop <- tokens_select(token, pattern = stopwords("en"), selection = "remove")
remove(token)
ngramTri <- tokens_ngrams(toks_nostop, n = 3)
topNgramTRI <- topfeatures(dfm(ngramTri), 500000)
saveRDS(topNgramTRI, "topNgramTRI.rds")

Read In Top TriGrams

readIn<- function() { topNgramTRI <- readRDS("topNgramTRI.rds")}

get_word3 <- function(x){
  phrase <- tolower(x)
  formated <- word(phrase, -2, -1)
  print(formated)
  formated <- gsub("'","",formated)
  formated <- gsub(" ", "_" , formated)
  formated <- paste("^", formated, ".+", sep = "")
  searchTerms <- names(topNgramTRI)[grepl(formated, names(topNgramTRI))]
  gsub(".+_.+_", "",  searchTerms)
  }

Coursera datascience capstone week 4

Shawn

3/6/2022

Reading Files

Cleaning

Sampling