Text Minning

Tokenizing

library(tm)
## Loading required package: NLP
Text=c("We will never know the real answer, before you try")
MC_tokenizer(Text)
##  [1] "We"     "will"   "never"  "know"   "the"    "real"   "answer"
##  [8] ""       "before" "you"    "try"
scan_tokenizer(Text)
##  [1] "We"      "will"    "never"   "know"    "the"     "real"    "answer,"
##  [8] "before"  "you"     "try"
strsplit_space_tokenizer <- function(x) unlist(strsplit(x, "[[:space:]]+"))
strsplit_space_tokenizer(Text)
##  [1] "We"      "will"    "never"   "know"    "the"     "real"    "answer,"
##  [8] "before"  "you"     "try"

n-Gram Tokenizing

library(tm)
if (Sys.getenv("JAVA_HOME")!="")
  Sys.setenv(JAVA_HOME="")
#install java machine
library(rJava)
library(RWeka)
Text=c("We will never know the real answer, before you try")
NGramTokenizer(Text, Weka_control(min = 2, max = 2))
## [1] "We will"       "will never"    "never know"    "know the"     
## [5] "the real"      "real answer"   "answer before" "before you"   
## [9] "you try"