Use for text classfication: Quanteda
Use for data transformation: Corpus of that package
#install.packages("quanteda")
library (quanteda)
# Load data
raw_data <- read.csv ("/Users/lytran/Desktop/R_cheetsheet/Data for practice/spam_classificiation.csv", nrows = 2000)
# With text data, need convert data into corpus
raw_data$v2 <- as.character(raw_data$v2) #Convert text data into class character
mycorp <- corpus(raw_data$v2) #Stored text data in corpus format
mycorp[10]
## text10
## "Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030"
# Next step is turn the text data into tokens
mytoken <- tokens(mycorp, remove_punct = TRUE)
mytoken[10]
## tokens from 1 document.
## text10 :
## [1] "Had" "your" "mobile" "11" "months"
## [6] "or" "more" "U" "R" "entitled"
## [11] "to" "Update" "to" "the" "latest"
## [16] "colour" "mobiles" "with" "camera" "for"
## [21] "Free" "Call" "The" "Mobile" "Update"
## [26] "Co" "FREE" "on" "08002986030"
#Clean tokens (eg: disregard capitalization, punction...)
newtokens <- tokens_wordstem(mytoken)
newtokens[10]
## tokens from 1 document.
## text10 :
## [1] "Had" "your" "mobil" "11" "month"
## [6] "or" "more" "U" "R" "entitl"
## [11] "to" "Update" "to" "the" "latest"
## [16] "colour" "mobil" "with" "camera" "for"
## [21] "Free" "Call" "The" "Mobil" "Update"
## [26] "Co" "FREE" "on" "08002986030"
newtokens2 <- tokens_tolower(newtokens)
newtokens2[10]
## tokens from 1 document.
## text10 :
## [1] "had" "your" "mobil" "11" "month"
## [6] "or" "more" "u" "r" "entitl"
## [11] "to" "update" "to" "the" "latest"
## [16] "colour" "mobil" "with" "camera" "for"
## [21] "free" "call" "the" "mobil" "update"
## [26] "co" "free" "on" "08002986030"
Reference https://rstudio-pubs-static.s3.amazonaws.com/381321_188aaabd730f4e42a7dda6da5b9f8652.html