Example of email spam classfication

Packages:

  • Use for text classfication: Quanteda

  • Use for data transformation: Corpus of that package

#install.packages("quanteda")
library (quanteda)
# Load data
raw_data <- read.csv ("/Users/lytran/Desktop/R_cheetsheet/Data for practice/spam_classificiation.csv", nrows = 2000)
# With text data, need convert data into corpus
raw_data$v2 <- as.character(raw_data$v2) #Convert text data into class character
mycorp <- corpus(raw_data$v2) #Stored text data in corpus format
mycorp[10]
##                                                                                                                                                       text10 
## "Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030"
# Next step is turn the text data into tokens
mytoken <- tokens(mycorp, remove_punct = TRUE)
mytoken[10]
## tokens from 1 document.
## text10 :
##  [1] "Had"         "your"        "mobile"      "11"          "months"     
##  [6] "or"          "more"        "U"           "R"           "entitled"   
## [11] "to"          "Update"      "to"          "the"         "latest"     
## [16] "colour"      "mobiles"     "with"        "camera"      "for"        
## [21] "Free"        "Call"        "The"         "Mobile"      "Update"     
## [26] "Co"          "FREE"        "on"          "08002986030"
#Clean tokens (eg: disregard capitalization, punction...)
newtokens <- tokens_wordstem(mytoken)
newtokens[10]
## tokens from 1 document.
## text10 :
##  [1] "Had"         "your"        "mobil"       "11"          "month"      
##  [6] "or"          "more"        "U"           "R"           "entitl"     
## [11] "to"          "Update"      "to"          "the"         "latest"     
## [16] "colour"      "mobil"       "with"        "camera"      "for"        
## [21] "Free"        "Call"        "The"         "Mobil"       "Update"     
## [26] "Co"          "FREE"        "on"          "08002986030"
newtokens2 <- tokens_tolower(newtokens)
newtokens2[10]
## tokens from 1 document.
## text10 :
##  [1] "had"         "your"        "mobil"       "11"          "month"      
##  [6] "or"          "more"        "u"           "r"           "entitl"     
## [11] "to"          "update"      "to"          "the"         "latest"     
## [16] "colour"      "mobil"       "with"        "camera"      "for"        
## [21] "free"        "call"        "the"         "mobil"       "update"     
## [26] "co"          "free"        "on"          "08002986030"

Reference https://rstudio-pubs-static.s3.amazonaws.com/381321_188aaabd730f4e42a7dda6da5b9f8652.html