Unsupervised Classification

Lekshman Ramesh

March 5, 2017

Why?

Data Reading

library(rjson)
library(plyr)
library(Matrix)
library(xgboost)
library(stringr)
tr.review<-"D:/Lekshman/Kaggle/Text Analytics/reviews_Cell_Phones_and_Accessories_5.json/Cell_Phones_and_Accessories_5.json"
con<-file(tr.review,"r")
input<-readLines(con,-1L)
close(con)
cell<-ldply(lapply(input,function(x)t(unlist(fromJSON(x)))))
tr.review<-"D:/Lekshman/Kaggle/Text Analytics/reviews_Cell_Phones_and_Accessories_5.json/Patio_Lawn_and_Garden_5.json"
con<-file(tr.review,"r")
input<-readLines(con,-1L)
close(con)
lawn<-ldply(lapply(input,function(x)t(unlist(fromJSON(x)))))
cell_red=cell[sample(194000,2000),5:7]
lawn_red=lawn[sample(13272,2000),5:7]

Data Processing

cell_red=rbind(cell_red,lawn_red)
#Removing all Punctiation except underscore & Digits
cell_red$rev1=gsub("[[:punct:]]", " ", cell_red$reviewText)
cell_red$rev1 = gsub("[[:digit:]]", " ", cell_red$rev1)
cell_red$rev1=gsub("\\s+"," ", str_trim(cell_red$rev1))
#Assigning Actual Classes
cell_red$class[1:2000]="Cell Phones"
cell_red$class[2001:4000]="Gardening Equipment"
#Removing Blanks
cell_red=cell_red[cell_red$reviewText!="",]

Sparse Matrix Creation using tm library

library(tm)
## Loading required package: NLP
library(SnowballC)
data2=Corpus(VectorSource(cell_red$rev1))
#AdjustingtheformattoPlainTextDocument.EssentialforcreationofTermDocumentMatrix
data2=tm_map(data2,PlainTextDocument)
#RemovingStopwordslikea,the,for,etc.thatappearoftenbutprovidenoinformation
data2=tm_map(data2,removeWords,stopwords())
#Converting to lower case
data2=tm_map(data2,content_transformer(tolower))
#Converting to root word
data2=tm_map(data2,stemDocument)
data2_tdm=DocumentTermMatrix(data2)
vars=findFreqTerms(data2_tdm,5)
data2_tdm_red=data2_tdm[,intersect(colnames(data2_tdm),vars)]
dim(data2_tdm_red)
## [1] 3997 3684

Unsupervised Learning Model

library(topicmodels)
#Assigning LDA parameters for Gibbs sampling
 burnin <- 4000 #Ignoring first few iterations
 iter <- 2000 # Iterations post burn in 
 thin <- 500 #Picking iterations that are far apart to ensure independence
 seed <-list(2003,5,63,100001,765) #Seeds for reproducability
 nstart <- 5 #No of reruns
 best <- TRUE #Return only the best class
 k=2 #No. of classes. Often needs to be guessed but not in this case luckily
#Running the model 
ldaOut <-LDA(data2_tdm_red, k, method="Gibbs", control=list(nstart=nstart, seed = seed, best=best, burnin = burnin, iter = iter, thin=thin))
#Comparing Predictions to Actual
ldaOut.topics <- as.matrix(topics(ldaOut))
table(cell_red$class,ldaOut.topics)
##                      ldaOut.topics
##                          1    2
##   Cell Phones         1830  168
##   Gardening Equipment  576 1423