Lekshman Ramesh
March 5, 2017
library(rjson)
library(plyr)
library(Matrix)
library(xgboost)
library(stringr)
tr.review<-"D:/Lekshman/Kaggle/Text Analytics/reviews_Cell_Phones_and_Accessories_5.json/Cell_Phones_and_Accessories_5.json"
con<-file(tr.review,"r")
input<-readLines(con,-1L)
close(con)
cell<-ldply(lapply(input,function(x)t(unlist(fromJSON(x)))))
tr.review<-"D:/Lekshman/Kaggle/Text Analytics/reviews_Cell_Phones_and_Accessories_5.json/Patio_Lawn_and_Garden_5.json"
con<-file(tr.review,"r")
input<-readLines(con,-1L)
close(con)
lawn<-ldply(lapply(input,function(x)t(unlist(fromJSON(x)))))
cell_red=cell[sample(194000,2000),5:7]
lawn_red=lawn[sample(13272,2000),5:7]cell_red=rbind(cell_red,lawn_red)
#Removing all Punctiation except underscore & Digits
cell_red$rev1=gsub("[[:punct:]]", " ", cell_red$reviewText)
cell_red$rev1 = gsub("[[:digit:]]", " ", cell_red$rev1)
cell_red$rev1=gsub("\\s+"," ", str_trim(cell_red$rev1))
#Assigning Actual Classes
cell_red$class[1:2000]="Cell Phones"
cell_red$class[2001:4000]="Gardening Equipment"
#Removing Blanks
cell_red=cell_red[cell_red$reviewText!="",]library(tm)## Loading required package: NLP
library(SnowballC)
data2=Corpus(VectorSource(cell_red$rev1))
#AdjustingtheformattoPlainTextDocument.EssentialforcreationofTermDocumentMatrix
data2=tm_map(data2,PlainTextDocument)
#RemovingStopwordslikea,the,for,etc.thatappearoftenbutprovidenoinformation
data2=tm_map(data2,removeWords,stopwords())
#Converting to lower case
data2=tm_map(data2,content_transformer(tolower))
#Converting to root word
data2=tm_map(data2,stemDocument)
data2_tdm=DocumentTermMatrix(data2)
vars=findFreqTerms(data2_tdm,5)
data2_tdm_red=data2_tdm[,intersect(colnames(data2_tdm),vars)]
dim(data2_tdm_red)## [1] 3997 3684
library(topicmodels)
#Assigning LDA parameters for Gibbs sampling
burnin <- 4000 #Ignoring first few iterations
iter <- 2000 # Iterations post burn in
thin <- 500 #Picking iterations that are far apart to ensure independence
seed <-list(2003,5,63,100001,765) #Seeds for reproducability
nstart <- 5 #No of reruns
best <- TRUE #Return only the best class
k=2 #No. of classes. Often needs to be guessed but not in this case luckily
#Running the model
ldaOut <-LDA(data2_tdm_red, k, method="Gibbs", control=list(nstart=nstart, seed = seed, best=best, burnin = burnin, iter = iter, thin=thin))
#Comparing Predictions to Actual
ldaOut.topics <- as.matrix(topics(ldaOut))
table(cell_red$class,ldaOut.topics)## ldaOut.topics
## 1 2
## Cell Phones 1830 168
## Gardening Equipment 576 1423