Lekshman Ramesh
March 5, 2017
library(rjson)
library(plyr)
library(Matrix)
library(xgboost)
library(stringr)
tr.review<-"D:/Lekshman/Kaggle/Text Analytics/reviews_Cell_Phones_and_Accessories_5.json/Cell_Phones_and_Accessories_5.json"
con<-file(tr.review,"r")
input<-readLines(con,-1L)
close(con)
cell<-ldply(lapply(input,function(x)t(unlist(fromJSON(x)))))
cell[1,]
## reviewerID asin reviewerName helpful1 helpful2
## 1 A30TL5EWN6DFXT 120401325X christina 0 0
## reviewText
## 1 They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again
## overall summary unixReviewTime reviewTime
## 1 4 Looks Good 1400630400 05 21, 2014
#KeepingonlyrequiredVariables
cell_red=cell[,5:7]
data1=cell_red[cell_red$overall %in% c(1,2),]
data2=cell_red[cell_red$overall %in% c(4,5),]
data2=data2[sample(148657,24343),]
cell_red=rbind(data1,data2)
cell_red=cell_red[sample(48686,10000),]
##Pre-ProcessingData
cell_red$rev1=gsub("n't"," not___",cell_red$reviewText)
cell_red$rev1=gsub("not","not___",cell_red$rev1)
#Removing all Punctiation except underscore & Digits
cell_red$rev1=gsub("[^[:alnum:][:blank:]_\\-]", " ", cell_red$rev1)
cell_red$rev1 = gsub("[[:digit:]]", " ", cell_red$rev1)
cell_red$rev1=gsub("\\s+"," ", str_trim(cell_red$rev1))
#Excluding blanks
cell_red=cell_red[cell_red$reviewText!="",]
library(tm)
## Loading required package: NLP
data2=Corpus(VectorSource(cell_red$rev1))
#AdjustingtheformattoPlainTextDocument.EssentialforcreationofTermDocumentMatrix
data2=tm_map(data2,PlainTextDocument)
#RemovingStopwords
data2=tm_map(data2,removeWords,stopwords())
data2_tdm=DocumentTermMatrix(data2)
#RemovingInfrequentWords
vars=findFreqTerms(data2_tdm,5)
data2_tdm_red=data2_tdm[,intersect(colnames(data2_tdm),vars)]
call_mat <- sparseMatrix(i=data2_tdm_red$i, j=data2_tdm_red$j, x=data2_tdm_red$v, dims=c(data2_tdm_red$nrow, data2_tdm_red$ncol))
#DimensionsOfTheMatrix
dim(call_mat)
## [1] 9995 5526
xgmat_red=xgb.DMatrix(call_mat, label=ifelse(cell_red$overall %in% c(4,5),1,0))
param <- list(objective = "binary:logistic",
booster = "gbtree", eta = 0.01,
subsample = 0.7,
colsample_bytree = 0.7,
max_depth = 50)
cv.res_red=xgb.cv(data=xgmat_red, nfold=4, nround = 700, params=param, print_every_n = 100)
## [1] train-error:0.196433+0.006491 test-error:0.310352+0.010736
## [101] train-error:0.055061+0.001924 test-error:0.202501+0.002147
## [201] train-error:0.035284+0.000759 test-error:0.189695+0.005499
## [301] train-error:0.023812+0.001648 test-error:0.178789+0.004877
## [401] train-error:0.017575+0.001035 test-error:0.171086+0.003875
## [501] train-error:0.012840+0.000683 test-error:0.164482+0.004009
## [601] train-error:0.009705+0.000529 test-error:0.161880+0.003067
## [700] train-error:0.007904+0.000739 test-error:0.158379+0.003624
xgb_grid_1=expand.grid(nrounds=200,eta=0.05,max_depth=c(50,75,100),
subsample=c(0.5,0.7,0.9),colsample_bytree=c(0.5,0.7,0.9))
#Parameter Tuning
cv_final=NULL
for (i in 1:nrow(xgb_grid_1)){
param <- list(objective = "binary:logistic",
booster = "gbtree", eta = xgb_grid_1$eta[i],
subsample = xgb_grid_1$subsample[i],
colsample_bytree = xgb_grid_1$colsample_bytree[i],
max_depth = xgb_grid_1$max_depth[i])
cv.res_red=xgb.cv(data=xgmat_red, nfold=4, nround = 200, params=param, verbose=0)
cv_op=cv.res_red$evaluation_log
cv_op=cv_op[200,]
cv_final=rbind(cv_op,cv_final)}