Movie Reviews - Tokenization & Sentiments Analysis

# Movie review - tokenization
library(tm); library(glmnet); library(MASS)

## Loading required package: NLP

## Warning: package 'NLP' was built under R version 3.1.2

## Loading required package: Matrix

## Warning: package 'Matrix' was built under R version 3.1.2

## Loaded glmnet 1.9-8

## Warning: package 'MASS' was built under R version 3.1.2

setwd("~/Documents/NYDSA/Lectures/Week8/week_8_day1_linear_reg_assignments")

# CSV file with 2000 movie reviews with (positive /negative) sentiments
sentiment <- read.csv("movie-reviews-sentiment.csv", header = T, as.is = T)
#Exploration & text cleanup
dim(sentiment); levels(as.factor(sentiment$Sentiment))

## [1] 2000    2

## [1] "Neg" "Pos"

sentiment$Sentiment[sentiment$Sentiment == "Neg"] <- 0
sentiment$Sentiment[sentiment$Sentiment == "Pos"] <- 1
sentiment$Text <- sapply(sentiment$Text, tolower)

# tokenization # split: words split as distinct elements of a vector
# removeStop: stop words # removePuncts, removeNumber
Tokenize <- function(document, split = TRUE, removeStop = TRUE, removePunct = TRUE, removeNumber = TRUE, removeWhite = TRUE, wordStem = TRUE) {
    if (removeStop == TRUE) {
        document <- removeWords(document, stopwords("SMART"))  # remove stop words
    }
    if (removePunct == TRUE) {
        document <- removePunctuation(document)  
    }
    if (removeNumber == TRUE) {
        document <- removeNumbers(document) 
    }
    if (removeWhite == TRUE) {
        document <- stripWhitespace(document) 
    }
    if (split == TRUE) {
        document <- unlist(strsplit(document, " "))
    }
    document
}

textList <- as.list(sentiment$Text)
textData <- sapply(textList, Tokenize)
head(textData,1)

## [[1]]
##   [1] "plot"           "teen"           "couples"        "church"        
##   [5] "party"          "drink"          "drive"          "accident"      
##   [9] "guys"           "dies"           "girlfriend"     "continues"     
##  [13] "life"           "nightmares"     "deal"           "watch"         
##  [17] "movie"          "sorta"          "find"           "critique"      
##  [21] "mindfuck"       "movie"          "teen"           "generation"    
##  [25] "touches"        "cool"           "idea"           "presents"      
##  [29] "bad"            "package"        "makes"          "review"        
##  [33] "harder"         "write"          "generally"      "applaud"       
##  [37] "films"          "attempt"        "break"          "mold"          
##  [41] "mess"           "head"           "lost"           "highway"       
##  [45] "memento"        "good"           "bad"            "ways"          
##  [49] "making"         "types"          "films"          "folks"         
##  [53] "snag"           "correctly"      "pretty"         "neat"          
##  [57] "concept"        "executed"       "terribly"       "problems"      
##  [61] "movie"          "main"           "problem"        "simply"        
##  [65] "jumbled"        "starts"         "normal"         "downshifts"    
##  [69] "fantasy"        "world"          "audience"       "member"        
##  [73] "idea"           "dreams"         "characters"     "coming"        
##  [77] "back"           "dead"           "dead"           "strange"       
##  [81] "apparitions"    "disappearances" "looooot"        "chase"         
##  [85] "scenes"         "tons"           "weird"          "things"        
##  [89] "happen"         "simply"         "explained"      "personally"    
##  [93] "mind"           "unravel"        "film"           "give"          
##  [97] "clue"           "kind"           "fed"            "film"          
## [101] "biggest"        "problem"        "big"            "secret"        
## [105] "hide"           "hide"           "completely"     "final"         
## [109] "minutes"        "make"           "things"         "entertaining"  
## [113] "thrilling"      "engaging"       "meantime"       "sad"           
## [117] "part"           "arrow"          "dig"            "flicks"        
## [121] "figured"        "half"           "point"          "strangeness"   
## [125] "start"          "make"           "bit"            "sense"         
## [129] "make"           "film"           "entertaining"   "guess"         
## [133] "bottom"         "line"           "movies"         "make"          
## [137] "audience"       "secret"         "password"       "enter"         
## [141] "world"          "understanding"  "showing"        "melissa"       
## [145] "sagemiller"     "running"        "visions"        "minutes"       
## [149] "movie"          "plain"          "lazy"           "people"        
## [153] "chasing"        "giving"         "scenes"         "offering"      
## [157] "insight"        "strangeness"    "movie"          "apparently"    
## [161] "studio"         "film"           "director"       "chopped"       
## [165] "shows"          "ve"             "pretty"         "decent"        
## [169] "teen"           "mindfuck"       "movie"          "guess"         
## [173] "suits"          "decided"        "turning"        "music"         
## [177] "video"          "edge"           "make"           "sense"         
## [181] "actors"         "pretty"         "good"           "part"          
## [185] "wes"            "bentley"        "playing"        "exact"         
## [189] "character"      "american"       "beauty"         "neighborhood"  
## [193] "biggest"        "kudos"          "sagemiller"     "holds"         
## [197] "entire"         "film"           "feeling"        "character"     
## [201] "unraveling"     "film"           "stick"          "entertain"     
## [205] "confusing"      "rarely"         "excites"        "feels"         
## [209] "pretty"         "redundant"      "runtime"        "pretty"        
## [213] "cool"           "ending"         "explanation"    "craziness"     
## [217] "horror"         "teen"           "slasher"        "flick"         
## [221] "packaged"       "apparently"     "assuming"       "genre"         
## [225] "hot"            "kids"           "wrapped"        "production"    
## [229] "years"          "ago"            "sitting"        "shelves"       
## [233] "skip"           "joblo"          "coming"         "nightmare"     
## [237] "elm"            "street"         "blair"          "witch"         
## [241] "crow"           "crow"           "salvation"      "lost"          
## [245] "highway"        "memento"        "stir"           "echoes"

review.length <- sapply(textData, length)
# Relationship between comments length and pos/neg sentiments
boxplot(review.length ~ sentiment$Sentiment)

qqnorm(review.length);qqline(review.length)

Y <- as.factor(sentiment$Sentiment)
#Model#1 logistic regression
modelReviewLength <- glm(Y ~ review.length, family = "binomial")
summary(modelReviewLength)

## 
## Call:
## glm(formula = Y ~ review.length, family = "binomial")
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.762  -1.142  -0.235   1.177   1.427  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.7184527  0.1183250  -6.072 1.26e-09 ***
## review.length  0.0025278  0.0003873   6.527 6.70e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2772.6  on 1999  degrees of freedom
## Residual deviance: 2727.3  on 1998  degrees of freedom
## AIC: 2731.3
## 
## Number of Fisher Scoring iterations: 4

#specific word "great" and affects to the sentiments
greatIdx <- grep("great", sentiment$Text)
Great <- rep(0, 2000); Great[greatIdx] <- 1; Great <- as.factor(Great); Great[1:50]

##  [1] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0
## [36] 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0
## Levels: 0 1

#model#2 logistic
modelGreat <- glm(Y ~ Great, family = "binomial")
summary(modelGreat)

## 
## Call:
## glm(formula = Y ~ Great, family = "binomial")
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.35832  -1.06189  -0.02761   1.29749   1.29749  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.27793    0.05842  -4.758 1.96e-06 ***
## Great1       0.69375    0.09273   7.481 7.36e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2772.6  on 1999  degrees of freedom
## Residual deviance: 2715.7  on 1998  degrees of freedom
## AIC: 2719.7
## 
## Number of Fisher Scoring iterations: 4

#frequency of words and affects to the sentiments
dataCorpus <- Corpus(VectorSource(textData))
ctrl <- list(wordLengths = c(1, Inf))
Matrix <- DocumentTermMatrix(dataCorpus, control = ctrl)
dataMatrix <- as.matrix(Matrix); dataMatrix[1:10,14:20]

##     Terms
## Docs aaliyah aalyah aameetings aamir aardman aaron aatish
##   1        0      0          0     0       0     0      0
##   2        0      0          0     0       0     0      0
##   3        0      0          0     0       0     0      0
##   4        0      0          0     0       0     0      0
##   5        0      0          0     0       0     0      0
##   6        0      0          0     0       0     0      0
##   7        0      0          0     0       0     0      0
##   8        0      0          0     0       0     0      0
##   9        0      0          0     0       0     0      0
##   10       0      0          0     0       0     0      0

dim(dataMatrix) # matrix too long

## [1]  2000 42147

wordFreq <- colSums(dataMatrix) #frequency
hist(wordFreq, breaks = 100)

dataMatrix200 <- dataMatrix[, colSums(dataMatrix) > 200] #frequency <200 removed
dim(dataMatrix200)

## [1] 2000  404

data <- data.frame(Y, dataMatrix200) # Y <- as.factor(sentiment$Sentiment)
#model#3 Elastic Net/LASSO
mod.enet <- glmnet(dataMatrix200, Y, family = "binomial", alpha=.5)
plot(mod.enet)

#model#4 choose lambda with cross-validation
cv <- cv.glmnet(dataMatrix200, Y, family = "binomial", nfolds = 5)
plot(cv)

cv$lambda.min # the optimal lambda

## [1] 0.009212902

cv2 <- cv.glmnet(dataMatrix200, Y, family = "binomial", nfolds = 5, type.measure="class")   
plot(cv2)

cv2$lambda.min

## [1] 0.01011115

length(coef(cv2));

## [1] 405

#Partitioning 2000 sentiments to create train & test datasets
sample1 <- sample(1000, 100); sample2 <- sample(c(1001: 2000), 100); samples <- c(sample1, sample2)
testX <- dataMatrix200[samples, ]
trainX <- dataMatrix200[-samples, ]
testY <- as.factor(sentiment$Sentiment)[samples]
trainY <- as.factor(sentiment$Sentiment)[-samples]   
dim(trainX); length(trainY)

## [1] 1800  404

## [1] 1800

#model#5
mod.train <- cv.glmnet(trainX, trainY, family = "binomial", nfolds = 5)
coef(mod.train)

## 405 x 1 sparse Matrix of class "dgCMatrix"
##                           1
## (Intercept)   -0.1319254194
## act            .           
## acting         .           
## action         .           
## actor          .           
## actors         .           
## alien          .           
## aliens         0.0381084340
## america        0.0617305918
## american       0.0851027421
## apparently    -0.1352001587
## appears       -0.0783744503
## art            .           
## attempt       -0.2966916797
## attempts      -0.2661345528
## attention      0.1479443048
## audience       .           
## back           0.0131565610
## bad           -0.4270670600
## based          .           
## basically      .           
## batman         .           
## battle         .           
## beautiful      0.1201821481
## beginning      .           
## begins         .           
## big            .           
## bit            0.0366397402
## black          .           
## body           .           
## book           .           
## boring        -0.6805718882
## boy            .           
## bring          .           
## brings         0.0234467349
## brother        .           
## call           .           
## called         .           
## camera         .           
## car            .           
## care           .           
## career         .           
## case           .           
## cast           .           
## chance         .           
## change         0.1443396556
## character      .           
## characters     .           
## child          .           
## children       .           
## chris          .           
## city           .           
## classic        0.0471879920
## close          .           
## comedy         .           
## comic          .           
## coming         .           
## completely     .           
## computer       .           
## cool           .           
## couple         .           
## credits        .           
## crew           .           
## dark           0.0542090805
## daughter       .           
## david          .           
## day            .           
## days           .           
## de             .           
## dead           .           
## deal           .           
## death          .           
## deep           .           
## dialogue       .           
## directed       .           
## direction      .           
## director      -0.0790850469
## disney         0.0052353683
## dog            .           
## dr             .           
## drama          .           
## early          .           
## earth          .           
## easily         0.1521080402
## easy           .           
## effects        .           
## elements       .           
## emotional      .           
## end            .           
## ending         0.0312543989
## ends           .           
## enjoy          0.0608969813
## entertaining   0.1030964838
## entertainment  .           
## entire         .           
## events         .           
## eventually     0.1487439398
## evil           .           
## expect         .           
## experience     0.1180982367
## extremely      0.2012431928
## eyes           .           
## face           .           
## fact           .           
## fall           .           
## falls         -0.1331280494
## family         0.0797067140
## fans           .           
## father         0.0346669701
## feature        .           
## features       .           
## feel           .           
## feeling        .           
## feels          .           
## fiction        0.0668647065
## fight          .           
## film           0.0063884966
## films          0.0178236333
## final          .           
## finally        .           
## find           .           
## finds          .           
## fine           .           
## forced         .           
## form           .           
## found          .           
## friend         .           
## friends        .           
## full           .           
## fun            0.1781509134
## funny          .           
## future         0.0546951374
## game           .           
## genre          .           
## george         .           
## girl           .           
## girlfriend     .           
## girls          .           
## give          -0.0629137538
## giving         .           
## god            .           
## good           0.0510809544
## great          0.2116730275
## group          .           
## guess         -0.1306993620
## guy            .           
## guys           .           
## half           .           
## hand           .           
## happen         .           
## happy          0.0338138601
## hard           .           
## head           .           
## heart          0.0228914793
## hell           .           
## hero           .           
## high           .           
## history        0.0567633045
## hit            .           
## hollywood      .           
## home           .           
## hope           .           
## horror         0.0068297526
## hour           .           
## hours          .           
## house          .           
## human          .           
## humor          .           
## husband        0.0102258287
## idea           .           
## important      .           
## impressive     .           
## including      .           
## interest       .           
## interesting    .           
## involved       .           
## involving      .           
## jack           .           
## jackie         0.0231894820
## james          .           
## job            0.1517756945
## joe            .           
## john           .           
## jokes         -0.0309671613
## kevin          .           
## kids           .           
## kill           .           
## killed         .           
## killer         .           
## kind           .           
## king           .           
## lack          -0.0105268191
## late          -0.0205112361
## laugh          .           
## laughs         .           
## lead           .           
## leads          0.0398881362
## leave          0.1181448898
## lee            .           
## left           .           
## level          .           
## life           0.1298925741
## line           .           
## lines          .           
## live           .           
## lives          .           
## living         .           
## ll             .           
## local          .           
## long           .           
## lost           .           
## lot            .           
## love           0.0206464307
## made           .           
## main           .           
## major          .           
## make           .           
## makes          0.0305686071
## making         .           
## man            .           
## manages        .           
## material      -0.2859729212
## matter         .           
## meet           .           
## meets          .           
## men            .           
## michael        .           
## mind           0.0399427456
## minutes       -0.0861862016
## mission        .           
## moment         .           
## moments        .           
## money          .           
## mother         .           
## movie         -0.0121534716
## movies         0.0350766423
## mr             .           
## murder         .           
## music          .           
## named          0.0304226421
## nice           .           
## night          .           
## note           .           
## number         .           
## obvious       -0.1133152623
## opening        .           
## order          .           
## original       .           
## oscar          0.0129428730
## parents        .           
## part           .           
## parts          .           
## past           .           
## paul           .           
## people         0.0444745226
## perfect        0.3690445747
## performance    0.0988212869
## performances   0.2597152955
## person         .           
## peter          .           
## picture        0.0613747699
## piece          .           
## place          .           
## planet         .           
## play           .           
## played         .           
## playing        .           
## plays          .           
## plot          -0.1251739619
## point         -0.0469902894
## police         .           
## poor          -0.2681581383
## power          .           
## premise        .           
## pretty         .           
## problem        .           
## problems      -0.0871712779
## production     .           
## put            .           
## question       .           
## quickly        .           
## real           .           
## reason        -0.3232393465
## recent         .           
## relationship   0.0223936704
## release        .           
## released       .           
## rest           .           
## result         .           
## review         .           
## robert         .           
## robin          .           
## role           .           
## roles          .           
## romantic       .           
## room           .           
## run            .           
## running        .           
## ryan           .           
## s              .           
## save          -0.0697944412
## scene          .           
## scenes         .           
## school         .           
## science        .           
## score          .           
## scream         .           
## screen         .           
## screenplay     .           
## script        -0.1872864425
## sense          .           
## sequel        -0.0029311159
## sequence       .           
## sequences      .           
## series        -0.0058379131
## set            .           
## sets           0.0291215221
## sex            .           
## ship           .           
## short          .           
## shot           .           
## show           .           
## shows          .           
## side           0.0808195600
## simple         0.1823879627
## simply         .           
## single         .           
## small          0.1527627762
## smith          .           
## son            .           
## sort           .           
## sound          .           
## space          .           
## special        .           
## star           0.0311229001
## stars          .           
## start          .           
## starts         .           
## stop           .           
## story          .           
## strong         0.0942661295
## stuff          .           
## stupid        -0.3836426608
## style          .           
## success        .           
## summer         .           
## supporting     .           
## supposed      -0.4061896018
## surprise       .           
## takes          0.0926324573
## taking         .           
## tale           0.1894671740
## talent        -0.1408489475
## talk           .           
## team           .           
## television     .           
## tells          .           
## theater        .           
## thing          .           
## things         .           
## thought        .           
## thriller      -0.0021342024
## time           .           
## times          0.0664177078
## title          .           
## told           0.0006993311
## tom            0.0223517653
## top            .           
## town           .           
## true           0.2836383810
## turn           .           
## turns          .           
## tv            -0.0604732918
## van            .           
## ve             .           
## version        .           
## video          .           
## viewer         .           
## violence       .           
## voice          .           
## war            0.0985905568
## watch          .           
## watching       .           
## white          .           
## wife           .           
## wild           .           
## william        .           
## williams       .           
## woman          .           
## women          .           
## wonderful      0.2088641961
## word           .           
## words          .           
## work           .           
## works          0.0617171703
## world          0.0858371508
## worse         -0.2042089577
## worst         -0.6509248015
## worth          .           
## written        .           
## wrong          .           
## year           .           
## years          .           
## york           .           
## young          .

Predicts <- predict(mod.train, newx = testX, type = "class")
table(Predicts, testY)

##         testY
## Predicts  0  1
##        0 86 19
##        1 14 81

select <- select[-1] # remove an intercept
vars <- colnames(dataMatrix200)[which(select != 0)] #non-zero variables
length(vars)

## [1] 107

dataMatrix.final <- data.frame(trainX[, vars])
dim(dataMatrix.final)

## [1] 1800  107

#model#7
modelFinal <- glm(trainY ~ ., family = "binomial", data = dataMatrix.final)        
summary(modelFinal)

## 
## Call:
## glm(formula = trainY ~ ., family = "binomial", data = dataMatrix.final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.44551  -0.93430  -0.02174   0.93861   2.27317  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.199012   0.138200  -1.440 0.149858    
## alien        0.054292   0.080455   0.675 0.499796    
## aliens       0.275352   0.150445   1.830 0.067213 .  
## america      0.366973   0.142168   2.581 0.009844 ** 
## american     0.288563   0.080382   3.590 0.000331 ***
## apparently  -0.599993   0.186880  -3.211 0.001325 ** 
## art          0.089016   0.128015   0.695 0.486834    
## attempt     -0.635794   0.160444  -3.963 7.41e-05 ***
## attempts    -0.666058   0.176296  -3.778 0.000158 ***
## back         0.091825   0.073033   1.257 0.208642    
## based        0.096196   0.134582   0.715 0.474749    
## battle       0.018307   0.123743   0.148 0.882384    
## big         -0.132633   0.064315  -2.062 0.039184 *  
## book        -0.027429   0.097006  -0.283 0.777364    
## bring        0.159462   0.165424   0.964 0.335066    
## chance      -0.065665   0.160533  -0.409 0.682508    
## characters   0.019989   0.045310   0.441 0.659088    
## city         0.161851   0.083724   1.933 0.053218 .  
## crew        -0.008190   0.116631  -0.070 0.944018    
## direction    0.165895   0.155520   1.067 0.286103    
## director    -0.233017   0.074829  -3.114 0.001846 ** 
## earth       -0.102256   0.092402  -1.107 0.268451    
## end          0.028070   0.071424   0.393 0.694314    
## ends        -0.341319   0.166398  -2.051 0.040246 *  
## enjoy        0.536041   0.171966   3.117 0.001826 ** 
## events       0.152169   0.166181   0.916 0.359833    
## expect       0.172036   0.164097   1.048 0.294462    
## experience   0.319268   0.169386   1.885 0.059449 .  
## fall        -0.017749   0.169519  -0.105 0.916614    
## falls       -0.443775   0.158544  -2.799 0.005125 ** 
## fans         0.125694   0.124135   1.013 0.311273    
## father       0.255795   0.100738   2.539 0.011110 *  
## feeling      0.066380   0.142251   0.467 0.640757    
## feels        0.188157   0.169672   1.109 0.267454    
## fight       -0.037530   0.095468  -0.393 0.694238    
## film         0.019846   0.017510   1.133 0.257029    
## forced      -0.051971   0.162322  -0.320 0.748839    
## friend       0.064855   0.109583   0.592 0.553965    
## full         0.201135   0.128213   1.569 0.116704    
## funny       -0.002160   0.062306  -0.035 0.972349    
## girl        -0.148484   0.096382  -1.541 0.123420    
## girlfriend  -0.051128   0.150351  -0.340 0.733811    
## girls       -0.151862   0.070542  -2.153 0.031335 *  
## give        -0.343702   0.108216  -3.176 0.001493 ** 
## god          0.069978   0.116540   0.600 0.548199    
## good         0.118944   0.043412   2.740 0.006147 ** 
## group        0.161459   0.108088   1.494 0.135235    
## high         0.065727   0.078556   0.837 0.402772    
## house       -0.056522   0.093221  -0.606 0.544299    
## husband      0.213475   0.121556   1.756 0.079057 .  
## jack        -0.026140   0.083013  -0.315 0.752840    
## james       -0.008448   0.094801  -0.089 0.928992    
## john         0.011248   0.060001   0.187 0.851299    
## king         0.019128   0.091837   0.208 0.835010    
## lead        -0.008630   0.157485  -0.055 0.956300    
## leads        0.569816   0.166065   3.431 0.000601 ***
## level       -0.208587   0.155512  -1.341 0.179824    
## lot          0.071825   0.089649   0.801 0.423029    
## major        0.107003   0.148084   0.723 0.469934    
## make        -0.122044   0.060857  -2.005 0.044918 *  
## manages      0.182251   0.151846   1.200 0.230047    
## michael     -0.082125   0.085826  -0.957 0.338629    
## mind         0.153922   0.117952   1.305 0.191910    
## mother       0.135238   0.093154   1.452 0.146563    
## movie       -0.099361   0.021182  -4.691 2.72e-06 ***
## music        0.088788   0.089325   0.994 0.320231    
## number      -0.002901   0.150973  -0.019 0.984670    
## paul        -0.235150   0.122498  -1.920 0.054905 .  
## people       0.103857   0.056928   1.824 0.068099 .  
## perfect      0.768431   0.146716   5.238 1.63e-07 ***
## performance  0.235513   0.075774   3.108 0.001883 ** 
## peter        0.042390   0.129661   0.327 0.743722    
## plays        0.117290   0.090468   1.296 0.194809    
## plot        -0.325863   0.058812  -5.541 3.01e-08 ***
## police       0.064496   0.134616   0.479 0.631861    
## pretty      -0.092158   0.097302  -0.947 0.343573    
## problem     -0.266619   0.120692  -2.209 0.027168 *  
## question    -0.093072   0.140720  -0.661 0.508357    
## real        -0.041694   0.072464  -0.575 0.565041    
## s            0.112185   0.099525   1.127 0.259656    
## screenplay  -0.176891   0.140786  -1.256 0.208953    
## script      -0.398583   0.083151  -4.793 1.64e-06 ***
## sense        0.169002   0.093646   1.805 0.071123 .  
## sequences   -0.090904   0.131444  -0.692 0.489200    
## set          0.045142   0.101601   0.444 0.656824    
## show        -0.021330   0.070780  -0.301 0.763139    
## side         0.281704   0.132089   2.133 0.032950 *  
## single      -0.166050   0.171337  -0.969 0.332473    
## special      0.034331   0.078835   0.435 0.663214    
## story        0.062341   0.041341   1.508 0.131560    
## stuff       -0.131977   0.161170  -0.819 0.412862    
## supporting   0.469664   0.161872   2.901 0.003714 ** 
## surprise     0.387793   0.167825   2.311 0.020850 *  
## taking       0.064651   0.157306   0.411 0.681080    
## tale         0.544714   0.172431   3.159 0.001583 ** 
## thought     -0.119110   0.122439  -0.973 0.330646    
## time        -0.011264   0.048214  -0.234 0.815273    
## town         0.093534   0.076986   1.215 0.224388    
## turns       -0.066507   0.121333  -0.548 0.583596    
## video       -0.154207   0.118641  -1.300 0.193677    
## violence     0.066716   0.122242   0.546 0.585224    
## voice        0.243777   0.131545   1.853 0.063855 .  
## watch        0.167260   0.093673   1.786 0.074167 .  
## women       -0.094339   0.127306  -0.741 0.458671    
## work        -0.108142   0.076697  -1.410 0.158544    
## works        0.449952   0.140999   3.191 0.001417 ** 
## world        0.248925   0.062929   3.956 7.63e-05 ***
## worse       -0.691686   0.168482  -4.105 4.04e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2495.3  on 1799  degrees of freedom
## Residual deviance: 1973.2  on 1692  degrees of freedom
## AIC: 2189.2
## 
## Number of Fisher Scoring iterations: 5

Predict.Final <- predict(modelFinal, newdata = data.frame(testX), type = "response")
Predict.Finalclass <- (Predict.Final > 0.5)
table(Predict.Finalclass, testY)

##                   testY
## Predict.Finalclass  0  1
##              FALSE 74 29
##              TRUE  26 71

Movie Reviews - Tokenization & Sentiments Analysis

saneja@gatech.edu