# Movie review - tokenization
library(tm); library(glmnet); library(MASS)
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.1.2
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 3.1.2
## Loaded glmnet 1.9-8
## Warning: package 'MASS' was built under R version 3.1.2
setwd("~/Documents/NYDSA/Lectures/Week8/week_8_day1_linear_reg_assignments")
# CSV file with 2000 movie reviews with (positive /negative) sentiments
sentiment <- read.csv("movie-reviews-sentiment.csv", header = T, as.is = T)
#Exploration & text cleanup
dim(sentiment); levels(as.factor(sentiment$Sentiment))
## [1] 2000 2
## [1] "Neg" "Pos"
sentiment$Sentiment[sentiment$Sentiment == "Neg"] <- 0
sentiment$Sentiment[sentiment$Sentiment == "Pos"] <- 1
sentiment$Text <- sapply(sentiment$Text, tolower)
# tokenization # split: words split as distinct elements of a vector
# removeStop: stop words # removePuncts, removeNumber
Tokenize <- function(document, split = TRUE, removeStop = TRUE, removePunct = TRUE, removeNumber = TRUE, removeWhite = TRUE, wordStem = TRUE) {
if (removeStop == TRUE) {
document <- removeWords(document, stopwords("SMART")) # remove stop words
}
if (removePunct == TRUE) {
document <- removePunctuation(document)
}
if (removeNumber == TRUE) {
document <- removeNumbers(document)
}
if (removeWhite == TRUE) {
document <- stripWhitespace(document)
}
if (split == TRUE) {
document <- unlist(strsplit(document, " "))
}
document
}
textList <- as.list(sentiment$Text)
textData <- sapply(textList, Tokenize)
head(textData,1)
## [[1]]
## [1] "plot" "teen" "couples" "church"
## [5] "party" "drink" "drive" "accident"
## [9] "guys" "dies" "girlfriend" "continues"
## [13] "life" "nightmares" "deal" "watch"
## [17] "movie" "sorta" "find" "critique"
## [21] "mindfuck" "movie" "teen" "generation"
## [25] "touches" "cool" "idea" "presents"
## [29] "bad" "package" "makes" "review"
## [33] "harder" "write" "generally" "applaud"
## [37] "films" "attempt" "break" "mold"
## [41] "mess" "head" "lost" "highway"
## [45] "memento" "good" "bad" "ways"
## [49] "making" "types" "films" "folks"
## [53] "snag" "correctly" "pretty" "neat"
## [57] "concept" "executed" "terribly" "problems"
## [61] "movie" "main" "problem" "simply"
## [65] "jumbled" "starts" "normal" "downshifts"
## [69] "fantasy" "world" "audience" "member"
## [73] "idea" "dreams" "characters" "coming"
## [77] "back" "dead" "dead" "strange"
## [81] "apparitions" "disappearances" "looooot" "chase"
## [85] "scenes" "tons" "weird" "things"
## [89] "happen" "simply" "explained" "personally"
## [93] "mind" "unravel" "film" "give"
## [97] "clue" "kind" "fed" "film"
## [101] "biggest" "problem" "big" "secret"
## [105] "hide" "hide" "completely" "final"
## [109] "minutes" "make" "things" "entertaining"
## [113] "thrilling" "engaging" "meantime" "sad"
## [117] "part" "arrow" "dig" "flicks"
## [121] "figured" "half" "point" "strangeness"
## [125] "start" "make" "bit" "sense"
## [129] "make" "film" "entertaining" "guess"
## [133] "bottom" "line" "movies" "make"
## [137] "audience" "secret" "password" "enter"
## [141] "world" "understanding" "showing" "melissa"
## [145] "sagemiller" "running" "visions" "minutes"
## [149] "movie" "plain" "lazy" "people"
## [153] "chasing" "giving" "scenes" "offering"
## [157] "insight" "strangeness" "movie" "apparently"
## [161] "studio" "film" "director" "chopped"
## [165] "shows" "ve" "pretty" "decent"
## [169] "teen" "mindfuck" "movie" "guess"
## [173] "suits" "decided" "turning" "music"
## [177] "video" "edge" "make" "sense"
## [181] "actors" "pretty" "good" "part"
## [185] "wes" "bentley" "playing" "exact"
## [189] "character" "american" "beauty" "neighborhood"
## [193] "biggest" "kudos" "sagemiller" "holds"
## [197] "entire" "film" "feeling" "character"
## [201] "unraveling" "film" "stick" "entertain"
## [205] "confusing" "rarely" "excites" "feels"
## [209] "pretty" "redundant" "runtime" "pretty"
## [213] "cool" "ending" "explanation" "craziness"
## [217] "horror" "teen" "slasher" "flick"
## [221] "packaged" "apparently" "assuming" "genre"
## [225] "hot" "kids" "wrapped" "production"
## [229] "years" "ago" "sitting" "shelves"
## [233] "skip" "joblo" "coming" "nightmare"
## [237] "elm" "street" "blair" "witch"
## [241] "crow" "crow" "salvation" "lost"
## [245] "highway" "memento" "stir" "echoes"
review.length <- sapply(textData, length)
# Relationship between comments length and pos/neg sentiments
boxplot(review.length ~ sentiment$Sentiment)

qqnorm(review.length);qqline(review.length)

Y <- as.factor(sentiment$Sentiment)
#Model#1 logistic regression
modelReviewLength <- glm(Y ~ review.length, family = "binomial")
summary(modelReviewLength)
##
## Call:
## glm(formula = Y ~ review.length, family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.762 -1.142 -0.235 1.177 1.427
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7184527 0.1183250 -6.072 1.26e-09 ***
## review.length 0.0025278 0.0003873 6.527 6.70e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2772.6 on 1999 degrees of freedom
## Residual deviance: 2727.3 on 1998 degrees of freedom
## AIC: 2731.3
##
## Number of Fisher Scoring iterations: 4
#specific word "great" and affects to the sentiments
greatIdx <- grep("great", sentiment$Text)
Great <- rep(0, 2000); Great[greatIdx] <- 1; Great <- as.factor(Great); Great[1:50]
## [1] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0
## [36] 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0
## Levels: 0 1
#model#2 logistic
modelGreat <- glm(Y ~ Great, family = "binomial")
summary(modelGreat)
##
## Call:
## glm(formula = Y ~ Great, family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.35832 -1.06189 -0.02761 1.29749 1.29749
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.27793 0.05842 -4.758 1.96e-06 ***
## Great1 0.69375 0.09273 7.481 7.36e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2772.6 on 1999 degrees of freedom
## Residual deviance: 2715.7 on 1998 degrees of freedom
## AIC: 2719.7
##
## Number of Fisher Scoring iterations: 4
#frequency of words and affects to the sentiments
dataCorpus <- Corpus(VectorSource(textData))
ctrl <- list(wordLengths = c(1, Inf))
Matrix <- DocumentTermMatrix(dataCorpus, control = ctrl)
dataMatrix <- as.matrix(Matrix); dataMatrix[1:10,14:20]
## Terms
## Docs aaliyah aalyah aameetings aamir aardman aaron aatish
## 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0
## 10 0 0 0 0 0 0 0
dim(dataMatrix) # matrix too long
## [1] 2000 42147
wordFreq <- colSums(dataMatrix) #frequency
hist(wordFreq, breaks = 100)

dataMatrix200 <- dataMatrix[, colSums(dataMatrix) > 200] #frequency <200 removed
dim(dataMatrix200)
## [1] 2000 404
data <- data.frame(Y, dataMatrix200) # Y <- as.factor(sentiment$Sentiment)
#model#3 Elastic Net/LASSO
mod.enet <- glmnet(dataMatrix200, Y, family = "binomial", alpha=.5)
plot(mod.enet)

#model#4 choose lambda with cross-validation
cv <- cv.glmnet(dataMatrix200, Y, family = "binomial", nfolds = 5)
plot(cv)

cv$lambda.min # the optimal lambda
## [1] 0.009212902
cv2 <- cv.glmnet(dataMatrix200, Y, family = "binomial", nfolds = 5, type.measure="class")
plot(cv2)

cv2$lambda.min
## [1] 0.01011115
length(coef(cv2));
## [1] 405
#Partitioning 2000 sentiments to create train & test datasets
sample1 <- sample(1000, 100); sample2 <- sample(c(1001: 2000), 100); samples <- c(sample1, sample2)
testX <- dataMatrix200[samples, ]
trainX <- dataMatrix200[-samples, ]
testY <- as.factor(sentiment$Sentiment)[samples]
trainY <- as.factor(sentiment$Sentiment)[-samples]
dim(trainX); length(trainY)
## [1] 1800 404
## [1] 1800
#model#5
mod.train <- cv.glmnet(trainX, trainY, family = "binomial", nfolds = 5)
coef(mod.train)
## 405 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.1319254194
## act .
## acting .
## action .
## actor .
## actors .
## alien .
## aliens 0.0381084340
## america 0.0617305918
## american 0.0851027421
## apparently -0.1352001587
## appears -0.0783744503
## art .
## attempt -0.2966916797
## attempts -0.2661345528
## attention 0.1479443048
## audience .
## back 0.0131565610
## bad -0.4270670600
## based .
## basically .
## batman .
## battle .
## beautiful 0.1201821481
## beginning .
## begins .
## big .
## bit 0.0366397402
## black .
## body .
## book .
## boring -0.6805718882
## boy .
## bring .
## brings 0.0234467349
## brother .
## call .
## called .
## camera .
## car .
## care .
## career .
## case .
## cast .
## chance .
## change 0.1443396556
## character .
## characters .
## child .
## children .
## chris .
## city .
## classic 0.0471879920
## close .
## comedy .
## comic .
## coming .
## completely .
## computer .
## cool .
## couple .
## credits .
## crew .
## dark 0.0542090805
## daughter .
## david .
## day .
## days .
## de .
## dead .
## deal .
## death .
## deep .
## dialogue .
## directed .
## direction .
## director -0.0790850469
## disney 0.0052353683
## dog .
## dr .
## drama .
## early .
## earth .
## easily 0.1521080402
## easy .
## effects .
## elements .
## emotional .
## end .
## ending 0.0312543989
## ends .
## enjoy 0.0608969813
## entertaining 0.1030964838
## entertainment .
## entire .
## events .
## eventually 0.1487439398
## evil .
## expect .
## experience 0.1180982367
## extremely 0.2012431928
## eyes .
## face .
## fact .
## fall .
## falls -0.1331280494
## family 0.0797067140
## fans .
## father 0.0346669701
## feature .
## features .
## feel .
## feeling .
## feels .
## fiction 0.0668647065
## fight .
## film 0.0063884966
## films 0.0178236333
## final .
## finally .
## find .
## finds .
## fine .
## forced .
## form .
## found .
## friend .
## friends .
## full .
## fun 0.1781509134
## funny .
## future 0.0546951374
## game .
## genre .
## george .
## girl .
## girlfriend .
## girls .
## give -0.0629137538
## giving .
## god .
## good 0.0510809544
## great 0.2116730275
## group .
## guess -0.1306993620
## guy .
## guys .
## half .
## hand .
## happen .
## happy 0.0338138601
## hard .
## head .
## heart 0.0228914793
## hell .
## hero .
## high .
## history 0.0567633045
## hit .
## hollywood .
## home .
## hope .
## horror 0.0068297526
## hour .
## hours .
## house .
## human .
## humor .
## husband 0.0102258287
## idea .
## important .
## impressive .
## including .
## interest .
## interesting .
## involved .
## involving .
## jack .
## jackie 0.0231894820
## james .
## job 0.1517756945
## joe .
## john .
## jokes -0.0309671613
## kevin .
## kids .
## kill .
## killed .
## killer .
## kind .
## king .
## lack -0.0105268191
## late -0.0205112361
## laugh .
## laughs .
## lead .
## leads 0.0398881362
## leave 0.1181448898
## lee .
## left .
## level .
## life 0.1298925741
## line .
## lines .
## live .
## lives .
## living .
## ll .
## local .
## long .
## lost .
## lot .
## love 0.0206464307
## made .
## main .
## major .
## make .
## makes 0.0305686071
## making .
## man .
## manages .
## material -0.2859729212
## matter .
## meet .
## meets .
## men .
## michael .
## mind 0.0399427456
## minutes -0.0861862016
## mission .
## moment .
## moments .
## money .
## mother .
## movie -0.0121534716
## movies 0.0350766423
## mr .
## murder .
## music .
## named 0.0304226421
## nice .
## night .
## note .
## number .
## obvious -0.1133152623
## opening .
## order .
## original .
## oscar 0.0129428730
## parents .
## part .
## parts .
## past .
## paul .
## people 0.0444745226
## perfect 0.3690445747
## performance 0.0988212869
## performances 0.2597152955
## person .
## peter .
## picture 0.0613747699
## piece .
## place .
## planet .
## play .
## played .
## playing .
## plays .
## plot -0.1251739619
## point -0.0469902894
## police .
## poor -0.2681581383
## power .
## premise .
## pretty .
## problem .
## problems -0.0871712779
## production .
## put .
## question .
## quickly .
## real .
## reason -0.3232393465
## recent .
## relationship 0.0223936704
## release .
## released .
## rest .
## result .
## review .
## robert .
## robin .
## role .
## roles .
## romantic .
## room .
## run .
## running .
## ryan .
## s .
## save -0.0697944412
## scene .
## scenes .
## school .
## science .
## score .
## scream .
## screen .
## screenplay .
## script -0.1872864425
## sense .
## sequel -0.0029311159
## sequence .
## sequences .
## series -0.0058379131
## set .
## sets 0.0291215221
## sex .
## ship .
## short .
## shot .
## show .
## shows .
## side 0.0808195600
## simple 0.1823879627
## simply .
## single .
## small 0.1527627762
## smith .
## son .
## sort .
## sound .
## space .
## special .
## star 0.0311229001
## stars .
## start .
## starts .
## stop .
## story .
## strong 0.0942661295
## stuff .
## stupid -0.3836426608
## style .
## success .
## summer .
## supporting .
## supposed -0.4061896018
## surprise .
## takes 0.0926324573
## taking .
## tale 0.1894671740
## talent -0.1408489475
## talk .
## team .
## television .
## tells .
## theater .
## thing .
## things .
## thought .
## thriller -0.0021342024
## time .
## times 0.0664177078
## title .
## told 0.0006993311
## tom 0.0223517653
## top .
## town .
## true 0.2836383810
## turn .
## turns .
## tv -0.0604732918
## van .
## ve .
## version .
## video .
## viewer .
## violence .
## voice .
## war 0.0985905568
## watch .
## watching .
## white .
## wife .
## wild .
## william .
## williams .
## woman .
## women .
## wonderful 0.2088641961
## word .
## words .
## work .
## works 0.0617171703
## world 0.0858371508
## worse -0.2042089577
## worst -0.6509248015
## worth .
## written .
## wrong .
## year .
## years .
## york .
## young .
Predicts <- predict(mod.train, newx = testX, type = "class")
table(Predicts, testY)
## testY
## Predicts 0 1
## 0 86 19
## 1 14 81
select <- select[-1] # remove an intercept
vars <- colnames(dataMatrix200)[which(select != 0)] #non-zero variables
length(vars)
## [1] 107
dataMatrix.final <- data.frame(trainX[, vars])
dim(dataMatrix.final)
## [1] 1800 107
#model#7
modelFinal <- glm(trainY ~ ., family = "binomial", data = dataMatrix.final)
summary(modelFinal)
##
## Call:
## glm(formula = trainY ~ ., family = "binomial", data = dataMatrix.final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.44551 -0.93430 -0.02174 0.93861 2.27317
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.199012 0.138200 -1.440 0.149858
## alien 0.054292 0.080455 0.675 0.499796
## aliens 0.275352 0.150445 1.830 0.067213 .
## america 0.366973 0.142168 2.581 0.009844 **
## american 0.288563 0.080382 3.590 0.000331 ***
## apparently -0.599993 0.186880 -3.211 0.001325 **
## art 0.089016 0.128015 0.695 0.486834
## attempt -0.635794 0.160444 -3.963 7.41e-05 ***
## attempts -0.666058 0.176296 -3.778 0.000158 ***
## back 0.091825 0.073033 1.257 0.208642
## based 0.096196 0.134582 0.715 0.474749
## battle 0.018307 0.123743 0.148 0.882384
## big -0.132633 0.064315 -2.062 0.039184 *
## book -0.027429 0.097006 -0.283 0.777364
## bring 0.159462 0.165424 0.964 0.335066
## chance -0.065665 0.160533 -0.409 0.682508
## characters 0.019989 0.045310 0.441 0.659088
## city 0.161851 0.083724 1.933 0.053218 .
## crew -0.008190 0.116631 -0.070 0.944018
## direction 0.165895 0.155520 1.067 0.286103
## director -0.233017 0.074829 -3.114 0.001846 **
## earth -0.102256 0.092402 -1.107 0.268451
## end 0.028070 0.071424 0.393 0.694314
## ends -0.341319 0.166398 -2.051 0.040246 *
## enjoy 0.536041 0.171966 3.117 0.001826 **
## events 0.152169 0.166181 0.916 0.359833
## expect 0.172036 0.164097 1.048 0.294462
## experience 0.319268 0.169386 1.885 0.059449 .
## fall -0.017749 0.169519 -0.105 0.916614
## falls -0.443775 0.158544 -2.799 0.005125 **
## fans 0.125694 0.124135 1.013 0.311273
## father 0.255795 0.100738 2.539 0.011110 *
## feeling 0.066380 0.142251 0.467 0.640757
## feels 0.188157 0.169672 1.109 0.267454
## fight -0.037530 0.095468 -0.393 0.694238
## film 0.019846 0.017510 1.133 0.257029
## forced -0.051971 0.162322 -0.320 0.748839
## friend 0.064855 0.109583 0.592 0.553965
## full 0.201135 0.128213 1.569 0.116704
## funny -0.002160 0.062306 -0.035 0.972349
## girl -0.148484 0.096382 -1.541 0.123420
## girlfriend -0.051128 0.150351 -0.340 0.733811
## girls -0.151862 0.070542 -2.153 0.031335 *
## give -0.343702 0.108216 -3.176 0.001493 **
## god 0.069978 0.116540 0.600 0.548199
## good 0.118944 0.043412 2.740 0.006147 **
## group 0.161459 0.108088 1.494 0.135235
## high 0.065727 0.078556 0.837 0.402772
## house -0.056522 0.093221 -0.606 0.544299
## husband 0.213475 0.121556 1.756 0.079057 .
## jack -0.026140 0.083013 -0.315 0.752840
## james -0.008448 0.094801 -0.089 0.928992
## john 0.011248 0.060001 0.187 0.851299
## king 0.019128 0.091837 0.208 0.835010
## lead -0.008630 0.157485 -0.055 0.956300
## leads 0.569816 0.166065 3.431 0.000601 ***
## level -0.208587 0.155512 -1.341 0.179824
## lot 0.071825 0.089649 0.801 0.423029
## major 0.107003 0.148084 0.723 0.469934
## make -0.122044 0.060857 -2.005 0.044918 *
## manages 0.182251 0.151846 1.200 0.230047
## michael -0.082125 0.085826 -0.957 0.338629
## mind 0.153922 0.117952 1.305 0.191910
## mother 0.135238 0.093154 1.452 0.146563
## movie -0.099361 0.021182 -4.691 2.72e-06 ***
## music 0.088788 0.089325 0.994 0.320231
## number -0.002901 0.150973 -0.019 0.984670
## paul -0.235150 0.122498 -1.920 0.054905 .
## people 0.103857 0.056928 1.824 0.068099 .
## perfect 0.768431 0.146716 5.238 1.63e-07 ***
## performance 0.235513 0.075774 3.108 0.001883 **
## peter 0.042390 0.129661 0.327 0.743722
## plays 0.117290 0.090468 1.296 0.194809
## plot -0.325863 0.058812 -5.541 3.01e-08 ***
## police 0.064496 0.134616 0.479 0.631861
## pretty -0.092158 0.097302 -0.947 0.343573
## problem -0.266619 0.120692 -2.209 0.027168 *
## question -0.093072 0.140720 -0.661 0.508357
## real -0.041694 0.072464 -0.575 0.565041
## s 0.112185 0.099525 1.127 0.259656
## screenplay -0.176891 0.140786 -1.256 0.208953
## script -0.398583 0.083151 -4.793 1.64e-06 ***
## sense 0.169002 0.093646 1.805 0.071123 .
## sequences -0.090904 0.131444 -0.692 0.489200
## set 0.045142 0.101601 0.444 0.656824
## show -0.021330 0.070780 -0.301 0.763139
## side 0.281704 0.132089 2.133 0.032950 *
## single -0.166050 0.171337 -0.969 0.332473
## special 0.034331 0.078835 0.435 0.663214
## story 0.062341 0.041341 1.508 0.131560
## stuff -0.131977 0.161170 -0.819 0.412862
## supporting 0.469664 0.161872 2.901 0.003714 **
## surprise 0.387793 0.167825 2.311 0.020850 *
## taking 0.064651 0.157306 0.411 0.681080
## tale 0.544714 0.172431 3.159 0.001583 **
## thought -0.119110 0.122439 -0.973 0.330646
## time -0.011264 0.048214 -0.234 0.815273
## town 0.093534 0.076986 1.215 0.224388
## turns -0.066507 0.121333 -0.548 0.583596
## video -0.154207 0.118641 -1.300 0.193677
## violence 0.066716 0.122242 0.546 0.585224
## voice 0.243777 0.131545 1.853 0.063855 .
## watch 0.167260 0.093673 1.786 0.074167 .
## women -0.094339 0.127306 -0.741 0.458671
## work -0.108142 0.076697 -1.410 0.158544
## works 0.449952 0.140999 3.191 0.001417 **
## world 0.248925 0.062929 3.956 7.63e-05 ***
## worse -0.691686 0.168482 -4.105 4.04e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2495.3 on 1799 degrees of freedom
## Residual deviance: 1973.2 on 1692 degrees of freedom
## AIC: 2189.2
##
## Number of Fisher Scoring iterations: 5
Predict.Final <- predict(modelFinal, newdata = data.frame(testX), type = "response")
Predict.Finalclass <- (Predict.Final > 0.5)
table(Predict.Finalclass, testY)
## testY
## Predict.Finalclass 0 1
## FALSE 74 29
## TRUE 26 71