knitr::opts_chunk$set(echo = TRUE)
#dsMovies <- read.table(file = 'movie-reviews-dataset.tsv', sep = '\t', header = TRUE)
dsMovies <- read.csv("dsMovies.csv", stringsAsFactors = FALSE, header = TRUE)
summary(dsMovies)
## type message
## Length:7663 Length:7663
## Class :character Class :character
## Mode :character Mode :character
str(dsMovies)
## 'data.frame': 7663 obs. of 2 variables:
## $ type : chr "negative" "negative" "negative" "negative" ...
## $ message: chr "it's a mindless action flick with a twist -- far better suited to video-viewing than the multiplex . " "after a while , the only way for a reasonably intelligent person to get through the country bears is to ponder how a whole segm"| __truncated__ "we get light showers of emotion a couple of times , but then -- strangely -- these wane to an inconsistent and ultimately unsat"| __truncated__ "summer's far too fleeting to squander on offal like this . " ...
head(dsMovies)
## type
## 1 negative
## 2 negative
## 3 negative
## 4 negative
## 5 negative
## 6 negative
## message
## 1 it's a mindless action flick with a twist -- far better suited to video-viewing than the multiplex .
## 2 after a while , the only way for a reasonably intelligent person to get through the country bears is to ponder how a whole segment of pop-music history has been allowed to get wet , fuzzy and sticky .
## 3 we get light showers of emotion a couple of times , but then -- strangely -- these wane to an inconsistent and ultimately unsatisfying drizzle .
## 4 summer's far too fleeting to squander on offal like this .
## 5 the film is grossly contradictory in conveying its social message , if indeed there is one .
## 6 often lingers just as long on the irrelevant as on the engaging , which gradually turns what time is it there ? into how long is this movie ?
tail(dsMovies)
## type
## 7658 positive
## 7659 positive
## 7660 positive
## 7661 positive
## 7662 positive
## 7663 positive
## message
## 7658 [has] an immediacy and an intimacy that sucks you in and dares you not to believe it's all true .
## 7659 it treats ana's journey with honesty that is tragically rare in the depiction of young women in film .
## 7660 captivates as it shows excess in business and pleasure , allowing us to find the small , human moments , and leaving off with a grand whimper .
## 7661 a refreshingly realistic , affectation-free coming-of-age tale .
## 7662 how good this film might be , depends if you believe that the shocking conclusion is too much of a plunge or not .
## 7663 great fun both for sports aficionados and for ordinary louts whose idea of exercise is climbing the steps of a stadium-seat megaplex .
dsMovies$type <- factor(dsMovies$type)
table(dsMovies$type)
##
## negative positive
## 2332 5331
barplot(table(dsMovies$type), xlab = "Quantity", ylab = "Type", horiz = TRUE, col='#990066')
library("tm")
## Loading required package: NLP
dsMoviesCorpus <- VCorpus(VectorSource (dsMovies$message))
print(dsMoviesCorpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 7663
inspect(dsMoviesCorpus[1:2])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 2
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 101
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 201
library("tm")
print(dsMoviesCorpus[[1]])
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 101
as.character(dsMoviesCorpus[[1]])
## [1] "it's a mindless action flick with a twist -- far better suited to video-viewing than the multiplex . "
library("tm")
lapply(dsMoviesCorpus[1:5], as.character)
## $`1`
## [1] "it's a mindless action flick with a twist -- far better suited to video-viewing than the multiplex . "
##
## $`2`
## [1] "after a while , the only way for a reasonably intelligent person to get through the country bears is to ponder how a whole segment of pop-music history has been allowed to get wet , fuzzy and sticky . "
##
## $`3`
## [1] "we get light showers of emotion a couple of times , but then -- strangely -- these wane to an inconsistent and ultimately unsatisfying drizzle . "
##
## $`4`
## [1] "summer's far too fleeting to squander on offal like this . "
##
## $`5`
## [1] "the film is grossly contradictory in conveying its social message , if indeed there is one . "
dsMovies_clean <- tm_map ( dsMoviesCorpus, content_transformer(tolower))
as.character(dsMoviesCorpus[[1]])
## [1] "it's a mindless action flick with a twist -- far better suited to video-viewing than the multiplex . "
as.character(dsMovies_clean[[1]])
## [1] "it's a mindless action flick with a twist -- far better suited to video-viewing than the multiplex . "
dsMovies_clean <- tm_map ( dsMovies_clean, removeNumbers)
as.character(dsMoviesCorpus[[32]])
## [1] "one of the worst films of 2002 . "
as.character(dsMovies_clean[[32]])
## [1] "one of the worst films of . "
#stopwords()
#stopwords('spanish')
dsMovies_clean <- tm_map ( dsMovies_clean, removeWords, stopwords('spanish'))
dsMovies_clean <- tm_map ( dsMovies_clean, removeWords, stopwords())
as.character(dsMoviesCorpus[[6]])
## [1] "often lingers just as long on the irrelevant as on the engaging , which gradually turns what time is it there ? into how long is this movie ? "
as.character(dsMovies_clean[[6]])
## [1] "often lingers just long irrelevant engaging , gradually turns time ? long movie ? "
as.character(dsMoviesCorpus[[1200]])
## [1] "fairly run-of-the-mill . "
as.character(dsMovies_clean[[1200]])
## [1] "fairly run---mill . "
dsMovies_clean <- tm_map ( dsMovies_clean, removePunctuation)
as.character(dsMoviesCorpus[[6]])
## [1] "often lingers just as long on the irrelevant as on the engaging , which gradually turns what time is it there ? into how long is this movie ? "
as.character(dsMovies_clean[[6]])
## [1] "often lingers just long irrelevant engaging gradually turns time long movie "
dsMovies_clean <- tm_map ( dsMovies_clean, stripWhitespace)
as.character(dsMoviesCorpus[[452]])
## [1] "jarecki and gibney do find enough material to bring kissinger's record into question and explain how the diplomat's tweaked version of statecraft may have cost thousands and possibly millions of lives . "
as.character(dsMovies_clean[[452]])
## [1] "jarecki gibney find enough material bring kissingers record question explain diplomats tweaked version statecraft may cost thousands possibly millions lives "
dsMovies_dtm <- DocumentTermMatrix (dsMovies_clean)
str(dsMovies_dtm)
## List of 6
## $ i : int [1:78753] 1 1 1 1 1 1 1 1 1 2 ...
## $ j : int [1:78753] 136 1327 5259 5569 9238 9551 14253 15273 15831 402 ...
## $ v : num [1:78753] 1 1 1 1 1 1 1 1 1 1 ...
## $ nrow : int 7663
## $ ncol : int 16567
## $ dimnames:List of 2
## ..$ Docs : chr [1:7663] "1" "2" "3" "4" ...
## ..$ Terms: chr [1:16567] "<c2><bd>" "<c3><a9>lan" "<e2><80><93>" "<e2><80><94>" ...
## - attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
## - attr(*, "weighting")= chr [1:2] "term frequency" "tf"
dsMovies_dtm_train <- dsMovies_dtm[1:5364, ]
dsMovies_dtm_test <- dsMovies_dtm[5365:7663, ]
dsMovies_train_labels <- dsMovies[1:5364,]$type
dsMovies_test_labels <- dsMovies[5365:7663,]$type
prop.table(table(dsMovies_train_labels))
## dsMovies_train_labels
## negative positive
## 0.3296048 0.6703952
prop.table(table(dsMovies_test_labels))
## dsMovies_test_labels
## negative positive
## 0.2453241 0.7546759
library("wordcloud")
## Loading required package: RColorBrewer
wordcloud(dsMovies_clean, min.freq = 40, random.order =TRUE)
## Warning in wordcloud(dsMovies_clean, min.freq = 40, random.order = TRUE):
## film could not be fit on page. It will not be plotted.
bad <- subset(dsMovies, type=="negative")
good <- subset(dsMovies, type =="positive")
wordcloud(bad$message, max.words = 80, scale = c(5, 0.5))
bad <- subset(dsMovies, type=="negative")
good <- subset(dsMovies, type =="positive")
wordcloud(good$message, max.words = 80, scale = c(5, 0.5))
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(tidyr)
bing <- get_sentiments("bing")
tidy_dsMovies <- dsMovies %>%
unnest_tokens(word, message)
data("stop_words")
dsMoviesCl <- tidy_dsMovies %>%
anti_join(stop_words)
## Joining, by = "word"
res = dsMoviesCl %>%
count(word, sort = TRUE)
barplot(res[1:10,]$n, xlab = "Word", ylab = "Quantity", horiz = FALSE, col='#FF6100',names.arg=res[1:10,]$word, cex.names=0.7)
res[1:10,]
## # A tibble: 10 x 2
## word n
## <chr> <int>
## 1 film 1087
## 2 movie 874
## 3 story 357
## 4 comedy 289
## 5 time 261
## 6 characters 232
## 7 funny 230
## 8 director 208
## 9 life 208
## 10 love 182
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
bing_word_counts <- tidy_dsMovies %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts %>%
filter(n > 50) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ylab("Contribution to sentiment")
dsMovies_freq_words <- findFreqTerms(dsMovies_dtm_train,20)
str(dsMovies_freq_words)
## chr [1:429] "acted" "acting" "action" "actor" "actors" ...
dsMovies_dtm_freq_train <- dsMovies_dtm_train[,dsMovies_freq_words]
dsMovies_dtm_freq_test <- dsMovies_dtm_test[,dsMovies_freq_words]
dsMovies_dtm_freq_train
## <<DocumentTermMatrix (documents: 5364, terms: 429)>>
## Non-/sparse entries: 21067/2280089
## Sparsity : 99%
## Maximal term length: 14
## Weighting : term frequency (tf)
dsMovies_dtm_freq_test
## <<DocumentTermMatrix (documents: 2299, terms: 429)>>
## Non-/sparse entries: 8863/977408
## Sparsity : 99%
## Maximal term length: 14
## Weighting : term frequency (tf)
converts_counts <- function(x){x <- ifelse(x>0,"Yes","No")}
dsMovies_train <- apply(dsMovies_dtm_freq_train, MARGIN = 2, converts_counts)
dsMovies_test <- apply(dsMovies_dtm_freq_test, MARGIN = 2, converts_counts)
library(e1071)
dsMovies_classifier <- naiveBayes(dsMovies_train,dsMovies_train_labels)
dsMovies_text_pred <- predict(dsMovies_classifier,dsMovies_test)
table(dsMovies_text_pred)
## dsMovies_text_pred
## negative positive
## 473 1826
library(gmodels)
#CrossTable(dsMovies_text_pred,dsMovies_test_labels,prop.chisq = FALSE,prop.t = FALSE, dnn = c('predicted','actual'))
table(dsMovies_text_pred,dsMovies_test_labels)
## dsMovies_test_labels
## dsMovies_text_pred negative positive
## negative 230 243
## positive 334 1492
\(P(X_i = x_i | Y = y) = 0\)
library(e1071)
dsMovies_classifier <- naiveBayes(dsMovies_train,dsMovies_train_labels,laplace = 1)
dsMovies_text_pred <- predict(dsMovies_classifier,dsMovies_test)
table(dsMovies_text_pred,dsMovies_test_labels)
## dsMovies_test_labels
## dsMovies_text_pred negative positive
## negative 222 237
## positive 342 1498