1. Libraries

# Main Library
library(ggplot2)

# Text Mining Library
library("tm") # for text mining
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library("SnowballC") # for text stemming
library("wordcloud") # word-cloud generator 
## Loading required package: RColorBrewer
library("RColorBrewer") # color palettes

2. Kaggle Dataset

TMBD 5000 Movie Dataset

3. Pre-Processing Data

Workflow:

Getting Data (csv) -> preprocessing -> eda (exploratory data analysis) -> data visualization using ggplot2

Formulate question What is the most used word used for movie title?

3.1 Pre-processing (subset ‘title’ from movie data frame)

mv is a dataframe with 4803 observations and 20 variables

mv <- read.csv("data_input/tmdb_5000_movies.csv", stringsAsFactors = FALSE)
# str(mv)

title <- mv[,c("title")]
head(title, n = 10)
##  [1] "Avatar"                                  
##  [2] "Pirates of the Caribbean: At World's End"
##  [3] "Spectre"                                 
##  [4] "The Dark Knight Rises"                   
##  [5] "John Carter"                             
##  [6] "Spider-Man 3"                            
##  [7] "Tangled"                                 
##  [8] "Avengers: Age of Ultron"                 
##  [9] "Harry Potter and the Half-Blood Prince"  
## [10] "Batman v Superman: Dawn of Justice"
rm(mv) # remove unused mv data frame to save memory

3.2 Exploratory Data Analysis

Load the data as a corpus

docs <- Corpus(VectorSource(title))
docs
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 4803

Inspect the content of the document

# inspect(docs)

3.2.1 Text transformation

Transformation is performed using tm_map() function to replace, for example, special characters from the text.

Replacing : and - with empty character:

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, "", x))
docs <- tm_map(docs, toSpace, ":")
## Warning in tm_map.SimpleCorpus(docs, toSpace, ":"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "-")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "-"): transformation drops
## documents

3.2.2 Cleaning the Text

# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
# Remove 'the' common stopwords
docs <- tm_map(docs, removeWords, c("the", "at", "of", "on", 
                                    "and", "vs", "an"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, c("the", "at", "of",
## "on", : transformation drops documents
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents

3.2.3 Build a term-document matrix

Document matrix is a table containing the frequency of the words. Column names are words and row names are documents. The function TermDocumentMatrix() from text mining package can be used as follow :

dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 20)
##              word freq
## man           man   66
## love         love   56
## with         with   51
## movie       movie   43
## you           you   41
## for           for   41
## dead         dead   40
## last         last   40
## from         from   36
## good         good   32
## house       house   32
## story       story   30
## big           big   30
## day           day   30
## american american   30
## black       black   27
## life         life   27
## girl         girl   27
## time         time   26
## all           all   26

4. Visualization - Word Cloud + ggplot2

wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

# re-ordering word based on frequency
top20 <- head(d, 20)
top20$word <- reorder(top20$word, top20$freq)

ggplot(top20, aes(x = word, y = freq, fill = word, label = freq)) +
  geom_bar(stat="identity", show.legend = FALSE) +
  coord_flip() +
  labs(title = "Top 20 Most Used Words in Movie Title", x = "Word", y = "Word Count") +
  geom_label(aes(fill = word),colour = "white", fontface = "bold", show.legend = FALSE)

5. Final Thoughts

The most used words in Movie Title was man

Top 5 words

  1. man
  2. love
  3. with
  4. movie
  5. you

What I Learn?

References

sthda.com rpubs.com