# Main Library
library(ggplot2)
# Text Mining Library
library("tm") # for text mining
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library("SnowballC") # for text stemming
library("wordcloud") # word-cloud generator
## Loading required package: RColorBrewer
library("RColorBrewer") # color palettes
Workflow:
Getting Data (csv) -> preprocessing -> eda (exploratory data analysis) -> data visualization using ggplot2
Formulate question What is the most used word used for movie title?
mv
is a dataframe with 4803 observations and 20 variables
mv <- read.csv("data_input/tmdb_5000_movies.csv", stringsAsFactors = FALSE)
# str(mv)
title <- mv[,c("title")]
head(title, n = 10)
## [1] "Avatar"
## [2] "Pirates of the Caribbean: At World's End"
## [3] "Spectre"
## [4] "The Dark Knight Rises"
## [5] "John Carter"
## [6] "Spider-Man 3"
## [7] "Tangled"
## [8] "Avengers: Age of Ultron"
## [9] "Harry Potter and the Half-Blood Prince"
## [10] "Batman v Superman: Dawn of Justice"
rm(mv) # remove unused mv data frame to save memory
Load the data as a corpus
docs <- Corpus(VectorSource(title))
docs
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 4803
Inspect the content of the document
# inspect(docs)
Transformation is performed using tm_map() function to replace, for example, special characters from the text.
Replacing :
and -
with empty character:
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, "", x))
docs <- tm_map(docs, toSpace, ":")
## Warning in tm_map.SimpleCorpus(docs, toSpace, ":"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "-")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "-"): transformation drops
## documents
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
# Remove 'the' common stopwords
docs <- tm_map(docs, removeWords, c("the", "at", "of", "on",
"and", "vs", "an"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, c("the", "at", "of",
## "on", : transformation drops documents
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
Document matrix is a table containing the frequency of the words. Column names are words and row names are documents. The function TermDocumentMatrix()
from text mining package can be used as follow :
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 20)
## word freq
## man man 66
## love love 56
## with with 51
## movie movie 43
## you you 41
## for for 41
## dead dead 40
## last last 40
## from from 36
## good good 32
## house house 32
## story story 30
## big big 30
## day day 30
## american american 30
## black black 27
## life life 27
## girl girl 27
## time time 26
## all all 26
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
# re-ordering word based on frequency
top20 <- head(d, 20)
top20$word <- reorder(top20$word, top20$freq)
ggplot(top20, aes(x = word, y = freq, fill = word, label = freq)) +
geom_bar(stat="identity", show.legend = FALSE) +
coord_flip() +
labs(title = "Top 20 Most Used Words in Movie Title", x = "Word", y = "Word Count") +
geom_label(aes(fill = word),colour = "white", fontface = "bold", show.legend = FALSE)
The most used words in Movie Title was man
Top 5 words
What I Learn?
tm
librarywordcloud
librarygeom_col()
+ coord_flip()
+ labs()
+ geom_label()