0. Set Up Working Directory in R:
getwd()
## [1] "C:/Users/outan/Desktop"
setwd("C:/Users/Outan/Desktop")
1. Import/Corpus a Text CSV Data File in R:
library(tm)
## Warning: package 'tm' was built under R version 4.0.2
library(SnowballC)
Sample_CSV <- read.csv("Text_Data_Starcraft.csv", stringsAsFactors = FALSE, header = FALSE)
Corpus <- Corpus(VectorSource(Sample_CSV$V1))
str(Sample_CSV)
## 'data.frame': 7 obs. of 1 variable:
## $ V1: chr "1. Terran Marine?" "Jacked up and good to go!" "2. Terran Medic: " "Please state the nature of the medical emergency " ...
inspect(Corpus)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 7
##
## [1] 1. Terran Marine?
## [2] Jacked up and good to go!
## [3] 2. Terran Medic:
## [4] Please state the nature of the medical emergency
## [5] 3. Terran Battle Cruiser!!!!!!
## [6] "Good Day Commander"
## [7] 4. Terran? very very good, but very hard, Protoss ? Good and hard, Zerg ?Okay okay, but hard
2. Text Data Cleaning in R: Breaking Text Data into Words, Numbers, Phrases, Symbols, or other elements called Tokens!
Step 1. Convert Case: Upper to Lower.
Corpus <- tm_map(Corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(Corpus, content_transformer(tolower)):
## transformation drops documents
inspect(Corpus)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 7
##
## [1] 1. terran marine?
## [2] jacked up and good to go!
## [3] 2. terran medic:
## [4] please state the nature of the medical emergency
## [5] 3. terran battle cruiser!!!!!!
## [6] "good day commander"
## [7] 4. terran? very very good, but very hard, protoss ? good and hard, zerg ?okay okay, but hard
Step 2. Remove Numbers from Text Data.
Corpus <- tm_map(Corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(Corpus, removeNumbers): transformation drops
## documents
inspect(Corpus)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 7
##
## [1] . terran marine?
## [2] jacked up and good to go!
## [3] . terran medic:
## [4] please state the nature of the medical emergency
## [5] . terran battle cruiser!!!!!!
## [6] "good day commander"
## [7] . terran? very very good, but very hard, protoss ? good and hard, zerg ?okay okay, but hard
Step 3. Strip Whitespace from Text Data: Multiple Spaces to a Single Space.
Corpus <- tm_map(Corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(Corpus, stripWhitespace): transformation drops
## documents
inspect(Corpus)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 7
##
## [1] . terran marine?
## [2] jacked up and good to go!
## [3] . terran medic:
## [4] please state the nature of the medical emergency
## [5] . terran battle cruiser!!!!!!
## [6] "good day commander"
## [7] . terran? very very good, but very hard, protoss ? good and hard, zerg ?okay okay, but hard
Step 4. Remove All Puntuation Marks from Text Data.
Corpus <- tm_map(Corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(Corpus, removePunctuation): transformation drops
## documents
inspect(Corpus)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 7
##
## [1] terran marine
## [2] jacked up and good to go
## [3] terran medic
## [4] please state the nature of the medical emergency
## [5] terran battle cruiser
## [6] good day commander
## [7] terran very very good but very hard protoss good and hard zerg okay okay but hard
Step 5-1. Stopwords: Remove Certain Words from Text Data.
Corpus <- tm_map(Corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(Corpus, removeWords, stopwords("english")):
## transformation drops documents
inspect(Corpus)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 7
##
## [1] terran marine
## [2] jacked good go
## [3] terran medic
## [4] please state nature medical emergency
## [5] terran battle cruiser
## [6] good day commander
## [7] terran good hard protoss good hard zerg okay okay hard
Step 5-2. Stopwords: Add Addtional Removals in Text Data.
st_word <- c(stopwords("en"), "medical", "nature", "cruiser", "marine", "jacked", "emergency", "medic", "please", "state", "commander", "day", "battle")
Corpus <- tm_map(Corpus, removeWords, st_word)
## Warning in tm_map.SimpleCorpus(Corpus, removeWords, st_word): transformation
## drops documents
Step 6. Stemming Words.
Corpus <- tm_map(Corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(Corpus, stemDocument): transformation drops
## documents
inspect(Corpus)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 7
##
## [1] terran
## [2] good go
## [3] terran
## [4]
## [5] terran
## [6] good
## [7] terran good hard protoss good hard zerg okay okay hard
3. Text Data Matrix in R:
Expression 1. Term-Oriented (Word-Oriented) Matrix.
tdm <- TermDocumentMatrix(Corpus)
inspect(tdm)
## <<TermDocumentMatrix (terms: 6, documents: 7)>>
## Non-/sparse entries: 11/31
## Sparsity : 74%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 1 2 3 4 5 6 7
## good 0 1 0 0 0 1 2
## hard 0 0 0 0 0 0 3
## okay 0 0 0 0 0 0 2
## protoss 0 0 0 0 0 0 1
## terran 1 0 1 0 1 0 1
## zerg 0 0 0 0 0 0 1
tfidf_tdm <- weightTfIdf(tdm)
## Warning in weightTfIdf(tdm): empty document(s): 4
inspect(tfidf_tdm)
## <<TermDocumentMatrix (terms: 6, documents: 7)>>
## Non-/sparse entries: 11/31
## Sparsity : 74%
## Maximal term length: 7
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample :
## Docs
## Terms 1 2 3 4 5 6 7
## good 0.0000000 1.222392 0.0000000 0 0.0000000 1.222392 0.24447848
## hard 0.0000000 0.000000 0.0000000 0 0.0000000 0.000000 0.84220648
## okay 0.0000000 0.000000 0.0000000 0 0.0000000 0.000000 0.56147098
## protoss 0.0000000 0.000000 0.0000000 0 0.0000000 0.000000 0.28073549
## terran 0.8073549 0.000000 0.8073549 0 0.8073549 0.000000 0.08073549
## zerg 0.0000000 0.000000 0.0000000 0 0.0000000 0.000000 0.28073549
Expression 2. Document-Oriented Matrix.
dtm <- DocumentTermMatrix(Corpus)
inspect(dtm)
## <<DocumentTermMatrix (documents: 7, terms: 6)>>
## Non-/sparse entries: 11/31
## Sparsity : 74%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs good hard okay protoss terran zerg
## 1 0 0 0 0 1 0
## 2 1 0 0 0 0 0
## 3 0 0 0 0 1 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 1 0
## 6 1 0 0 0 0 0
## 7 2 3 2 1 1 1
tfidf_dtm <- weightTfIdf(dtm)
## Warning in weightTfIdf(dtm): empty document(s): 4
inspect(tfidf_dtm)
## <<DocumentTermMatrix (documents: 7, terms: 6)>>
## Non-/sparse entries: 11/31
## Sparsity : 74%
## Maximal term length: 7
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample :
## Terms
## Docs good hard okay protoss terran zerg
## 1 0.0000000 0.0000000 0.000000 0.0000000 0.80735492 0.0000000
## 2 1.2223924 0.0000000 0.000000 0.0000000 0.00000000 0.0000000
## 3 0.0000000 0.0000000 0.000000 0.0000000 0.80735492 0.0000000
## 4 0.0000000 0.0000000 0.000000 0.0000000 0.00000000 0.0000000
## 5 0.0000000 0.0000000 0.000000 0.0000000 0.80735492 0.0000000
## 6 1.2223924 0.0000000 0.000000 0.0000000 0.00000000 0.0000000
## 7 0.2444785 0.8422065 0.561471 0.2807355 0.08073549 0.2807355