Introduction What Is Text Mining.
The steps involved in Text Ming Are:
Data Location
“un-profile.txt” is available in ./data folder
Data Description
“un-profile.txt” is a free form text file
Setup
Library
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.4
## Loading required package: RColorBrewer
Functions
# frequency category helper function
FreqCategory <- function(value) {
strCategory <- ifelse(value <=5, " 5",
ifelse(value <=10, " 10",
ifelse(value <=20, " 20",
ifelse(value <=50, " 50",
ifelse(value <=100, " 100",
ifelse(value <=500, " 500",
ifelse(value <=1000, " 1,000",
">1,000")))))))
strCategory
}
Read Data
vcsUNPrfLines <- readLines("C:/Users/Acer/Documents/WeSchool_R&BA/Trim 4/R/filesforrtextminingshinyapp_lect6/Trump Florida Rally 2-18-17.txt")
## Warning in readLines("C:/Users/Acer/Documents/WeSchool_R&BA/Trim 4/R/
## filesforrtextminingshinyapp_lect6/Trump Florida Rally 2-18-17.txt"):
## incomplete final line found on 'C:/Users/Acer/Documents/WeSchool_R&BA/Trim
## 4/R/filesforrtextminingshinyapp_lect6/Trump Florida Rally 2-18-17.txt'
head(vcsUNPrfLines)
## [1] "Thank you, everybody, thank you. I didn't know that Melania was going to be saying the Lord's Prayer, but I thought that was very beautiful, thank you, thank you."
## [2] ""
## [3] "It's so great to be here in Florida. My second home with you. This is a state I truly love. This is a state where we all had great victory together. Thank you."
## [4] ""
## [5] "It's now been a month since my inauguration. And I am here to tell you about our incredible progress in making America great again. I'm also here to tell you about our plans for the future and they're big and they're bold and It's what our country is all about, believe me."
## [6] ""
Line Count
# line count = length of vector
intLineCount <- length(vcsUNPrfLines)
intLineCount
## [1] 132
Line Count: 132
Words Per Line
# split
lstUNPrfLines <- str_split(vcsUNPrfLines," ")
# words per line
vciUNPrfWperL <- unlist(lapply(lstUNPrfLines, length))
# print average words per line
mean(vciUNPrfWperL)
## [1] 40.20455
Average Word Per Line: 40.2045455
Word Count
# unlist to get vector of words
vcsUNPrfWords <- unlist(lstUNPrfLines)
# total word count = length of vector
intWordCount <- length(vcsUNPrfWords)
# print
intWordCount
## [1] 5307
Word Count: 5307
Show Words
# head
head(vcsUNPrfWords,100)
## [1] "Thank" "you," "everybody," "thank"
## [5] "you." "I" "didn't" "know"
## [9] "that" "Melania" "was" "going"
## [13] "to" "be" "saying" "the"
## [17] "Lord's" "Prayer," "but" "I"
## [21] "thought" "that" "was" "very"
## [25] "beautiful," "thank" "you," "thank"
## [29] "you." "" "It's" "so"
## [33] "great" "to" "be" "here"
## [37] "in" "Florida." "My" "second"
## [41] "home" "with" "you." "This"
## [45] "is" "a" "state" "I"
## [49] "truly" "love." "This" "is"
## [53] "a" "state" "where" "we"
## [57] "all" "had" "great" "victory"
## [61] "together." "Thank" "you." ""
## [65] "It's" "now" "been" "a"
## [69] "month" "since" "my" "inauguration."
## [73] "And" "I" "am" "here"
## [77] "to" "tell" "you" "about"
## [81] "our" "incredible" "progress" "in"
## [85] "making" "America" "great" "again."
## [89] "I'm" "also" "here" "to"
## [93] "tell" "you" "about" "our"
## [97] "plans" "for" "the" "future"
Clean Words
# lower case
vcsUNPrfWords <- str_to_lower(vcsUNPrfWords)
# remove numbers
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:digit:]]", "")
# remove punctuation
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:punct:]]", "")
# remove white spaces
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:space:]]", "")
# remove special chars
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[~@#$%&-_=<>]", "")
# remove empty vectors
vcsUNPrfWords <- vcsUNPrfWords[vcsUNPrfWords != ""]
# hack & remove $
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="$", "")
# head
head(vcsUNPrfWords,100)
## [1] "thank" "you" "everybody" "thank"
## [5] "you" "i" "didnt" "know"
## [9] "that" "melania" "was" "going"
## [13] "to" "be" "saying" "the"
## [17] "lords" "prayer" "but" "i"
## [21] "thought" "that" "was" "very"
## [25] "beautiful" "thank" "you" "thank"
## [29] "you" "its" "so" "great"
## [33] "to" "be" "here" "in"
## [37] "florida" "my" "second" "home"
## [41] "with" "you" "this" "is"
## [45] "a" "state" "i" "truly"
## [49] "love" "this" "is" "a"
## [53] "state" "where" "we" "all"
## [57] "had" "great" "victory" "together"
## [61] "thank" "you" "its" "now"
## [65] "been" "a" "month" "since"
## [69] "my" "inauguration" "and" "i"
## [73] "am" "here" "to" "tell"
## [77] "you" "about" "our" "incredible"
## [81] "progress" "in" "making" "america"
## [85] "great" "again" "im" "also"
## [89] "here" "to" "tell" "you"
## [93] "about" "our" "plans" "for"
## [97] "the" "future" "and" "theyre"
Normal Word Data Frame
# make data frame
dfrUNPrfWords <- data.frame(vcsUNPrfWords)
colnames(dfrUNPrfWords) <- c("Words")
dfrUNPrfWords$Words <- as.character(dfrUNPrfWords$Words)
# normal word count
head(dfrUNPrfWords,10)
## Words
## 1 thank
## 2 you
## 3 everybody
## 4 thank
## 5 you
## 6 i
## 7 didnt
## 8 know
## 9 that
## 10 melania
Normal Word Count
# summarise data
dfrUNPrfFreq <- dfrUNPrfWords %>%
group_by(Words) %>%
summarise(Freq=n()) %>%
arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 the 224
## 2 to 187
## 3 and 186
## 4 of 106
## 5 a 90
## 6 you 90
Normal Word Cloud
# wordcloud normal word count
wordcloud(dfrUNPrfFreq$Words[1:100], dfrUNPrfFreq$Freq[1:100], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))
Significant Word Data Frame
# significant words only
# remove all words with len <= 2
dfrUNPrfWords <- filter(dfrUNPrfWords, str_length(Words)>2)
# remove all common words
# original found at http://en.wikipedia.org/wiki/Stop_words
vcsCmnWords <- c("all","also","and","any","are","but","can","cant","cry","due","etc","few","for","get","had","has","hasnt","have","her","here","hers","herself","him","himself","his","how","inc","into","its","ltd","may","nor","not","now","off","once","one","only","onto","our","ours","out","over","own","part","per","put","see","seem","she","than","that","the","their","them","then","thence","there","these","they","this","those","though","thus","too","top","upon","very","via","was","were","what","when","which","while","who","whoever","whom","whose","why","will","with","within","without","would","yet","you","your","yours","the")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsCmnWords))
# remove all bad words ...
# original found at http://en.wiktionary.org/wiki/Category:English_swear_words
vcsBadWords <- c("arse","ass","asshole","bastard","bitch","bloody","bollocks","child-fucker","cunt","damn","fuck","goddamn","godsdamn","hell","motherfucker","shit","shitass","whore")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsBadWords))
# show
head(dfrUNPrfWords)
## Words
## 1 thank
## 2 everybody
## 3 thank
## 4 didnt
## 5 know
## 6 melania
Significant Word Count
# summarise data
dfrUNPrfFreq <- dfrUNPrfWords %>%
group_by(Words) %>%
summarise(Freq=n()) %>%
arrange(desc(Freq))
head(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 going 61
## 2 want 37
## 3 people 35
## 4 great 33
## 5 country 30
## 6 theyre 29
Significant Word Count Tail
tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 yes 1
## 2 yesterday 1
## 3 yesterdays 1
## 4 youll 1
## 5 yourselves 1
## 6 youth 1
Remove Sparse Word
# remove sparse words ... words with Freq <= 5
dfrUNPrfFreq <- filter(dfrUNPrfFreq, Freq>5)
tail(dfrUNPrfFreq)
## # A tibble: 6 x 2
## Words Freq
## <chr> <int>
## 1 tax 6
## 2 television 6
## 3 women 6
## 4 working 6
## 5 years 6
## 6 youve 6
Word Count Final
# total word count = length of vector
intWordCountFinal <- length(dfrUNPrfFreq$Words)
# print
intWordCountFinal
## [1] 92
Frequency Category
# add FrequencyCategory colum
dfrUNPrfFreq <- mutate(dfrUNPrfFreq, Fcat=FreqCategory(dfrUNPrfFreq$Freq))
# new data frame for Frequency Of Categorized Frequencies ...
dfrUNPrfFocf <- dfrUNPrfFreq %>% group_by(Fcat) %>% summarise(Rfrq=n())
#
dfrUNPrfFocf$Fcat <- factor(dfrUNPrfFocf$Fcat, levels=dfrUNPrfFocf$Fcat, ordered=T)
# head
head(dfrUNPrfFocf,10)
## # A tibble: 4 x 2
## Fcat Rfrq
## <ord> <int>
## 1 " 10" 57
## 2 " 20" 26
## 3 " 50" 8
## 4 " 100" 1
Word Cloud
# wordcloud
wordcloud(dfrUNPrfFreq$Words[1:50], dfrUNPrfFreq$Freq[1:50], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))
Word Plot
# word plot
ggplot(slice(dfrUNPrfFreq,1:30), aes(x=reorder(Words,-Freq),y=Freq)) +
geom_bar(stat="identity", fill=rainbow(30)) +
ylab("Frequency") +
xlab("Words") +
ggtitle("Word Frequency - Top 30 Words") +
theme(plot.title=element_text(size=rel(1.5), colour="blue")) +
coord_flip()
Frequency Plot
ggplot(dfrUNPrfFocf, aes(Fcat,Rfrq))+
geom_bar(stat="identity", width=0.8, fill=rainbow(length(dfrUNPrfFocf$Fcat))) +
xlab("Words With Frequency Less Than") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=60, hjust=1, vjust=1),axis.text.y=element_text(angle=60, hjust=1, vjust=1),plot.title=element_text(size=rel(1.5), colour="blue")) +
ggtitle("Frequency Of Word Count")
Word Length
dfrUNPrfChrs <- data.frame(Chars=nchar(dfrUNPrfFreq$Words))
#intRowCount <- nrow(table(dfrUNPrfChrs))
ggplot(dfrUNPrfChrs, aes(x=Chars)) +
geom_histogram(binwidth=1, fill='blue') +
geom_vline(xintercept=mean(nchar(dfrUNPrfFreq$Words)), color='black', size=1.5, alpha=.5) +
xlab("Word Length (Chars)") + ylab("Number Of Words (Frequency)")
Wind Up
print("Wind Up")
## [1] "Wind Up"