Text Minig

Introduction What Is Text Mining.

The steps involved in Text Ming Are:

Read File using readLines
Split Lines to Words based on " "
Unlist -> single vector of Words
Clean vector of Words
Convert to Lower case
Remove Numbers / Digits
Remove Punctuation
Remove Special Chars
Remove White Spaces
Remove Empty Vectors
Convert vector of Words to dataframe [Total Number Of Words]
Clean Data Frame (based on requirement)
Remove Stop Words
Remove Profanity
Remove Sparse Words
Covert vector of Words to dataframe [Total Number Of Significant Words]
Perform operations as required . in our case find frequency of words (use summarise)
Plot graphs as required . in our case word cloud

Data Location
“un-profile.txt” is available in ./data folder

Data Description
“un-profile.txt” is a free form text file

Setup

Library

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.4.4

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.4.4

## Loading required package: RColorBrewer

Functions

# frequency category helper function
FreqCategory <- function(value) {
    strCategory <- ifelse(value <=5,   "      5",
                ifelse(value <=10,     "     10",
                ifelse(value <=20,     "     20",
                ifelse(value <=50,     "     50",
                ifelse(value <=100,    "    100",
                ifelse(value <=500,    "    500",
                ifelse(value <=1000,   "  1,000",
                              ">1,000")))))))
                strCategory
}

Read Data

vcsUNPrfLines <- readLines("C:/Users/Acer/Documents/WeSchool_R&BA/Trim 4/R/filesforrtextminingshinyapp_lect6/Trump Florida Rally 2-18-17.txt")

## Warning in readLines("C:/Users/Acer/Documents/WeSchool_R&BA/Trim 4/R/
## filesforrtextminingshinyapp_lect6/Trump Florida Rally 2-18-17.txt"):
## incomplete final line found on 'C:/Users/Acer/Documents/WeSchool_R&BA/Trim
## 4/R/filesforrtextminingshinyapp_lect6/Trump Florida Rally 2-18-17.txt'

head(vcsUNPrfLines)

## [1] "Thank you, everybody, thank you. I didn't know that Melania was going to be saying the Lord's Prayer, but I thought that was very beautiful, thank you, thank you."                                                                                                               
## [2] ""                                                                                                                                                                                                                                                                                 
## [3] "It's so great to be here in Florida. My second home with you. This is a state I truly love. This is a state where we all had great victory together. Thank you."                                                                                                                  
## [4] ""                                                                                                                                                                                                                                                                                 
## [5] "It's now been a month since my inauguration. And I am here to tell you about our incredible progress in making America great again. I'm also here to tell you about our plans for the future and they're big and they're bold and It's what our country is all about, believe me."
## [6] ""

Line Count

# line count = length of vector
intLineCount <- length(vcsUNPrfLines)
intLineCount

## [1] 132

Line Count: 132

Words Per Line

# split
lstUNPrfLines <- str_split(vcsUNPrfLines," ")
# words per line
vciUNPrfWperL <- unlist(lapply(lstUNPrfLines, length))
# print average words per line
mean(vciUNPrfWperL)

## [1] 40.20455

Average Word Per Line: 40.2045455

Word Count

# unlist to get vector of words
vcsUNPrfWords <- unlist(lstUNPrfLines)
# total word count = length of vector
intWordCount <- length(vcsUNPrfWords)
# print
intWordCount

## [1] 5307

Word Count: 5307

Show Words

# head
head(vcsUNPrfWords,100)

##   [1] "Thank"         "you,"          "everybody,"    "thank"        
##   [5] "you."          "I"             "didn't"        "know"         
##   [9] "that"          "Melania"       "was"           "going"        
##  [13] "to"            "be"            "saying"        "the"          
##  [17] "Lord's"        "Prayer,"       "but"           "I"            
##  [21] "thought"       "that"          "was"           "very"         
##  [25] "beautiful,"    "thank"         "you,"          "thank"        
##  [29] "you."          ""              "It's"          "so"           
##  [33] "great"         "to"            "be"            "here"         
##  [37] "in"            "Florida."      "My"            "second"       
##  [41] "home"          "with"          "you."          "This"         
##  [45] "is"            "a"             "state"         "I"            
##  [49] "truly"         "love."         "This"          "is"           
##  [53] "a"             "state"         "where"         "we"           
##  [57] "all"           "had"           "great"         "victory"      
##  [61] "together."     "Thank"         "you."          ""             
##  [65] "It's"          "now"           "been"          "a"            
##  [69] "month"         "since"         "my"            "inauguration."
##  [73] "And"           "I"             "am"            "here"         
##  [77] "to"            "tell"          "you"           "about"        
##  [81] "our"           "incredible"    "progress"      "in"           
##  [85] "making"        "America"       "great"         "again."       
##  [89] "I'm"           "also"          "here"          "to"           
##  [93] "tell"          "you"           "about"         "our"          
##  [97] "plans"         "for"           "the"           "future"

Clean Words

# lower case
vcsUNPrfWords <- str_to_lower(vcsUNPrfWords)
# remove numbers
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:digit:]]", "")
# remove punctuation
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:punct:]]", "")
# remove white spaces
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[[:space:]]", "")
# remove special chars
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="[~@#$%&-_=<>]", "")
# remove empty vectors
vcsUNPrfWords <- vcsUNPrfWords[vcsUNPrfWords != ""]
# hack & remove $
vcsUNPrfWords <- str_replace_all(vcsUNPrfWords, pattern="$", "")
# head
head(vcsUNPrfWords,100)

##   [1] "thank"        "you"          "everybody"    "thank"       
##   [5] "you"          "i"            "didnt"        "know"        
##   [9] "that"         "melania"      "was"          "going"       
##  [13] "to"           "be"           "saying"       "the"         
##  [17] "lords"        "prayer"       "but"          "i"           
##  [21] "thought"      "that"         "was"          "very"        
##  [25] "beautiful"    "thank"        "you"          "thank"       
##  [29] "you"          "its"          "so"           "great"       
##  [33] "to"           "be"           "here"         "in"          
##  [37] "florida"      "my"           "second"       "home"        
##  [41] "with"         "you"          "this"         "is"          
##  [45] "a"            "state"        "i"            "truly"       
##  [49] "love"         "this"         "is"           "a"           
##  [53] "state"        "where"        "we"           "all"         
##  [57] "had"          "great"        "victory"      "together"    
##  [61] "thank"        "you"          "its"          "now"         
##  [65] "been"         "a"            "month"        "since"       
##  [69] "my"           "inauguration" "and"          "i"           
##  [73] "am"           "here"         "to"           "tell"        
##  [77] "you"          "about"        "our"          "incredible"  
##  [81] "progress"     "in"           "making"       "america"     
##  [85] "great"        "again"        "im"           "also"        
##  [89] "here"         "to"           "tell"         "you"         
##  [93] "about"        "our"          "plans"        "for"         
##  [97] "the"          "future"       "and"          "theyre"

Normal Word Data Frame

# make data frame
dfrUNPrfWords <- data.frame(vcsUNPrfWords)
colnames(dfrUNPrfWords) <- c("Words")
dfrUNPrfWords$Words <- as.character(dfrUNPrfWords$Words)
# normal word count
head(dfrUNPrfWords,10)

##        Words
## 1      thank
## 2        you
## 3  everybody
## 4      thank
## 5        you
## 6          i
## 7      didnt
## 8       know
## 9       that
## 10   melania

Normal Word Count

# summarise data
dfrUNPrfFreq <- dfrUNPrfWords %>% 
                group_by(Words) %>% 
                summarise(Freq=n()) %>% 
                arrange(desc(Freq))
head(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words  Freq
##   <chr> <int>
## 1 the     224
## 2 to      187
## 3 and     186
## 4 of      106
## 5 a        90
## 6 you      90

Normal Word Cloud

# wordcloud normal word count
wordcloud(dfrUNPrfFreq$Words[1:100], dfrUNPrfFreq$Freq[1:100], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))

Significant Word Data Frame

# significant words only
# remove all words with len <= 2
dfrUNPrfWords <- filter(dfrUNPrfWords, str_length(Words)>2)
# remove all common words
# original found at http://en.wikipedia.org/wiki/Stop_words
vcsCmnWords <- c("all","also","and","any","are","but","can","cant","cry","due","etc","few","for","get","had","has","hasnt","have","her","here","hers","herself","him","himself","his","how","inc","into","its","ltd","may","nor","not","now","off","once","one","only","onto","our","ours","out","over","own","part","per","put","see","seem","she","than","that","the","their","them","then","thence","there","these","they","this","those","though","thus","too","top","upon","very","via","was","were","what","when","which","while","who","whoever","whom","whose","why","will","with","within","without","would","yet","you","your","yours","the")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsCmnWords))
# remove all bad words ...
# original found at http://en.wiktionary.org/wiki/Category:English_swear_words
vcsBadWords <- c("arse","ass","asshole","bastard","bitch","bloody","bollocks","child-fucker","cunt","damn","fuck","goddamn","godsdamn","hell","motherfucker","shit","shitass","whore")
dfrUNPrfWords <- filter(dfrUNPrfWords, !(Words %in% vcsBadWords))
# show
head(dfrUNPrfWords)

##       Words
## 1     thank
## 2 everybody
## 3     thank
## 4     didnt
## 5      know
## 6   melania

Significant Word Count

# summarise data
dfrUNPrfFreq <- dfrUNPrfWords %>% 
                group_by(Words) %>% 
                summarise(Freq=n()) %>% 
                arrange(desc(Freq))
head(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words    Freq
##   <chr>   <int>
## 1 going      61
## 2 want       37
## 3 people     35
## 4 great      33
## 5 country    30
## 6 theyre     29

Significant Word Count Tail

tail(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words       Freq
##   <chr>      <int>
## 1 yes            1
## 2 yesterday      1
## 3 yesterdays     1
## 4 youll          1
## 5 yourselves     1
## 6 youth          1

Remove Sparse Word

# remove sparse words ... words with Freq <= 5
dfrUNPrfFreq <- filter(dfrUNPrfFreq, Freq>5)
tail(dfrUNPrfFreq)

## # A tibble: 6 x 2
##   Words       Freq
##   <chr>      <int>
## 1 tax            6
## 2 television     6
## 3 women          6
## 4 working        6
## 5 years          6
## 6 youve          6

Word Count Final

# total word count = length of vector
intWordCountFinal <- length(dfrUNPrfFreq$Words)
# print
intWordCountFinal

## [1] 92

Frequency Category

# add FrequencyCategory colum
dfrUNPrfFreq <- mutate(dfrUNPrfFreq, Fcat=FreqCategory(dfrUNPrfFreq$Freq))
# new data frame for Frequency Of Categorized Frequencies ... 
dfrUNPrfFocf <- dfrUNPrfFreq %>% group_by(Fcat) %>% summarise(Rfrq=n())
# 
dfrUNPrfFocf$Fcat <- factor(dfrUNPrfFocf$Fcat, levels=dfrUNPrfFocf$Fcat, ordered=T)
# head
head(dfrUNPrfFocf,10)

## # A tibble: 4 x 2
##   Fcat       Rfrq
##   <ord>     <int>
## 1 "     10"    57
## 2 "     20"    26
## 3 "     50"     8
## 4 "    100"     1

Word Cloud

# wordcloud
wordcloud(dfrUNPrfFreq$Words[1:50], dfrUNPrfFreq$Freq[1:50], random.order=F, max.words=100, colors=brewer.pal(8, "Dark2"))

Word Plot

# word  plot
ggplot(slice(dfrUNPrfFreq,1:30), aes(x=reorder(Words,-Freq),y=Freq)) +
    geom_bar(stat="identity", fill=rainbow(30)) +
    ylab("Frequency") +
    xlab("Words") +
    ggtitle("Word Frequency - Top 30 Words") +
    theme(plot.title=element_text(size=rel(1.5), colour="blue")) +
    coord_flip()

Frequency Plot

ggplot(dfrUNPrfFocf, aes(Fcat,Rfrq))+
    geom_bar(stat="identity", width=0.8, fill=rainbow(length(dfrUNPrfFocf$Fcat))) +
    xlab("Words With Frequency Less Than") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=60, hjust=1, vjust=1),axis.text.y=element_text(angle=60, hjust=1, vjust=1),plot.title=element_text(size=rel(1.5), colour="blue")) +
    ggtitle("Frequency Of Word Count")

Word Length

dfrUNPrfChrs <- data.frame(Chars=nchar(dfrUNPrfFreq$Words))
#intRowCount <- nrow(table(dfrUNPrfChrs))
ggplot(dfrUNPrfChrs, aes(x=Chars)) +
    geom_histogram(binwidth=1, fill='blue') +
    geom_vline(xintercept=mean(nchar(dfrUNPrfFreq$Words)), color='black', size=1.5, alpha=.5) +
    xlab("Word Length (Chars)") + ylab("Number Of Words (Frequency)")

Wind Up

print("Wind Up")

## [1] "Wind Up"

Text Minig

Alireza Chevelwalla