ParagraphClassification

Library

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.4.4

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

## Warning: package 'stringr' was built under R version 3.4.4

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.4.4

library(Rstem)

Load a text paragraph

words <- readLines("cybersecurity blog.txt")

## Warning in readLines("cybersecurity blog.txt"): incomplete final line found
## on 'cybersecurity blog.txt'

head(words)

## [1] "On Wednesday, Denver-based deepwatch announced it has secured $23 million in a Series A funding round. The cybersecurity provider will use the sizable Series A to fuel research and development, as well as to expand its market reach."                                                                                                                             
## [2] ""                                                                                                                                                                                                                                                                                                                                                                     
## [3] "“We’ve been doubling in size since starting in 2015, going from a startup to 130 employees and over 100 customers — many of them blue chip companies,” said Chief Financial Officer Ron Peele. “We expect that growth rate to continue.”Deepwatch uses a combination of software and in-house cybersecurity expertise to help companies protect their digital assets."
## [4] ""                                                                                                                                                                                                                                                                                                                                                                     
## [5] "“A lot of what we do is detection. We look through the data, in the form of logs, to see what activity is happening with our customers’ digital assets,” said Peele. “From that activity, we do machine learning that helps us identify patterns and threats. From what we learn from that, we can alert someone or take corresponding action.”"                      
## [6] ""

make a vector of words

vcsWords <- str_split(words, pattern = " ")
vcsWords <- unlist(vcsWords)
head(vcsWords,10)

##  [1] "On"           "Wednesday,"   "Denver-based" "deepwatch"   
##  [5] "announced"    "it"           "has"          "secured"     
##  [9] "$23"          "million"

Clean all words

# convert to lower-case
vcsWords <- str_to_lower(vcsWords)

# remove punctuations
vcsWords <- str_replace_all(vcsWords,pattern="[[:punct:]]", "")

# remove numbers
vcsWords <- str_replace_all(vcsWords,pattern="[[:digit:]]", "")

# remove spaces
vcsWords <- str_replace_all(vcsWords,pattern="[[:space:]]", "")

# remove special chars
vcsWords <- str_replace_all(vcsWords,pattern="[~@#$%&-_=<>]", "")

# filter out null values
vcsWords <- vcsWords[vcsWords != ""]

head(vcsWords,100)

##   [1] "on"                "wednesday"         "denverbased"      
##   [4] "deepwatch"         "announced"         "it"               
##   [7] "has"               "secured"           "million"          
##  [10] "in"                "a"                 "series"           
##  [13] "a"                 "funding"           "round"            
##  [16] "the"               "cybersecurity"     "provider"         
##  [19] "will"              "use"               "the"              
##  [22] "sizable"           "series"            "a"                
##  [25] "to"                "fuel"              "research"         
##  [28] "and"               "development"       "as"               
##  [31] "well"              "as"                "to"               
##  [34] "expand"            "its"               "market"           
##  [37] "reach"             "weve"              "been"             
##  [40] "doubling"          "in"                "size"             
##  [43] "since"             "starting"          "in"               
##  [46] "going"             "from"              "a"                
##  [49] "startup"           "to"                "employees"        
##  [52] "and"               "over"              "customers"        
##  [55] "many"              "of"                "them"             
##  [58] "blue"              "chip"              "companies"        
##  [61] "said"              "chief"             "financial"        
##  [64] "officer"           "ron"               "peele"            
##  [67] "we"                "expect"            "that"             
##  [70] "growth"            "rate"              "to"               
##  [73] "continuedeepwatch" "uses"              "a"                
##  [76] "combination"       "of"                "software"         
##  [79] "and"               "inhouse"           "cybersecurity"    
##  [82] "expertise"         "to"                "help"             
##  [85] "companies"         "protect"           "their"            
##  [88] "digital"           "assets"            "a"                
##  [91] "lot"               "of"                "what"             
##  [94] "we"                "do"                "is"               
##  [97] "detection"         "we"                "look"             
## [100] "through"

make dataframe of words

word.frame <- data.frame(words = vcsWords)
#word.frame$words <- as.character(words)
head(word.frame,10)

##          words
## 1           on
## 2    wednesday
## 3  denverbased
## 4    deepwatch
## 5    announced
## 6           it
## 7          has
## 8      secured
## 9      million
## 10          in

Calculate frequency of words

word.frame <- word.frame%>% 
              group_by(words)%>% 
              summarise(Freq = n())%>%
              arrange(desc(Freq))
head(word.frame)

## # A tibble: 6 x 2
##   words  Freq
##   <fct> <int>
## 1 to       16
## 2 a        14
## 3 the      14
## 4 and      12
## 5 of       10
## 6 we        9

make a word cloud

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.4.4

## Loading required package: RColorBrewer

## Warning: package 'RColorBrewer' was built under R version 3.4.4

wordcloud(word.frame$words[1:100],freq = word.frame$Freq[1:100],random.order = F,max.words = 100,
          colors = brewer.pal(8,"Dark2"))

Only important words to keep

# words with letter more than 2
word.frame <- filter(word.frame,str_length(words) > 2)
# remove stop_words
cmnwords <- c("all","also","and","any","are","but","can","cant","cry","due",
                 "etc","few","for","get","had","has","hasnt","have","her","here",
                 "hers","herself","him","himself","his","how","inc","into","its",
                 "ltd","may","nor","not","now","off","once","one","only","onto",
                 "our","ours","out","over","own","part","per","put","see","seem",
                 "she","than","that","the","their","them","then","thence","there",
                 "these","they","this","those","though","thus","too","top","upon",
                 "very","via","was","were","what","when","which","while","who","whoever",
                 "whom","whose","why","will","with","within","without","would","yet",
                 "you","your","yours","the","from","a","aa","of","it's","it","is","was","were")

word.frame <- filter(word.frame,!(words %in% cmnwords))

abusive.words <- c("arse","ass","asshole","bastard","bitch","bloody","bollocks","child-fucker","cunt","damn","fuck","goddamn","godsdamn","hell","motherfucker","shit","shitass","whore")

word.frame <- filter(word.frame,!(words %in% abusive.words))

head(word.frame,10)

## # A tibble: 10 x 2
##    words          Freq
##    <fct>         <int>
##  1 said              6
##  2 cybersecurity     5
##  3 assets            4
##  4 companies         4
##  5 deepwatch         4
##  6 peele             4
##  7 customers         3
##  8 denver            3
##  9 funding           3
## 10 protect           3

summarise again on frequency of words

word.frame <- word.frame %>%
              arrange(desc(Freq))

head(word.frame,10)

## # A tibble: 10 x 2
##    words          Freq
##    <fct>         <int>
##  1 said              6
##  2 cybersecurity     5
##  3 assets            4
##  4 companies         4
##  5 deepwatch         4
##  6 peele             4
##  7 customers         3
##  8 denver            3
##  9 funding           3
## 10 protect           3

remove words with less frequency

word.frame <- filter(word.frame,Freq > 1)
tail(word.frame)

## # A tibble: 6 x 2
##   words       Freq
##   <fct>      <int>
## 1 operations     2
## 2 reach          2
## 3 recruit        2
## 4 series         2
## 5 want           2
## 6 where          2

word cloud of new dataframe

wordcloud(word.frame$words,freq = word.frame$Freq,random.order = F,max.words = 100,
          colors = brewer.pal(8,"Dark2"))

load the file with words and their categories

catgryWords <- read.csv("word_data1.csv",header = T,stringsAsFactors = F)
head(catgryWords)

##       words    type
## 1 animation glamour
## 2  animator glamour
## 3      best glamour
## 4       boy glamour
## 5 bollywood glamour
## 6      boom glamour

categorizing frparagraph

high.freqwords <- as.character(word.frame$words)

matches <- catgryWords[catgryWords$words %in% high.freqwords, ]
matches

##              words               type
## 346  cybersecurity         technology
## 357        digital         technology
## 504         denver         technology
## 550      companies         technology
## 785       security         technology
## 1358       funding         technology
## 1402      critical         technology
## 2485          said International News

matches <- matches %>% group_by(type)%>%summarise(freq = n())%>%arrange(desc(freq))
matches

## # A tibble: 2 x 2
##   type                freq
##   <chr>              <int>
## 1 technology             7
## 2 International News     1

print(paste("paragraph belongs to ",as.character(matches[1,1]),"catageory"))

## [1] "paragraph belongs to  technology catageory"

ParagraphClassification

Rohan Khollamkar

7 April 2019