Context

Acquiring Data

##Getting Data

## after scraping Indeed got data and saved into a csv file
df_BigData <- read.csv(file.choose(), header = TRUE)
attach(df_BigData)

## The following object is masked from package:stringr:
## 
##     words

## matching words which are in the list to the dataframe.
## got a new dataframe.
Filter_Worddata <- df_BigData[ which(words %in% 'big data' | 
                     words %in% 'aws' | 
                     words %in% 'machine learning'|
                     words %in% '.net'|
                       words %in% 'java'|
                       words %in% 'sqlserver'|
                       words %in% 'hadoop'|
                       words %in% 'datawarehouse'|
                       words %in% 'python'|
                       words %in% 'oracle'|
                       words %in% 'sas'|
                       words %in% 'r'|
                       words %in% 'c/c++'|
                       words %in% 'data science'|
                       words %in% 'dynamodb'|
                       words %in% 'sql server'|
                       words %in% 'experimental design'|
                       words %in% 'mysql'|
                       words %in% 'agile'|
                       words %in% 'data mining'|
                       words %in% 'shell script'|
                       words %in% 'enterprise software'|
                       words %in% 'embedded software'),c('doc_id','job_title','state','words')]

nrow(Filter_Worddata)

## [1] 5184

Word Cloud

##Now we will use data mining operation and build our wordCloud for better visulization of the data which we hav extrated and filtered from our big data repository.


## word cloud for key words
keyWord_Cloud <- Corpus(VectorSource(Filter_Worddata$words))
keyWord_Cloud

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5184

keyWord_Cloud <- keyWord_Cloud %>% tm_map(PlainTextDocument)
keyWord_Cloud <- keyWord_Cloud %>% tm_map(str_replace_all, pattern = "[[:punct:]]", replacement = " ")
keyWord_Cloud <- keyWord_Cloud %>% tm_map(tolower)


## word cloud for key job titles

jobTitle_Cloud <- Corpus(VectorSource(Filter_Worddata$job_title))
jobTitle_Cloud

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5184

jobTitle_Cloud <- jobTitle_Cloud %>% tm_map(PlainTextDocument)
jobTitle_Cloud <- jobTitle_Cloud %>% tm_map(str_replace_all, pattern = "[[:punct:]]", replacement = " ")
jobTitle_Cloud <- jobTitle_Cloud %>% tm_map(tolower)


##Ploting the results

wordcloud(keyWord_Cloud, max.words = 100, random.order = FALSE, random.color = TRUE,colors=c('orange', 'black'))

wordcloud(jobTitle_Cloud, max.words = 100, random.order = FALSE, random.color = TRUE,colors=c('blue', 'green','red'))