library(rvest)
## Warning: package 'rvest' was built under R version 3.4.4
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.4.4
library(knitr)
library(tm)
## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.4
## -- Attaching packages ----------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble  1.4.2     v readr   1.1.1
## v tidyr   0.8.0     v purrr   0.2.4
## v tibble  1.4.2     v forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'readr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.4
## -- Conflicts -------------------------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::annotate()     masks NLP::annotate()
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()
library(SnowballC)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.4
## Loading required package: RColorBrewer
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.4
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(RCurl)
## Loading required package: bitops
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
library(bitops)
library(knitr)

Import files to determine Which are relevant to our goal:

TermFrequency <- read.csv(url("https://raw.githubusercontent.com/Shetura36/Data-607-Assignments/master/Project3/TermFrequency_adjusted_10.csv"), header=FALSE, sep = ",")

TermFrequency <- TermFrequency[-c(1), ]

We need now to determine the relevant terms that encapsulate all the terms we might come across that can be grouped together

Top13 <- head(TermFrequency, n = 13) 

#Selected top 13 Terms used in Data Science jobs on Indeed. We selected these 13 because they appear the most and can be used as a link point for most of the top 100 other terms. 

write.csv(Top13, file = "Top13RelevantDataScienceTerms.csv") 

Visualization of Relavent Term Frequency

dsterms<- ggplot(data=Top13, aes(x=V2, y=V3)) + geom_bar(stat = "identity") + labs(x="Terms",y="Frequency")
dsterms + theme(axis.text.x = element_text(angle = 60, hjust = 1))

This data is quite telling as it shows even though hard technical skills are crucial, soft skills are as important as the need for team work and communication are quite high on the list of competencies that are being searched for by recruiting teams.