library(rvest)
## Loading required package: xml2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(httr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v readr   1.3.1
## v tibble  2.1.3     v purrr   0.3.2
## v tidyr   0.8.3     v stringr 1.4.0
## v ggplot2 3.2.1     v forcats 0.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:httr':
## 
##     config
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidyr)

Data Science Skills

Data Science General Skills

text_string <- read_file("https://github.com/AlainKuiete/DATA607/blob/master/datascienceskills")
search_word <- c('python', 'machine', 'learning','analytic', 'team', 'teams', 'r','sql','','communication', 'analytical','java','scala', 'hdoop','spark','computer', 'science', '','engineering', 'solving', 'Tableau', 'sofware', 'statistical','modeling','quantitative',
'customer', 'statistics', 'management', 'models', 'predictive', 'problems')


Results <- tibble(text = text_string) %>% 
  mutate(text = tolower(text)) %>% 
  mutate(text = str_remove_all(text, '[[:punct:]]')) %>% 
  mutate(tokens = str_split(text, "\\s+")) %>%
  unnest() %>% 
  count(tokens) %>% 
  filter(tokens %in%search_word) %>%
  mutate(freq = n / sum(n)) %>% 
  arrange(desc(n))

The table with words and frequencies.

Results <- mutate(Results, tokens = reorder(tokens,-1*n))
Results
ggplot2::ggplot(Results, aes(x=tokens, y=freq, fill=tokens)) + 
  geom_bar(width = 0.75,  stat = "identity", colour = "black", size = 1) + 
  coord_polar(theta = "x") + xlab("") + ylab("") + 
  ggtitle("Word Frequency") + theme(legend.position = "none") + 
  labs(x = NULL, y = NULL)

plotly::ggplotly(ggplot2::ggplot(Results, 
                  aes(x=tokens, y=freq, fill=tokens)) + 
                   geom_bar(width = 0.75, stat = "identity", colour = "black", size = 1) + 
                   xlab("") + ylab("") + ggtitle("Word Frequency") + 
                   theme(legend.position = "none") + 
                   labs(x = NULL, y = NULL) + 
                   theme(plot.subtitle = element_text(vjust = 1), plot.caption = element_text(vjust = 1), axis.text.x = element_text(angle = 90)) + 
                   theme(panel.background = element_rect(fill = "honeydew1"), plot.background = element_rect(fill = "antiquewhite")))%>% 
  config(displaylogo = F) %>% config(showLink = F)
Downloading data
dsg <- read.csv("https://raw.githubusercontent.com/AlainKuiete/DATA607/master/ds_general_skills_revised.csv" )
str(dsg)
## 'data.frame':    30 obs. of  5 variables:
##  $ Keyword    : Factor w/ 27 levels "","\"data scientist\" \"[keyword]\"",..: 14 8 25 11 10 15 27 7 13 20 ...
##  $ LinkedIn   : Factor w/ 23 levels "","1,212","1,310",..: 16 15 12 11 8 6 5 4 3 2 ...
##  $ Indeed     : Factor w/ 23 levels "","1,125","1,413",..: 11 12 8 7 6 4 3 2 23 22 ...
##  $ SimplyHired: Factor w/ 23 levels "","1,153","1,497",..: 10 11 9 8 4 3 2 23 22 21 ...
##  $ Monster    : Factor w/ 23 levels "","1,207","1,815",..: 7 10 8 4 6 3 2 22 18 17 ...

Subsetting the Data Science soft Skills

dskg <-dsg[1:15,]

reshaping my dataframe

colnames(dskg) <- c("D.Skills", "LinkedIn", "Indeed", "SimplyHired", "Monster")
dskg$LinkedIn <- as.numeric(gsub(",","",dskg$LinkedIn))
dskg$Indeed <- as.numeric(gsub(",","",dskg$Indeed))
dskg$SimplyHired <- as.numeric(gsub(",","",dskg$SimplyHired))
dskg$Monster <- as.numeric(gsub(",","",dskg$Monster))

Computation

s.dskg <- summarise(dskg, sL=sum(LinkedIn, na.rm=TRUE), sI= sum(Indeed, na.rm=TRUE), sS=sum(SimplyHired, na.rm=TRUE),
           sM=sum(Monster, na.rm=TRUE))

tsg <- sum(s.dskg)

g.skills <- dskg%>% mutate(D.Skills, pct=(LinkedIn+Indeed+SimplyHired+Monster)/tsg)%>%
  select(D.Skills,pct)
g.skills

Visualisation

Plotting with horizontal column
library(ggplot2)

g.skills %>%
  filter(pct > .001) %>%
  mutate(phrases = reorder(D.Skills, pct)) %>%
  ggplot(aes(phrases, pct)) +
  geom_col(aes(fill=phrases),show.legend=FALSE) +
  ggtitle("General Skills for Data Science")+
  xlab(NULL) +ylab("Overall percentage")+
  coord_flip()

Analysis, Machine Learning, Statistics, Computer Science and Communication are general skill required for Data Scientists.

Data Science Soft Skills

Downloading data
dss <- read.csv("https://raw.githubusercontent.com/AlainKuiete/DATA607/master/ds_job_listing_software.csv")

Subsetting the Data Science soft Skills

dsk <- dss[1:30,1:5]
str(dsk)
## 'data.frame':    30 obs. of  5 variables:
##  $ Keyword    : Factor w/ 42 levels "","\"data scientist\" \"[keyword]\"",..: 31 33 39 37 15 18 34 40 17 35 ...
##  $ LinkedIn   : Factor w/ 40 levels "","1,024","1,040",..: 33 27 17 9 8 7 6 5 4 3 ...
##  $ Indeed     : Factor w/ 37 levels "","1,012","1,134",..: 23 22 11 5 6 4 3 2 37 34 ...
##  $ SimplyHired: Factor w/ 39 levels "","1,059","1,164",..: 16 15 14 4 3 2 39 38 37 35 ...
##  $ Monster    : Factor w/ 40 levels "","1,002","1,062",..: 18 17 5 3 4 2 39 36 35 33 ...

reshaping my dataframe

colnames(dsk) <- c("D.Skills", "LinkedIn", "Indeed", "SimplyHired", "Monster")
dsk$LinkedIn <- as.numeric(gsub(",","",dsk$LinkedIn))
dsk$Indeed <- as.numeric(gsub(",","",dsk$Indeed))
dsk$SimplyHired <- as.numeric(gsub(",","",dsk$SimplyHired))
dsk$Monster <- as.numeric(gsub(",","",dsk$Monster))

Computation

s.dsks <- summarise(dsk, sL=sum(LinkedIn), sI= sum(Indeed), sS=sum(SimplyHired),
           sM=sum(Monster))
ts <- sum(s.dsks)

skills <- dsk%>% mutate(D.Skills, pct=(LinkedIn+Indeed+SimplyHired+Monster)/ts)%>%select(D.Skills,pct)
skill <-skills[1:10,]
skill

Visualisation

skill <- mutate(skill, D.Skills = reorder(D.Skills, -1*pct))
ggplot(skill, 
       aes(x=D.Skills, y = pct))+
  geom_col(aes(fill=D.Skills), position = "dodge")+
  xlab("Computer Skills")+ylab("Overall percentage")+
  ggtitle("Top 10 Sofware Skills for Data Science")

Python and R are the most software computer skills recommanded for Data Scientist.

Reference: The Most in Demand Skills for Data Scientists by Jeff Hale. Toward Data Science
url1 <- "https://www.google.com/search?safe=active&rlz=1C1EJFC_enUS834US834&sxsrf=ACYBGNSOYZtHkxHTh41kYm0q3DH8TnMuHw:1572152661833&ei=VSW1Xa3EMsHk_Ab9_afoAw&q=data+sciences+jobs&oq=data+sciences+jobs&gs_l=psy-ab.3..0i10l10.41861.55760..56318...1.0..1.3382.6568.0j1j7-1j1j1......0....1..gws-wiz.....10..35i362i39.rCNbLJ7wjdA&uact=5&ibp=htl;jobs&sa=X&ved=2ahUKEwjMi9q01bvlAhWCm-AKHSUxD-wQp4wCMAB6BAgFEAE#fpstate=tldetail&htidocid=n1I1nIYsv9Heqo3hAAAAAA%3D%3D&htivrt=jobs"

counting words with R

url2 <- "https://www.codementor.io/jhwatts2010/counting-words-with-r-ds35hzgmj"