Data Science Skills
Data Science General Skills
text_string <- read_file("https://github.com/AlainKuiete/DATA607/blob/master/datascienceskills")
search_word <- c('python', 'machine', 'learning','analytic', 'team', 'teams', 'r','sql','','communication', 'analytical','java','scala', 'hdoop','spark','computer', 'science', '','engineering', 'solving', 'Tableau', 'sofware', 'statistical','modeling','quantitative',
'customer', 'statistics', 'management', 'models', 'predictive', 'problems')
Results <- tibble(text = text_string) %>%
mutate(text = tolower(text)) %>%
mutate(text = str_remove_all(text, '[[:punct:]]')) %>%
mutate(tokens = str_split(text, "\\s+")) %>%
unnest() %>%
count(tokens) %>%
filter(tokens %in%search_word) %>%
mutate(freq = n / sum(n)) %>%
arrange(desc(n))
The table with words and frequencies.
Results <- mutate(Results, tokens = reorder(tokens,-1*n))
Results
ggplot2::ggplot(Results, aes(x=tokens, y=freq, fill=tokens)) +
geom_bar(width = 0.75, stat = "identity", colour = "black", size = 1) +
coord_polar(theta = "x") + xlab("") + ylab("") +
ggtitle("Word Frequency") + theme(legend.position = "none") +
labs(x = NULL, y = NULL)

plotly::ggplotly(ggplot2::ggplot(Results,
aes(x=tokens, y=freq, fill=tokens)) +
geom_bar(width = 0.75, stat = "identity", colour = "black", size = 1) +
xlab("") + ylab("") + ggtitle("Word Frequency") +
theme(legend.position = "none") +
labs(x = NULL, y = NULL) +
theme(plot.subtitle = element_text(vjust = 1), plot.caption = element_text(vjust = 1), axis.text.x = element_text(angle = 90)) +
theme(panel.background = element_rect(fill = "honeydew1"), plot.background = element_rect(fill = "antiquewhite")))%>%
config(displaylogo = F) %>% config(showLink = F)
Downloading data
dsg <- read.csv("https://raw.githubusercontent.com/AlainKuiete/DATA607/master/ds_general_skills_revised.csv" )
str(dsg)
## 'data.frame': 30 obs. of 5 variables:
## $ Keyword : Factor w/ 27 levels "","\"data scientist\" \"[keyword]\"",..: 14 8 25 11 10 15 27 7 13 20 ...
## $ LinkedIn : Factor w/ 23 levels "","1,212","1,310",..: 16 15 12 11 8 6 5 4 3 2 ...
## $ Indeed : Factor w/ 23 levels "","1,125","1,413",..: 11 12 8 7 6 4 3 2 23 22 ...
## $ SimplyHired: Factor w/ 23 levels "","1,153","1,497",..: 10 11 9 8 4 3 2 23 22 21 ...
## $ Monster : Factor w/ 23 levels "","1,207","1,815",..: 7 10 8 4 6 3 2 22 18 17 ...
Subsetting the Data Science soft Skills
dskg <-dsg[1:15,]
reshaping my dataframe
colnames(dskg) <- c("D.Skills", "LinkedIn", "Indeed", "SimplyHired", "Monster")
dskg$LinkedIn <- as.numeric(gsub(",","",dskg$LinkedIn))
dskg$Indeed <- as.numeric(gsub(",","",dskg$Indeed))
dskg$SimplyHired <- as.numeric(gsub(",","",dskg$SimplyHired))
dskg$Monster <- as.numeric(gsub(",","",dskg$Monster))
Computation
s.dskg <- summarise(dskg, sL=sum(LinkedIn, na.rm=TRUE), sI= sum(Indeed, na.rm=TRUE), sS=sum(SimplyHired, na.rm=TRUE),
sM=sum(Monster, na.rm=TRUE))
tsg <- sum(s.dskg)
g.skills <- dskg%>% mutate(D.Skills, pct=(LinkedIn+Indeed+SimplyHired+Monster)/tsg)%>%
select(D.Skills,pct)
g.skills
Visualisation
Plotting with horizontal column
library(ggplot2)
g.skills %>%
filter(pct > .001) %>%
mutate(phrases = reorder(D.Skills, pct)) %>%
ggplot(aes(phrases, pct)) +
geom_col(aes(fill=phrases),show.legend=FALSE) +
ggtitle("General Skills for Data Science")+
xlab(NULL) +ylab("Overall percentage")+
coord_flip()

Analysis, Machine Learning, Statistics, Computer Science and Communication are general skill required for Data Scientists.
Data Science Soft Skills
Downloading data
dss <- read.csv("https://raw.githubusercontent.com/AlainKuiete/DATA607/master/ds_job_listing_software.csv")
Subsetting the Data Science soft Skills
dsk <- dss[1:30,1:5]
str(dsk)
## 'data.frame': 30 obs. of 5 variables:
## $ Keyword : Factor w/ 42 levels "","\"data scientist\" \"[keyword]\"",..: 31 33 39 37 15 18 34 40 17 35 ...
## $ LinkedIn : Factor w/ 40 levels "","1,024","1,040",..: 33 27 17 9 8 7 6 5 4 3 ...
## $ Indeed : Factor w/ 37 levels "","1,012","1,134",..: 23 22 11 5 6 4 3 2 37 34 ...
## $ SimplyHired: Factor w/ 39 levels "","1,059","1,164",..: 16 15 14 4 3 2 39 38 37 35 ...
## $ Monster : Factor w/ 40 levels "","1,002","1,062",..: 18 17 5 3 4 2 39 36 35 33 ...
reshaping my dataframe
colnames(dsk) <- c("D.Skills", "LinkedIn", "Indeed", "SimplyHired", "Monster")
dsk$LinkedIn <- as.numeric(gsub(",","",dsk$LinkedIn))
dsk$Indeed <- as.numeric(gsub(",","",dsk$Indeed))
dsk$SimplyHired <- as.numeric(gsub(",","",dsk$SimplyHired))
dsk$Monster <- as.numeric(gsub(",","",dsk$Monster))
Computation
s.dsks <- summarise(dsk, sL=sum(LinkedIn), sI= sum(Indeed), sS=sum(SimplyHired),
sM=sum(Monster))
ts <- sum(s.dsks)
skills <- dsk%>% mutate(D.Skills, pct=(LinkedIn+Indeed+SimplyHired+Monster)/ts)%>%select(D.Skills,pct)
skill <-skills[1:10,]
skill
Visualisation
skill <- mutate(skill, D.Skills = reorder(D.Skills, -1*pct))
ggplot(skill,
aes(x=D.Skills, y = pct))+
geom_col(aes(fill=D.Skills), position = "dodge")+
xlab("Computer Skills")+ylab("Overall percentage")+
ggtitle("Top 10 Sofware Skills for Data Science")
