#install new package #GCOOKBOOK
#install.packages("gcookbook")
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
library(gcookbook)
library(forcats)
skills_data<-read.csv("https://raw.githubusercontent.com/Raji030/data_science_skills/main/ds_general_skills_revised.csv", sep=",",header = TRUE)
skills_data
## Keyword
## 1 machine learning
## 2 analysis
## 3 statistics
## 4 computer science
## 5 communication
## 6 mathematics
## 7 visualization
## 8 AI composite
## 9 deep learning
## 10 NLP composite
## 11 software development
## 12 neural networks
## 13 data engineering
## 14 project management
## 15 software engineering
## 16
## 17 Total
## 18
## 19 add AI and artificial intelligence and subtract the overlap search term with both terms in it
## 20 AI
## 21 artificial intelligence
## 22 AI + artificial intelligence
## 23
## 24 add NLP and natural language processing and subtract the overlap search term with both terms in it
## 25 NLP
## 26 natural language processing
## 27 NLP + natural language processing
## 28
## 29 "data scientist" "[keyword]"
## 30 Oct 10, 2018
## LinkedIn Indeed SimplyHired Monster
## 1 5,701 3,439 2,561 2,340
## 2 5,168 3,500 2,668 3,306
## 3 4,893 2,992 2,308 2,399
## 4 4,517 2,739 2,093 1,900
## 5 3,404 2,344 1,791 2,053
## 6 2,605 1,961 1,497 1,815
## 7 1,879 1,413 1,153 1,207
## 8 1,568 1,125 811 687
## 9 1,310 979 675 606
## 10 1,212 910 660 582
## 11 732 627 481 784
## 12 671 485 421 305
## 13 514 339 276 200
## 14 476 397 330 348
## 15 413 295 250 512
## 16
## 17 35,063 23,545 17,975 19,044
## 18
## 19
## 20 916 690 508 680
## 21 964 754 498 679
## 22 312 319 195 672
## 23
## 24
## 25 643 466 362 576
## 26 791 621 429 575
## 27 222 177 131 569
## 28
## 29
## 30
glimpse(skills_data)
## Rows: 30
## Columns: 5
## $ Keyword <chr> "machine learning", "analysis", "statistics", "computer sc…
## $ LinkedIn <chr> "5,701", "5,168", "4,893", "4,517", "3,404", "2,605", "1,8…
## $ Indeed <chr> "3,439", "3,500", "2,992", "2,739", "2,344", "1,961", "1,4…
## $ SimplyHired <chr> "2,561", "2,668", "2,308", "2,093", "1,791", "1,497", "1,1…
## $ Monster <chr> "2,340", "3,306", "2,399", "1,900", "2,053", "1,815", "1,2…
#Removing empty rows
data_1<- skills_data[!apply(skills_data == "", 1, all),]
data_1
## Keyword
## 1 machine learning
## 2 analysis
## 3 statistics
## 4 computer science
## 5 communication
## 6 mathematics
## 7 visualization
## 8 AI composite
## 9 deep learning
## 10 NLP composite
## 11 software development
## 12 neural networks
## 13 data engineering
## 14 project management
## 15 software engineering
## 17 Total
## 19 add AI and artificial intelligence and subtract the overlap search term with both terms in it
## 20 AI
## 21 artificial intelligence
## 22 AI + artificial intelligence
## 24 add NLP and natural language processing and subtract the overlap search term with both terms in it
## 25 NLP
## 26 natural language processing
## 27 NLP + natural language processing
## 29 "data scientist" "[keyword]"
## 30 Oct 10, 2018
## LinkedIn Indeed SimplyHired Monster
## 1 5,701 3,439 2,561 2,340
## 2 5,168 3,500 2,668 3,306
## 3 4,893 2,992 2,308 2,399
## 4 4,517 2,739 2,093 1,900
## 5 3,404 2,344 1,791 2,053
## 6 2,605 1,961 1,497 1,815
## 7 1,879 1,413 1,153 1,207
## 8 1,568 1,125 811 687
## 9 1,310 979 675 606
## 10 1,212 910 660 582
## 11 732 627 481 784
## 12 671 485 421 305
## 13 514 339 276 200
## 14 476 397 330 348
## 15 413 295 250 512
## 17 35,063 23,545 17,975 19,044
## 19
## 20 916 690 508 680
## 21 964 754 498 679
## 22 312 319 195 672
## 24
## 25 643 466 362 576
## 26 791 621 429 575
## 27 222 177 131 569
## 29
## 30
#Removing rows that have data redundancy
data_2<-data_1[-(17:30),]
data_2
## Keyword LinkedIn Indeed SimplyHired Monster
## 1 machine learning 5,701 3,439 2,561 2,340
## 2 analysis 5,168 3,500 2,668 3,306
## 3 statistics 4,893 2,992 2,308 2,399
## 4 computer science 4,517 2,739 2,093 1,900
## 5 communication 3,404 2,344 1,791 2,053
## 6 mathematics 2,605 1,961 1,497 1,815
## 7 visualization 1,879 1,413 1,153 1,207
## 8 AI composite 1,568 1,125 811 687
## 9 deep learning 1,310 979 675 606
## 10 NLP composite 1,212 910 660 582
## 11 software development 732 627 481 784
## 12 neural networks 671 485 421 305
## 13 data engineering 514 339 276 200
## 14 project management 476 397 330 348
## 15 software engineering 413 295 250 512
## 17 Total 35,063 23,545 17,975 19,044
# Replacing skill names that have abbreviated parts in Keyword column
data_2$Keyword[data_2$Keyword == "AI composite"] <- "artificial intelligence"
data_2$Keyword[data_2$Keyword == "NLP composite"] <- "natural language processing"
data_2
## Keyword LinkedIn Indeed SimplyHired Monster
## 1 machine learning 5,701 3,439 2,561 2,340
## 2 analysis 5,168 3,500 2,668 3,306
## 3 statistics 4,893 2,992 2,308 2,399
## 4 computer science 4,517 2,739 2,093 1,900
## 5 communication 3,404 2,344 1,791 2,053
## 6 mathematics 2,605 1,961 1,497 1,815
## 7 visualization 1,879 1,413 1,153 1,207
## 8 artificial intelligence 1,568 1,125 811 687
## 9 deep learning 1,310 979 675 606
## 10 natural language processing 1,212 910 660 582
## 11 software development 732 627 481 784
## 12 neural networks 671 485 421 305
## 13 data engineering 514 339 276 200
## 14 project management 476 397 330 348
## 15 software engineering 413 295 250 512
## 17 Total 35,063 23,545 17,975 19,044
data_2$LinkedIn = as.numeric(gsub("[^[:digit:]]","",data_2$LinkedIn))
data_2$Indeed = as.numeric(gsub("[^[:digit:]]","",data_2$Indeed))
data_2$SimplyHired = as.numeric(gsub("[^[:digit:]]","",data_2$SimplyHired))
data_2$Monster = as.numeric(gsub("[^[:digit:]]","",data_2$Monster))
data_3 <- data_2 %>%
mutate(total_count = rowSums(across(where(is.numeric))))
data_3 <- data_3[-c(16),]
data_3<- data_3 %>%
rename(Data_Skills = Keyword)
library(dbplyr)
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
Top_5 <- data_3 %>%
#select(Data_Skills,total_count) %>%
slice_max(total_count, n=5)
Bottom_5 <-data_3 %>%
#select(Data_Skills,total_count) %>%
slice_min(total_count,n=5)
Top_Bottom <-rbind(Top_5,Bottom_5)
Top_Bottom_Overall <- Top_Bottom %>%
mutate(Data_Skills = fct_reorder(Data_Skills, total_count)) %>%
select (Data_Skills,total_count) %>%
ggplot( aes(x=total_count,y=Data_Skills, fill=Data_Skills))+
geom_bar(stat='identity',width = 0.8,color='purple') +
scale_fill_manual(values = c("red","red","red","red","red","green","green","green","green","green")) +
geom_text(aes(label=total_count),size =3)+
theme_minimal()+
ggtitle("Top 5 & Bottom 5- Overall Data Scientist Skills ") +
labs(y="Top5 & Bottom5 Skills", x="Overall Count")
Top_Bottom_Overall
Top_Bottom_LinkedIn <- Top_Bottom %>%
mutate(Data_Skills = fct_reorder(Data_Skills, LinkedIn)) %>%
select (Data_Skills,LinkedIn) %>%
ggplot( aes(x=LinkedIn,y=Data_Skills, fill=Data_Skills))+
geom_bar(stat='identity',width = 0.8,color='purple') +
scale_fill_manual(values = c("red","red","red","red","red","green","green","green","green","green")) +
geom_text(aes(label=LinkedIn),size =3)+
theme_minimal()+
ggtitle("Top 5 & Bottom 5- LinkedIn Data Scientist Skills ") +
labs(y="Top5 & Bottom5 Skills", x="LinkedIn Count")
Top_Bottom_LinkedIn
Top_Bottom_Indeed <- Top_Bottom %>%
mutate(Data_Skills = fct_reorder(Data_Skills, Indeed )) %>%
select (Data_Skills,Indeed) %>%
ggplot( aes(x=Indeed,y=Data_Skills, fill=Data_Skills))+
geom_bar(stat='identity',width = 0.8,color='purple') +
scale_fill_manual(values = c("red","red","red","red","red","green","green","green","green","green")) +
geom_text(aes(label=Indeed),size =3)+
theme_minimal()+
ggtitle("Top 5 & Bottom 5- Indeed Data Scientist Skills") +
labs(y="Top5 & Bottom5 Skills", x="Indeed Count")
Top_Bottom_Indeed
Top_Bottom_SimplyHired <- Top_Bottom %>%
mutate(Data_Skills = fct_reorder(Data_Skills, SimplyHired)) %>%
select (Data_Skills,SimplyHired) %>%
ggplot( aes(x=SimplyHired,y=Data_Skills, fill=Data_Skills))+
geom_bar(stat='identity',width = 0.8,color='purple') +
scale_fill_manual(values = c("red","red","red","red","red","green","green","green","green","green")) +
geom_text(aes(label=SimplyHired),size =3)+
theme_minimal()+
ggtitle("Top 5 & Bottom 5- SimplyHired Data Scientist Skills") +
labs(y="Top5 & Bottom5 Skills", x="SimplyHired Count")
Top_Bottom_SimplyHired
Top_Bottom_Monster <- Top_Bottom %>%
mutate(Data_Skills = fct_reorder(Data_Skills, Monster)) %>%
select (Data_Skills,Monster) %>%
ggplot( aes(x=Monster,y=Data_Skills, fill=Data_Skills))+
geom_bar(stat='identity',width = 0.8,color='purple') +
scale_fill_manual(values = c("red","red","red","red","red","green","green","green","green","green")) +
geom_text(aes(label=Monster),size =3)+
theme_minimal()+
ggtitle("Top 5 & Bottom 5- Monster Data Scientist Skills") +
labs(y="Top5 & Bottom5 Skills", x="Monster Count")
Top_Bottom_Monster
# Analysis
Top_Bottom_Overall
Top_Bottom_LinkedIn
Top_Bottom_Indeed
Top_Bottom_SimplyHired
Top_Bottom_Monster
Analysis: 1. LinkedIn- Machine Learning skill is top one (1) and
Software Engineering Skill is bottom one (1) compare to the overall
skills.(total count almost twice size than SimplyHired) 2. Indeed- Top
five skills are overlapping to over skills order but bottom two is data
engineering and than software engineering skill which opposite of
overall skills’ order. Top1 skill and Top2 skill’s count are close to
each other less than 100 count. 3. SimplyHired - Same as Indeed. 4.
Monster- Top one skill is 37% increasing from the top two skill and
Machine Learning skill dropped to Top three(3) and communication skill’s
order is higher than overall skills and others 3 job websites. Software
engineering skill is bottom one (1) or bottom two (2) for overall skills
or other job websites’ skills but the order is the second of bottom 5
for Monster.