#install new package #GCOOKBOOK

#install.packages("gcookbook")

library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(stringr)
library(ggplot2)
library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:tidyr':
## 
##     extract

library(gcookbook)
library(forcats)

Getting the data

skills_data<-read.csv("https://raw.githubusercontent.com/Raji030/data_science_skills/main/ds_general_skills_revised.csv", sep=",",header = TRUE)

skills_data

##                                                                                               Keyword
## 1                                                                                    machine learning
## 2                                                                                            analysis
## 3                                                                                          statistics
## 4                                                                                    computer science
## 5                                                                                       communication
## 6                                                                                         mathematics
## 7                                                                                       visualization
## 8                                                                                        AI composite
## 9                                                                                       deep learning
## 10                                                                                      NLP composite
## 11                                                                               software development
## 12                                                                                    neural networks
## 13                                                                                   data engineering
## 14                                                                                 project management
## 15                                                                               software engineering
## 16                                                                                                   
## 17                                                                                              Total
## 18                                                                                                   
## 19      add AI and artificial intelligence and subtract the overlap search term with both terms in it
## 20                                                                                                 AI
## 21                                                                            artificial intelligence
## 22                                                                       AI + artificial intelligence
## 23                                                                                                   
## 24 add NLP and natural language processing and subtract the overlap search term with both terms in it
## 25                                                                                                NLP
## 26                                                                        natural language processing
## 27                                                                  NLP + natural language processing
## 28                                                                                                   
## 29                                                                       "data scientist" "[keyword]"
## 30                                                                                       Oct 10, 2018
##    LinkedIn Indeed SimplyHired Monster
## 1     5,701  3,439       2,561   2,340
## 2     5,168  3,500       2,668   3,306
## 3     4,893  2,992       2,308   2,399
## 4     4,517  2,739       2,093   1,900
## 5     3,404  2,344       1,791   2,053
## 6     2,605  1,961       1,497   1,815
## 7     1,879  1,413       1,153   1,207
## 8     1,568  1,125         811     687
## 9     1,310    979         675     606
## 10    1,212    910         660     582
## 11      732    627         481     784
## 12      671    485         421     305
## 13      514    339         276     200
## 14      476    397         330     348
## 15      413    295         250     512
## 16                                    
## 17   35,063 23,545      17,975  19,044
## 18                                    
## 19                                    
## 20      916    690         508     680
## 21      964    754         498     679
## 22      312    319         195     672
## 23                                    
## 24                                    
## 25      643    466         362     576
## 26      791    621         429     575
## 27      222    177         131     569
## 28                                    
## 29                                    
## 30

glimpse(skills_data)

## Rows: 30
## Columns: 5
## $ Keyword     <chr> "machine learning", "analysis", "statistics", "computer sc…
## $ LinkedIn    <chr> "5,701", "5,168", "4,893", "4,517", "3,404", "2,605", "1,8…
## $ Indeed      <chr> "3,439", "3,500", "2,992", "2,739", "2,344", "1,961", "1,4…
## $ SimplyHired <chr> "2,561", "2,668", "2,308", "2,093", "1,791", "1,497", "1,1…
## $ Monster     <chr> "2,340", "3,306", "2,399", "1,900", "2,053", "1,815", "1,2…

Data Cleaning:

#Removing empty rows 

data_1<- skills_data[!apply(skills_data == "", 1, all),]
data_1

##                                                                                               Keyword
## 1                                                                                    machine learning
## 2                                                                                            analysis
## 3                                                                                          statistics
## 4                                                                                    computer science
## 5                                                                                       communication
## 6                                                                                         mathematics
## 7                                                                                       visualization
## 8                                                                                        AI composite
## 9                                                                                       deep learning
## 10                                                                                      NLP composite
## 11                                                                               software development
## 12                                                                                    neural networks
## 13                                                                                   data engineering
## 14                                                                                 project management
## 15                                                                               software engineering
## 17                                                                                              Total
## 19      add AI and artificial intelligence and subtract the overlap search term with both terms in it
## 20                                                                                                 AI
## 21                                                                            artificial intelligence
## 22                                                                       AI + artificial intelligence
## 24 add NLP and natural language processing and subtract the overlap search term with both terms in it
## 25                                                                                                NLP
## 26                                                                        natural language processing
## 27                                                                  NLP + natural language processing
## 29                                                                       "data scientist" "[keyword]"
## 30                                                                                       Oct 10, 2018
##    LinkedIn Indeed SimplyHired Monster
## 1     5,701  3,439       2,561   2,340
## 2     5,168  3,500       2,668   3,306
## 3     4,893  2,992       2,308   2,399
## 4     4,517  2,739       2,093   1,900
## 5     3,404  2,344       1,791   2,053
## 6     2,605  1,961       1,497   1,815
## 7     1,879  1,413       1,153   1,207
## 8     1,568  1,125         811     687
## 9     1,310    979         675     606
## 10    1,212    910         660     582
## 11      732    627         481     784
## 12      671    485         421     305
## 13      514    339         276     200
## 14      476    397         330     348
## 15      413    295         250     512
## 17   35,063 23,545      17,975  19,044
## 19                                    
## 20      916    690         508     680
## 21      964    754         498     679
## 22      312    319         195     672
## 24                                    
## 25      643    466         362     576
## 26      791    621         429     575
## 27      222    177         131     569
## 29                                    
## 30

#Removing rows that have data redundancy

data_2<-data_1[-(17:30),]
data_2

##                 Keyword LinkedIn Indeed SimplyHired Monster
## 1      machine learning    5,701  3,439       2,561   2,340
## 2              analysis    5,168  3,500       2,668   3,306
## 3            statistics    4,893  2,992       2,308   2,399
## 4      computer science    4,517  2,739       2,093   1,900
## 5         communication    3,404  2,344       1,791   2,053
## 6           mathematics    2,605  1,961       1,497   1,815
## 7         visualization    1,879  1,413       1,153   1,207
## 8          AI composite    1,568  1,125         811     687
## 9         deep learning    1,310    979         675     606
## 10        NLP composite    1,212    910         660     582
## 11 software development      732    627         481     784
## 12      neural networks      671    485         421     305
## 13     data engineering      514    339         276     200
## 14   project management      476    397         330     348
## 15 software engineering      413    295         250     512
## 17                Total   35,063 23,545      17,975  19,044

# Replacing skill names that have abbreviated parts in Keyword column 

data_2$Keyword[data_2$Keyword == "AI composite"] <- "artificial intelligence"
data_2$Keyword[data_2$Keyword == "NLP composite"] <- "natural language processing"
data_2

##                        Keyword LinkedIn Indeed SimplyHired Monster
## 1             machine learning    5,701  3,439       2,561   2,340
## 2                     analysis    5,168  3,500       2,668   3,306
## 3                   statistics    4,893  2,992       2,308   2,399
## 4             computer science    4,517  2,739       2,093   1,900
## 5                communication    3,404  2,344       1,791   2,053
## 6                  mathematics    2,605  1,961       1,497   1,815
## 7                visualization    1,879  1,413       1,153   1,207
## 8      artificial intelligence    1,568  1,125         811     687
## 9                deep learning    1,310    979         675     606
## 10 natural language processing    1,212    910         660     582
## 11        software development      732    627         481     784
## 12             neural networks      671    485         421     305
## 13            data engineering      514    339         276     200
## 14          project management      476    397         330     348
## 15        software engineering      413    295         250     512
## 17                       Total   35,063 23,545      17,975  19,044

Data Transform- Convert the data from char to numeric / add another sum for each skill / remove total row

data_2$LinkedIn = as.numeric(gsub("[^[:digit:]]","",data_2$LinkedIn))
data_2$Indeed = as.numeric(gsub("[^[:digit:]]","",data_2$Indeed))
data_2$SimplyHired = as.numeric(gsub("[^[:digit:]]","",data_2$SimplyHired))
data_2$Monster = as.numeric(gsub("[^[:digit:]]","",data_2$Monster))

data_3 <- data_2 %>%
  mutate(total_count = rowSums(across(where(is.numeric)))) 

data_3 <- data_3[-c(16),]

data_3<- data_3 %>%
 rename(Data_Skills = Keyword)

select top 5 & bottom 5 skills

library(dbplyr)

## 
## Attaching package: 'dbplyr'

## The following objects are masked from 'package:dplyr':
## 
##     ident, sql

Top_5 <- data_3 %>%
  #select(Data_Skills,total_count) %>%
  slice_max(total_count, n=5)

Bottom_5 <-data_3 %>%
  #select(Data_Skills,total_count) %>%
  slice_min(total_count,n=5)

Top_Bottom <-rbind(Top_5,Bottom_5)

Overall Top 5 & Bottom 5- Data Scientist Skills

Top_Bottom_Overall <- Top_Bottom %>%
mutate(Data_Skills = fct_reorder(Data_Skills, total_count)) %>%
select (Data_Skills,total_count) %>%
ggplot( aes(x=total_count,y=Data_Skills, fill=Data_Skills))+
  geom_bar(stat='identity',width = 0.8,color='purple') + 
  scale_fill_manual(values = c("red","red","red","red","red","green","green","green","green","green")) +
  geom_text(aes(label=total_count),size =3)+
  theme_minimal()+
    ggtitle("Top 5 & Bottom 5- Overall Data Scientist Skills ") +
      labs(y="Top5 & Bottom5 Skills", x="Overall Count")
  
Top_Bottom_Overall

LinkedIn Top 5 & Bottom 5 Data Scientist Skills

Top_Bottom_LinkedIn <- Top_Bottom %>%
mutate(Data_Skills = fct_reorder(Data_Skills, LinkedIn)) %>%
select (Data_Skills,LinkedIn) %>%
ggplot( aes(x=LinkedIn,y=Data_Skills, fill=Data_Skills))+
  geom_bar(stat='identity',width = 0.8,color='purple') + 
  scale_fill_manual(values = c("red","red","red","red","red","green","green","green","green","green")) +
  geom_text(aes(label=LinkedIn),size =3)+
  theme_minimal()+
    ggtitle("Top 5 & Bottom 5- LinkedIn Data Scientist Skills ") +
      labs(y="Top5 & Bottom5 Skills", x="LinkedIn Count")

Top_Bottom_LinkedIn

Indeed Top 5 & Bottom 5 Data Scientist Skills

Top_Bottom_Indeed <- Top_Bottom %>%
mutate(Data_Skills = fct_reorder(Data_Skills, Indeed )) %>%
select (Data_Skills,Indeed) %>%
ggplot( aes(x=Indeed,y=Data_Skills, fill=Data_Skills))+
  geom_bar(stat='identity',width = 0.8,color='purple') + 
  scale_fill_manual(values = c("red","red","red","red","red","green","green","green","green","green")) +
  geom_text(aes(label=Indeed),size =3)+
  theme_minimal()+
    ggtitle("Top 5 & Bottom 5- Indeed Data Scientist Skills") +
      labs(y="Top5 & Bottom5 Skills", x="Indeed Count")

Top_Bottom_Indeed

SimplyHired Top 5 & Bottom 5 Data Scientist Skills

Top_Bottom_SimplyHired <- Top_Bottom %>%
mutate(Data_Skills = fct_reorder(Data_Skills, SimplyHired)) %>%
select (Data_Skills,SimplyHired) %>%
ggplot( aes(x=SimplyHired,y=Data_Skills, fill=Data_Skills))+
  geom_bar(stat='identity',width = 0.8,color='purple') + 
  scale_fill_manual(values = c("red","red","red","red","red","green","green","green","green","green")) +
  geom_text(aes(label=SimplyHired),size =3)+
  theme_minimal()+
    ggtitle("Top 5 & Bottom 5- SimplyHired Data Scientist Skills") +
      labs(y="Top5 & Bottom5 Skills", x="SimplyHired Count")

Top_Bottom_SimplyHired

Monster Top 5 & Bottom 5 Data Scientist Skills

Top_Bottom_Monster <- Top_Bottom %>%
mutate(Data_Skills = fct_reorder(Data_Skills, Monster)) %>%
select (Data_Skills,Monster) %>%
ggplot( aes(x=Monster,y=Data_Skills, fill=Data_Skills))+
  geom_bar(stat='identity',width = 0.8,color='purple') + 
  scale_fill_manual(values = c("red","red","red","red","red","green","green","green","green","green")) +
  geom_text(aes(label=Monster),size =3)+
  theme_minimal()+
    ggtitle("Top 5 & Bottom 5- Monster Data Scientist Skills") +
      labs(y="Top5 & Bottom5 Skills", x="Monster Count")

Top_Bottom_Monster

# Analysis

Top_Bottom_Overall

Top_Bottom_LinkedIn

Top_Bottom_Indeed

Top_Bottom_SimplyHired

Top_Bottom_Monster

Analysis: 1. LinkedIn- Machine Learning skill is top one (1) and Software Engineering Skill is bottom one (1) compare to the overall skills.(total count almost twice size than SimplyHired) 2. Indeed- Top five skills are overlapping to over skills order but bottom two is data engineering and than software engineering skill which opposite of overall skills’ order. Top1 skill and Top2 skill’s count are close to each other less than 100 count. 3. SimplyHired - Same as Indeed. 4. Monster- Top one skill is 37% increasing from the top two skill and Machine Learning skill dropped to Top three(3) and communication skill’s order is higher than overall skills and others 3 job websites. Software engineering skill is bottom one (1) or bottom two (2) for overall skills or other job websites’ skills but the order is the second of bottom 5 for Monster.

Project3_DATA607

Mahmud Hasan Al Raji &Ivan Tikhonov & Joyce Aldrich & Ariana Nolan

2022-10-22