607 Week8 Project3

Overview

This project is an opportunity to practice both the hard technical skills around R and SQL as well as the soft skills of distance collaboration with classmates. The primary question of this project is: Which are the most valued data science skills? Groups are to seek an answer to this question while also completing each of the following:

Form a group of 3-5 members
Establish the tools the group will use to collaborate
Determine what data to collect and how to load it
Use R to tidy and transform the data
Load the data into a relational database as a set of normalized tables
Use R to perform exploritory analysis (and any additional transformations needed)
Provide documentation of data sources, transformation, analysis, and results
Submit all code, documentation, and links to collaboration spaces
Give a 3-5 minute presentation about the project at the meetup following the project due date

Documentation

Documentation for this project was developed and maintained in a sepereate file. This was done both to allow parallel development but also to minimize the length each document. The documentaion can be found at:

http://rpubs.com/StephRoark/430122

Load the Selected Data Sources

#load the necessary supporting packages
library(stringr)
library(dplyr)
library(tidyr)
library(Hmisc)
library(ggplot2)


file_location_1 <- 'https://raw.githubusercontent.com/ZacharyHerold/CUNY-DATA607/master/SCData.csv'
ds_user_profiles_skills <- read.table(file_location_1, sep = ",", header = TRUE, stringsAsFactors = FALSE)


file_location_2 <- 'https://raw.githubusercontent.com/ZacharyHerold/CUNY-DATA607/master/ds_general_skills_revised.csv'
ds_job_postings_skills <- read.table(file_location_2, sep = ",", header = TRUE, stringsAsFactors = FALSE)


file_location_3 <- 'https://raw.githubusercontent.com/ZacharyHerold/CUNY-DATA607/master/ds_job_listing_software.csv'
ds_job_postings_software <- read.table(file_location_3, sep = ",", header = TRUE, fill = TRUE, stringsAsFactors = FALSE, strip.white = TRUE, comment.char = "")

Tidy and Transform the Data

Tidy and transform the user profile datadata

#ds_user_profiles_skills: 
    ds_user_profiles <- ds_user_profiles_skills %>%
      mutate(Skill = tolower(Skill)) %>%
      mutate(Skill = capitalize(Skill)) %>%
      mutate(Skill_Type = ifelse(Skill %in% c("R"
                                              ,"Python"
                                              ,"Sql"
                                              ,"Matlab"
                                              ,"Java"
                                              ,"C++"
                                              ,"Sas"
                                              ,"Hadoop")
                                 ,"Software/Language Skill"
                                 ,"General Skill")) %>%
      select("Skill_Type", "Skill", "Percentage")

Pull out and tidy the count of job postings by site

#ds_job_postings_n_postings
    #pull out the total number of postings (to be used later)
    ds_jp_n_postings <- ds_job_postings_software[c(41),]
    
    ds_jp_n_postings <- ds_jp_n_postings %>%
      select("LinkedIn"
             , "Indeed"
             , "SimplyHired"
             , "Monster")
    
    ds_jp_n_postings <- gather(ds_jp_n_postings, "Site", "N_Postings", 1:4)
    ds_jp_n_postings <- ds_jp_n_postings %>%
      mutate(N_Postings = str_replace_all(N_Postings, ",", "")) %>%
      mutate(N_Postings = as.numeric(N_Postings))
    #ds_jp_n_postings

Tidy and transform the two job posting data sets

#ds_job_postings_skills: 
    #limit to desired rows
    ds_job_postings_skills <- ds_job_postings_skills[c(1:15, 17),]
    
    #cleanup
    ds_job_postings_skills <- ds_job_postings_skills %>%
      mutate(Skill_Type = "General Skill")  %>%
      select("Skill_Type"
             , "Keyword"
             , "LinkedIn"
             , "Indeed"
             , "SimplyHired"
             , "Monster")
 
    
#ds_job_postings_software:
    
    #limit to desired rows
    ds_job_postings_software <- ds_job_postings_software[c(1:37, 39),]
    
    #cleanup
    ds_job_postings_software <- ds_job_postings_software %>%
      #filter(c(1:37)) %>%
      mutate(Skill_Type = "Software/Language Skill") %>%
      select("Skill_Type"
             , "Keyword"
             , "LinkedIn"
             , "Indeed"
             , "SimplyHired"
             , "Monster")

# union (append) the two job posting datasets
    ds_job_postings <- union(ds_job_postings_skills
                             ,ds_job_postings_software)
              
#tidy the unioned job postings dataset
    ds_job_postings <- ds_job_postings %>%
      rename(Skill = "Keyword") %>%
      filter(Skill != "Total") %>%
      mutate(Skill = tolower(Skill)) %>%
      mutate(Skill = capitalize(Skill)) 
    
#transform to "normalized" structure
    ds_job_postings <-gather(ds_job_postings, "Site", "N_Records", 3:6)
    
#cast to numeric
    ds_job_postings <- ds_job_postings %>%
      mutate(N_Records = str_replace_all(N_Records, ",", "")) %>%
      
      filter(!is.na(N_Records)) %>%
      filter(N_Records != "") %>%
      mutate(N_Records = as.numeric(N_Records))

In job posting data set, add percent of postings

ds_job_postings <- inner_join(ds_job_postings
                              , ds_jp_n_postings
                              , by = "Site")

ds_job_postings <- ds_job_postings %>%
  mutate(Pct_Postings = round(N_Records/N_Postings*100, 2)) %>%
  select(-N_Postings)
 
head(ds_job_postings, n=10)

##                 Skill_Type            Skill     Site N_Records
## 1  Software/Language Skill       Javascript LinkedIn       328
## 2  Software/Language Skill     Scikit-learn LinkedIn       474
## 3  Software/Language Skill              Sql LinkedIn      3879
## 4            General Skill         Analysis LinkedIn      5168
## 5            General Skill Data engineering LinkedIn       514
## 6  Software/Language Skill            Mysql LinkedIn       278
## 7            General Skill    Communication LinkedIn      3404
## 8  Software/Language Skill            Hbase LinkedIn       302
## 9  Software/Language Skill       Tensorflow LinkedIn       844
## 10 Software/Language Skill           Docker LinkedIn       290
##    Pct_Postings
## 1          3.81
## 2          5.51
## 3         45.05
## 4         60.02
## 5          5.97
## 6          3.23
## 7         39.54
## 8          3.51
## 9          9.80
## 10         3.37

Export Tidy Data to be Loaded to a Relational Database

write.csv(x = ds_user_profiles
          , file = "C:/Users/cbailey/Documents/GitHub/607-Week8-Project3/05_ds_profiles.csv"
          ,row.names = FALSE)


write.csv(x = ds_job_postings
          , file = "C:/Users/cbailey/Documents/GitHub/607-Week8-Project3/05_ds_job_postings.csv"
          ,row.names = FALSE)

Exploritory Analysis

Pull the top skills (listed >50% of postings) across sites

jp_avg_across_sites <- ds_job_postings %>%
  group_by(Skill_Type, Skill) %>%
  summarise(Avg_Pct_Postings = mean(Pct_Postings)) %>%
  arrange(desc(Avg_Pct_Postings)) 

jp_avg_across_sites <- as.data.frame(jp_avg_across_sites)

jp_avg_across_sites_top <- jp_avg_across_sites %>% 
  filter(Avg_Pct_Postings >= 50)
jp_avg_across_sites_top

##                Skill_Type            Skill Avg_Pct_Postings
## 1 Software/Language Skill           Python          72.8400
## 2           General Skill         Analysis          71.5175
## 3           General Skill Machine learning          65.6225
## 4           General Skill       Statistics          59.8450
## 5 Software/Language Skill                R          59.7400
## 6           General Skill Computer science          52.7875

Pull the top skills (listed >50% of postings) across sites
Also, review the frequency across sites and the order within sites

jp_avg_within_sites <- ds_job_postings %>%
  group_by(Site, Skill_Type, Skill) %>%
  summarise(Avg_Pct_Postings = mean(Pct_Postings)) %>%
  arrange(Site, desc(Avg_Pct_Postings)) %>%
  filter(Avg_Pct_Postings >= 50)

jp_avg_within_sites <- as.data.frame(jp_avg_within_sites)
jp_avg_within_sites

##           Site              Skill_Type            Skill Avg_Pct_Postings
## 1       Indeed Software/Language Skill           Python            74.31
## 2       Indeed           General Skill         Analysis            68.12
## 3       Indeed           General Skill Machine learning            66.93
## 4       Indeed Software/Language Skill                R            60.45
## 5       Indeed           General Skill       Statistics            58.23
## 6       Indeed           General Skill Computer science            53.31
## 7       Indeed Software/Language Skill              Sql            51.15
## 8     LinkedIn Software/Language Skill           Python            73.72
## 9     LinkedIn           General Skill Machine learning            66.21
## 10    LinkedIn           General Skill         Analysis            60.02
## 11    LinkedIn           General Skill       Statistics            56.83
## 12    LinkedIn Software/Language Skill                R            52.88
## 13    LinkedIn           General Skill Computer science            52.46
## 14     Monster           General Skill         Analysis            88.25
## 15     Monster Software/Language Skill           Python            67.91
## 16     Monster           General Skill       Statistics            64.04
## 17     Monster Software/Language Skill                R            63.13
## 18     Monster           General Skill Machine learning            62.47
## 19     Monster           General Skill    Communication            54.81
## 20     Monster           General Skill Computer science            50.72
## 21 SimplyHired Software/Language Skill           Python            75.42
## 22 SimplyHired           General Skill         Analysis            69.68
## 23 SimplyHired           General Skill Machine learning            66.88
## 24 SimplyHired Software/Language Skill                R            62.50
## 25 SimplyHired           General Skill       Statistics            60.28
## 26 SimplyHired           General Skill Computer science            54.66
## 27 SimplyHired Software/Language Skill              Sql            53.70

Compare that to the skills listed by data scientists

# change term in ds profile data set to allow for joining later
ds_user_profiles_top <-ds_user_profiles %>%
  mutate(Skill = str_replace_all(Skill, "Data analysis", "Analysis")) %>%
  arrange(desc(Percentage)) %>%
  top_n(7, Percentage)

ds_user_profiles

##                 Skill_Type                 Skill Percentage
## 1            General Skill         Data analysis      55.69
## 2  Software/Language Skill                     R      48.01
## 3  Software/Language Skill                Python      46.31
## 4            General Skill           Data mining      45.94
## 5            General Skill      Machine learning      45.32
## 6            General Skill            Statistics      41.52
## 7  Software/Language Skill                   Sql      39.00
## 8            General Skill             Analytics      35.16
## 9  Software/Language Skill                Matlab      30.55
## 10 Software/Language Skill                  Java      30.02
## 11           General Skill  Statistical modeling      27.15
## 12           General Skill      Algorithm design      27.04
## 13 Software/Language Skill                   C++      25.99
## 14           General Skill Business intelligence      23.78
## 15           General Skill              Big data      23.68
## 16 Software/Language Skill                   Sas      22.78
## 17 Software/Language Skill                Hadoop      20.98
## 18           General Skill           Programming      19.77
## 19           General Skill              Research      19.70
## 20           General Skill  Software engineering      19.51

# join the job posting and user profile data sets
ds_overlap <- inner_join(ds_user_profiles_top
                         ,jp_avg_across_sites
                         ,by = c("Skill", "Skill_Type"))
ds_overlap <- ds_overlap %>% 
  mutate(Avg_Pct_Postings = round(Avg_Pct_Postings,2)) %>%
  select(Skill = Skill
         , Skill_Type = Skill_Type
         , DS_Profile_Pct = Percentage
         , DS_Posting_Pct = Avg_Pct_Postings)

ds_overlap

##              Skill              Skill_Type DS_Profile_Pct DS_Posting_Pct
## 1         Analysis           General Skill          55.69          71.52
## 2                R Software/Language Skill          48.01          59.74
## 3           Python Software/Language Skill          46.31          72.84
## 4 Machine learning           General Skill          45.32          65.62
## 5       Statistics           General Skill          41.52          59.84
## 6              Sql Software/Language Skill          39.00          49.76

Graphic Exploration

Scatter plot of Data Scientist User Profile Skills vs Data Science Job Posting Skills

scatter <- ggplot(ds_overlap, aes(x= DS_Profile_Pct, y = DS_Posting_Pct)) 
scatter <- scatter + geom_point(aes(color = Skill))
scatter <- scatter + xlim(0, 100) + ylim(0, 100)
scatter

Bar graph of the top 20 skills listed on Data Science Profiles

bar <- ggplot(data = ds_user_profiles
              , aes(x = reorder(Skill, -Percentage)
                    ,y = Percentage
                    ,fill = Skill_Type))
bar <- bar + xlab("Skill")
bar <- bar + geom_bar(stat="identity")
bar <- bar + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
bar

Bar graph of top 20 skills from job postings (averaged across sites)

jp_avg_across_sites <- jp_avg_across_sites %>% 
  top_n(20, Avg_Pct_Postings)

bar <- ggplot(data = jp_avg_across_sites
              , aes(x = reorder(Skill, -Avg_Pct_Postings)
                    ,y = Avg_Pct_Postings
                    ,fill = Skill_Type))
bar <- bar + xlab("Skill")
bar <- bar + geom_bar(stat="identity")
bar <- bar + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
bar

Conclusions

While the exact order of “top” skills varies slightly by source, the following six skills were consistently identified by data scientists’ and data scientist job postings across multiple sites.

Data Analysis
R
Python
Machine Learining
Statistics
SQL

These six skills show that “top” skills include both general technical skills (analysis, machine learning, and Statistics) and software/language skills (R, Python, SQL). This is more clearly seen in the two bar graph displays in which the color for each skill type is roughly equally present.

Other skills such as computer science or communication were mentioned by some sources but not consistently across sources. While it may not have consistently placed as a top skill in our selected data sets, communication skills were found to be invaluable while working on this project. Learning to work together to plan and execute this project across many time zones has been an interesting challenge and we have all learned invaluable lessons.