3/11/2020

Soft Skills

#keywords as 'keywords'
keywords<-unname(unlist(read.csv("https://raw.githubusercontent.com/chilleundso/DATA607/master/Project3/softskills.csv", stringsAsFactors = FALSE)))

#Iniates new columns for each keyword; also makes list index list
df[keywords] <- NA
keywordColIndex <- seq(length(df)-length(keywords)+1,length(df),1)

#Will determine if contains keywords; occurs (1), or doesnt occur(0)
#Coerces multiple occurances to 1
i = 0
for (desc in df$Description) {
  i = i + 1
  df[i,keywordColIndex] <-(as.integer(str_detect(desc,fixed(keywords, ignore_case = TRUE))))
}

#Gets rid of the descriptions
df$Description <- NULL

#Gather soft skills
df2 <- df[,c(1, 44:ncol(df))] %>%
  gather(skill, flag, -X) %>%
  filter(flag == 1) %>%
  select(id = X, skill) %>%
  mutate(skill_type = 'soft')

Hard Skills

# List all hard skills and how often they were listed in a separate dataframe
for (i in 1:nrow(df_raw)) {
  slist <- str_extract_all(df_raw$Skill[i], "'.+?'")
  temp_df <- data.frame(skill = slist[[1]])
  if (nrow(temp_df) > 0) {
    temp_df$id <- i - 1
    temp_df$skill <- as.character(temp_df$skill)
    temp_df <- select(temp_df, id, skill)
    if (i == 1) {
      sdf <- temp_df
    } else {
      sdf <- bind_rows(sdf, temp_df)
    }
  }
}

# Remove quotes and trim whitespace
sdf$skill <- str_replace_all(sdf$skill, "'", "")
sdf$skill <- str_replace_all(sdf$skill, '"', "")
sdf$skill <- str_trim(sdf$skill)
sdf$skill_type <- 'hard'

# Combine skills
skills <- bind_rows(df2, sdf)

Job Title

df <- df_raw
job.indeed.df  <- df %>%
  select(c(Id=X, (Job_Title)))

# Pattern Building:
pattern.analyst <- c('analyst','statistician','analysis','analytics')
pattern.engineer <- c('engineer', 'engg', 'technician','technologist','designer','architect')
pattern.scientist <- c('scientist','doctor','dr.')
pattern.junior <- c('junior','jr', 'entry','internship','jr.')
pattern.senior <- c('senior', 'sr','experienced','sr.')

# Intermedaite Data Frame for Titlles:
final.data.df <- data.frame(Id=integer(nrow(job.indeed.df)), Job_Title=character(nrow(job.indeed.df))
                            , analyst=integer(nrow(job.indeed.df)) ,engineer=integer(nrow(job.indeed.df)),scientist=integer(nrow(job.indeed.df))
                            , junior=integer(nrow(job.indeed.df)),senior=integer(nrow(job.indeed.df)))
final.data.df$Id <-   job.indeed.df$Id
final.data.df$Job_Title <- as.character( as.character( job.indeed.df$Job_Title) )

# Working on the counts:
for (i in 1: nrow(job.indeed.df)) {
  final.data.df$analyst[i] <- if(grepl(paste(pattern.analyst,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) )  1 else 0
  final.data.df$engineer[i] <- if(grepl(paste(pattern.engineer,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) )  1 else 0
  final.data.df$scientist[i] <- if(grepl(paste(pattern.scientist,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) )  1 else 0
  final.data.df$junior[i] <- if(grepl(paste(pattern.junior,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) )  1 else 0
  final.data.df$senior[i] <- if(grepl(paste(pattern.senior,collapse="|"), job.indeed.df$Job_Title[i], ignore.case = TRUE) )  1 else 0
}

Hard Skills For Data Roles

  • Python is ranked #1 & R is ranked #4
  • Skills vary after the top 2

Hard Skills for Data Scientists

  • More than half of job posts mention Python, machine learning, or R

Hard Skills for Data Analysts

  • SQL seems to be the most desired skill by far

Hard Skills for Data Engineers

  • Python and SQL are top skills
  • R isn’t in the top 10

Hard Skills for Junior Data Scientists

  • Top skills for junior data scientists are similar to data science in general
  • Tableau #4 shows the value of visualizations

Hard Skills for Senior Data Scientists

  • Top skills for senior data scientists are Python, Machine Learning, R, and SQL

Job Demand for Skill Level

  • High demand for senior roles

Hard Skills by Industry

  • Consulting and Business Services - optimizing businesses
  • Internet and Software - leverage the data they have

Word Cloud

# WordCLoud map for the soft and hard skillset that high rating/star companies are 
# looking for in engineer vs analyst/scientist Position: 
df.stars.5 <- df_all %>%
  filter(No_of_Stars ==5  ) %>%
  select(job_id, Queried_Salary,   Company_Industry, analyst, engineer, scientist , junior, senior ) %>%
  inner_join(skills, by = c( 'job_id' = 'id'))

df.stars.5.engg <- df.stars.5 %>% 
  filter ( engineer ==1) %>%
  group_by(skill) %>%
  summarize(total_count = n())

df.stars.5.ana.sci <- df.stars.5 %>% 
  filter ( analyst + scientist > 0 ) %>%
  group_by(skill) %>%
  summarize(total_count = n())


set.seed(1234)
wordcloud(words = df.stars.5.engg$skill, freq = df.stars.5.engg$total_count, min.freq = 1,
          max.words=100, random.order=FALSE, rot.per=0.10,
          colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Design Experience could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Microsoft SQL Server could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Responsible could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Shell Scripting could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Business Intelligence could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Google Cloud Platform could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Load Balancing could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Natural Language Processing could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Oral communication skills could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : PostgreSQL could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Power BI could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Presentation could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Problem solving could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Responsibility could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Scheduling could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : TensorFlow could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Teradata could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : TS/SCI Clearance could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Work-Life Balance could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : Writing could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = df.stars.5.engg$skill, freq = df.stars.
## 5.engg$total_count, : ZooKeeper could not be fit on page. It will not be
## plotted.

wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.5.ana.sci$total_count, min.freq = 1,
          max.words=100, random.order=FALSE, rot.per=0.10, 
          colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Entrepreneurial could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Interpersonal skills could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Natural Language Processing could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Organizational skills could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Personal Skills could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Presentation Skills could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Problem solving could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Business Analysis could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Data-driven decision-making could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Effective communicator could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Integrated Library System could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Leadership Experience could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Project management could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Shell Scripting could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Team-oriented could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Team Building could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Time management could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Time Management could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(words = df.stars.5.ana.sci$skill, freq = df.stars.
## 5.ana.sci$total_count, : Written communication skills could not be fit on page.
## It will not be plotted.

# Salary Range by titles given by high rating/star companies:
df.stars.5.Salary <- df.stars.5 %>% 
  group_by(Queried_Salary) %>%
  summarize(total_count = n(),
            ana_total = sum(analyst),
            eng_totals = sum(engineer),
            sci_total = sum(scientist),
            jr_total = sum(junior),
            sr_total = sum(senior))

kable(df.stars.5.Salary)
Queried_Salary total_count ana_total eng_totals sci_total jr_total sr_total
<80000 37 37 0 0 0 0
>160000 47 17 0 30 0 0
100000-119999 111 33 54 24 0 0
120000-139999 136 55 34 77 0 9
140000-159999 59 17 28 14 0 8
80000-99999 61 25 36 0 0 13
# ==> We can see from above no junior level roles; No Engineer roles with greater then 
#     160,000 USD
#Statewise Job posting for Data Science Jobs: 
df.states <- df_all %>% 
  select(job_id, Location ) %>%
  group_by(Location) %>%
  summarize(total_count = n())

all_states <- map_data("state")

df.states$region <- stateFromLower(df.states$Location)
Total <- merge(all_states, df.states, by="region")

Total <- Total[Total$region!="REMOTE",]

p <- ggplot()
p <- p + geom_polygon(data=Total, aes(x=long, y=lat, group = group, fill=Total$total_count),colour="white"
      ) + scale_fill_continuous(low = "#56B4E9", high = "#0072B2", guide="colorbar")

p