So far, we have analyzed a sample data set from LinkedIn and Twitter to understand the trends in ‘Data Science’. Now, let’s take a large dataset containing all the jobs related to ‘Data Science’ to validate our hypothesis. For this purpose, we need to select a company which is one of the largest hiring company for data science related roles and also is a reputable company that has positive reviews in the job market.
One of the largest job portals company, glassdoor.com shows ‘Amazon’ as the best candidate for this purpose.
The search on Amazon.jobs for keyword “Data Science” in the location “United States” revealed 6630 results. This data is scraped from the website using python query and using a proxy website ‘Crawlera’.
from time import time
from time import sleep
from datetime import datetime
from requests import get
from random import randint
from random import choice
from IPython.core.display import clear_output
from warnings import warn
import json
import csv
''' Amazon Job Search URL:
https://www.amazon.jobs/en/search?offset=0&result_limit=10&sort=relevant&job_type=Full-Time&
cities[]=Seattle%2C%20Washington%2C%20USA&cities[]=San%20Francisco%2C%20California%2C%20USA&
cities[]=Sunnyvale%2C%20California%2C%20USA&cities[]=Bellevue%2C%20Washington%2C%20USA&
cities[]=East%20Palo%20Alto%2C%20California%2C%20USA&cities[]=Santa%20Monica%2C%20California%2C%20USA&
category_type=Corporate&distanceType=Mi&radius=24km&latitude=&longitude=&loc_group_id=&loc_query=USA&base_query=&
city=&country=USA®ion=&county=&query_options=&
'''
def get_all_jobs(pages):
requests = 0
start_time = time()
total_runtime = datetime.now()
user_agent_list = [
# Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
# Firefox
'Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1'
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1'
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0'
'Mozilla/5.0 (X11; U; Linux Core i7-4980HQ; de; rv:32.0; compatible; JobboerseBot; http://www.jobboerse.com/bot.htm) Gecko/20100101 Firefox/38.0'
'Mozilla/5.0 (Windows NT 5.1; rv:36.0) Gecko/20100101 Firefox/36.0'
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'
'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0'
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'
]
for page in pages:
try:
user_agent = choice(user_agent_list)
headers = {'User-Agent': user_agent}
response = get('https://www.amazon.jobs/en/search.json?base_query=&city=&country=USA&county=&'
'facets%5B%5D=location&facets%5B%5D=business_category&facets%5B%5D=category&'
'facets%5B%5D=schedule_type_id&facets%5B%5D=employee_class&facets%5B%5D=normalized_location'
'&facets%5B%5D=job_function_id&job_function_id%5B%5D=job_function_corporate_80rdb4&'
'latitude=&loc_group_id=&loc_query=USA&longitude=&'
'normalized_location%5B%5D=Seattle%2C+Washington%2C+USA&'
'normalized_location%5B%5D=San+Francisco'
'%2C+California%2C+USA&normalized_location%5B%5D=Sunnyvale%2C+California%2C+USA&'
'normalized_location%5B%5D=Bellevue%2C+Washington%2C+USA&'
'normalized_location%5B%5D=East+Palo+Alto%2C+California%2C+USA&'
'normalized_location%5B%5D=Santa+Monica%2C+California%2C+USA&offset={}&query_options=&'
'radius=24km®ion=&result_limit=10&schedule_type_id%5B%5D=Full-Time&'
'sort=relevant'.format(page),
headers=headers,
# You will need your own Crawlera account and place below.
proxies={
"http": "http://ef31668955f04d569bde95a6b0d3dadb:@proxy.crawlera.com:8010/"
}
)
# Monitor the frequency of requests
requests += 1
# Pauses the loop between 8 - 15 seconds and marks the elapsed time
sleep(randint(8, 15))
current_time = time()
elapsed_time = current_time - start_time
print("Amazon Request:{}; Frequency: {} request/s; Total Run Time: {}".format(requests,
requests / elapsed_time, datetime.now() - total_runtime))
clear_output(wait=True)
# Throw a warning for non-200 status codes
if response.status_code != 200:
warn("Request: {}; Status code: {}".format(requests, response.status_code))
# Set page requests. Break the loop if number of requests is greater than expected
if requests > 999:
warn("Number of requests was greater than expected.")
break
yield from get_job_infos(response)
except AttributeError as e:
print(e)
continue
def get_job_infos(response):
amazon_jobs = json.loads(response.text)
for website in amazon_jobs['jobs']:
site = website['company_name']
title = website['title']
location = website['normalized_location']
job_link = 'https://www.amazon.jobs' + website['job_path']
basic_qualifications= website['basic_qualifications']
business_category= website['business_category']
description= website['description']
description_short= website['description_short']
job_category= website['job_category']
job_family= website['job_family']
posted_date= website['posted_date']
preferred_qualifications= website['preferred_qualifications']
yield site, title, location, job_link, basic_qualifications,business_category,description,description_short,job_category,job_family,posted_date,preferred_qualifications
def main():
# Page range starts from 0 and the middle value increases by 10 each page.
pages = [str(i) for i in range(0, 6630, 10)]
# Writes to csv file
with open('amazon_jobs.csv', "w", newline='', encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["Website", "Title", "Location", "Job URL","Basic Qualifications","Business Category","Description","Description_short","Job category","Job family","Posted date","Preferred Qualifications"])
writer.writerows(get_all_jobs(pages))
if __name__ == "__main__":
main()The dataset is then loaded to AWS db ensuring that no passwords are stored in the process. All the datasets leveraged in the project were loaded using the same approach.
#prompt for input
password_file<-"C:\\password-files-for-r\\AWS_login.csv"
passwords<-read.csv(password_file)
# read in login credentials
df_login <- passwords # read in login credentials
vardb_user <- df_login$login_name
vardb_password <- df_login$login_password
vardb_schema <- df_login$login_schema
vardb_host <- df_login$login_host
#cat("Host: ", vardb_host, " Schema=", vardb_schema, " username=", vardb_user, " password=", vardb_password)
mydb = dbConnect(RMySQL::MySQL(), user=vardb_user, password=vardb_password, port=3306, dbname=vardb_schema, host=vardb_host)
#load amazon.csv into R
setwd("LOCATION OF .CSV ON YOUR COMPUTER")
amazon_table<-read_csv("amazon_table.csv")
# correct file encoding for upload to MySQL
write.table(amazon_table,file="tmp.txt", fileEncoding ="utf8")
mydata_utf8 <- read.table(file="tmp.txt",encoding="utf8")
#upload table to MySQL
written_df<-dbWriteTable(mydb,"DESIRED TABLE NAME",
mydata_utf8,append=TRUE,row.names=FALSE)library(tidyverse)
library(tidytext)
library(quanteda)
library(corpus)
library(scales)
knitr::opts_chunk$set(eval = TRUE, results = FALSE)
amzn_jobs_raw <- read.csv("amazon_jobs.csv")
#amzn_jobs_raw<-dbReadTable(mydb,"amazon_table")Select only the required columns to perform the analysis. Both ‘Basic Qualifications’ and ‘Preferred Qualifications’ appear to be key columns(apart from job category), let’s select them both and subset the data to the year 2020 alone.
Data reveals that ‘Software Development’ department offers the most data science jobs at Amazon! Let’s look into the table a bit. In total, the top three job categories such as Software Development, Product Management (technical+non-technical) and Finance & Accounting contribute to 63% of the data science jobs.
amzn_jobs_latest <- amzn_jobs_raw %>% select(Job.URL,Basic.Qualifications,Job.category,Posted.date,Preferred.Qualifications)%>%
filter(str_sub(amzn_jobs_raw$Posted.date,-4,-1) == "2020")
jobs_cnt <- amzn_jobs_latest %>% count(Job.category,sort = TRUE)
jobs_cnt = mutate(jobs_cnt, jobs_pct = round((n / sum(n))*100))
knitr:: kable(head(jobs_cnt,10)) | Job.category | n | jobs_pct |
|---|---|---|
| Software Development | 2177 | 36 |
| Project/Program/Product Management–Technical | 669 | 11 |
| Project/Program/Product Management–Non-Tech | 649 | 11 |
| Finance & Accounting | 274 | 5 |
| Business Intelligence | 233 | 4 |
| Systems, Quality, & Security Engineering | 224 | 4 |
| Solutions Architect | 208 | 3 |
| Operations, IT, & Support Engineering | 147 | 2 |
| Sales, Advertising, & Account Management | 141 | 2 |
| Design | 127 | 2 |
head(jobs_cnt,5) %>% arrange(jobs_pct) %>% mutate(Job.category = fct_reorder(Job.category, jobs_pct)) %>%
ggplot( aes(y=Job.category, x=jobs_pct)) +
geom_bar( stat="identity",color="black")+
geom_text(aes(label=paste0(jobs_pct,"%")),color = "orange",hjust=1,vjust=0.3)+
xlab("Percentage of Jobs")+ ylab("")+
theme( axis.line = element_line(colour = "black",
size = 1, linetype = "solid"))+
ggtitle("'Data Science' Jobs by Job Category") +
theme(plot.title = element_text(lineheight=.8, face="bold"))On closer look, ‘Perferred Qualifications’ seen to have more information about the job than ‘Basic Qualifications’. And next, removing legal and PR content from the column to perform text analysis
amzn_jobs_latest$Preferred.Qualifications <- sub('Amazon is committed.*$', '', amzn_jobs_latest$Preferred.Qualifications)
amzn_jobs_latest$Preferred.Qualifications <- sub('race.*$', '', amzn_jobs_latest$Preferred.Qualifications)
amzn_jobs_latest$Preferred.Qualifications <- sub('equal opportunity.*$', '', amzn_jobs_latest$Preferred.Qualifications)ngram_list <- c('br', 'race', 'national origin', 'gender', 'identity', 'sexual','orientation', 'protected', 'veteran', 'status', 'disability', 'age', 'protected status','â', 'equal','opportunity','amazon','employer','https','en','legally','discriminate','workplace','national','amazonâ')
Pref_freq <- amzn_jobs_latest %>%
unnest_tokens(word, `Preferred.Qualifications`) %>%
anti_join(stop_words) %>%
filter(
!str_detect(word, pattern = "[[:digit:]]"), # removes any words with numeric digits
!str_detect(word, pattern = "[[:punct:]]"), # removes any remaining punctuations
!str_detect(word, pattern = "(.)\\1{2,}"), # removes any words with 3 or more repeated letters
!str_detect(word, pattern = "\\b(.)\\b"), # removes any remaining single letter words
!word %in% ngram_list # remove stopwords from both words in bi-gram
) %>%
count(word) %>%
filter(n >= 1000) %>% # filter for words used 1000 or more times
spread(word, n) %>% # convert to wide format
map_df(replace_na, 0) # replace NAs with 0
Pref_freq_tall <- as.data.frame(t(Pref_freq))
Pref_freq_tall <- tibble::rownames_to_column(Pref_freq_tall, "Keywords")
Pref_freq_tall <- Pref_freq_tall[order(-Pref_freq_tall$V1),]
Pref_freq_tall = mutate(Pref_freq_tall, one_pct = round((V1 / sum(V1))*100))
#knitr:: kable(head(Pref_freq_tall,10))
head(Pref_freq_tall,10) %>% arrange(one_pct) %>% mutate(Keywords = fct_reorder(Keywords, one_pct)) %>%
ggplot(aes( x=Keywords,y=one_pct)) +
geom_bar( stat="identity",color="black")+
geom_text(aes(label=paste0(one_pct,"%")),color = "orange",hjust=1,vjust=0.3)+
theme( axis.line = element_line(colour = "black",
size = 1, linetype = "solid"))+
coord_flip()+
xlab("")+ ylab("Percentage of Frequency")+
theme( axis.line = element_line(colour = "black",
size = 1, linetype = "solid"))+
ggtitle("Percentage of One Word Frequencies") +
theme(plot.title = element_text(lineheight=.8, face="bold"))Pref_freq_bi <- amzn_jobs_latest %>%
unnest_tokens(bigram, `Preferred.Qualifications`,token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(
!word1 %in% stop_words$word, # remove stopwords from both words in bi-gram
!word2 %in% stop_words$word,
!str_detect(word1, pattern = "[[:digit:]]"), # removes any words with numeric digits
!str_detect(word2, pattern = "[[:digit:]]"),
!str_detect(word1, pattern = "[[:punct:]]"), # removes any remaining punctuations
!str_detect(word2, pattern = "[[:punct:]]"),
!str_detect(word1, pattern = "(.)\\1{2,}"), # removes any words with 3 or more repeated letters
!str_detect(word2, pattern = "(.)\\1{2,}"),
!str_detect(word1, pattern = "\\b(.)\\b"), # removes any remaining single letter words
!str_detect(word2, pattern = "\\b(.)\\b"),
!word1 %in% ngram_list, # remove stopwords from both words in bi-gram
!word2 %in% ngram_list
) %>%
unite("bigram", c(word1, word2), sep = " ") %>%
count(bigram) %>%
filter(n >= 800) %>% # filter for bi-grams used 1000 or more times
spread(bigram, n) %>% # convert to wide format
map_df(replace_na, 0) # replace NAs with 0
Pref_freq_bi_tall <- as.data.frame(t(Pref_freq_bi))
Pref_freq_bi_tall <- tibble::rownames_to_column(Pref_freq_bi_tall, "Keywords")
Pref_freq_bi_tall <- Pref_freq_bi_tall[order(-Pref_freq_bi_tall$V1),]
#knitr:: kable(head(Pref_freq_bi_tall,10))
Pref_freq_bi_tall = mutate(Pref_freq_bi_tall, one_pct = round((V1 / sum(V1))*100))
head(Pref_freq_bi_tall,10) %>% arrange(one_pct) %>% mutate(Keywords = fct_reorder(Keywords, one_pct)) %>%
ggplot(aes( x=Keywords,y=one_pct)) +
geom_bar( stat="identity",color="black")+
geom_text(aes(label=paste0(one_pct,"%")),color = "orange",hjust=1,vjust=0.3)+
theme( axis.line = element_line(colour = "black",
size = 1, linetype = "solid"))+
coord_flip()+
xlab("")+ ylab("Percentage of Frequency")+
theme( axis.line = element_line(colour = "black",
size = 1, linetype = "solid"))+
ggtitle("Percentage of Two Words Frequencies") +
theme(plot.title = element_text(lineheight=.8, face="bold"))Pref_freq_tri <- amzn_jobs_latest %>%
unnest_tokens(trigram, `Preferred.Qualifications`,token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2","word3"), sep = " ") %>%
filter(
!word1 %in% stop_words$word, # remove stopwords from both words in bi-gram
!word2 %in% stop_words$word,
!str_detect(word1, pattern = "[[:digit:]]"), # removes any words with numeric digits
!str_detect(word2, pattern = "[[:digit:]]"),
!str_detect(word3, pattern = "[[:digit:]]"),
!str_detect(word1, pattern = "[[:punct:]]"), # removes any remaining punctuations
!str_detect(word2, pattern = "[[:punct:]]"),
!str_detect(word3, pattern = "[[:punct:]]"),
!str_detect(word1, pattern = "(.)\\1{2,}"), # removes any words with 3 or more repeated letters
!str_detect(word2, pattern = "(.)\\1{2,}"),
!str_detect(word3, pattern = "(.)\\1{2,}"),
!str_detect(word1, pattern = "\\b(.)\\b"), # removes any remaining single letter words
!str_detect(word2, pattern = "\\b(.)\\b"),
!str_detect(word3, pattern = "\\b(.)\\b"),
!word1 %in% ngram_list, # remove stopwords from both words in bi-gram
!word2 %in% ngram_list,
!word3 %in% ngram_list
) %>%
unite("trigram", c(word1, word2, word3), sep = " ") %>%
count(trigram) %>%
filter(n >= 500) %>% # filter for bi-grams used 500 or more times
spread(trigram, n) %>% # convert to wide format
map_df(replace_na, 0) # replace NAs with 0
Pref_freq_tri_tall <- as.data.frame(t(Pref_freq_tri))
Pref_freq_tri_tall <- tibble::rownames_to_column(Pref_freq_tri_tall, "Keywords")
Pref_freq_tri_tall <- Pref_freq_tri_tall[order(-Pref_freq_tri_tall$V1),]
#knitr:: kable(head(Pref_freq_tri_tall,10))
Pref_freq_tri_tall = mutate(Pref_freq_tri_tall, one_pct = round((V1 / sum(V1))*100))
head(Pref_freq_tri_tall,10) %>% arrange(one_pct) %>% mutate(Keywords = fct_reorder(Keywords, one_pct)) %>%
ggplot(aes( x=Keywords,y=one_pct)) +
geom_bar( stat="identity",color="black")+
geom_text(aes(label=paste0(one_pct,"%")),color = "orange",hjust=1,vjust=0.3)+
theme( axis.line = element_line(colour = "black",
size = 1, linetype = "solid"))+
coord_flip()+
xlab("")+ ylab("Percentage of Frequency")+
theme( axis.line = element_line(colour = "black",
size = 1, linetype = "solid"))+
ggtitle("Percentage of Three Words Frequencies") +
theme(plot.title = element_text(lineheight=.8, face="bold"))soft_skills <- dictionary(list(Professional_Experience = c('demonstrated ability','track record','proven ability','experience building','professional'),Leadership = c('leadership principles',' affirmative action','fast paced','lead','leadership','guide'),Communication = c('written communication','communication skills','verbal'), Management = c('management','mba'), Business_Knowledge = c('Business'), Commitment = c('committed','commitment'), Deep_dive = c('deep dive','investigate','dive'), Learning = c('learning','learner','curious','inquisitive'), Creativity = c('creative','design','visual'), Problem_Solver = c('solver','puzzle','problem','framework','complex','thinker'),Confidence = c('confidence','believe')))
technical_skills <- dictionary(list(General_Coding = c('code','coding standards','source control','programming'),Software_Development = c('software development','software engineering'),Subject_Matter_Expert = c('subject matter'),Computer_Science = c('computer science'), Python = c('Python','python'), Database = c('database','Teradata'),SQL = 'SQL',R = 'R',Statistics = c('statistics', 'stats'),C_Family =c('C','C++','C#'),Java = 'Java',Perl = 'Perl', Scala = 'Scala',Machine_Learning = c('machine learning','deep learning','Tensorflow'),AI = c('ai','artificial intelligence','iot','AI'),Julia = 'Julia',Matlab = 'Matlab',Big_Data = c('big data','Apache Spark','Azure','Apache Hive','Hadoop'),Other_Statiscal_tools = c('SAS', 'minitab','SPSS','prism'),Tableau= c('Tableau','power bi'),Business_Intelligence = c('BI','bi','business intelligence')))
all_skills = dictionary(list(Professional_Experience = c('demonstrated ability','track record','proven ability','experience building','professional'),Leadership = c('leadership principles',' affirmative action','fast paced','lead','leadership','guide'),Communication = c('written communication','communication skills','verbal'), Management = c('management','mba'), Business_Knowledge = c('Business'), Commitment = c('committed','commitment'), Deep_dive = c('deep dive','investigate','dive'), Learning = c('learning','learner','curious','inquisitive'), Creativity = c('creative','design','visual'), Problem_Solver = c('solver','puzzle','problem','framework','complex','thinker'),Confidence = c('confidence','believe'),General_Coding = c('code','coding standards','source control','programming'),Software_Development = c('software development','software engineering'),Subject_Matter_Expert = c('subject matter'),Computer_Science = c('computer science'), Python = c('Python','python'), Database = c('database','Teradata'),SQL = 'SQL',R = 'R',Statistics = c('statistics','Minitab', 'stats'),C_family =c('C','C++','C#'),Java = 'Java',Perl = 'Perl', Scala = 'Scala',Machine_Learning = c('machine learning','deep learning','Tensorflow'),AI = c('ai','artificial intelligence','iot','AI'),Julia = 'Julia',Matlab = 'Matlab',Big_data = c('big data','Apache Spark','Azure','Apache Hive','Hadoop'),Other_Statiscal_tools = c('SAS', 'minitab','SPSS','prism'),Tableau= c('Tableau','power bi'),Business_Intelligence = c('BI','bi','business intelligence')))
R_python_Matlab_other <- dictionary(list(R = 'R',Python = c('Python','python'),Matlab ='matlab', Other_Statiscal_tools = c('SAS', 'minitab','SPSS','prism')))data_dict<-dfm(amzn_jobs_latest$Preferred.Qualifications, dictionary=soft_skills)
top_soft <- as.data.frame(topfeatures(data_dict))
top_soft <- tibble::rownames_to_column(top_soft, "Keywords")
colnames(top_soft) <- c("Keywords","freq")
top_soft = mutate(top_soft, skills_pct = round((freq / sum(freq))*100))
top_soft %>% arrange(skills_pct) %>% mutate(Keywords = fct_reorder(Keywords, skills_pct)) %>%
ggplot(aes( x=Keywords,y=skills_pct)) +
geom_bar( stat="identity",color="black")+
geom_text(aes(label=paste0(skills_pct,"%")),color = "orange",hjust=1,vjust=0.3)+
theme( axis.line = element_line(colour = "black",
size = 1, linetype = "solid"))+
coord_flip()+
xlab("")+ ylab("")+
theme( axis.line = element_line(colour = "black",
size = 1, linetype = "solid"))+
ggtitle("Top Soft Skills for 'Data Science' at Amazon") +
theme(plot.title = element_text(lineheight=.8, face="bold"))data_dict2<-dfm(amzn_jobs_latest$Preferred.Qualifications, dictionary=technical_skills)
top_tech <- as.data.frame(topfeatures(data_dict2))
top_tech <- tibble::rownames_to_column(top_tech, "Keywords")
colnames(top_tech) <- c("Keywords","freq")
top_tech = mutate(top_tech, skills_pct = round((freq / sum(freq))*100))
top_tech %>% arrange(skills_pct) %>% mutate(Keywords = fct_reorder(Keywords, skills_pct)) %>%
ggplot(aes( x=Keywords,y=skills_pct)) +
geom_bar( stat="identity",color="black")+
geom_text(aes(label=paste0(skills_pct,"%")),color = "orange",hjust=1,vjust=0.3)+
theme( axis.line = element_line(colour = "black",
size = 1, linetype = "solid"))+
coord_flip()+
xlab("")+ ylab("")+
theme( axis.line = element_line(colour = "black",
size = 1, linetype = "solid"))+
ggtitle("Top Technical Skills for 'Data Science' at Amazon") +
theme(plot.title = element_text(lineheight=.8, face="bold"))data_dict4<-dfm(amzn_jobs_latest$Preferred.Qualifications, dictionary=all_skills)
top_skill <- as.data.frame(topfeatures(data_dict4))
top_skill <- tibble::rownames_to_column(top_skill, "Keywords")
colnames(top_skill) <- c("Keywords","freq")
top_skill = mutate(top_skill, skills_pct = round((freq / sum(freq))*100))
top_skill %>% arrange(skills_pct) %>% mutate(Keywords = fct_reorder(Keywords, skills_pct)) %>%
ggplot(aes( x=Keywords,y=skills_pct)) +
geom_bar( stat="identity",color="black")+
geom_text(aes(label=paste0(skills_pct,"%")),color = "orange",hjust=1,vjust=0.3)+
theme( axis.line = element_line(colour = "black",
size = 1, linetype = "solid"))+
coord_flip()+
xlab("")+ ylab("")+
theme( axis.line = element_line(colour = "black",
size = 1, linetype = "solid"))+
ggtitle("Top Technical Skills for 'Data Science' at Amazon") +
theme(plot.title = element_text(lineheight=.8, face="bold"))data_dict3<-dfm(amzn_jobs_latest$Preferred.Qualifications, dictionary=R_python_Matlab_other)
top_stats <- as.data.frame(topfeatures(data_dict3))
top_stats <- tibble::rownames_to_column(top_stats, "Keywords")
colnames(top_stats) <- c("Keywords","freq")
top_stats = mutate(top_stats, skills_pct = round((freq / sum(freq))*100))
top_stats %>% arrange(skills_pct) %>% mutate(Keywords = fct_reorder(Keywords, skills_pct)) %>%
ggplot(aes( x=Keywords,y=skills_pct)) +
geom_bar( stat="identity",color="black")+
geom_text(aes(label=paste0(skills_pct,"%")),color = "orange",hjust=1,vjust=0.3)+
theme( axis.line = element_line(colour = "black",
size = 1, linetype = "solid"))+
coord_flip()+
xlab("")+ ylab("")+
theme( axis.line = element_line(colour = "black",
size = 1, linetype = "solid"))+
ggtitle("Top Statistical Tools for 'Data Science' at Amazon") +
theme(plot.title = element_text(lineheight=.8, face="bold"))