library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
Looking at the Glassdoor Data Science Job Postings scraped on 06/05/2020
temp1 <- read.csv2("G:/My Documents/Data_Job_SF.csv", sep = ",")
temp2 <- read.csv2("G:/My Documents/Data_Job_TX.csv", sep = ",")
temp3 <- read.csv2("G:/My Documents/Data_Job_WA.csv", sep = ",")
all_data <-
bind_rows(temp1, temp2, temp3) %>% filter(str_detect(Job_title, "Data Scientist") == TRUE)
dim(all_data)
## [1] 608 12
Data fields
names(all_data) %>% kbl() %>% kable_styling()
x |
---|
Job_title |
Company |
State |
City |
Min_Salary |
Max_Salary |
Job_Desc |
Industry |
Rating |
Date_Posted |
Valid_until |
Job_Type |
Industry
all_data %>% count(Industry) %>% arrange(desc(n)) %>% kbl() %>% kable_styling()
Industry | n |
---|---|
Information Technology | 188 |
Business Services | 128 |
86 | |
Aerospace & Defense | 56 |
Government | 22 |
Biotech & Pharmaceuticals | 18 |
Finance | 17 |
Retail | 17 |
Manufacturing | 13 |
Oil, Gas, Energy & Utilities | 13 |
Insurance | 12 |
Accounting & Legal | 9 |
Health Care | 8 |
Media | 8 |
Education | 3 |
Consumer Services | 2 |
Real Estate | 2 |
Transportation & Logistics | 2 |
Arts, Entertainment & Recreation | 1 |
Construction, Repair & Maintenance | 1 |
Non-Profit | 1 |
Telecommunications | 1 |
Descriptions
library(tidytext)
library(wordcloud)
## Loading required package: RColorBrewer
tidy_desc <- all_data %>%
group_by(Industry) %>%
mutate(
linenumber = row_number()) %>%
ungroup() %>%
unnest_tokens(word, Job_Desc) %>% select(Industry, linenumber, word)
custom_stop_words <- bind_rows(tibble(word = c("data", "science", "experience"),
lexicon = c("custom")),
stop_words)
it_df <- tidy_desc %>% filter(Industry == 'Information Technology')
it_df %>%
anti_join(custom_stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 5))
## Joining, by = "word"
bs_df <- tidy_desc %>% filter(Industry == 'Business Services')
bs_df %>%
anti_join(custom_stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 5))
## Joining, by = "word"
ad_df <- tidy_desc %>% filter(Industry == 'Aerospace & Defense')
ad_df %>%
anti_join(custom_stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 5))
## Joining, by = "word"
bp_df <- tidy_desc %>% filter(Industry == 'Biotech & Pharmaceuticals')
bp_df %>%
anti_join(custom_stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 5))
## Joining, by = "word"
+160k Data Related Jobs Listings Extracted From Glassdoor link
Limited to Data Scientists
temp4 <- read.csv2("G:/My Documents/glassdoor.csv", sep = ",")
temp4 <- temp4 %>% filter(str_detect(header.jobTitle, "Data Scientist") == TRUE)
dim(temp4)
## [1] 6184 163
Country Harder to extract country
count(temp4, map.country) %>% kbl() %>% kable_styling()
map.country | n |
---|---|
1726 | |
ae | 2 |
AE | 34 |
AR | 16 |
Argentina | 5 |
AT | 39 |
AU | 41 |
Australia | 44 |
Austria | 24 |
Bangalore | 4 |
BE | 67 |
Belgium | 8 |
Bengaluru | 1 |
BG | 3 |
BR | 27 |
Brazil | 7 |
ca | 9 |
CA | 139 |
CAN | 2 |
Canada | 37 |
CH | 78 |
CHE | 80 |
Chennai | 1 |
China | 16 |
CL | 1 |
CN | 6 |
CO | 1 |
Costa Rica | 2 |
Croatia | 2 |
Czech Republic | 2 |
de | 4 |
DE | 415 |
Denmark | 11 |
Deutschland | 5 |
DK | 7 |
EE | 1 |
EG | 4 |
Egypt | 3 |
England | 8 |
es | 2 |
ES | 33 |
FI | 1 |
Finland | 8 |
FR | 307 |
FRA | 4 |
France | 89 |
FRANCE | 5 |
gb | 1 |
GB | 150 |
Germany | 80 |
GR | 1 |
Greece | 11 |
HK | 66 |
Hong Kong | 50 |
HU | 4 |
Hungary | 4 |
ID | 48 |
IE | 24 |
IL | 176 |
in | 7 |
IN | 363 |
India | 67 |
Indonesia | 4 |
IR | 1 |
Ireland | 23 |
Israel | 38 |
it | 1 |
IT | 25 |
Italy | 48 |
Japan | 11 |
jp | 2 |
JP | 15 |
Kenya | 3 |
Kingdom | 20 |
KR | 6 |
LT | 2 |
lu | 1 |
LU | 6 |
Malaysia | 5 |
Mexico | 10 |
mx | 1 |
MX | 32 |
MY | 39 |
Nederland | 2 |
Netherlands | 24 |
New Zealand | 9 |
ng | 1 |
NG | 11 |
NL | 261 |
NO | 2 |
NZ | 7 |
Pakistan | 1 |
PH | 49 |
Philippines | 5 |
PK | 5 |
PL | 62 |
Poland | 20 |
Portugal | 5 |
PT | 13 |
Romania | 6 |
RS | 2 |
RU | 3 |
Russia | 1 |
SA | 1 |
Saudi Arabia | 4 |
SE | 15 |
Serbia | 1 |
SG | 162 |
SGP | 2 |
SI | 1 |
Singapore | 42 |
SINGAPORE | 2 |
Slovakia | 2 |
South Africa | 12 |
Sp | 3 |
Spain | 45 |
Sweden | 48 |
Switzerland | 13 |
Taiwan | 1 |
TH | 2 |
Thailand | 3 |
TR | 23 |
Turkey | 6 |
TW | 6 |
UA | 7 |
UK | 33 |
Ukraine | 1 |
United Arab Emirates | 10 |
United Kingdom | 267 |
United States | 46 |
UNITED STATES | 4 |
Uruguay | 1 |
us | 5 |
US | 191 |
USA | 7 |
Vietnam | 2 |
VN | 6 |
ZA | 45 |
Data fields Potentially a lot of uninteresting fields Does include salary, industry and description
names(temp4) %>% kbl() %>% kable_styling()
x |
---|
benefits.benefitRatingDecimal |
benefits.comments |
benefits.highlights |
benefits.numRatings |
benefits.employerSummary |
breadCrumbs |
gaTrackerData.category |
gaTrackerData.empId |
gaTrackerData.empName |
gaTrackerData.empSize |
gaTrackerData.expired |
gaTrackerData.industry |
gaTrackerData.industryId |
gaTrackerData.jobId.long |
gaTrackerData.jobId.int |
gaTrackerData.jobTitle |
gaTrackerData.location |
gaTrackerData.locationId |
gaTrackerData.locationType |
gaTrackerData.pageRequestGuid.guid |
gaTrackerData.pageRequestGuid.guidValid |
gaTrackerData.pageRequestGuid.part1 |
gaTrackerData.pageRequestGuid.part2 |
gaTrackerData.sector |
gaTrackerData.sectorId |
gaTrackerData.profileConversionTrackingParams.trackingCAT |
gaTrackerData.profileConversionTrackingParams.trackingSRC |
gaTrackerData.profileConversionTrackingParams.trackingXSP |
gaTrackerData.jobViewTrackingResult.jobViewDisplayTimeMillis |
gaTrackerData.jobViewTrackingResult.requiresTracking |
gaTrackerData.jobViewTrackingResult.trackingUrl |
header.adOrderId |
header.advertiserType |
header.applicationId |
header.applyButtonDisabled |
header.applyUrl |
header.blur |
header.coverPhoto |
header.easyApply |
header.employerId |
header.employerName |
header.expired |
header.gocId |
header.hideCEOInfo |
header.jobTitle |
header.locId |
header.location |
header.locationType |
header.logo |
header.logo2x |
header.organic |
header.overviewUrl |
header.posted |
header.rating |
header.saved |
header.savedJobId |
header.sgocId |
header.sponsored |
header.userAdmin |
header.uxApplyType |
header.featuredVideo |
header.normalizedJobTitle |
header.urgencyLabel |
header.urgencyLabelForMessage |
header.urgencyMessage |
header.needsCommission |
header.payHigh |
header.payLow |
header.payMed |
header.payPeriod |
header.salaryHigh |
header.salaryLow |
header.salarySource |
job.description |
job.discoverDate |
job.eolHashCode |
job.importConfigId |
job.jobReqId.long |
job.jobReqId.int |
job.jobSource |
job.jobTitleId |
job.listingId.long |
job.listingId.int |
map.country |
map.employerName |
map.lat |
map.lng |
map.location |
map.address |
map.postalCode |
overview.allBenefitsLink |
overview.allPhotosLink |
overview.allReviewsLink |
overview.allSalariesLink |
overview.foundedYear |
overview.hq |
overview.industry |
overview.industryId |
overview.revenue |
overview.sector |
overview.sectorId |
overview.size |
overview.stock |
overview.type |
overview.description |
overview.mission |
overview.website |
overview.allVideosLink |
overview.competitors |
overview.companyVideo |
photos |
rating.ceo.name |
rating.ceo.photo |
rating.ceo.photo2x |
rating.ceo.ratingsCount |
rating.ceoApproval |
rating.recommendToFriend |
rating.starRating |
reviews |
salary.country.cc3LetterISO |
salary.country.ccISO |
salary.country.continent.continentCode |
salary.country.continent.continentName |
salary.country.continent.id |
salary.country.continent.new |
salary.country.countryFIPS |
salary.country.currency.currencyCode |
salary.country.currency.defaultFractionDigits |
salary.country.currency.displayName |
salary.country.currency.id |
salary.country.currency.name |
salary.country.currency.negativeTemplate |
salary.country.currency.new |
salary.country.currency.positiveTemplate |
salary.country.currency.symbol |
salary.country.currencyCode |
salary.country.defaultLocale |
salary.country.defaultName |
salary.country.defaultShortName |
salary.country.employerSolutionsCountry |
salary.country.id |
salary.country.longName |
salary.country.major |
salary.country.name |
salary.country.new |
salary.country.population |
salary.country.shortName |
salary.country.tld |
salary.country.type |
salary.country.uniqueName |
salary.country.usaCentricDisplayName |
salary.currency.currencyCode |
salary.currency.defaultFractionDigits |
salary.currency.displayName |
salary.currency.id |
salary.currency.name |
salary.currency.negativeTemplate |
salary.currency.new |
salary.currency.positiveTemplate |
salary.currency.symbol |
salary.lastSalaryDate |
salary.salaries |
wwfu |
Industry
temp4 %>% count(overview.industry) %>% arrange(desc(n)) %>% kbl() %>% kable_styling()
overview.industry | n |
---|---|
1817 | |
IT Services | 466 |
Staffing & Outsourcing | 466 |
Internet | 408 |
Consulting | 348 |
Computer Hardware & Software | 336 |
Enterprise Software & Network Solutions | 257 |
Biotech & Pharmaceuticals | 181 |
Advertising & Marketing | 136 |
Insurance Operators | 119 |
Investment Banking & Asset Management | 111 |
Accounting | 107 |
Banks & Building Societies | 105 |
Publishing | 61 |
Research & Development | 58 |
Healthcare Services & Hospitals | 56 |
Aerospace & Defence | 55 |
Energy | 49 |
Video Games | 48 |
Telecommunications Services | 47 |
Electrical & Electronic Manufacturing | 42 |
Lending | 42 |
Transportation Management | 42 |
Industrial Manufacturing | 40 |
Cable, Internet & Telephone Providers | 37 |
Financial Transaction Processing | 35 |
Government Agencies | 33 |
Financial Analytics & Research | 31 |
Department, Clothing, & Shoe Shops | 27 |
Transportation Equipment Manufacturing | 27 |
Food & Drink Manufacturing | 26 |
Travel Agencies | 25 |
Consumer Products Manufacturing | 22 |
Other Retail Shops | 22 |
Chemical Manufacturing | 20 |
Social Services | 20 |
Architectural & Engineering Services | 19 |
Education Training Services | 19 |
Miscellaneous Manufacturing | 19 |
Wholesale | 19 |
Airlines | 18 |
Gambling | 18 |
Health, Beauty & Fitness | 18 |
TV Broadcasting & Cable Networks | 17 |
General Merchandise & Superstores | 16 |
Insurance Agencies & Brokerages | 16 |
Grocery Shops & Supermarkets | 14 |
Colleges & Universities | 13 |
Music Production & Distribution | 13 |
Oil & Gas Exploration & Production | 13 |
Utilities | 12 |
Vehicle Dealers | 12 |
Estate Agents | 11 |
Hotel & Resorts | 11 |
Sports & Recreation | 11 |
Consumer Product Hire | 10 |
Logistics & Supply Chain | 10 |
Sporting Goods Shops | 9 |
Beauty & Personal Accessories Shops | 8 |
Consumer Electronics & Appliance Shops | 8 |
Film Production & Distribution | 8 |
Farm Support Services | 6 |
Fast-Food & Quick-Service Restaurants | 6 |
Healthcare Product Manufacturing | 6 |
Mining | 6 |
Camping & Caravan Parks | 5 |
Catering & Food Service Contractors | 5 |
Brokerage Services | 4 |
Building & Construction | 4 |
Express Delivery Services | 4 |
Food & Beverage Shops | 4 |
Home Furniture and Houseware Shops | 4 |
Media & Entertainment Retail Shops | 4 |
Oil & Gas Services | 4 |
Security Services | 4 |
Shipping | 4 |
Ticket Sales | 4 |
Car Hire | 3 |
Charitable Foundations | 3 |
Food Production | 3 |
Pharmacies & Health Shops | 3 |
Primary & Secondary Education | 3 |
Vehicle Repair & Maintenance | 3 |
Commercial Equipment Repair & Maintenance | 2 |
Haulage | 2 |
Legal | 2 |
Membership Organisations | 2 |
Metal & Mineral Manufacturing | 2 |
Motor Vehicle Parts & Accessories Shops | 2 |
Museums, Zoos & Amusement Parks | 2 |
Pet & Pet Supply Shops | 2 |
Removal Services | 2 |
Bus & Coach Services | 1 |
Casual Restaurants | 1 |
Convenience Stores & Roadside Services | 1 |
General Repair & Maintenance | 1 |
Local Councils | 1 |
News Outlets | 1 |
Radio | 1 |
Rail Freight | 1 |
Telecommunications Manufacturing | 1 |
Venture Capital & Private Equity | 1 |