library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
Looking at the Glassdoor Data Science Job Postings scraped on 06/05/2020
temp1 <- read.csv2("G:/My Documents/Data_Job_SF.csv", sep = ",")
temp2 <- read.csv2("G:/My Documents/Data_Job_TX.csv", sep = ",")
temp3 <- read.csv2("G:/My Documents/Data_Job_WA.csv", sep = ",")
all_data <-
bind_rows(temp1, temp2, temp3) %>% filter(str_detect(Job_title, "Data Scientist") == TRUE)
dim(all_data)
## [1] 608 12
Data fields
names(all_data) %>% kbl() %>% kable_styling()
| x |
|---|
| Job_title |
| Company |
| State |
| City |
| Min_Salary |
| Max_Salary |
| Job_Desc |
| Industry |
| Rating |
| Date_Posted |
| Valid_until |
| Job_Type |
Industry
all_data %>% count(Industry) %>% arrange(desc(n)) %>% kbl() %>% kable_styling()
| Industry | n |
|---|---|
| Information Technology | 188 |
| Business Services | 128 |
| 86 | |
| Aerospace & Defense | 56 |
| Government | 22 |
| Biotech & Pharmaceuticals | 18 |
| Finance | 17 |
| Retail | 17 |
| Manufacturing | 13 |
| Oil, Gas, Energy & Utilities | 13 |
| Insurance | 12 |
| Accounting & Legal | 9 |
| Health Care | 8 |
| Media | 8 |
| Education | 3 |
| Consumer Services | 2 |
| Real Estate | 2 |
| Transportation & Logistics | 2 |
| Arts, Entertainment & Recreation | 1 |
| Construction, Repair & Maintenance | 1 |
| Non-Profit | 1 |
| Telecommunications | 1 |
Descriptions
library(tidytext)
library(wordcloud)
## Loading required package: RColorBrewer
tidy_desc <- all_data %>%
group_by(Industry) %>%
mutate(
linenumber = row_number()) %>%
ungroup() %>%
unnest_tokens(word, Job_Desc) %>% select(Industry, linenumber, word)
custom_stop_words <- bind_rows(tibble(word = c("data", "science", "experience"),
lexicon = c("custom")),
stop_words)
it_df <- tidy_desc %>% filter(Industry == 'Information Technology')
it_df %>%
anti_join(custom_stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 5))
## Joining, by = "word"
bs_df <- tidy_desc %>% filter(Industry == 'Business Services')
bs_df %>%
anti_join(custom_stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 5))
## Joining, by = "word"
ad_df <- tidy_desc %>% filter(Industry == 'Aerospace & Defense')
ad_df %>%
anti_join(custom_stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 5))
## Joining, by = "word"
bp_df <- tidy_desc %>% filter(Industry == 'Biotech & Pharmaceuticals')
bp_df %>%
anti_join(custom_stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 5))
## Joining, by = "word"
+160k Data Related Jobs Listings Extracted From Glassdoor link
Limited to Data Scientists
temp4 <- read.csv2("G:/My Documents/glassdoor.csv", sep = ",")
temp4 <- temp4 %>% filter(str_detect(header.jobTitle, "Data Scientist") == TRUE)
dim(temp4)
## [1] 6184 163
Country Harder to extract country
count(temp4, map.country) %>% kbl() %>% kable_styling()
| map.country | n |
|---|---|
| 1726 | |
| ae | 2 |
| AE | 34 |
| AR | 16 |
| Argentina | 5 |
| AT | 39 |
| AU | 41 |
| Australia | 44 |
| Austria | 24 |
| Bangalore | 4 |
| BE | 67 |
| Belgium | 8 |
| Bengaluru | 1 |
| BG | 3 |
| BR | 27 |
| Brazil | 7 |
| ca | 9 |
| CA | 139 |
| CAN | 2 |
| Canada | 37 |
| CH | 78 |
| CHE | 80 |
| Chennai | 1 |
| China | 16 |
| CL | 1 |
| CN | 6 |
| CO | 1 |
| Costa Rica | 2 |
| Croatia | 2 |
| Czech Republic | 2 |
| de | 4 |
| DE | 415 |
| Denmark | 11 |
| Deutschland | 5 |
| DK | 7 |
| EE | 1 |
| EG | 4 |
| Egypt | 3 |
| England | 8 |
| es | 2 |
| ES | 33 |
| FI | 1 |
| Finland | 8 |
| FR | 307 |
| FRA | 4 |
| France | 89 |
| FRANCE | 5 |
| gb | 1 |
| GB | 150 |
| Germany | 80 |
| GR | 1 |
| Greece | 11 |
| HK | 66 |
| Hong Kong | 50 |
| HU | 4 |
| Hungary | 4 |
| ID | 48 |
| IE | 24 |
| IL | 176 |
| in | 7 |
| IN | 363 |
| India | 67 |
| Indonesia | 4 |
| IR | 1 |
| Ireland | 23 |
| Israel | 38 |
| it | 1 |
| IT | 25 |
| Italy | 48 |
| Japan | 11 |
| jp | 2 |
| JP | 15 |
| Kenya | 3 |
| Kingdom | 20 |
| KR | 6 |
| LT | 2 |
| lu | 1 |
| LU | 6 |
| Malaysia | 5 |
| Mexico | 10 |
| mx | 1 |
| MX | 32 |
| MY | 39 |
| Nederland | 2 |
| Netherlands | 24 |
| New Zealand | 9 |
| ng | 1 |
| NG | 11 |
| NL | 261 |
| NO | 2 |
| NZ | 7 |
| Pakistan | 1 |
| PH | 49 |
| Philippines | 5 |
| PK | 5 |
| PL | 62 |
| Poland | 20 |
| Portugal | 5 |
| PT | 13 |
| Romania | 6 |
| RS | 2 |
| RU | 3 |
| Russia | 1 |
| SA | 1 |
| Saudi Arabia | 4 |
| SE | 15 |
| Serbia | 1 |
| SG | 162 |
| SGP | 2 |
| SI | 1 |
| Singapore | 42 |
| SINGAPORE | 2 |
| Slovakia | 2 |
| South Africa | 12 |
| Sp | 3 |
| Spain | 45 |
| Sweden | 48 |
| Switzerland | 13 |
| Taiwan | 1 |
| TH | 2 |
| Thailand | 3 |
| TR | 23 |
| Turkey | 6 |
| TW | 6 |
| UA | 7 |
| UK | 33 |
| Ukraine | 1 |
| United Arab Emirates | 10 |
| United Kingdom | 267 |
| United States | 46 |
| UNITED STATES | 4 |
| Uruguay | 1 |
| us | 5 |
| US | 191 |
| USA | 7 |
| Vietnam | 2 |
| VN | 6 |
| ZA | 45 |
Data fields Potentially a lot of uninteresting fields Does include salary, industry and description
names(temp4) %>% kbl() %>% kable_styling()
| x |
|---|
| benefits.benefitRatingDecimal |
| benefits.comments |
| benefits.highlights |
| benefits.numRatings |
| benefits.employerSummary |
| breadCrumbs |
| gaTrackerData.category |
| gaTrackerData.empId |
| gaTrackerData.empName |
| gaTrackerData.empSize |
| gaTrackerData.expired |
| gaTrackerData.industry |
| gaTrackerData.industryId |
| gaTrackerData.jobId.long |
| gaTrackerData.jobId.int |
| gaTrackerData.jobTitle |
| gaTrackerData.location |
| gaTrackerData.locationId |
| gaTrackerData.locationType |
| gaTrackerData.pageRequestGuid.guid |
| gaTrackerData.pageRequestGuid.guidValid |
| gaTrackerData.pageRequestGuid.part1 |
| gaTrackerData.pageRequestGuid.part2 |
| gaTrackerData.sector |
| gaTrackerData.sectorId |
| gaTrackerData.profileConversionTrackingParams.trackingCAT |
| gaTrackerData.profileConversionTrackingParams.trackingSRC |
| gaTrackerData.profileConversionTrackingParams.trackingXSP |
| gaTrackerData.jobViewTrackingResult.jobViewDisplayTimeMillis |
| gaTrackerData.jobViewTrackingResult.requiresTracking |
| gaTrackerData.jobViewTrackingResult.trackingUrl |
| header.adOrderId |
| header.advertiserType |
| header.applicationId |
| header.applyButtonDisabled |
| header.applyUrl |
| header.blur |
| header.coverPhoto |
| header.easyApply |
| header.employerId |
| header.employerName |
| header.expired |
| header.gocId |
| header.hideCEOInfo |
| header.jobTitle |
| header.locId |
| header.location |
| header.locationType |
| header.logo |
| header.logo2x |
| header.organic |
| header.overviewUrl |
| header.posted |
| header.rating |
| header.saved |
| header.savedJobId |
| header.sgocId |
| header.sponsored |
| header.userAdmin |
| header.uxApplyType |
| header.featuredVideo |
| header.normalizedJobTitle |
| header.urgencyLabel |
| header.urgencyLabelForMessage |
| header.urgencyMessage |
| header.needsCommission |
| header.payHigh |
| header.payLow |
| header.payMed |
| header.payPeriod |
| header.salaryHigh |
| header.salaryLow |
| header.salarySource |
| job.description |
| job.discoverDate |
| job.eolHashCode |
| job.importConfigId |
| job.jobReqId.long |
| job.jobReqId.int |
| job.jobSource |
| job.jobTitleId |
| job.listingId.long |
| job.listingId.int |
| map.country |
| map.employerName |
| map.lat |
| map.lng |
| map.location |
| map.address |
| map.postalCode |
| overview.allBenefitsLink |
| overview.allPhotosLink |
| overview.allReviewsLink |
| overview.allSalariesLink |
| overview.foundedYear |
| overview.hq |
| overview.industry |
| overview.industryId |
| overview.revenue |
| overview.sector |
| overview.sectorId |
| overview.size |
| overview.stock |
| overview.type |
| overview.description |
| overview.mission |
| overview.website |
| overview.allVideosLink |
| overview.competitors |
| overview.companyVideo |
| photos |
| rating.ceo.name |
| rating.ceo.photo |
| rating.ceo.photo2x |
| rating.ceo.ratingsCount |
| rating.ceoApproval |
| rating.recommendToFriend |
| rating.starRating |
| reviews |
| salary.country.cc3LetterISO |
| salary.country.ccISO |
| salary.country.continent.continentCode |
| salary.country.continent.continentName |
| salary.country.continent.id |
| salary.country.continent.new |
| salary.country.countryFIPS |
| salary.country.currency.currencyCode |
| salary.country.currency.defaultFractionDigits |
| salary.country.currency.displayName |
| salary.country.currency.id |
| salary.country.currency.name |
| salary.country.currency.negativeTemplate |
| salary.country.currency.new |
| salary.country.currency.positiveTemplate |
| salary.country.currency.symbol |
| salary.country.currencyCode |
| salary.country.defaultLocale |
| salary.country.defaultName |
| salary.country.defaultShortName |
| salary.country.employerSolutionsCountry |
| salary.country.id |
| salary.country.longName |
| salary.country.major |
| salary.country.name |
| salary.country.new |
| salary.country.population |
| salary.country.shortName |
| salary.country.tld |
| salary.country.type |
| salary.country.uniqueName |
| salary.country.usaCentricDisplayName |
| salary.currency.currencyCode |
| salary.currency.defaultFractionDigits |
| salary.currency.displayName |
| salary.currency.id |
| salary.currency.name |
| salary.currency.negativeTemplate |
| salary.currency.new |
| salary.currency.positiveTemplate |
| salary.currency.symbol |
| salary.lastSalaryDate |
| salary.salaries |
| wwfu |
Industry
temp4 %>% count(overview.industry) %>% arrange(desc(n)) %>% kbl() %>% kable_styling()
| overview.industry | n |
|---|---|
| 1817 | |
| IT Services | 466 |
| Staffing & Outsourcing | 466 |
| Internet | 408 |
| Consulting | 348 |
| Computer Hardware & Software | 336 |
| Enterprise Software & Network Solutions | 257 |
| Biotech & Pharmaceuticals | 181 |
| Advertising & Marketing | 136 |
| Insurance Operators | 119 |
| Investment Banking & Asset Management | 111 |
| Accounting | 107 |
| Banks & Building Societies | 105 |
| Publishing | 61 |
| Research & Development | 58 |
| Healthcare Services & Hospitals | 56 |
| Aerospace & Defence | 55 |
| Energy | 49 |
| Video Games | 48 |
| Telecommunications Services | 47 |
| Electrical & Electronic Manufacturing | 42 |
| Lending | 42 |
| Transportation Management | 42 |
| Industrial Manufacturing | 40 |
| Cable, Internet & Telephone Providers | 37 |
| Financial Transaction Processing | 35 |
| Government Agencies | 33 |
| Financial Analytics & Research | 31 |
| Department, Clothing, & Shoe Shops | 27 |
| Transportation Equipment Manufacturing | 27 |
| Food & Drink Manufacturing | 26 |
| Travel Agencies | 25 |
| Consumer Products Manufacturing | 22 |
| Other Retail Shops | 22 |
| Chemical Manufacturing | 20 |
| Social Services | 20 |
| Architectural & Engineering Services | 19 |
| Education Training Services | 19 |
| Miscellaneous Manufacturing | 19 |
| Wholesale | 19 |
| Airlines | 18 |
| Gambling | 18 |
| Health, Beauty & Fitness | 18 |
| TV Broadcasting & Cable Networks | 17 |
| General Merchandise & Superstores | 16 |
| Insurance Agencies & Brokerages | 16 |
| Grocery Shops & Supermarkets | 14 |
| Colleges & Universities | 13 |
| Music Production & Distribution | 13 |
| Oil & Gas Exploration & Production | 13 |
| Utilities | 12 |
| Vehicle Dealers | 12 |
| Estate Agents | 11 |
| Hotel & Resorts | 11 |
| Sports & Recreation | 11 |
| Consumer Product Hire | 10 |
| Logistics & Supply Chain | 10 |
| Sporting Goods Shops | 9 |
| Beauty & Personal Accessories Shops | 8 |
| Consumer Electronics & Appliance Shops | 8 |
| Film Production & Distribution | 8 |
| Farm Support Services | 6 |
| Fast-Food & Quick-Service Restaurants | 6 |
| Healthcare Product Manufacturing | 6 |
| Mining | 6 |
| Camping & Caravan Parks | 5 |
| Catering & Food Service Contractors | 5 |
| Brokerage Services | 4 |
| Building & Construction | 4 |
| Express Delivery Services | 4 |
| Food & Beverage Shops | 4 |
| Home Furniture and Houseware Shops | 4 |
| Media & Entertainment Retail Shops | 4 |
| Oil & Gas Services | 4 |
| Security Services | 4 |
| Shipping | 4 |
| Ticket Sales | 4 |
| Car Hire | 3 |
| Charitable Foundations | 3 |
| Food Production | 3 |
| Pharmacies & Health Shops | 3 |
| Primary & Secondary Education | 3 |
| Vehicle Repair & Maintenance | 3 |
| Commercial Equipment Repair & Maintenance | 2 |
| Haulage | 2 |
| Legal | 2 |
| Membership Organisations | 2 |
| Metal & Mineral Manufacturing | 2 |
| Motor Vehicle Parts & Accessories Shops | 2 |
| Museums, Zoos & Amusement Parks | 2 |
| Pet & Pet Supply Shops | 2 |
| Removal Services | 2 |
| Bus & Coach Services | 1 |
| Casual Restaurants | 1 |
| Convenience Stores & Roadside Services | 1 |
| General Repair & Maintenance | 1 |
| Local Councils | 1 |
| News Outlets | 1 |
| Radio | 1 |
| Rail Freight | 1 |
| Telecommunications Manufacturing | 1 |
| Venture Capital & Private Equity | 1 |