The key steps involved in this process included:
#Create dataframe to capture all listings associated with search as well as select parameters
listings <- data.frame(matrix(ncol=5))
colnames(listings) = c("Company", "Job_Title", "Location", "Links", "Job_Description")
#loop through pages of listings for single search
for (i in seq(10,100,10)){
#first line is the landing page after search query
url_start <- "https://www.indeed.com/jobs?q=data+scientist&l=Atlanta%2C+GA"
# provides url for subsequent pages
url <- paste0(url_start, "&start=", i)
# reads all info on each page
target <- xml2::read_html(url)
#Sys.sleep(2)
# Company names on each page
Company <- target %>%
rvest::html_nodes(".company") %>%
rvest::html_text() %>%
stringi::stri_trim_both()
# Job Titles on each page
Job_Title <- target %>%
rvest::html_nodes("div") %>%
rvest::html_nodes(xpath = '//*[@data-tn-element = "jobTitle"]') %>%
rvest::html_attr("title")
# Job Locations on each page
Location<- target %>%
rvest::html_nodes(".location") %>%
rvest::html_text()
# Job Links on each page --> these should link to individual job pages
Links <- target %>%
rvest::html_nodes('[data-tn-element="jobTitle"]') %>%
rvest::html_attr("href")
# Collect job descriptions from linked pages
Job_Description <- c()
for(i in seq_along(Links)) {
p_url <- paste0("https://www.indeed.com", Links[i])
pg <- xml2::read_html(p_url)
description <- pg %>%
rvest::html_nodes("span") %>%
rvest::html_nodes(xpath = '//*[@class="jobsearch-JobComponent-description icl-u-xs-mt--md"]') %>%
rvest::html_text() %>%
stringi::stri_trim_both()
Job_Description <- c(Job_Description, description)
}
df <-data.frame(Job_Title, Company, Location, Links, Job_Description)
listings <- rbind(listings, df) # build final dataframe
}listings <- listings %>% mutate(ID = row_number()) # add row number
# write individual descriptions to text files using row number
for (i in 1:nrow(listings)) {
write(listings$Job_Description[i], paste0(listings$ID[i], ".txt"))
}
# export remainder of dataframe to csv
Atlanta<-listings%>%select(Job_Title, Company, Location, Links)
write.csv(Atlanta, file = "Atlanta_Table.csv")Atl <- read_csv("https://raw.githubusercontent.com/zachsfr/Data-607-Project-Three/Sean-Branch/Atlanta_Table.csv",trim_ws =T)
Atl <- Atl %>%
filter(!X1 ==1 ) %>%
select(!X1) %>%
rowid_to_column("ID") %>%
mutate( across(.cols = everything(),~str_squish(.)))
Atl <- Atl %>%
separate(Location, c("City", "State"), ",") %>%
separate(State, c("tmp", "State", "Zip_Code"), "\\s") %>%
select(-c( tmp, Links))
# Create categories of job_level
Atl <- Atl%>%
mutate(Job_Level = case_when(grepl("Intern", Job_Title) ~ "Intern",
grepl("^Data Analyst|Entry Level", Job_Title, ignore.case=TRUE) ~ "Junior",
grepl("Senior Data Scientist|Director|Lead|Principal|Sr.|President", Job_Title, ignore.case=TRUE)~"Senior",
grepl("^Associate.+|Senior Data Analyst|Data Engineer|Senior Associate|Machine Learning|ML|AI|Data Engineer|Manage.+|Data Scientist|Specialist|Data Science", Job_Title, ignore.case=TRUE)~"Mid_Level"))%>%
relocate(Job_Level, .after=Job_Title)write.csv(Atl, file = "Atlanta_Clean.csv")from bs4 import BeautifulSoup
import requests
import numpy as np
import csv
import pandas as pd
from time import sleep
from random import randint
from datetime import datetime
import redef get_url(position,location):
template = "https://www.indeed.com/jobs?q={}&l={}"
url = template.format(position,location)
return urlpy$get_url("Test","Test")def get_record(card):
atag = card.h2.a
try:
job_title = atag.get('title')
except AttributeError:
job_title = ''
try:
company = card.find('span', 'company').text.strip()
except AttributeError:
company = ''
try:
location = card.find('div', 'recJobLoc').get('data-rc-loc')
except AttributeError:
location = ''
try:
job_summary = card.find('div', 'summary').text.strip()
except AttributeError:
job_summary = ''
try:
post_date = card.find('span', 'date').text.strip()
except AttributeError:
post_date = ''
try:
salary = card.find('span', 'salarytext').text.strip()
except AttributeError:
salary = ''
extract_date = datetime.today().strftime('%Y-%m-%d')
job_url = 'https://www.indeed.com' + atag.get('href')
return (job_title, company, location, job_summary, salary, post_date, extract_date, job_url)headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.47'
}def get_data(position,location):
records = []
url = get_url(position, location)
while True:
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('div', 'jobsearch-SerpJobCard')
for card in cards:
record = get_record(card)
records.append(record)
try:
url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
delay = randint(1, 50)
sleep(delay)
except AttributeError:
break
return pd.DataFrame(records)indeed_scraper <- function(position,location){
x <- py$get_data(position,location)
x
}NY <- indeed_scraper("Data Science","NY")write.csv(NY, file = "Data-Science-Nyn.csv")if(!file.exists('Data-607-Project-Three-Dan-Branch')) {
download.file(
"https://github.com/zachsfr/Data-607-Project-Three/archive/refs/heads/Dan-Branch.zip",
destfile = 'Dan-Branch.zip',mode = "wb")
unzip("Dan-Branch.zip")
}
atlanta <- ("Data-607-Project-Three-Dan-Branch/atlanta")atlanta_corpus <- VCorpus(DirSource(atlanta, encoding = "UTF-8"), readerControl = list(language = "en"))find <- c("artificial intelligence","amazon web services","[^[[:alnum:]][Cc]\\#","[^[[:alnum:]][Cc]\\+\\+","computer science","computer vision","data analysis","data engineering","data wrangling","deep learning","large datasets","machine learning","natural language processing","neural networks","object oriented","project management","[^[[:alnum:]][Rr][^[[:alnum:]]","scikit-learn","software development","software engineering","time series")
repl <- c("ai","aws"," csharp"," cplusplus","computerscience","computervision","dataanalysis","dataengineering","datawrangling","deeplearning","largedatasets","machinelearning","nlp","neuralnetworks","oop","projectmanagement"," rrrr","scikitlearn","softwaredevelopment","softwareengineering","timeseries")
for (i in seq(length(find))) {
atlanta_corpus <- tm_map(atlanta_corpus, content_transformer(function(atlanta_corpus) gsub(atlanta_corpus, pattern = find[i], replacement = repl[i])))
}atlanta_corpus <- tm_map(atlanta_corpus, removePunctuation)tm package to generate the document-term matrix, and then transform it to a dataframe.document_term <- DocumentTermMatrix(atlanta_corpus)
document_term <- document_term %>%
as.matrix() %>%
as.data.frame()ds_skills_list, and the dataframe containing only these columns of interest is ds_skills_df.ds_skills_list <- c("ai","airflow","analysis","aws","azure","bigquery","c","caffe","caffe2","cassandra","communication","computerscience","computervision","cplusplus","csharp","d3","dataanalysis","dataengineering","datawrangling","databases","deeplearning","docker","excel","fintech","git","hadoop","hbase","hive","java","javascript","keras","kubernetes","largedatasets","linux","machinelearning","mathematics","matlab","mongodb","mysql","neuralnetworks","nlp","nosql","numpy","oop","pandas","perl","pig","projectmanagement","publications","python","pytorch","rrrr","sas","scala","scikitlearn","scipy","sklearn","softwaredevelopment","softwareengineering","spark","spss","sql","statistics","tableau","tensorflow","theano","timeseries","unix","visualization")
ds_skills_in_document_term <- cbind(ds_skills_list, ds_skills_list %in% colnames(document_term))
ds_skills_in_document_term <- as.data.frame(ds_skills_in_document_term)
ds_skills_in_document_term <- ds_skills_in_document_term %>%
filter(V2 == "TRUE")
ds_skills_df <- document_term %>%
select(ds_skills_in_document_term$ds_skills_list)ds_skills_df.ds_skills_df <- rownames_to_column(ds_skills_df)
ds_skills_df <- rename(ds_skills_df, "listing" = "rowname", "r" = "rrrr")
ds_skills_df <- ds_skills_df %>%
mutate("listing" = substr(listing,0,nchar(listing)-4))Install the latest version of the MariaDB server, which is very similar to MySQL, and is the best fit for our Raspberry Pi OS (Debian10).
Edit the “/etc/mysql/mariadb.conf.d/50-server.cnf” file to allow the server to bind to non-local addresses.
Edit the “/etc/mysql/mariadb.conf.d/50-client.cnf” file’s socket location to match the server one.
Enter MariaDB as the root user.
Listings table and one in the Skills table, and the “listing” column is set to delete a row when the listing it points to is deleted (ON DELETE CASCADE).Listings table such thatjob levels are in a table similar to Degrees, and represented ordinally by an integer in likewise fashion, and
(city, state) combinations are represented instead by an integer which points to a row containing that tuple in a Location table.