knitr::opts_chunk$set(echo = TRUE)
library(rvest)
## Loading required package: xml2
library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
library(RCurl)
## Warning: package 'RCurl' was built under R version 4.0.3
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
#made a test vector of company names, this is where you will make a vector of company names
company_names <- c("Microsoft", "apple", "chipotle")
#make the appropriate urls
#this uses company names to get siret numbers
scrape_co_info <- function(company_names) {
url_base <- "https://www.societe.com/cgi-bin/search?champs="
urls <- paste0(url_base, company_names)
siret_nos <- vector("character", length(urls))
siret_rule <- "#siret_number > span"
href_rule <- "#search > div > a"
for(i in 1:length(urls)) {
iter <- paste("Getting Data for", company_names[i])
print.noquote(iter)
company_url <- read_html(urls[i]) %>% html_node(href_rule) %>% html_attr('href')
company_url <- paste0("https://www.societe.com/", company_url)
siret_nos[i] <- read_html(company_url) %>% html_node(siret_rule) %>% html_text()
Sys.sleep(0.5)
}
final <- data.frame(Company = company_names, Siret_No = siret_nos)
return(final)
}
siret_nos <- scrape_co_info(company_names)
## [1] Getting Data for Microsoft
## [1] Getting Data for apple
## [1] Getting Data for chipotle
#get company name using siret No.
siret_no <- siret_nos$Siret_No
scrape_co_name <- function(siret_nos) {
urls <- paste0("https://www.societe.com/etablissement/club-utilisateurs-microsoft-dynamics-france-", siret_nos, ".html")
rule <- "#identite_deno"
company_names <- vector("character", length(urls))
for(i in 1:length(urls)) {
company_names[i] <- read_html(urls[i]) %>% html_node(rule) %>% html_text()
}
final <- data.frame(SIRET = siret_nos, Company = company_names)
return(final)
}
scrape_co_name(siret_no)
## SIRET Company
## 1 50855907700013 CLUB UTILISATEURS MICROSOFT DYNAMICS FRANCE - 92200
## 2 78921691800027 ACTIVITES POUR PROMOUVOIR LE LOGEMENT EVOLUTIF - 31000
## 3 83092777800019 CHIPOTLE - 35760