knitr::opts_chunk$set(echo = TRUE)
library(rvest)
## Loading required package: xml2
library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()
library(RCurl)
## Warning: package 'RCurl' was built under R version 4.0.3
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
#made a test vector of company names, this is where you will make a vector of company names
company_names <- c("Microsoft", "apple", "chipotle")

#make the appropriate urls

#this uses company names to get siret numbers
scrape_co_info <- function(company_names) {
  url_base <- "https://www.societe.com/cgi-bin/search?champs="
  urls <- paste0(url_base, company_names)
  siret_nos <- vector("character", length(urls))
  siret_rule <- "#siret_number > span"
  href_rule <- "#search > div > a"
  for(i in 1:length(urls)) {
    
    iter <- paste("Getting Data for", company_names[i])
    print.noquote(iter)
    company_url <- read_html(urls[i]) %>% html_node(href_rule) %>% html_attr('href')
    company_url <- paste0("https://www.societe.com/", company_url)
    siret_nos[i] <- read_html(company_url) %>% html_node(siret_rule) %>% html_text()
    Sys.sleep(0.5)
    
    }
    
  final <- data.frame(Company = company_names, Siret_No = siret_nos)
  return(final)
}

siret_nos <- scrape_co_info(company_names)
## [1] Getting Data for Microsoft
## [1] Getting Data for apple
## [1] Getting Data for chipotle
#get company name using siret No.
siret_no <- siret_nos$Siret_No

scrape_co_name <- function(siret_nos) {
  urls <- paste0("https://www.societe.com/etablissement/club-utilisateurs-microsoft-dynamics-france-", siret_nos, ".html")
  rule <- "#identite_deno"
  company_names <- vector("character", length(urls))
  for(i in 1:length(urls)) {
    company_names[i] <- read_html(urls[i]) %>% html_node(rule) %>% html_text()
    
  }
  final <- data.frame(SIRET = siret_nos, Company = company_names)
  return(final)
}
scrape_co_name(siret_no)
##            SIRET                                                Company
## 1 50855907700013    CLUB UTILISATEURS MICROSOFT DYNAMICS FRANCE - 92200
## 2 78921691800027 ACTIVITES POUR PROMOUVOIR LE LOGEMENT EVOLUTIF - 31000
## 3 83092777800019                                       CHIPOTLE - 35760