Aim

In this paper I present the process of webscraping data from Polish portal Otomoto.pl which I used next for further analysis


# General-purpose data wrangling
library(tidyverse)  
# Parsing of HTML/XML files  
library(rvest)    
# String manipulation
library(stringr)   
# Verbose regular expressions
library(rebus)     
# Eases DateTime manipulation
library(lubridate)
library(dplyr)

# paste link

url<-"https://www.otomoto.pl/osobowe/seg-city-car--seg-combi--seg-compact--seg-minivan--seg-sedan/od-2009/?search%5Bfilter_float_price%3Afrom%5D=15000&search%5Bfilter_float_price%3Ato%5D=25000&search%5Bfilter_float_year%3Ato%5D=2018&search%5Bfilter_float_mileage%3Afrom%5D=100000&search%5Bfilter_float_mileage%3Ato%5D=200000&search%5Bfilter_float_engine_capacity%3Afrom%5D=1000&search%5Bfilter_float_engine_capacity%3Ato%5D=2000&search%5Bfilter_float_engine_power%3Afrom%5D=80&search%5Bfilter_float_engine_power%3Ato%5D=200&search%5Bfilter_enum_fuel_type%5D%5B0%5D=petrol&search%5Bfilter_enum_fuel_type%5D%5B1%5D=diesel&search%5Bfilter_enum_fuel_type%5D%5B2%5D=petrol-lpg&search%5Bfilter_enum_damaged%5D=0&search%5Bfilter_enum_registered%5D=1&search%5Bfilter_enum_no_accident%5D=1&search%5Border%5D=filter_float_engine_power%3Adesc&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=polska"

web <- read_html(url)
#find last page
pages <- html_nodes(web, ".page")
pages <- html_text(pages)
pages <- as.numeric(pages[length(pages)])

#function for scraping data
scraping <- function(nazwa){
  result<-numeric(0)
  for (i in 1:pages) {
    web<-read_html(list_of_pages[i])
    value <- html_nodes(web,nazwa) 
    value <- html_text(value)
   
    result<-c(result,value)
  }
  return(result)
}


#create urls for all pages
list_of_pages <- str_c(url, '&page=', 1:pages)

zmienne <- c(".offer-title__link",".offer-price__number",".offer-item__params-item:nth-child(1) span",
             ".offer-item__params-item:nth-child(2) span",".offer-item__params-item:nth-child(3) span",
             ".offer-item__params-item:nth-child(4) span",".offer-item__location h4")
zmienne_nazwy <- c("mark", "price","year","mileage","capacity","fuel", "location")

#scrape data for each variable

start_time <- Sys.time()
for (i in 1:length(zmienne))
{
  nam <- scraping(zmienne[i])
  assign(zmienne_nazwy[i],nam)
}
end_time <- Sys.time()
end_time - start_time

#####clearing data####

mark <- gsub("\n","",mark)
mark <- word(str_trim(mark),1)

price <- gsub("PLN\n","",price)
price <- as.numeric(gsub(" ","",price))

year <- as.numeric(gsub(" ","",year))

mileage <- gsub("km","",mileage)
mileage <- as.numeric(gsub(" ","",mileage))

capacity <- gsub("cm3","",capacity)
capacity <- as.numeric(gsub(" ","",capacity))


location <- regmatches(location, gregexpr("(?<=\\().*?(?=\\))", location, perl=T))

location <- unlist(location)


data <- data.frame(id=c(1:length(mark)),mark, price, year,mileage,capacity, fuel,location)
dane<-data#save copy
data$fuel2 <-as.character(data$fuel)

#zmiana zmiennej fuel
# Benzyna
# Diesel
# Benzyna+LPG

data$fuel2[data$fuel2=="Benzyna"]<-1
data$fuel2[data$fuel2=="Diesel"]<-3
data$fuel2[data$fuel2=="Benzyna+LPG"]<-2

#location
a<-table(data$location)
a<-names(a)

data$location2 <- as.character(data$location)

for (i in 1:16)
{
i
data$location2[data$location2==a[i]] <- i
}
    data$fuel2 <- as.numeric(data$fuel2)              
    data$location2 <- as.numeric(data$location2)              
    data$age<-2019- as.numeric(as.character(data$year))

As the result I gain the observations with folloiwng variables:

##         mark price year mileage capacity        fuel           location
## 1       Seat 24900 2009  198000     2000     Benzyna          Lubelskie
## 2 Volkswagen 25000 2009  188000     1968      Diesel        Mazowieckie
## 3       Opel 17900 2009  190000     1598 Benzyna+LPG            Łódzkie
## 4       Saab 22999 2009  180000     1910      Diesel           Lubuskie
## 5        BMW 24900 2009  163000     1995      Diesel Zachodniopomorskie
## 6       Saab 19999 2009  181000     1910      Diesel          Pomorskie