In this paper I present the process of webscraping data from Polish portal Otomoto.pl which I used next for further analysis
# General-purpose data wrangling
library(tidyverse)
# Parsing of HTML/XML files
library(rvest)
# String manipulation
library(stringr)
# Verbose regular expressions
library(rebus)
# Eases DateTime manipulation
library(lubridate)
library(dplyr)
# paste link
url<-"https://www.otomoto.pl/osobowe/seg-city-car--seg-combi--seg-compact--seg-minivan--seg-sedan/od-2009/?search%5Bfilter_float_price%3Afrom%5D=15000&search%5Bfilter_float_price%3Ato%5D=25000&search%5Bfilter_float_year%3Ato%5D=2018&search%5Bfilter_float_mileage%3Afrom%5D=100000&search%5Bfilter_float_mileage%3Ato%5D=200000&search%5Bfilter_float_engine_capacity%3Afrom%5D=1000&search%5Bfilter_float_engine_capacity%3Ato%5D=2000&search%5Bfilter_float_engine_power%3Afrom%5D=80&search%5Bfilter_float_engine_power%3Ato%5D=200&search%5Bfilter_enum_fuel_type%5D%5B0%5D=petrol&search%5Bfilter_enum_fuel_type%5D%5B1%5D=diesel&search%5Bfilter_enum_fuel_type%5D%5B2%5D=petrol-lpg&search%5Bfilter_enum_damaged%5D=0&search%5Bfilter_enum_registered%5D=1&search%5Bfilter_enum_no_accident%5D=1&search%5Border%5D=filter_float_engine_power%3Adesc&search%5Bbrand_program_id%5D%5B0%5D=&search%5Bcountry%5D=polska"
web <- read_html(url)
#find last page
pages <- html_nodes(web, ".page")
pages <- html_text(pages)
pages <- as.numeric(pages[length(pages)])
#function for scraping data
scraping <- function(nazwa){
result<-numeric(0)
for (i in 1:pages) {
web<-read_html(list_of_pages[i])
value <- html_nodes(web,nazwa)
value <- html_text(value)
result<-c(result,value)
}
return(result)
}
#create urls for all pages
list_of_pages <- str_c(url, '&page=', 1:pages)
zmienne <- c(".offer-title__link",".offer-price__number",".offer-item__params-item:nth-child(1) span",
".offer-item__params-item:nth-child(2) span",".offer-item__params-item:nth-child(3) span",
".offer-item__params-item:nth-child(4) span",".offer-item__location h4")
zmienne_nazwy <- c("mark", "price","year","mileage","capacity","fuel", "location")
#scrape data for each variable
start_time <- Sys.time()
for (i in 1:length(zmienne))
{
nam <- scraping(zmienne[i])
assign(zmienne_nazwy[i],nam)
}
end_time <- Sys.time()
end_time - start_time
#####clearing data####
mark <- gsub("\n","",mark)
mark <- word(str_trim(mark),1)
price <- gsub("PLN\n","",price)
price <- as.numeric(gsub(" ","",price))
year <- as.numeric(gsub(" ","",year))
mileage <- gsub("km","",mileage)
mileage <- as.numeric(gsub(" ","",mileage))
capacity <- gsub("cm3","",capacity)
capacity <- as.numeric(gsub(" ","",capacity))
location <- regmatches(location, gregexpr("(?<=\\().*?(?=\\))", location, perl=T))
location <- unlist(location)
data <- data.frame(id=c(1:length(mark)),mark, price, year,mileage,capacity, fuel,location)
dane<-data#save copy
data$fuel2 <-as.character(data$fuel)
#zmiana zmiennej fuel
# Benzyna
# Diesel
# Benzyna+LPG
data$fuel2[data$fuel2=="Benzyna"]<-1
data$fuel2[data$fuel2=="Diesel"]<-3
data$fuel2[data$fuel2=="Benzyna+LPG"]<-2
#location
a<-table(data$location)
a<-names(a)
data$location2 <- as.character(data$location)
for (i in 1:16)
{
i
data$location2[data$location2==a[i]] <- i
}
data$fuel2 <- as.numeric(data$fuel2)
data$location2 <- as.numeric(data$location2)
data$age<-2019- as.numeric(as.character(data$year))
As the result I gain the observations with folloiwng variables:
## mark price year mileage capacity fuel location
## 1 Seat 24900 2009 198000 2000 Benzyna Lubelskie
## 2 Volkswagen 25000 2009 188000 1968 Diesel Mazowieckie
## 3 Opel 17900 2009 190000 1598 Benzyna+LPG Łódzkie
## 4 Saab 22999 2009 180000 1910 Diesel Lubuskie
## 5 BMW 24900 2009 163000 1995 Diesel Zachodniopomorskie
## 6 Saab 19999 2009 181000 1910 Diesel Pomorskie