library(xml2)
library(rvest)
library(stringr)
library(jsonlite)
#Please make sure both the .R files are in the same directory.
#Enter Lazada URLs to Scrap
urlToScrap <- c(
"https://www.lazada.com.my/products/iphone-12-64gb-128gb-256gb-i1559136225-s5247366377.html",
"https://www.lazada.com.my/products/samsung-galaxy-m32-smartphone-8gb-ram-128gb-rom-1-year-samsung-warranty-free-shipping-i2301265385-s9789875175.html",
"https://www.lazada.com.my/products/apple-ipad-pro-129-inch-5th-generation-wi-fi-i2233155396-s9460855321.html"
)
#Set source for our scrapper.R
scriptDir <- dirname(rstudioapi::getSourceEditorContext()$path)
source(paste(scriptDir, "Scrapper.R", sep = "/"))
#Initialize empty products dataframe
products = data.frame()
#Loop each URL for scrapping and store in the products dataframe
for (i in 1:length(urlToScrap)) {
products <- rbind(products, Scrap(urlToScrap[i]))
}
print(products)
## Title
## 1 iPhone 12 64GB / 128GB / 256GB
## 2 Samsung Galaxy M32 Smartphone (8GB RAM + 128GB ROM) 1 Year Samsung Warranty , Free Shipping
## 3 Apple iPad Pro 12.9-inch 5th Generation (Wi-Fi)
## Price Color Storage Brand Rating
## 1 RM3,399.00 PRODUCT(RED) N/A Apple 35 Ratings
## 2 RM949.00 Black 128GB Samsung 472 Ratings
## 3 RM4,599.00 Space Grey 128GB Apple 1 Ratings
Scrap.R File
Scrap <- function(urlToScrap) {
webpage <- read_html(urlToScrap)
title_html <- html_nodes(webpage, "div#module_product_title_1")
title <- html_text(title_html)
formattedTitle <- str_replace_all(title, "[\r\n]" , "")
head(formattedTitle)
price_html <- html_nodes(webpage, "span.pdp-price")
price <- html_text(price_html)
formattedPrice <- str_replace_all(price[1], "[\r\n]" , "")
head(formattedPrice)
type_html <- html_nodes(webpage, "div.sku-prop-content-header")
type <- html_text(type_html)
formattedType <- str_replace_all(type, "[\r\n]" , "")
#formattedType <- sub(".*Family", "", formattedType)
if (length(formattedType) > 1) {
color = formattedType[1]
storage = formattedType[2]
} else {
color = formattedType[1]
storage = "N/A"
}
#head(formattedType[1])
#paste(gsub("\\Family.*$", "", formattedType), sub(".*Family", "", formattedType), sep = ": ")
brand_html <- html_nodes(webpage, "div#module_product_brand_1")
brand <- html_text(brand_html)
formattedBrand <- str_replace_all(brand, "[\r\n]" , "")
formattedBrand <- gsub("\\More.*$", "", formattedBrand)
formattedBrand <- trimws(sub(".*:", "", formattedBrand))
head(formattedBrand)
rating_html <- html_nodes(webpage, "a.pdp-review-summary__link")
rating <- html_text(rating_html)
formattedRating <- str_replace_all(rating, "[\r\n]" , "")
head(formattedRating)
product <- data.frame(Title = formattedTitle,
Price = formattedPrice,
Color = color,
Storage = storage,
Brand = formattedBrand,
Rating = formattedRating)
return(product)
close(urlToScrap)
}