library(xml2)
library(rvest)
library(stringr)
library(jsonlite)

#Please make sure both the .R files are in the same directory.

#Enter Lazada URLs to Scrap
urlToScrap <- c(
"https://www.lazada.com.my/products/iphone-12-64gb-128gb-256gb-i1559136225-s5247366377.html",
"https://www.lazada.com.my/products/samsung-galaxy-m32-smartphone-8gb-ram-128gb-rom-1-year-samsung-warranty-free-shipping-i2301265385-s9789875175.html",
"https://www.lazada.com.my/products/apple-ipad-pro-129-inch-5th-generation-wi-fi-i2233155396-s9460855321.html"
)

#Set source for our scrapper.R
scriptDir <- dirname(rstudioapi::getSourceEditorContext()$path)
source(paste(scriptDir, "Scrapper.R", sep = "/"))

#Initialize empty products dataframe
products = data.frame()

#Loop each URL for scrapping and store in the products dataframe
for (i in 1:length(urlToScrap)) {
  products <- rbind(products, Scrap(urlToScrap[i]))
}

print(products)
##                                                                                         Title
## 1                                                              iPhone 12 64GB / 128GB / 256GB
## 2 Samsung Galaxy M32 Smartphone (8GB RAM + 128GB ROM) 1 Year Samsung Warranty , Free Shipping
## 3                                             Apple iPad Pro 12.9-inch 5th Generation (Wi-Fi)
##        Price        Color Storage   Brand      Rating
## 1 RM3,399.00 PRODUCT(RED)     N/A   Apple  35 Ratings
## 2   RM949.00        Black   128GB Samsung 472 Ratings
## 3 RM4,599.00   Space Grey   128GB   Apple   1 Ratings


Store data in JSON format

json_data <- toJSON(products, pretty = T)
cat(json_data)
## [
##   {
##     "Title": "iPhone 12 64GB / 128GB / 256GB",
##     "Price": "RM3,399.00",
##     "Color": "PRODUCT(RED)",
##     "Storage": "N/A",
##     "Brand": "Apple",
##     "Rating": "35 Ratings"
##   },
##   {
##     "Title": "Samsung Galaxy M32 Smartphone (8GB RAM + 128GB ROM) 1 Year Samsung Warranty , Free Shipping",
##     "Price": "RM949.00",
##     "Color": "Black",
##     "Storage": "128GB",
##     "Brand": "Samsung",
##     "Rating": "472 Ratings"
##   },
##   {
##     "Title": "Apple iPad Pro 12.9-inch 5th Generation (Wi-Fi)",
##     "Price": "RM4,599.00",
##     "Color": "Space Grey",
##     "Storage": "128GB",
##     "Brand": "Apple",
##     "Rating": "1 Ratings"
##   }
## ]
write(json_data, paste(scriptDir, "products.json", sep="/"))



Scrap.R File

Scrap <- function(urlToScrap) {
  webpage <- read_html(urlToScrap)
  
  title_html <- html_nodes(webpage, "div#module_product_title_1")
  title <- html_text(title_html)
  formattedTitle <- str_replace_all(title, "[\r\n]" , "")
  head(formattedTitle)
  
  price_html <- html_nodes(webpage, "span.pdp-price")
  price <- html_text(price_html)
  formattedPrice <- str_replace_all(price[1], "[\r\n]" , "")
  head(formattedPrice)
  
  type_html <- html_nodes(webpage, "div.sku-prop-content-header")
  type <- html_text(type_html)
  formattedType <- str_replace_all(type, "[\r\n]" , "")
  #formattedType <- sub(".*Family", "", formattedType)
  if (length(formattedType) > 1) {
    color = formattedType[1]
    storage = formattedType[2]
  } else {
    color = formattedType[1]
    storage = "N/A"
  }
  #head(formattedType[1])
  #paste(gsub("\\Family.*$", "", formattedType), sub(".*Family", "", formattedType), sep = ": ")
  
  brand_html <- html_nodes(webpage, "div#module_product_brand_1")
  brand <- html_text(brand_html)
  formattedBrand <- str_replace_all(brand, "[\r\n]" , "")
  formattedBrand <- gsub("\\More.*$", "", formattedBrand)
  formattedBrand <- trimws(sub(".*:", "", formattedBrand))
  head(formattedBrand)
  
  rating_html <- html_nodes(webpage, "a.pdp-review-summary__link")
  rating <- html_text(rating_html)
  formattedRating <- str_replace_all(rating, "[\r\n]" , "")
  head(formattedRating)
  
  product <- data.frame(Title = formattedTitle,
                        Price = formattedPrice,
                        Color = color, 
                        Storage = storage,
                        Brand = formattedBrand,
                        Rating = formattedRating)
  return(product) 
  close(urlToScrap)
}