#loading the package:
library(xml2)
library(rvest)
library(stringr)
#Specifying the url for desired website to be scrapped
url <- 'https://www.amazon.sg/Sony-ILCEM3K-Full-frame-Mirrorless-Interchangeable-Lens/dp/B07B45D8WV/ref=sr_1_1?keywords=camera&qid=1636541556&sr=8-1'
#Reading the html content from Amazon
webpage <- read_html(url)
#scrape title of the product
title_html <- html_nodes(webpage, 'span#productTitle')
title <- html_text(title_html)
head(title)
## [1] "\n\n\n\n\n\n\n\nSony a7 III (ILCEM3K/B) Full-frame Mirrorless Interchangeable-Lens Camera with 28-70mm Lens with 3-Inch LCD, Black\n\n\n\n\n\n\n"
##
##
##
##
##
##
##
##
## Sony a7 III (ILCEM3K/B) Full-frame Mirrorless Interchangeable-Lens Camera with 28-70mm Lens with 3-Inch LCD, Black
# remove all space and new lines
title <- str_replace_all(title, "[\r\n]" , "")
# scrape the price of the product
price_html <- html_nodes(webpage, 'span.a-offscreen')
price <- html_text(price_html)
# remove spaces and new line
price<- str_replace_all(price, "[\r\n]" , "")[1]
# print price value
head(price)
## [1] "S$3,204.85"
## S$3,204.85
# scrape product description
desc_html <- html_nodes(webpage, 'div#productDescription')
desc <- html_text(desc_html)
# replace new lines and spaces
desc <- str_replace_all(desc, "[\r\n]" , "")
desc <- str_trim(desc)
head(desc)
## [1] "Style:w/ 28-70mm | Configuration:BaseAdvanced 24.2MP Full-frame Image Sensor w/ 1.8X readout speed Advanced 24.2MP Back-Illuminated 35mm Full-frame Image Sensor. Improved AF and tracking plus up to 10fps continuous shooting capture decisive moments. Reliable operability ensures confident shooting. A light, compact body enhances mobility. 4K HDR3 4 movie recording capability."
# scrape product rating
rate_html <- html_nodes(webpage, 'span#acrPopover')
rate <- html_text(rate_html)
# remove spaces and newlines and tabs
rate <- str_replace_all(rate, "[\r\n]" , "")
rate <- str_trim(rate)[1]
# print rating of the product
head(rate)
## [1] "4.8 out of 5 stars"
# Scrape product color
color_html <- html_nodes(webpage, 'div#productOverview_feature_div')
color_html <- html_nodes(color_html, 'span.a-size-base')
color <- html_text(color_html)
# remove tabs from text
color <- str_trim(color)[12]
# print product color
head(color)
## [1] "Black"
#Combining all the lists to form a data frame
product_data <- data.frame(Title = title, Price = price, Description = desc, Rating = rate, Color = color)
#Structure of the data frame
str(product_data)
## 'data.frame': 1 obs. of 5 variables:
## $ Title : chr "Sony a7 III (ILCEM3K/B) Full-frame Mirrorless Interchangeable-Lens Camera with 28-70mm Lens with 3-Inch LCD, Black"
## $ Price : chr "S$3,204.85"
## $ Description: chr "Style:w/ 28-70mm | Configuration:BaseAdvanced 24.2MP Full-frame Image Sensor w/ 1.8X readout speed Advanced 24."| __truncated__
## $ Rating : chr "4.8 out of 5 stars"
## $ Color : chr "Black"
# Include ‘jsonlite’ library to convert in JSON form.
library(jsonlite)
# convert dataframe into JSON format
json_data <- toJSON(product_data)
# print output
cat(json_data)
## [{"Title":"Sony a7 III (ILCEM3K/B) Full-frame Mirrorless Interchangeable-Lens Camera with 28-70mm Lens with 3-Inch LCD, Black","Price":"S$3,204.85","Description":"Style:w/ 28-70mm | Configuration:BaseAdvanced 24.2MP Full-frame Image Sensor w/ 1.8X readout speed Advanced 24.2MP Back-Illuminated 35mm Full-frame Image Sensor. Improved AF and tracking plus up to 10fps continuous shooting capture decisive moments. Reliable operability ensures confident shooting. A light, compact body enhances mobility. 4K HDR3 4 movie recording capability.","Rating":"4.8 out of 5 stars","Color":"Black"}]
The End, Thanks!