| title: “Tutorial 3 Q2” author: “Noraziah Suliman” date: “11/21/2021” |
#Specifying the url for desired website to be scrapped
url <- "https://www.amazon.com/Samsung-SM-A505G-Factory-Unlocked-Renewed/dp/B07RL93F7T/ref=sr_1_3?keywords=iphone%2B4gb%2Bram%2B64gb&qid=1637416288&sr=8-3&th=1"
#Reading the html content from Amazon
webpage <- read_html(url)
#scrape title of the product>
title_html <- html_nodes(webpage, 'h1#title')
title <- html_text(title_html)
# remove all space and new lines
title <- str_replace_all(title, "[\r\n]","")
# print title value
head(title)
## [1] "Samsung Galaxy A50 SM-A505G 64GB 4GB RAM 25 MP 6.4\" Factory Unlocked- Black (Renewed)"
# scrape the price of the product>
price_html <- html_nodes(webpage, 'span#price_inside_buybox')
price <- html_text(price_html)
# remove spaces and new line>
price <- str_replace_all(price, "[\r\n]","")
# print price value>
head(price)
## [1] "$169.00"
# scrape the color of the product>
color_html <- html_nodes(webpage, 'span#inline-twister-expanded-dimension-text-color_name')
color <- html_text(color_html)
# remove spaces and new line>
color <- str_replace_all(color, "[\r\n]","")
head(color)
## [1] "Black"
# scrape the size of the product>
size_html <- html_nodes(webpage, 'span#inline-twister-expanded-dimension-text-size_name')
size <- html_text(size_html)
# remove spaces and new line>
size <- str_replace_all(size, "[\r\n]","")
head(size)
## [1] "64GB"
# scrape product rating >
rate_html <- html_nodes(webpage, 'span#acrPopover')
rate <- html_text(rate_html)
# remove spaces and newlines and tabs >
rate <- str_replace_all(rate, "[\r\n]","")
#str_trim(rate)
# print rating of the product>
head(rate)
## [1] "4.1 out of 5 stars" "4.1 out of 5 stars"
#Combining all the lists to form a data frame
product_data <- data.frame(Title = title, Price = price, Size = size, Color = color)
#Structure of the data frame
str(product_data)
## 'data.frame': 1 obs. of 4 variables:
## $ Title: chr "Samsung Galaxy A50 SM-A505G 64GB 4GB RAM 25 MP 6.4\" Factory Unlocked- Black (Renewed)"
## $ Price: chr "$169.00"
## $ Size : chr "64GB"
## $ Color: chr "Black"
# Include ‘jsonlite’ library to convert in JSON form.>
library(jsonlite)
# convert dataframe into JSON format>
json_data <- toJSON(product_data)
# print output>
cat(json_data)
## [{"Title":"Samsung Galaxy A50 SM-A505G 64GB 4GB RAM 25 MP 6.4\" Factory Unlocked- Black (Renewed)","Price":"$169.00","Size":"64GB","Color":"Black"}]