Andrea | Andrés | Arlette | Carlos
Para obtener los reviews de los productos de amazon, primero debemos obtener el código ASIN de los productos, ya que los url están formados por este código para cada producto.
Por ejemplo, el url https://www.amazon.com/dp/B07L5YT5HV nos llevará a la página del libro “La casa de los espíritus [The House of the Spirits]”, donde B00P1L7BPU es el código ASIN. Dentro de otros libros incluidos en la evaluación están: La casa de los espíritus [The House of the Spirits], Cosmos: A Personal Voyage,Why We Sleep: Unlocking the Power of Sleep and Dreams.
library(RCurl) library(XML) library(stringr) library(rvest) library(purrr) library(knitr) library(lubridate) library(robotstxt) library(polite) library(tidyverse)
Vamos a utilizar la siguiente función para extraer varias características(título del producto,autor del review,título del review,estrellas,fecha y comentario), con el fin de poder analizar subsecuentemente estos datos.
# Function to scrape elements from Amazon reviews
scrape_amazon <- function(url, throttle = 5){
# Set throttle between URL calls
sec = 0
if(throttle < 0) warning("throttle was less than 0: set to 0")
if(throttle > 0) sec = max(0, throttle + runif(1, -1, 1))
session <- bow(url, force = TRUE)
# obtain HTML of URL
doc <- scrape(session)
# # Parse relevant elements from HTML
# title <- doc %>%
# html_nodes(".a-color-base") %>%
# html_text()
# title<- title[10:length(title)]
#
author <- doc %>%
html_nodes(".a-profile-name") %>%
html_text()
author <- author[-1:-2]
date <- doc %>%
html_nodes(".review-date") %>%
html_text() %>%
gsub(".*on ", "", .)
date <- date[-1:-2]
review_format <- doc %>%
html_nodes(".review-format-strip") %>%
html_text()
stars <- doc %>%
html_nodes(".review-rating") %>%
html_text() %>%
str_extract("\\d") %>%
as.numeric()
stars <- stars[-1:-2]
comments <- doc %>%
html_nodes(".review-text") %>%
html_text()
# Combine attributes into a single data frame
df <- data.frame(author, date, review_format, stars, comments, stringsAsFactors = F)
return(df)
}
Con esta función(ya está precargada en el tutorial), podemos extraer los datos mencionados.
# Poner el número de páginas deseadas
pages <- 2
# Objeto vacio para guardar los datos
reviews_all <- NULL
# iterar sobre cada página
prod_code <- c("B00P1L7BPU","B06XTYCPST","B0752XRB5F")
# ciclo para realizar scrapping de características de cada producto en prod_code
for(i in 1:length(prod_code)){
url <- paste0("https://www.amazon.com/product-reviews/",prod_code[i],
"/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews")
#obtener el nombre del producto y realizar una limpieza del mismo
session <- bow(url, force = TRUE)
prod <- scrape(session) %>%
html_nodes( "a.a-link-normal") %>%
html_text() %>%
gsub("\n", "", .) %>%
trimws()
prod<- prod[1]
for(page_num in 1:pages){
url <- paste0("https://www.amazon.com/product-reviews/",
prod_code[i],"/ref=cm_cr_arp_d_paging_btm_next_",
page_num,
"?ie=UTF8&reviewerType=all_reviews&pageNumber=",
page_num)
reviews <- scrape_amazon(url, throttle = 3)
reviews_all <- rbind(reviews_all, cbind(prod, reviews))
#books_reviews[[i]]<- reviews_all
}
#str(reviews_all)
}
La casa de los espíritus summary(libro1$stars)
Cosmos: A Personal Voyage summary(libro2$stars)
Why We Sleep: Unlocking the Power of Sleep and Dreams summary(libro3$stars)
libro1 <- subset(reviews_all, reviews_all$prod == "La casa de los espíritus [The House of the Spirits]")
libro2 <- subset(reviews_all, reviews_all$prod == "Cosmos: A Personal Voyage")
libro3 <- subset(reviews_all, reviews_all$prod == "Why We Sleep: Unlocking the Power of Sleep and Dreams")
summary(libro1$stars)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 4.0 5.0 4.4 5.0 5.0
summary(libro2$stars)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.0 5.0 5.0 4.9 5.0 5.0
summary(libro3$stars)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 3.00 3.25 5.00 5.00
library(dlookr)
## Warning: package 'dlookr' was built under R version 4.0.4
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:base':
##
## transform
library(dplyr)
boxplotLibros<- boxplot(stars~prod,
data=reviews_all,
main="*Stars by books*",
xlab="*Books*",
ylab="*Stars*",
col="gold",
border="blue"
)
print(boxplotLibros)
## $stats
## [,1] [,2] [,3]
## [1,] 5 3 1
## [2,] 5 4 2
## [3,] 5 5 3
## [4,] 5 5 5
## [5,] 5 5 5
##
## $n
## [1] 20 20 20
##
## $conf
## [,1] [,2] [,3]
## [1,] 5 4.646701 1.940104
## [2,] 5 5.353299 4.059896
##
## $out
## [1] 4 4 1
##
## $group
## [1] 1 1 2
##
## $names
## [1] "Cosmos: A Personal Voyage"
## [2] "La casa de los espíritus [The House of the Spirits]"
## [3] "Why We Sleep: Unlocking the Power of Sleep and Dreams"