scrape_amazon <- function(url, throttle = 0){
# Install / Load relevant packages
if(!"pacman" %in% installed.packages()[,"Package"]) install.packages("pacman")
pacman::p_load(RCurl, XML, dplyr, stringr, rvest, purrr)
# Set throttle between URL calls
sec = 0
if(throttle < 0) warning("throttle was less than 0: set to 0")
if(throttle > 0) sec = max(0, throttle + runif(1, -1, 1))
# obtain HTML of URL
doc <- read_html(url)
author <- doc %>%
html_nodes(".a-profile-name") %>%
html_text()
date <- doc %>%
html_nodes(".review-date") %>%
html_text() %>%
gsub(".*on ", "", .)
review_format <- doc %>%
html_nodes(".review-format-strip") %>%
html_text()
stars <- doc %>%
html_nodes(".review-rating") %>%
html_text() %>%
str_extract("\\d") %>%
as.numeric()
comments <- doc %>%
html_nodes(".review-text") %>%
html_text()
# Combine attributes into a single data frame
df <- data.frame(author, date, review_format, stars, comments, stringsAsFactors = F)
return(df)
}
cod_asin <- "B07PZTVLB3"
web_url <- paste0("https://www.amazon.com/dp/", cod_asin, "/?pagenumber=1")
comentarios <- scrape_amazon(web_url)
str(comentarios)
## 'data.frame': 13 obs. of 5 variables:
## $ author : chr "Tisha Vargas" "Rick" "Michelle Casey" "Kindle Customer" ...
## $ date : chr "October 3, 2019" "October 25, 2019" "October 7, 2019" "October 19, 2019" ...
## $ review_format: chr "Verified Purchase" "Verified Purchase" "Verified Purchase" "Verified Purchase" ...
## $ stars : num 5 5 5 5 5 5 5 5 5 5 ...
## $ comments : chr "\n\n\n\n\n\n\n\n\n\n \n \n \n This book had amazing insight. It brought alot of things to light for me a"| __truncated__ "\n\n\n\n\n\n\n\n\n\n \n \n \n They don’t have enough stars as an option to give this book or any book of "| __truncated__ "\n\n\n\n\n\n\n\n\n\n \n \n \n Jesus used John Ramirez to absolutely transform my life through deliverance"| __truncated__ "\n\n\n\n\n\n\n\n\n\n \n \n \n This Book is for EVERYONE Sincere about Overcoming fear! I Highly recommend"| __truncated__ ...
pages <- 15
comentariostotales <- NULL
product <- read_html(web_url) %>%
html_nodes( "#productTitle") %>%
html_text() %>%
gsub("\n", "", .) %>%
trimws()
product
## [1] "Destroying Fear: Strategies to Overthrow the Enemy's Tactics and Walk in Total Freedom"
for (page_num in 1: pages) {
web_url <- paste0("https://www.amazon.com/dp/", cod_asin, "/?pagenumber=", page_num)
comentariostotales <- rbind(comentariostotales, cbind(product, comentarios))
}
CodeID <- "B08GZGCT6Q"
url <- paste0("https://www.amazon.com/dp/", CodeID,"/?pageNumber=1")
reviews <- scrape_amazon(url)
pages <- 5
reviews_all <- NULL
product <- read_html(url) %>%
html_nodes( "#productTitle") %>%
html_text() %>%
gsub("\n", "", .) %>%
trimws()
product
## [1] "Mystery of the Power Words: Speak the Words That Move Mountains and Make Hell Tremble"
for(page_num in 1:pages){
url <- paste0("http://www.amazon.com/dp/",
CodeID,
"/?pageNumber=",
page_num)
reviews <- scrape_amazon(url, throttle = 3)
reviews_all <- rbind(reviews_all, cbind(product, reviews))
}
prod_code <- "B0062Y7D26"
url <- paste0("https://www.amazon.com/dp/", prod_code,"/?pageNumber=1")
reviews_Adriana <- scrape_amazon(url)
# mostrar data
str(reviews_Adriana)
## 'data.frame': 13 obs. of 5 variables:
## $ author : chr "Raul" "Rose M" "Gaby B." "Sec1958" ...
## $ date : chr "November 30, 2015" "October 22, 2020" "May 29, 2015" "April 20, 2019" ...
## $ review_format: chr "Verified Purchase" "Verified Purchase" "Verified Purchase" "Verified Purchase" ...
## $ stars : num 5 5 5 5 4 5 5 5 5 5 ...
## $ comments : chr "\n\n\n\n\n\n\n\n\n\n \n \n Este es un libro sencillamente fantástico y realmente te roba el corazón. Si bi"| __truncated__ "\n\n\n\n\n\n\n\n\n\n \n \n \n I gave this book as a gift to a well educated relative, retired from teachi"| __truncated__ "\n\n\n\n\n\n\n\n\n\n \n \n \"He odiado las palabras y las he amado, y espero haber estado a su altura.\"Ma"| __truncated__ "\n\n\n\n\n\n\n\n\n\n \n \n La historia conmovedora de una niña alemana durante la Segunda Guerra Mundial. "| __truncated__ ...
pages <- 5
reviews_allAdriana <- NULL
product <- read_html(url) %>%
html_nodes( "#productTitle") %>%
html_text() %>%
gsub("\n", "", .) %>%
trimws()
product
## [1] "La ladrona de libros (Spanish Edition)"
for(page_num in 1:pages){
url <- paste0("http://www.amazon.com/dp/",
prod_code,
"/?pageNumber=",
page_num)
reviews_Adriana <- scrape_amazon(url, throttle = 3)
reviews_allAdriana <- rbind(reviews_allAdriana, cbind(product, reviews_Adriana))
}
cod_asin <- "0375851569"
web_url <- paste0("https://www.amazon.com/dp/", cod_asin, "/?pagenumber=1")
CometariosDrS <- scrape_amazon(web_url)
str(CometariosDrS)
## 'data.frame': 13 obs. of 5 variables:
## $ author : chr "Ariyuana" "goingBananas" "Wendy" "cyclehum" ...
## $ date : chr "February 27, 2021" "May 27, 2018" "June 10, 2017" "February 15, 2018" ...
## $ review_format: chr "Verified Purchase" "Verified Purchase" "Verified Purchase" "Verified Purchase" ...
## $ stars : num 5 4 5 2 2 5 5 5 5 5 ...
## $ comments : chr "\n\n\n\n\n\n\n\n\n\n \n \n \n I’m so glad I bought this before hearing about Dr. Seuss getting “cancelled"| __truncated__ "\n\n\n\n\n\n\n\n\n\n \n \n \n Cute starter set in the book size I was looking for 6.75”x 9.25”. The book"| __truncated__ "\n\n\n\n\n\n\n\n\n\n \n \n \n WHO could not love Dr. Seuss books?! I enjoy them as much in my adulthood "| __truncated__ "\n\n\n\n\n\n\n\n\n\n \n \n \n These books would be a great buy, but one came with the cover upside-down o"| __truncated__ ...
pages <- 5
ComentTot_DrSeuss <- NULL
product <- read_html(web_url) %>%
html_nodes( "#productTitle") %>%
html_text() %>%
gsub("\n", "", .) %>%
trimws()
product
## [1] "Dr. Seuss's Beginner Book Collection (Cat in the Hat, One Fish Two Fish, Green Eggs and Ham, Hop on Pop, Fox in Socks)"
for (page_num in 1: pages) {
web_url <- paste0("https://www.amazon.com/dp/", cod_asin, "/?pagenumber=", page_num)
ComentTot_DrSeuss <- rbind(ComentTot_DrSeuss, cbind(product, CometariosDrS))
}
names(comentariostotales)
## [1] "product" "author" "date" "review_format"
## [5] "stars" "comments"
names(reviews_all)
## [1] "product" "author" "date" "review_format"
## [5] "stars" "comments"
names(reviews_allAdriana)
## [1] "product" "author" "date" "review_format"
## [5] "stars" "comments"
names(ComentTot_DrSeuss)
## [1] "product" "author" "date" "review_format"
## [5] "stars" "comments"
df_juntos <- bind_rows(comentariostotales, reviews_all, reviews_allAdriana, ComentTot_DrSeuss)
analisis <- df_juntos %>%
group_by(product,stars) %>%
summarise(Cantidad = n()) %>%
arrange(- Cantidad)
## `summarise()` regrouping output by 'product' (override with `.groups` argument)
analisis
## # A tibble: 9 x 3
## # Groups: product [4]
## product stars Cantidad
## <chr> <dbl> <int>
## 1 Destroying Fear: Strategies to Overthrow the Enemy's Tactics a~ 5 195
## 2 La ladrona de libros (Spanish Edition) 5 50
## 3 Dr. Seuss's Beginner Book Collection (Cat in the Hat, One Fish~ 5 45
## 4 Mystery of the Power Words: Speak the Words That Move Mountain~ 5 45
## 5 La ladrona de libros (Spanish Edition) 4 15
## 6 Dr. Seuss's Beginner Book Collection (Cat in the Hat, One Fish~ 2 10
## 7 Dr. Seuss's Beginner Book Collection (Cat in the Hat, One Fish~ 4 10
## 8 Mystery of the Power Words: Speak the Words That Move Mountain~ 1 5
## 9 Mystery of the Power Words: Speak the Words That Move Mountain~ 4 5
glimpse(comentariostotales)
## Rows: 195
## Columns: 6
## $ product <chr> "Destroying Fear: Strategies to Overthrow the Enemy's...
## $ author <chr> "Tisha Vargas", "Rick", "Michelle Casey", "Kindle Cus...
## $ date <chr> "October 3, 2019", "October 25, 2019", "October 7, 20...
## $ review_format <chr> "Verified Purchase", "Verified Purchase", "Verified P...
## $ stars <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,...
## $ comments <chr> "\n\n\n\n\n\n\n\n\n\n \n \n \n This book had am...
DestroyingFear <- comentariostotales%>%
group_by(stars) %>%
summarise(cantidad = n())
## `summarise()` ungrouping output (override with `.groups` argument)
boxplot(DestroyingFear$stars,
main = "Destroying Fear: Strategies to Overthrow the Enemy's Tactics and Walk in Total Freedom")
#MysteryofthePowerWords
MysteryofthePowerWords <- reviews_all%>%
group_by(stars) %>%
summarise(cantidad = n())
## `summarise()` ungrouping output (override with `.groups` argument)
boxplot(MysteryofthePowerWords$stars,
main = "Mystery of the Power Words: Speak the Words That Move Mountains and Make Hell Tremble")
#La Ladrona de Libros
Laladronadelibros<- reviews_allAdriana%>%
group_by(stars) %>%
summarise(cantidad = n())
## `summarise()` ungrouping output (override with `.groups` argument)
boxplot(Laladronadelibros$stars,
main = "La ladrona de libros (Spanish Edition)")
Dr_Seuss<- ComentTot_DrSeuss%>%
group_by(stars) %>%
summarise(cantidad = n())
## `summarise()` ungrouping output (override with `.groups` argument)
boxplot(Dr_Seuss$stars,
main = "Dr. Seuss's Beginner Book Collection")