Usamos función para extraer varias características(título del producto,autor del review,título del review,estrellas,fecha y

comentario), con el fin de poder analizar subsecuentemente estos datos.

scrape_amazon <- function(url, throttle = 0){
  
  # Install / Load relevant packages
  if(!"pacman" %in% installed.packages()[,"Package"]) install.packages("pacman")
  pacman::p_load(RCurl, XML, dplyr, stringr, rvest, purrr)
  
  # Set throttle between URL calls
  sec = 0
  if(throttle < 0) warning("throttle was less than 0: set to 0")
  if(throttle > 0) sec = max(0, throttle + runif(1, -1, 1))
  
  # obtain HTML of URL
  doc <- read_html(url)
  

  author <- doc %>%
    html_nodes(".a-profile-name") %>%
    html_text()
  
  date <- doc %>%
    html_nodes(".review-date") %>%
    html_text() %>% 
    gsub(".*on ", "", .)
  
  review_format <- doc %>% 
    html_nodes(".review-format-strip") %>% 
    html_text() 
  
  stars <- doc %>%
    html_nodes(".review-rating") %>%
    html_text() %>%
    str_extract("\\d") %>%
    as.numeric() 
  
  comments <- doc %>%
    html_nodes(".review-text") %>%
    html_text() 
  
  # Combine attributes into a single data frame
  df <- data.frame(author, date, review_format, stars, comments, stringsAsFactors = F)
  
  return(df)
}

Libro DestroyingFear

cod_asin <- "B07PZTVLB3" 

web_url <-  paste0("https://www.amazon.com/dp/", cod_asin, "/?pagenumber=1")

comentarios <- scrape_amazon(web_url)
 
str(comentarios)
## 'data.frame':    13 obs. of  5 variables:
##  $ author       : chr  "Tisha Vargas" "Rick" "Michelle Casey" "Kindle Customer" ...
##  $ date         : chr  "October 3, 2019" "October 25, 2019" "October 7, 2019" "October 19, 2019" ...
##  $ review_format: chr  "Verified Purchase" "Verified Purchase" "Verified Purchase" "Verified Purchase" ...
##  $ stars        : num  5 5 5 5 5 5 5 5 5 5 ...
##  $ comments     : chr  "\n\n\n\n\n\n\n\n\n\n  \n  \n    \n  This book had amazing insight.  It brought alot of things to light for me a"| __truncated__ "\n\n\n\n\n\n\n\n\n\n  \n  \n    \n  They don’t have enough stars as an option to give this book or any book of "| __truncated__ "\n\n\n\n\n\n\n\n\n\n  \n  \n    \n  Jesus used John Ramirez to absolutely transform my life through deliverance"| __truncated__ "\n\n\n\n\n\n\n\n\n\n  \n  \n    \n  This Book is for EVERYONE Sincere about Overcoming fear! I Highly recommend"| __truncated__ ...
pages <-  15 

comentariostotales <-  NULL

product <-  read_html(web_url) %>%
  html_nodes( "#productTitle") %>% 
  html_text() %>% 
  gsub("\n", "", .) %>% 
  trimws()

product
## [1] "Destroying Fear: Strategies to Overthrow the Enemy's Tactics and Walk in Total Freedom"
for (page_num in 1: pages) {
  
  web_url <- paste0("https://www.amazon.com/dp/", cod_asin, "/?pagenumber=", page_num)
  
    comentariostotales <- rbind(comentariostotales, cbind(product, comentarios))
}

Libro MysteryofthePowerWords

CodeID <- "B08GZGCT6Q"

url <- paste0("https://www.amazon.com/dp/", CodeID,"/?pageNumber=1")

reviews <- scrape_amazon(url)

pages <- 5

reviews_all <- NULL

product <- read_html(url) %>%
  html_nodes( "#productTitle") %>% 
  html_text() %>% 
  gsub("\n", "", .) %>% 
  trimws()

product 
## [1] "Mystery of the Power Words: Speak the Words That Move Mountains and Make Hell Tremble"
for(page_num in 1:pages){
  url <- paste0("http://www.amazon.com/dp/",
                CodeID,
                "/?pageNumber=",
                page_num)
  reviews <- scrape_amazon(url, throttle = 3)
  reviews_all <- rbind(reviews_all, cbind(product, reviews))
}

Libro La Ladrona de Libros

prod_code <- "B0062Y7D26"

url <- paste0("https://www.amazon.com/dp/", prod_code,"/?pageNumber=1")

reviews_Adriana <- scrape_amazon(url)

# mostrar data
str(reviews_Adriana)
## 'data.frame':    13 obs. of  5 variables:
##  $ author       : chr  "Raul" "Rose M" "Gaby B." "Sec1958" ...
##  $ date         : chr  "November 30, 2015" "October 22, 2020" "May 29, 2015" "April 20, 2019" ...
##  $ review_format: chr  "Verified Purchase" "Verified Purchase" "Verified Purchase" "Verified Purchase" ...
##  $ stars        : num  5 5 5 5 4 5 5 5 5 5 ...
##  $ comments     : chr  "\n\n\n\n\n\n\n\n\n\n  \n    \n  Este es un libro sencillamente fantástico y realmente te roba el corazón. Si bi"| __truncated__ "\n\n\n\n\n\n\n\n\n\n  \n  \n    \n  I gave this book as a gift to a well educated relative, retired from teachi"| __truncated__ "\n\n\n\n\n\n\n\n\n\n  \n    \n  \"He odiado las palabras y las he amado, y espero haber estado a su altura.\"Ma"| __truncated__ "\n\n\n\n\n\n\n\n\n\n  \n    \n  La historia conmovedora de una niña alemana durante la Segunda Guerra Mundial. "| __truncated__ ...
pages <- 5

reviews_allAdriana <- NULL

product <- read_html(url) %>%
  html_nodes( "#productTitle") %>%
  html_text() %>%
  gsub("\n", "", .) %>%
  trimws()

product
## [1] "La ladrona de libros (Spanish Edition)"
for(page_num in 1:pages){
  url <- paste0("http://www.amazon.com/dp/",
                prod_code,
                "/?pageNumber=",
                page_num)
  reviews_Adriana <- scrape_amazon(url, throttle = 3)
  reviews_allAdriana <- rbind(reviews_allAdriana, cbind(product, reviews_Adriana))
}

Libro Dr Seuss

cod_asin <- "0375851569" 

web_url <-  paste0("https://www.amazon.com/dp/", cod_asin, "/?pagenumber=1")

CometariosDrS <- scrape_amazon(web_url)

str(CometariosDrS)
## 'data.frame':    13 obs. of  5 variables:
##  $ author       : chr  "Ariyuana" "goingBananas" "Wendy" "cyclehum" ...
##  $ date         : chr  "February 27, 2021" "May 27, 2018" "June 10, 2017" "February 15, 2018" ...
##  $ review_format: chr  "Verified Purchase" "Verified Purchase" "Verified Purchase" "Verified Purchase" ...
##  $ stars        : num  5 4 5 2 2 5 5 5 5 5 ...
##  $ comments     : chr  "\n\n\n\n\n\n\n\n\n\n  \n  \n    \n  I’m so glad I bought this before hearing about Dr. Seuss getting “cancelled"| __truncated__ "\n\n\n\n\n\n\n\n\n\n  \n  \n    \n  Cute starter set in the book size I was looking for 6.75”x 9.25”.  The book"| __truncated__ "\n\n\n\n\n\n\n\n\n\n  \n  \n    \n  WHO could not love Dr. Seuss books?!  I enjoy them as much in my adulthood "| __truncated__ "\n\n\n\n\n\n\n\n\n\n  \n  \n    \n  These books would be a great buy, but one came with the cover upside-down o"| __truncated__ ...
pages <-  5 

ComentTot_DrSeuss <-  NULL

product <-  read_html(web_url) %>%
  html_nodes( "#productTitle") %>% 
  html_text() %>% 
  gsub("\n", "", .) %>% 
  trimws()

product
## [1] "Dr. Seuss's Beginner Book Collection (Cat in the Hat, One Fish Two Fish, Green Eggs and Ham, Hop on Pop, Fox in Socks)"
for (page_num in 1: pages) {
  
  web_url <- paste0("https://www.amazon.com/dp/", cod_asin, "/?pagenumber=", page_num)
  
  ComentTot_DrSeuss <- rbind(ComentTot_DrSeuss, cbind(product, CometariosDrS))
}

Una mediante una función de dplyr su dataframe y el de sus compañeros.

names(comentariostotales)
## [1] "product"       "author"        "date"          "review_format"
## [5] "stars"         "comments"
names(reviews_all)
## [1] "product"       "author"        "date"          "review_format"
## [5] "stars"         "comments"
names(reviews_allAdriana)
## [1] "product"       "author"        "date"          "review_format"
## [5] "stars"         "comments"
names(ComentTot_DrSeuss)
## [1] "product"       "author"        "date"          "review_format"
## [5] "stars"         "comments"
df_juntos <- bind_rows(comentariostotales, reviews_all, reviews_allAdriana, ComentTot_DrSeuss)

Analice la variable stars e indique cuál está mejor valorado mediante el cálculo de estadísticas descriptivas.

analisis <- df_juntos %>%
  group_by(product,stars) %>%
    summarise(Cantidad = n()) %>%
      arrange(- Cantidad)
## `summarise()` regrouping output by 'product' (override with `.groups` argument)
analisis
## # A tibble: 9 x 3
## # Groups:   product [4]
##   product                                                         stars Cantidad
##   <chr>                                                           <dbl>    <int>
## 1 Destroying Fear: Strategies to Overthrow the Enemy's Tactics a~     5      195
## 2 La ladrona de libros (Spanish Edition)                              5       50
## 3 Dr. Seuss's Beginner Book Collection (Cat in the Hat, One Fish~     5       45
## 4 Mystery of the Power Words: Speak the Words That Move Mountain~     5       45
## 5 La ladrona de libros (Spanish Edition)                              4       15
## 6 Dr. Seuss's Beginner Book Collection (Cat in the Hat, One Fish~     2       10
## 7 Dr. Seuss's Beginner Book Collection (Cat in the Hat, One Fish~     4       10
## 8 Mystery of the Power Words: Speak the Words That Move Mountain~     1        5
## 9 Mystery of the Power Words: Speak the Words That Move Mountain~     4        5

Cree un boxplot por libro de la variable stars

glimpse(comentariostotales)
## Rows: 195
## Columns: 6
## $ product       <chr> "Destroying Fear: Strategies to Overthrow the Enemy's...
## $ author        <chr> "Tisha Vargas", "Rick", "Michelle Casey", "Kindle Cus...
## $ date          <chr> "October 3, 2019", "October 25, 2019", "October 7, 20...
## $ review_format <chr> "Verified Purchase", "Verified Purchase", "Verified P...
## $ stars         <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,...
## $ comments      <chr> "\n\n\n\n\n\n\n\n\n\n  \n  \n    \n  This book had am...
 DestroyingFear <- comentariostotales%>%
   group_by(stars) %>%
    summarise(cantidad = n())
## `summarise()` ungrouping output (override with `.groups` argument)
 boxplot(DestroyingFear$stars,  
         main = "Destroying Fear: Strategies to Overthrow the Enemy's Tactics and Walk in Total Freedom")

#MysteryofthePowerWords

 MysteryofthePowerWords <- reviews_all%>%
   group_by(stars) %>%
   summarise(cantidad = n())
## `summarise()` ungrouping output (override with `.groups` argument)
 boxplot(MysteryofthePowerWords$stars, 
         main = "Mystery of the Power Words: Speak the Words That Move Mountains and Make Hell Tremble")

#La Ladrona de Libros

 Laladronadelibros<- reviews_allAdriana%>%
   group_by(stars) %>%
   summarise(cantidad = n())
## `summarise()` ungrouping output (override with `.groups` argument)
 boxplot(Laladronadelibros$stars, 
         main = "La ladrona de libros (Spanish Edition)")

Dr Seuss

 Dr_Seuss<- ComentTot_DrSeuss%>%
   group_by(stars) %>%
   summarise(cantidad = n())
## `summarise()` ungrouping output (override with `.groups` argument)
 boxplot(Dr_Seuss$stars, 
         main = "Dr. Seuss's Beginner Book Collection")