###Grupo 4

Arlette | Andrea | Andres | Carlos

Activar las librerias

Creacion el dataframe de los libros

# Function to scrape elements from Amazon reviews
scrape_amazon <- function(url, throttle = 5){
  
  # Set throttle between URL calls
  sec = 0
  if(throttle < 0) warning("throttle was less than 0: set to 0")
  if(throttle > 0) sec = max(0, throttle + runif(1, -1, 1))
  
  session <- bow(url, force = TRUE)
  # obtain HTML of URL
  doc <- scrape(session)
  
  # # Parse relevant elements from HTML
  # title <- doc %>%
  #   html_nodes(".a-color-base") %>%
  #   html_text() 
  # title<- title[10:length(title)]
  # 
  author <- doc %>%
    html_nodes(".a-profile-name") %>%
    html_text()
  
  author <- author[-1:-2]
  
  date <- doc %>%
    html_nodes(".review-date") %>%
    html_text() %>% 
    gsub(".*on ", "", .)
  
  date <- date[-1:-2]
  
  review_format <- doc %>% 
    html_nodes(".review-format-strip") %>% 
    html_text() 
  
  stars <- doc %>%
    html_nodes(".review-rating") %>%
    html_text() %>%
    str_extract("\\d") %>%
    as.numeric() 
  
  stars <- stars[-1:-2]
  
  comments <- doc %>%
    html_nodes(".review-text") %>%
    html_text() 
  
  # Combine attributes into a single data frame
  df <- data.frame(author, date, review_format, stars, comments, stringsAsFactors = F)
  
  return(df)
}

# Poner el número de páginas deseadas
pages <- 2

# Objeto vacio para guardar los datos
reviews_all <- NULL

# iterar sobre cada página

prod_code <- c("B00P1L7BPU","B06XTYCPST","B0752XRB5F")

# ciclo para realizar scrapping de características de cada producto en prod_code
for(i in 1:length(prod_code)){

  url <- paste0("https://www.amazon.com/product-reviews/",prod_code[i],
                "/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews")
  
  #obtener el nombre del producto y realizar una limpieza del mismo
  
  session <- bow(url, force = TRUE)
  prod <- scrape(session) %>%
    html_nodes( "a.a-link-normal") %>% 
    html_text() %>% 
    gsub("\n", "", .) %>% 
    trimws()
  
  prod<- prod[1]
  
  for(page_num in 1:pages){
    
    url <- paste0("https://www.amazon.com/product-reviews/",
                  prod_code[i],"/ref=cm_cr_arp_d_paging_btm_next_",
                  page_num,
                  "?ie=UTF8&reviewerType=all_reviews&pageNumber=",
                  page_num)
    
    
    
    reviews <- scrape_amazon(url, throttle = 3)
    reviews_all <- rbind(reviews_all, cbind(prod, reviews))
    
    #books_reviews[[i]]<- reviews_all
  }
  #str(reviews_all)
  
  

  }

Guardar el dataframe en un csv

write_delim(reviews_all,"AmzReviews.csv",delim = ";")

laboratorio 2

1.Cargue y limpie sus datos de acuerdo a lo revisado en el tutorial(para la variable de comentarios).

#Cargar data
booksReviews<- read_delim("AmzReviews.csv",delim = ";")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   prod = col_character(),
##   author = col_character(),
##   date = col_character(),
##   review_format = col_character(),
##   stars = col_double(),
##   comments = col_character()
## )
Clean_String <- function(string){
    # minúscula
    temp <- tolower(string)
    # Remover todo lo que no sea número o letra 
    temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
    # remover espacios extra
    temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
 
    return(temp)
    
}

# Aplicar la función a los comentarios
booksReviews$comments <- Clean_String(booksReviews$comments)

2.Tokenice el dataframe resultante con un N-gram=1 a 2 eliminando los stopwords (para esto utilice en la función unnested_ngrams la siguiente guía:

library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.4
# Convertir el texto en tokens

booksReviewsT <- booksReviews %>%
  mutate(id=paste(prod,author,date,sep="-")) %>%
  select(id,stars,comments) %>%
  unnest_tokens(input = comments,output = word)


#se carga el dataset the stopwords
data(stop_words)

#se eliminan los stopwords

tidyReviews <- booksReviewsT %>%
  anti_join(stop_words)
## Joining, by = "word"
booksReviewsTN_2 <- booksReviews %>%
  mutate(id=paste(prod,author,date,sep="-")) %>%
  select(id,stars,comments) %>%
  unnest_ngrams(input = comments,output = bigram,n=2,n_min = 1)


#Separar primero los bigrams
booksReviewsTN_2_separated <- booksReviewsTN_2 %>%
  separate(col = bigram, into = c("word1", "word2"), sep = " ")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 12107 rows [1, 3,
## 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, ...].
#Eliminar los que contengan stop words

booksReviewsTN_2_filtered <- booksReviewsTN_2_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# Volver a juntar los bigramas que separamos

booksReviewsTN_2_filtered <- booksReviewsTN_2_filtered %>%
  mutate(word2=ifelse(is.na(word2)==T," ",word2)) %>%
  unite(bigram, word1, word2, sep = " ")

tidyReviewsCounts <- booksReviewsTN_2_filtered %>%
  count(id, bigram, sort = TRUE)

tidyReviewsTFIDF <- tidyReviewsCounts %>%
  bind_tf_idf(bigram,id, n) %>%
  arrange(desc(tf_idf))

3. Genere la métrica Tf-Idf para los resultados del ejercicio anterior.

#Cargar data

head(booksReviewsTN_2_filtered)
## # A tibble: 6 x 3
##   id                                                              stars bigram  
##   <chr>                                                           <dbl> <chr>   
## 1 La casa de los espíritus [The House of the Spirits]-Benjamin-S~     5 "la  "  
## 2 La casa de los espíritus [The House of the Spirits]-Benjamin-S~     5 "la cas~
## 3 La casa de los espíritus [The House of the Spirits]-Benjamin-S~     5 "casa  "
## 4 La casa de los espíritus [The House of the Spirits]-Benjamin-S~     5 "casa d~
## 5 La casa de los espíritus [The House of the Spirits]-Benjamin-S~     5 "de  "  
## 6 La casa de los espíritus [The House of the Spirits]-Benjamin-S~     5 "de los"
tidyReviewsCounts <- booksReviewsTN_2_filtered %>%
  count(id, bigram, sort = TRUE)

head(tidyReviewsCounts)
## # A tibble: 6 x 3
##   id                                                              bigram       n
##   <chr>                                                           <chr>    <int>
## 1 Why We Sleep: Unlocking the Power of Sleep and Dreams-Ian Mann~ "sleep ~    33
## 2 La casa de los espíritus [The House of the Spirits]-Benjamin-S~ "de  "      14
## 3 La casa de los espíritus [The House of the Spirits]-Benjamin-S~ "la  "      14
## 4 Why We Sleep: Unlocking the Power of Sleep and Dreams-Dr. Merc~ "sleep ~    14
## 5 La casa de los espíritus [The House of the Spirits]-eltonsmith~ "de  "      12
## 6 La casa de los espíritus [The House of the Spirits]-eltonsmith~ "la  "      11
tidyReviewsTFIDF <- tidyReviewsCounts %>%
  bind_tf_idf(bigram,id, n) %>%
  arrange(desc(tf_idf))

#Unimos de nuevo con el dataframe original para obtener la variable stars

tidyReviewsTFIDF
## # A tibble: 6,166 x 6
##    id                                       bigram         n     tf   idf tf_idf
##    <chr>                                    <chr>      <int>  <dbl> <dbl>  <dbl>
##  1 La casa de los espíritus [The House of ~ "movie  "      2 0.143   3.40  0.486
##  2 La casa de los espíritus [The House of ~ "typos  "      4 0.129   3.40  0.439
##  3 La casa de los espíritus [The House of ~ "descript~     1 0.1     4.09  0.409
##  4 La casa de los espíritus [The House of ~ "masterpi~     1 0.1     4.09  0.409
##  5 La casa de los espíritus [The House of ~ "detail  "     1 0.0909  4.09  0.372
##  6 La casa de los espíritus [The House of ~ "dissapea~     1 0.0909  4.09  0.372
##  7 La casa de los espíritus [The House of ~ "scenario~     1 0.0909  4.09  0.372
##  8 La casa de los espíritus [The House of ~ "story lo~     1 0.0909  4.09  0.372
##  9 La casa de los espíritus [The House of ~ "happened~     1 0.0909  3.40  0.309
## 10 La casa de los espíritus [The House of ~ "knowing ~     1 0.0909  3.40  0.309
## # ... with 6,156 more rows

4.Genere un wordcloud del Tf-Idf para la variable stars==1 y stars==5, es decir, los reviews peor y mejor calificados.Analice los resultados.

#Unimos de nuevo con el dataframe original para obtener la variable stars

tidyReviewsFinal <- tidyReviewsTFIDF %>%
  left_join(booksReviewsTN_2_filtered) 
## Joining, by = c("id", "bigram")
head(tidyReviewsFinal)
## # A tibble: 6 x 7
##   id                                      bigram      n    tf   idf tf_idf stars
##   <chr>                                   <chr>   <int> <dbl> <dbl>  <dbl> <dbl>
## 1 La casa de los espíritus [The House of~ "movie~     2 0.143  3.40  0.486     5
## 2 La casa de los espíritus [The House of~ "movie~     2 0.143  3.40  0.486     5
## 3 La casa de los espíritus [The House of~ "typos~     4 0.129  3.40  0.439     1
## 4 La casa de los espíritus [The House of~ "typos~     4 0.129  3.40  0.439     1
## 5 La casa de los espíritus [The House of~ "typos~     4 0.129  3.40  0.439     1
## 6 La casa de los espíritus [The House of~ "typos~     4 0.129  3.40  0.439     1
# Wordcloud de reviews 1 y 5 estrellas
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.0.4
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 4.0.3
library(RColorBrewer)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.0.4
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
tidyReviewsFinal %>%
  filter(stars==5 | stars==1) %>%
  group_by(bigram,stars) %>%
  summarise(tf_idf=mean(tf_idf)) %>%
  reshape2::acast(bigram ~ stars, value.var = "tf_idf", fill = 0) %>%
  wordcloud::comparison.cloud(scale=c(3,.9),max.words = 50)
## `summarise()` has grouped output by 'bigram'. You can override using the `.groups` argument.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## magnificent historical could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## recognition software could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## sleep study could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## software could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## spanish speaker could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## star rating could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## insomnia could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## character could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## horrible could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## rating could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## released could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## study could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## includes could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## disorder could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## speaker could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## quality could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## funny could not be fit on page. It will not be plotted.