###Grupo 4
Arlette | Andrea | Andres | Carlos
# Function to scrape elements from Amazon reviews
scrape_amazon <- function(url, throttle = 5){
# Set throttle between URL calls
sec = 0
if(throttle < 0) warning("throttle was less than 0: set to 0")
if(throttle > 0) sec = max(0, throttle + runif(1, -1, 1))
session <- bow(url, force = TRUE)
# obtain HTML of URL
doc <- scrape(session)
# # Parse relevant elements from HTML
# title <- doc %>%
# html_nodes(".a-color-base") %>%
# html_text()
# title<- title[10:length(title)]
#
author <- doc %>%
html_nodes(".a-profile-name") %>%
html_text()
author <- author[-1:-2]
date <- doc %>%
html_nodes(".review-date") %>%
html_text() %>%
gsub(".*on ", "", .)
date <- date[-1:-2]
review_format <- doc %>%
html_nodes(".review-format-strip") %>%
html_text()
stars <- doc %>%
html_nodes(".review-rating") %>%
html_text() %>%
str_extract("\\d") %>%
as.numeric()
stars <- stars[-1:-2]
comments <- doc %>%
html_nodes(".review-text") %>%
html_text()
# Combine attributes into a single data frame
df <- data.frame(author, date, review_format, stars, comments, stringsAsFactors = F)
return(df)
}
# Poner el número de páginas deseadas
pages <- 2
# Objeto vacio para guardar los datos
reviews_all <- NULL
# iterar sobre cada página
prod_code <- c("B00P1L7BPU","B06XTYCPST","B0752XRB5F")
# ciclo para realizar scrapping de características de cada producto en prod_code
for(i in 1:length(prod_code)){
url <- paste0("https://www.amazon.com/product-reviews/",prod_code[i],
"/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews")
#obtener el nombre del producto y realizar una limpieza del mismo
session <- bow(url, force = TRUE)
prod <- scrape(session) %>%
html_nodes( "a.a-link-normal") %>%
html_text() %>%
gsub("\n", "", .) %>%
trimws()
prod<- prod[1]
for(page_num in 1:pages){
url <- paste0("https://www.amazon.com/product-reviews/",
prod_code[i],"/ref=cm_cr_arp_d_paging_btm_next_",
page_num,
"?ie=UTF8&reviewerType=all_reviews&pageNumber=",
page_num)
reviews <- scrape_amazon(url, throttle = 3)
reviews_all <- rbind(reviews_all, cbind(prod, reviews))
#books_reviews[[i]]<- reviews_all
}
#str(reviews_all)
}
write_delim(reviews_all,"AmzReviews.csv",delim = ";")
#Cargar data
booksReviews<- read_delim("AmzReviews.csv",delim = ";")
##
## -- Column specification --------------------------------------------------------
## cols(
## prod = col_character(),
## author = col_character(),
## date = col_character(),
## review_format = col_character(),
## stars = col_double(),
## comments = col_character()
## )
Clean_String <- function(string){
# minúscula
temp <- tolower(string)
# Remover todo lo que no sea número o letra
temp <- stringr::str_replace_all(temp,"[^a-zA-Z\\s]", " ")
# remover espacios extra
temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
return(temp)
}
# Aplicar la función a los comentarios
booksReviews$comments <- Clean_String(booksReviews$comments)
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.4
# Convertir el texto en tokens
booksReviewsT <- booksReviews %>%
mutate(id=paste(prod,author,date,sep="-")) %>%
select(id,stars,comments) %>%
unnest_tokens(input = comments,output = word)
#se carga el dataset the stopwords
data(stop_words)
#se eliminan los stopwords
tidyReviews <- booksReviewsT %>%
anti_join(stop_words)
## Joining, by = "word"
booksReviewsTN_2 <- booksReviews %>%
mutate(id=paste(prod,author,date,sep="-")) %>%
select(id,stars,comments) %>%
unnest_ngrams(input = comments,output = bigram,n=2,n_min = 1)
#Separar primero los bigrams
booksReviewsTN_2_separated <- booksReviewsTN_2 %>%
separate(col = bigram, into = c("word1", "word2"), sep = " ")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 12107 rows [1, 3,
## 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, ...].
#Eliminar los que contengan stop words
booksReviewsTN_2_filtered <- booksReviewsTN_2_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# Volver a juntar los bigramas que separamos
booksReviewsTN_2_filtered <- booksReviewsTN_2_filtered %>%
mutate(word2=ifelse(is.na(word2)==T," ",word2)) %>%
unite(bigram, word1, word2, sep = " ")
tidyReviewsCounts <- booksReviewsTN_2_filtered %>%
count(id, bigram, sort = TRUE)
tidyReviewsTFIDF <- tidyReviewsCounts %>%
bind_tf_idf(bigram,id, n) %>%
arrange(desc(tf_idf))
#Cargar data
head(booksReviewsTN_2_filtered)
## # A tibble: 6 x 3
## id stars bigram
## <chr> <dbl> <chr>
## 1 La casa de los espíritus [The House of the Spirits]-Benjamin-S~ 5 "la "
## 2 La casa de los espíritus [The House of the Spirits]-Benjamin-S~ 5 "la cas~
## 3 La casa de los espíritus [The House of the Spirits]-Benjamin-S~ 5 "casa "
## 4 La casa de los espíritus [The House of the Spirits]-Benjamin-S~ 5 "casa d~
## 5 La casa de los espíritus [The House of the Spirits]-Benjamin-S~ 5 "de "
## 6 La casa de los espíritus [The House of the Spirits]-Benjamin-S~ 5 "de los"
tidyReviewsCounts <- booksReviewsTN_2_filtered %>%
count(id, bigram, sort = TRUE)
head(tidyReviewsCounts)
## # A tibble: 6 x 3
## id bigram n
## <chr> <chr> <int>
## 1 Why We Sleep: Unlocking the Power of Sleep and Dreams-Ian Mann~ "sleep ~ 33
## 2 La casa de los espíritus [The House of the Spirits]-Benjamin-S~ "de " 14
## 3 La casa de los espíritus [The House of the Spirits]-Benjamin-S~ "la " 14
## 4 Why We Sleep: Unlocking the Power of Sleep and Dreams-Dr. Merc~ "sleep ~ 14
## 5 La casa de los espíritus [The House of the Spirits]-eltonsmith~ "de " 12
## 6 La casa de los espíritus [The House of the Spirits]-eltonsmith~ "la " 11
tidyReviewsTFIDF <- tidyReviewsCounts %>%
bind_tf_idf(bigram,id, n) %>%
arrange(desc(tf_idf))
#Unimos de nuevo con el dataframe original para obtener la variable stars
tidyReviewsTFIDF
## # A tibble: 6,166 x 6
## id bigram n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 La casa de los espíritus [The House of ~ "movie " 2 0.143 3.40 0.486
## 2 La casa de los espíritus [The House of ~ "typos " 4 0.129 3.40 0.439
## 3 La casa de los espíritus [The House of ~ "descript~ 1 0.1 4.09 0.409
## 4 La casa de los espíritus [The House of ~ "masterpi~ 1 0.1 4.09 0.409
## 5 La casa de los espíritus [The House of ~ "detail " 1 0.0909 4.09 0.372
## 6 La casa de los espíritus [The House of ~ "dissapea~ 1 0.0909 4.09 0.372
## 7 La casa de los espíritus [The House of ~ "scenario~ 1 0.0909 4.09 0.372
## 8 La casa de los espíritus [The House of ~ "story lo~ 1 0.0909 4.09 0.372
## 9 La casa de los espíritus [The House of ~ "happened~ 1 0.0909 3.40 0.309
## 10 La casa de los espíritus [The House of ~ "knowing ~ 1 0.0909 3.40 0.309
## # ... with 6,156 more rows
#Unimos de nuevo con el dataframe original para obtener la variable stars
tidyReviewsFinal <- tidyReviewsTFIDF %>%
left_join(booksReviewsTN_2_filtered)
## Joining, by = c("id", "bigram")
head(tidyReviewsFinal)
## # A tibble: 6 x 7
## id bigram n tf idf tf_idf stars
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 La casa de los espíritus [The House of~ "movie~ 2 0.143 3.40 0.486 5
## 2 La casa de los espíritus [The House of~ "movie~ 2 0.143 3.40 0.486 5
## 3 La casa de los espíritus [The House of~ "typos~ 4 0.129 3.40 0.439 1
## 4 La casa de los espíritus [The House of~ "typos~ 4 0.129 3.40 0.439 1
## 5 La casa de los espíritus [The House of~ "typos~ 4 0.129 3.40 0.439 1
## 6 La casa de los espíritus [The House of~ "typos~ 4 0.129 3.40 0.439 1
# Wordcloud de reviews 1 y 5 estrellas
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.0.4
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 4.0.3
library(RColorBrewer)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.0.4
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
tidyReviewsFinal %>%
filter(stars==5 | stars==1) %>%
group_by(bigram,stars) %>%
summarise(tf_idf=mean(tf_idf)) %>%
reshape2::acast(bigram ~ stars, value.var = "tf_idf", fill = 0) %>%
wordcloud::comparison.cloud(scale=c(3,.9),max.words = 50)
## `summarise()` has grouped output by 'bigram'. You can override using the `.groups` argument.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## magnificent historical could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## recognition software could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## sleep study could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## software could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## spanish speaker could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## star rating could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## insomnia could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## character could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## horrible could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## rating could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## released could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## study could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## includes could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## disorder could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## speaker could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## quality could not be fit on page. It will not be plotted.
## Warning in wordcloud::comparison.cloud(., scale = c(3, 0.9), max.words = 50):
## funny could not be fit on page. It will not be plotted.