Analiza tekstu

Część pierwsza - zapoznanie z danymi

library(readr)
library(ggplot2)
library(dplyr)
library(tidytext)
library(wordcloud)
library(hunspell)
library(tidyr)

reviews <- read_csv("C:/Users/mbuko/Downloads/reviews.csv")
View(reviews)

head(reviews)

## # A tibble: 6 × 3
##   date       rating review                                                      
##   <date>      <dbl> <chr>                                                       
## 1 2024-08-25      0 The experience was mediocre. The appetizers were cold, and …
## 2 2024-08-11      5 This place never disappoints! The food is always top-notch,…
## 3 2024-08-09     NA The setting is beautiful, but the food left much to be desi…
## 4 2024-08-09      1 This place never disappoints! The food is always top-notch,…
## 5 2024-09-08     NA Had a disappointing meal here. The pasta was overcooked, an…
## 6 2024-06-17      5 The restaurant was clean, and the staff was polite. However…

reviews$date

##   [1] "2024-08-25" "2024-08-11" "2024-08-09" "2024-08-09" "2024-09-08"
##   [6] "2024-06-17" "2024-06-26" "2024-09-07" "2024-07-23" "2024-08-09"
##  [11] "2024-09-01" "2024-08-04" "2024-07-24" "2024-07-14" "2024-06-07"
##  [16] "2024-07-02" "2024-07-02" "2024-08-21" "2024-06-15" "2024-07-07"
##  [21] "2024-09-02" "2024-07-19" "2024-06-30" "2024-07-25" "2024-09-04"
##  [26] "2024-07-04" "2024-09-02" "2024-06-08" "2024-07-04" "2024-06-17"
##  [31] "2024-08-20" "2024-06-26" "2024-06-04" "2024-07-17" "2024-06-18"
##  [36] "2024-07-16" "2024-06-02" "2024-07-04" "2024-06-22" "2024-06-13"
##  [41] "2024-07-30" "2024-09-10" "2024-06-01" "2024-07-30" "2024-07-19"
##  [46] "2024-09-02" "2024-06-11" "2024-08-04" "2024-07-05" "2024-06-18"
##  [51] "2024-06-15" "2024-08-14" "2024-08-28" "2024-06-06" "2024-06-08"
##  [56] "2024-07-03" "2024-06-23" "2024-06-17" "2024-09-02" "2024-06-25"
##  [61] "2024-07-02" "2024-07-15" "2024-07-24" "2024-07-16" "2024-07-12"
##  [66] "2024-07-22" "2024-06-15" "2024-06-12" "2024-06-18" "2024-06-13"
##  [71] "2024-08-27" "2024-06-20" "2024-07-16" "2024-06-01" "2024-07-01"
##  [76] "2024-07-13" "2024-08-07" "2024-08-13" "2024-07-08" "2024-08-09"
##  [81] "2024-07-10" "2024-06-09" "2024-06-04" "2024-09-08" "2024-07-28"
##  [86] "2024-06-10" "2024-06-14" "2024-07-07" "2024-07-20" "2024-06-04"
##  [91] "2024-08-15" "2024-08-28" "2024-07-20" "2024-08-29" "2024-08-22"
##  [96] "2024-08-18" "2024-07-20" "2024-06-07" "2024-07-11" "2024-07-29"
## [101] "2024-08-25" "2024-06-18" "2024-08-07" "2024-06-14" "2024-06-15"
## [106] "2024-09-03" "2024-06-27" "2024-08-02" "2024-06-24" "2024-07-08"
## [111] "2024-08-03" "2024-06-01" "2024-08-20" "2024-08-26" "2024-07-24"
## [116] "2024-09-09" "2024-06-12" "2024-06-09" "2024-08-23" "2024-06-06"
## [121] "2024-08-23" "2024-07-21" "2024-06-20" "2024-06-16" "2024-08-03"
## [126] "2024-08-22" "2024-08-13" "2024-06-20" "2024-09-07" "2024-08-06"
## [131] "2024-06-09" "2024-09-02" "2024-07-04" "2024-06-06" "2024-08-12"
## [136] "2024-06-05" "2024-07-10" "2024-07-09" "2024-07-16" "2024-06-04"
## [141] "2024-06-05" "2024-08-14" "2024-08-25" "2024-06-06" "2024-06-18"
## [146] "2024-06-13" "2024-07-31" "2024-08-27" "2024-06-07" "2024-08-31"

reviews$rating

##   [1]  0  5 NA  1 NA  5  4  4  0  5  3  3  2  3  4  0  1  4  2  3 NA  1  3  2 NA
##  [26]  1  3  3  1  1  1  1 NA  1  1  4  4  2  3  1  2  1 NA  5  3  3 NA  1  2  1
##  [51]  2  4  4  4  2  1  5  4  0  5  1  0 NA  5  2  3  4  5 NA  1 NA  4  2  5  0
##  [76]  1  4  2  5  5  5  4  5  2  2  4  4  1  1  1  2  1  4  1  2  4  3  5  3  5
## [101]  1  2  5  2  3  1  2  4  5  5  0  1  2  2  2  5  3  0  1  2  3  0  3  3  1
## [126]  1  3  3  3  5  3  1  1  1  5  3  5  0  0  4  5  2  3  1  5  2  2  4  4  4

reviews[1,3]

## # A tibble: 1 × 1
##   review                                                                        
##   <chr>                                                                         
## 1 The experience was mediocre. The appetizers were cold, and the main course la…

reviews[1:5,2]

## # A tibble: 5 × 1
##   rating
##    <dbl>
## 1      0
## 2      5
## 3     NA
## 4      1
## 5     NA

reviews[3,]

## # A tibble: 1 × 3
##   date       rating review                                                      
##   <date>      <dbl> <chr>                                                       
## 1 2024-08-09     NA The setting is beautiful, but the food left much to be desi…

reviews[,2]

## # A tibble: 150 × 1
##    rating
##     <dbl>
##  1      0
##  2      5
##  3     NA
##  4      1
##  5     NA
##  6      5
##  7      4
##  8      4
##  9      0
## 10      5
## # ℹ 140 more rows

print(reviews[,2], n=10)

## # A tibble: 150 × 1
##    rating
##     <dbl>
##  1      0
##  2      5
##  3     NA
##  4      1
##  5     NA
##  6      5
##  7      4
##  8      4
##  9      0
## 10      5
## # ℹ 140 more rows

str(reviews)

## spc_tbl_ [150 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ date  : Date[1:150], format: "2024-08-25" "2024-08-11" ...
##  $ rating: num [1:150] 0 5 NA 1 NA 5 4 4 0 5 ...
##  $ review: chr [1:150] "The experience was mediocre. The appetizers were cold, and the main course lacked seasoning. The staff was frie"| __truncated__ "This place never disappoints! The food is always top-notch, and the staff is very professional. The ambiance is"| __truncated__ "The setting is beautiful, but the food left much to be desired. The steak was overcooked, and the side dishes w"| __truncated__ "This place never disappoints! The food is always top-notch, and the staff is very professional. The ambiance is"| __truncated__ ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   date = col_date(format = ""),
##   ..   rating = col_double(),
##   ..   review = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

is.na(reviews)

##         date rating review
##   [1,] FALSE  FALSE  FALSE
##   [2,] FALSE  FALSE  FALSE
##   [3,] FALSE   TRUE  FALSE
##   [4,] FALSE  FALSE  FALSE
##   [5,] FALSE   TRUE  FALSE
##   [6,] FALSE  FALSE  FALSE
##   [7,] FALSE  FALSE  FALSE
##   [8,] FALSE  FALSE  FALSE
##   [9,] FALSE  FALSE  FALSE
##  [10,] FALSE  FALSE  FALSE
##  [11,] FALSE  FALSE  FALSE
##  [12,] FALSE  FALSE  FALSE
##  [13,] FALSE  FALSE  FALSE
##  [14,] FALSE  FALSE  FALSE
##  [15,] FALSE  FALSE  FALSE
##  [16,] FALSE  FALSE  FALSE
##  [17,] FALSE  FALSE  FALSE
##  [18,] FALSE  FALSE  FALSE
##  [19,] FALSE  FALSE  FALSE
##  [20,] FALSE  FALSE  FALSE
##  [21,] FALSE   TRUE  FALSE
##  [22,] FALSE  FALSE  FALSE
##  [23,] FALSE  FALSE  FALSE
##  [24,] FALSE  FALSE  FALSE
##  [25,] FALSE   TRUE  FALSE
##  [26,] FALSE  FALSE  FALSE
##  [27,] FALSE  FALSE  FALSE
##  [28,] FALSE  FALSE  FALSE
##  [29,] FALSE  FALSE  FALSE
##  [30,] FALSE  FALSE  FALSE
##  [31,] FALSE  FALSE  FALSE
##  [32,] FALSE  FALSE  FALSE
##  [33,] FALSE   TRUE  FALSE
##  [34,] FALSE  FALSE  FALSE
##  [35,] FALSE  FALSE  FALSE
##  [36,] FALSE  FALSE  FALSE
##  [37,] FALSE  FALSE  FALSE
##  [38,] FALSE  FALSE  FALSE
##  [39,] FALSE  FALSE  FALSE
##  [40,] FALSE  FALSE   TRUE
##  [41,] FALSE  FALSE  FALSE
##  [42,] FALSE  FALSE  FALSE
##  [43,] FALSE   TRUE  FALSE
##  [44,] FALSE  FALSE  FALSE
##  [45,] FALSE  FALSE   TRUE
##  [46,] FALSE  FALSE  FALSE
##  [47,] FALSE   TRUE  FALSE
##  [48,] FALSE  FALSE  FALSE
##  [49,] FALSE  FALSE  FALSE
##  [50,] FALSE  FALSE  FALSE
##  [51,] FALSE  FALSE   TRUE
##  [52,] FALSE  FALSE  FALSE
##  [53,] FALSE  FALSE  FALSE
##  [54,] FALSE  FALSE  FALSE
##  [55,] FALSE  FALSE  FALSE
##  [56,] FALSE  FALSE  FALSE
##  [57,] FALSE  FALSE  FALSE
##  [58,] FALSE  FALSE  FALSE
##  [59,] FALSE  FALSE  FALSE
##  [60,] FALSE  FALSE  FALSE
##  [61,] FALSE  FALSE  FALSE
##  [62,] FALSE  FALSE  FALSE
##  [63,] FALSE   TRUE  FALSE
##  [64,] FALSE  FALSE  FALSE
##  [65,] FALSE  FALSE  FALSE
##  [66,] FALSE  FALSE  FALSE
##  [67,] FALSE  FALSE  FALSE
##  [68,] FALSE  FALSE  FALSE
##  [69,] FALSE   TRUE  FALSE
##  [70,] FALSE  FALSE  FALSE
##  [71,] FALSE   TRUE  FALSE
##  [72,] FALSE  FALSE  FALSE
##  [73,] FALSE  FALSE  FALSE
##  [74,] FALSE  FALSE  FALSE
##  [75,] FALSE  FALSE  FALSE
##  [76,] FALSE  FALSE  FALSE
##  [77,] FALSE  FALSE  FALSE
##  [78,] FALSE  FALSE  FALSE
##  [79,] FALSE  FALSE  FALSE
##  [80,] FALSE  FALSE   TRUE
##  [81,] FALSE  FALSE  FALSE
##  [82,] FALSE  FALSE  FALSE
##  [83,] FALSE  FALSE  FALSE
##  [84,] FALSE  FALSE  FALSE
##  [85,] FALSE  FALSE  FALSE
##  [86,] FALSE  FALSE  FALSE
##  [87,] FALSE  FALSE  FALSE
##  [88,] FALSE  FALSE  FALSE
##  [89,] FALSE  FALSE  FALSE
##  [90,] FALSE  FALSE  FALSE
##  [91,] FALSE  FALSE  FALSE
##  [92,] FALSE  FALSE  FALSE
##  [93,] FALSE  FALSE  FALSE
##  [94,] FALSE  FALSE  FALSE
##  [95,] FALSE  FALSE  FALSE
##  [96,] FALSE  FALSE  FALSE
##  [97,] FALSE  FALSE  FALSE
##  [98,] FALSE  FALSE  FALSE
##  [99,] FALSE  FALSE  FALSE
## [100,] FALSE  FALSE  FALSE
## [101,] FALSE  FALSE  FALSE
## [102,] FALSE  FALSE  FALSE
## [103,] FALSE  FALSE  FALSE
## [104,] FALSE  FALSE  FALSE
## [105,] FALSE  FALSE  FALSE
## [106,] FALSE  FALSE   TRUE
## [107,] FALSE  FALSE  FALSE
## [108,] FALSE  FALSE  FALSE
## [109,] FALSE  FALSE  FALSE
## [110,] FALSE  FALSE  FALSE
## [111,] FALSE  FALSE  FALSE
## [112,] FALSE  FALSE  FALSE
## [113,] FALSE  FALSE  FALSE
## [114,] FALSE  FALSE  FALSE
## [115,] FALSE  FALSE   TRUE
## [116,] FALSE  FALSE  FALSE
## [117,] FALSE  FALSE  FALSE
## [118,] FALSE  FALSE  FALSE
## [119,] FALSE  FALSE  FALSE
## [120,] FALSE  FALSE   TRUE
## [121,] FALSE  FALSE  FALSE
## [122,] FALSE  FALSE  FALSE
## [123,] FALSE  FALSE  FALSE
## [124,] FALSE  FALSE  FALSE
## [125,] FALSE  FALSE  FALSE
## [126,] FALSE  FALSE  FALSE
## [127,] FALSE  FALSE  FALSE
## [128,] FALSE  FALSE  FALSE
## [129,] FALSE  FALSE  FALSE
## [130,] FALSE  FALSE  FALSE
## [131,] FALSE  FALSE  FALSE
## [132,] FALSE  FALSE  FALSE
## [133,] FALSE  FALSE  FALSE
## [134,] FALSE  FALSE  FALSE
## [135,] FALSE  FALSE  FALSE
## [136,] FALSE  FALSE   TRUE
## [137,] FALSE  FALSE  FALSE
## [138,] FALSE  FALSE  FALSE
## [139,] FALSE  FALSE  FALSE
## [140,] FALSE  FALSE  FALSE
## [141,] FALSE  FALSE  FALSE
## [142,] FALSE  FALSE  FALSE
## [143,] FALSE  FALSE  FALSE
## [144,] FALSE  FALSE   TRUE
## [145,] FALSE  FALSE  FALSE
## [146,] FALSE  FALSE  FALSE
## [147,] FALSE  FALSE  FALSE
## [148,] FALSE  FALSE  FALSE
## [149,] FALSE  FALSE  FALSE
## [150,] FALSE  FALSE   TRUE

colSums(is.na(reviews))

##   date rating review 
##      0     10     10

reviews[!complete.cases(reviews), ]

## # A tibble: 20 × 3
##    date       rating review                                                     
##    <date>      <dbl> <chr>                                                      
##  1 2024-08-09     NA The setting is beautiful, but the food left much to be des…
##  2 2024-09-08     NA Had a disappointing meal here. The pasta was overcooked, a…
##  3 2024-09-02     NA This place never disappoints! The food is always top-notch…
##  4 2024-09-04     NA The experience was mediocre. The appetizers were cold, and…
##  5 2024-06-04     NA This is one of my favorite spots in town. The food is alwa…
##  6 2024-06-13      1 <NA>                                                       
##  7 2024-06-01     NA This is one of my favorite spots in town. The food is alwa…
##  8 2024-07-19      3 <NA>                                                       
##  9 2024-06-11     NA The location is convenient, but the food is nothing specia…
## 10 2024-06-15      2 <NA>                                                       
## 11 2024-07-24     NA Had a lovely evening here. The staff was attentive, and th…
## 12 2024-06-18     NA The location is convenient, but the food is nothing specia…
## 13 2024-08-27     NA This is one of my favorite spots in town. The food is alwa…
## 14 2024-08-09      5 <NA>                                                       
## 15 2024-09-03      1 <NA>                                                       
## 16 2024-07-24      2 <NA>                                                       
## 17 2024-06-06      2 <NA>                                                       
## 18 2024-06-05      3 <NA>                                                       
## 19 2024-06-06      1 <NA>                                                       
## 20 2024-08-31      4 <NA>

mean(reviews$rating)

## [1] NA

mean(reviews$rating, na.rm = TRUE)

## [1] 2.621429

sd(reviews$rating, na.rm = TRUE)

## [1] 1.597966

min(reviews$date)

## [1] "2024-06-01"

max(reviews$date)

## [1] "2024-09-10"

reviews$miesiac<-as.numeric(format(reviews$date, "%m"))

reviews %>%
  group_by(miesiac) %>%
  summarise(srednia_ocena = mean(rating, na.rm = TRUE))

## # A tibble: 4 × 2
##   miesiac srednia_ocena
##     <dbl>         <dbl>
## 1       6          3.02
## 2       7          2.14
## 3       8          2.71
## 4       9          2.36

Analiza graficzna

ggplot(reviews, aes(y = rating)) +
  geom_boxplot(fill="blue2") +
  theme_minimal(base_size = 8) + 
  labs(y="Ocena") +
  theme(
    axis.text.x = element_blank(),      # Ukrywa tekst (wartości) na osi X
    axis.ticks.x = element_blank()      # Ukrywa znaczniki (ticks) na osi X
  )

ggplot(reviews, aes(x = rating)) +
  geom_histogram(binwidth = 1, fill="blue2") +
  theme_minimal(base_size = 8) + 
  labs(x="Ocena", y="Liczba") +
  scale_x_continuous(breaks = seq(0, 5, by = 1))

ggplot(reviews, aes(x = rating)) +
  geom_density(fill="blue2") +
  theme_minimal(base_size = 8) + 
  labs(x="Gęstość", y="Liczba")

ggplot(reviews, aes(x = as.factor(miesiac), y = rating, fill = as.factor(miesiac))) +
  geom_boxplot() +
  theme_minimal(base_size = 8) + 
  labs(x="Miesiąc", y="Oceny") + 
  theme(legend.position = "non")

Analiza tekstu

Tokenizacja

reviews$id<-1:nrow(reviews)


tokeny <- reviews %>%
  unnest_tokens(word, review)

count(tokeny, word, sort = TRUE)

## # A tibble: 237 × 2
##    word      n
##    <chr> <int>
##  1 the     589
##  2 was     251
##  3 and     194
##  4 a       132
##  5 food    113
##  6 is       92
##  7 we       83
##  8 but      80
##  9 were     80
## 10 for      71
## # ℹ 227 more rows

print(count(tokeny, word, sort = TRUE), n = 20)

## # A tibble: 237 × 2
##    word           n
##    <chr>      <int>
##  1 the          589
##  2 was          251
##  3 and          194
##  4 a            132
##  5 food         113
##  6 is            92
##  7 we            83
##  8 but           80
##  9 were          80
## 10 for           71
## 11 to            64
## 12 experience    52
## 13 of            51
## 14 staff         51
## 15 had           49
## 16 it            37
## 17 service       36
## 18 appetizers    34
## 19 very          34
## 20 with          34
## # ℹ 217 more rows

Usunięcie stop-words

stop_words<-stop_words

tokeny <- tokeny %>%
  group_by(word) %>%
  filter(n() > 5) %>%
  ungroup() %>%
  anti_join(stop_words)


print(count(tokeny, word, sort = TRUE), n = 20)

## # A tibble: 110 × 2
##    word           n
##    <chr>      <int>
##  1 food         113
##  2 experience    52
##  3 staff         51
##  4 service       36
##  5 appetizers    34
##  6 friendly      33
##  7 atmosphere    29
##  8 evening       24
##  9 cold          22
## 10 meal          22
## 11 loved         21
## 12 dinner        20
## 13 left          20
## 14 notch         20
## 15 top           20
## 16 generous      19
## 17 portions      19
## 18 special       19
## 19 bland         18
## 20 dishes        18
## # ℹ 90 more rows

word_count <- count(tokeny, word, sort = TRUE)
word_count$proc<-word_count$n/sum(word_count$n)*100

set.seed(1)
wordcloud(words = word_count$word, freq = word_count$n, min.freq = 10,
          max.words=80, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

tokeny %>%
  count(rating, word, sort = TRUE) %>%
  group_by(rating) %>%
  top_n(10) %>%
  ungroup() %>%
  ggplot(aes(reorder_within(word, n, rating), n,
             fill = as.factor(rating))) +
  geom_col(show.legend = FALSE) +
  scale_x_reordered() +
  coord_flip() +
  facet_wrap(~rating, scales = "free") +
  scale_y_continuous(expand = c(0, 0)) +
  theme_bw(base_size = 10) +
  labs(fill= "Ocena", 
       x= "wyrazy", 
       y= "n") +
  theme(plot.title = element_text(lineheight=.8, face="bold"))

tokeny$fill<-ifelse(tokeny$rating > 3, "positive", 
                    ifelse(tokeny$rating<2, "negative", "neutral"))


tokeny %>%
  count(fill, word, sort = TRUE) %>%
  group_by(fill) %>%
  top_n(10) %>%
  ungroup() %>%
  ggplot(aes(reorder_within(word, n, fill), n,
             fill = as.factor(fill))) +
  geom_col(show.legend = FALSE) +
  scale_x_reordered() +
  coord_flip() +
  facet_wrap(~fill, scales = "free") +
  scale_y_continuous(expand = c(0, 0)) +
  theme_bw(base_size = 10) +
  labs(fill= "Ocena", 
       x= "wyrazy", 
       y= "n") +
  theme(plot.title = element_text(lineheight=.8, face="bold"))

Forma podstawowa

stem_hunspell <- function(term) {
  stems <- hunspell_stem(term)[[1]]
  
  if (length(stems) == 0) {
    stem <- NA
  } else {
    if (nchar(stems[[length(stems)]]) > 1) 
      stem <- stems[[length(stems)]] else stem <- term
  }
  stem
}



## znajdowanie rdzenia słowa
words.stem <- lapply(word_count$word, stem_hunspell)


stem.list <- cbind(word_count, stem = unlist(words.stem))

stem.list <- stem.list[!is.na(stem.list[, 3]) & stem.list[, 1] != stem.list[, 3], 
]

stem.list

##              word   n      proc         stem
## 1            food 113 7.4293228         food
## 2      experience  52 3.4188034   experience
## 3           staff  51 3.3530572        staff
## 4         service  36 2.3668639      service
## 5      appetizers  34 2.2353715    appetizer
## 6        friendly  33 2.1696252       friend
## 7      atmosphere  29 1.9066404   atmosphere
## 8         evening  24 1.5779093         even
## 9            cold  22 1.4464168         cold
## 10           meal  22 1.4464168         meal
## 11          loved  21 1.3806706         love
## 12         dinner  20 1.3149244       dinner
## 13           left  20 1.3149244         left
## 14          notch  20 1.3149244        notch
## 15            top  20 1.3149244          top
## 16       generous  19 1.2491782     generous
## 17       portions  19 1.2491782      portion
## 18        special  19 1.2491782      special
## 19          bland  18 1.1834320        bland
## 20         dishes  18 1.1834320         dish
## 21     overcooked  18 1.1834320     overcook
## 22       ambiance  17 1.1176857     ambiance
## 23      attentive  17 1.1176857    attentive
## 24        dessert  17 1.1176857      dessert
## 25        friends  17 1.1176857       friend
## 26        perfect  17 1.1176857      perfect
## 27     restaurant  17 1.1176857   restaurant
## 28          crowd  15 0.9861933        crowd
## 29         highly  15 0.9861933         high
## 30         lacked  15 0.9861933         lack
## 31           main  15 0.9861933         main
## 32       mediocre  15 0.9861933     mediocre
## 33    overwhelmed  15 0.9861933    overwhelm
## 34      recommend  15 0.9861933      commend
## 35      seasoning  15 0.9861933       season
## 36        variety  15 0.9861933      variety
## 37      delicious  14 0.9204471    delicious
## 38           time  14 0.9204471         time
## 39  disappointing  13 0.8547009      appoint
## 40        amazing  12 0.7889546        amaze
## 41          meals  12 0.7889546         meal
## 42          steak  12 0.7889546        steak
## 43            bit  11 0.7232084          bit
## 44         coming  11 0.7232084         come
## 45           cozy  11 0.7232084         cozy
## 46    disappoints  11 0.7232084      appoint
## 47       favorite  11 0.7232084     favorite
## 48           live  11 0.7232084         live
## 49           love  11 0.7232084         love
## 50       occasion  11 0.7232084     occasion
## 51   professional  11 0.7232084 professional
## 52        relaxed  11 0.7232084        relax
## 53       romantic  11 0.7232084     romantic
## 54          spots  11 0.7232084         spot
## 55           town  11 0.7232084         town
## 56      beautiful  10 0.6574622    beautiful
## 57        desired  10 0.6574622         sire
## 58       expected  10 0.6574622       expect
## 59        feeling  10 0.6574622         feel
## 60         lovely  10 0.6574622         love
## 61        minutes  10 0.6574622       minute
## 62          price  10 0.6574622        price
## 63      satisfied  10 0.6574622      satisfy
## 64        setting  10 0.6574622         sett
## 65           wait  10 0.6574622         wait
## 66         waiter  10 0.6574622         wait
## NA           <NA>  NA        NA         <NA>
## 68           bold   9 0.5917160         bold
## 69        flavors   9 0.5917160       flavor
## 70           menu   9 0.5917160         menu
## 71   presentation   9 0.5917160 presentation
## 72         burger   8 0.5259698         burg
## 73     convenient   8 0.5259698   convenient
## 74        courses   8 0.5259698       course
## 75         didn’t   8 0.5259698       didn't
## 76            dry   8 0.5259698          dry
## 77   expectations   8 0.5259698  expectation
## 78          fries   8 0.5259698          fry
## 79      highlight   8 0.5259698    highlight
## 80     lackluster   8 0.5259698   lackluster
## 81       location   8 0.5259698       locate
## 82     overpriced   8 0.5259698    overprice
## 83          pasta   8 0.5259698        pasta
## 84       pleasant   8 0.5259698     pleasant
## 85           rest   8 0.5259698           re
## 86          salty   8 0.5259698        salty
## 87          sauce   8 0.5259698        sauce
## 88          soggy   8 0.5259698        soggy
## 89         arrive   7 0.4602235       arrive
## 90    beautifully   7 0.4602235    beautiful
## 91           chef   7 0.4602235         chef
## 92          clean   7 0.4602235        clean
## 93         dining   7 0.4602235          din
## 94         finish   7 0.4602235       finish
## 95      flavorful   7 0.4602235    flavorful
## 96          greet   7 0.4602235        greet
## 97         polite   7 0.4602235       polite
## 98      potential   7 0.4602235    potential
## 99          start   7 0.4602235        start
## 100     wonderful   7 0.4602235    wonderful
## 101        casual   6 0.3944773       casual
## 102        choice   6 0.3944773       choice
## 103        cooked   6 0.3944773         cook
## 104     excellent   6 0.3944773    excellent
## 105           lot   6 0.3944773          lot
## 106   outstanding   6 0.3944773  outstanding
## 107    perfection   6 0.3944773   perfection
## 108         quick   6 0.3944773        quick
## 109         quiet   6 0.3944773        quiet
## 110         solid   6 0.3944773        solid

Bigramy

bigramy <- reviews %>%
  unnest_tokens(bigram, review, token = "ngrams", n = 2) %>% 
  count(bigram, sort = TRUE)


bigramy<- bigramy %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>% 
  filter(!is.na(word1)) 



bigramy %>%
  unite(word, word1, word2, sep = " ") %>%
  slice_max(n, n = 10) %>%
  ggplot(aes(x = reorder(word, n), y = n, fill = word)) +
  geom_col(show.legend = FALSE) +
  scale_y_continuous(expand = c(0, 0)) +
  coord_flip() +
  scale_fill_brewer(palette = "Set3") +  # paleta kolorów
  labs(x = "Bigram", y = "Liczba wystąpień", title = "Top 10 bigramów") +
  theme_minimal(base_size = 10)

trigramy <- reviews %>%
  unnest_tokens(trigram, review, token = "ngrams", n = 3) %>% 
  count(trigram, sort = TRUE)

Analiza sentymentu

##AFINN
afinn<-get_sentiments("afinn")

sentiment<-tokeny %>%
  inner_join(afinn)

sentiment2<-tokeny %>%
  inner_join(afinn) %>%
  group_by(id) %>% summarise(sentiment = sum(value))

reviews<-full_join(reviews, sentiment2, by = "id")

ggplot(reviews, aes(x = rating, y = sentiment)) +
  geom_point() +
  theme_minimal(base_size = 8) + 
  labs(x="Ocena", y="Sentyment")

ggplot(reviews, aes(x = as.factor(rating), y = sentiment, fill = as.factor(rating))) +
  geom_boxplot() +
  theme_minimal(base_size = 8) + 
  labs(x="Ocena", y="Sentyment") + 
  theme(legend.position = "non")

##Bing
bing<-get_sentiments("bing")
bing$value<-ifelse(bing$sentiment == "positive",1,-1)

sentiment<-tokeny %>%
  inner_join(bing)

sentiment2<-tokeny %>%
  inner_join(bing) %>%
  group_by(id) %>% summarise(sentiment_bing = sum(value))



reviews<-full_join(reviews, sentiment2, by = "id")

ggplot(reviews, aes(x = rating, y = sentiment_bing)) +
  geom_point() +
  theme_minimal(base_size = 8) + 
  labs(x="Ocena", y="Sentyment")

ggplot(reviews, aes(x = as.factor(rating), y = sentiment_bing, fill = as.factor(rating))) +
  geom_boxplot() +
  theme_minimal(base_size = 8) + 
  labs(x="Ocena", y="Sentyment") + 
  theme(legend.position = "non")

ggplot(reviews, aes(x = sentiment, y = sentiment_bing)) +
  geom_point()

ggplot(reviews, aes(x = sentiment, y = sentiment_bing)) +
  geom_bin2d()