Analiza tekstu
Część pierwsza - zapoznanie z danymi
library(readr)
library(ggplot2)
library(dplyr)
library(tidytext)
library(wordcloud)
library(hunspell)
library(tidyr)
reviews <- read_csv("C:/Users/mbuko/Downloads/reviews.csv")
View(reviews)
head(reviews)## # A tibble: 6 × 3
## date rating review
## <date> <dbl> <chr>
## 1 2024-08-25 0 The experience was mediocre. The appetizers were cold, and …
## 2 2024-08-11 5 This place never disappoints! The food is always top-notch,…
## 3 2024-08-09 NA The setting is beautiful, but the food left much to be desi…
## 4 2024-08-09 1 This place never disappoints! The food is always top-notch,…
## 5 2024-09-08 NA Had a disappointing meal here. The pasta was overcooked, an…
## 6 2024-06-17 5 The restaurant was clean, and the staff was polite. However…
## [1] "2024-08-25" "2024-08-11" "2024-08-09" "2024-08-09" "2024-09-08"
## [6] "2024-06-17" "2024-06-26" "2024-09-07" "2024-07-23" "2024-08-09"
## [11] "2024-09-01" "2024-08-04" "2024-07-24" "2024-07-14" "2024-06-07"
## [16] "2024-07-02" "2024-07-02" "2024-08-21" "2024-06-15" "2024-07-07"
## [21] "2024-09-02" "2024-07-19" "2024-06-30" "2024-07-25" "2024-09-04"
## [26] "2024-07-04" "2024-09-02" "2024-06-08" "2024-07-04" "2024-06-17"
## [31] "2024-08-20" "2024-06-26" "2024-06-04" "2024-07-17" "2024-06-18"
## [36] "2024-07-16" "2024-06-02" "2024-07-04" "2024-06-22" "2024-06-13"
## [41] "2024-07-30" "2024-09-10" "2024-06-01" "2024-07-30" "2024-07-19"
## [46] "2024-09-02" "2024-06-11" "2024-08-04" "2024-07-05" "2024-06-18"
## [51] "2024-06-15" "2024-08-14" "2024-08-28" "2024-06-06" "2024-06-08"
## [56] "2024-07-03" "2024-06-23" "2024-06-17" "2024-09-02" "2024-06-25"
## [61] "2024-07-02" "2024-07-15" "2024-07-24" "2024-07-16" "2024-07-12"
## [66] "2024-07-22" "2024-06-15" "2024-06-12" "2024-06-18" "2024-06-13"
## [71] "2024-08-27" "2024-06-20" "2024-07-16" "2024-06-01" "2024-07-01"
## [76] "2024-07-13" "2024-08-07" "2024-08-13" "2024-07-08" "2024-08-09"
## [81] "2024-07-10" "2024-06-09" "2024-06-04" "2024-09-08" "2024-07-28"
## [86] "2024-06-10" "2024-06-14" "2024-07-07" "2024-07-20" "2024-06-04"
## [91] "2024-08-15" "2024-08-28" "2024-07-20" "2024-08-29" "2024-08-22"
## [96] "2024-08-18" "2024-07-20" "2024-06-07" "2024-07-11" "2024-07-29"
## [101] "2024-08-25" "2024-06-18" "2024-08-07" "2024-06-14" "2024-06-15"
## [106] "2024-09-03" "2024-06-27" "2024-08-02" "2024-06-24" "2024-07-08"
## [111] "2024-08-03" "2024-06-01" "2024-08-20" "2024-08-26" "2024-07-24"
## [116] "2024-09-09" "2024-06-12" "2024-06-09" "2024-08-23" "2024-06-06"
## [121] "2024-08-23" "2024-07-21" "2024-06-20" "2024-06-16" "2024-08-03"
## [126] "2024-08-22" "2024-08-13" "2024-06-20" "2024-09-07" "2024-08-06"
## [131] "2024-06-09" "2024-09-02" "2024-07-04" "2024-06-06" "2024-08-12"
## [136] "2024-06-05" "2024-07-10" "2024-07-09" "2024-07-16" "2024-06-04"
## [141] "2024-06-05" "2024-08-14" "2024-08-25" "2024-06-06" "2024-06-18"
## [146] "2024-06-13" "2024-07-31" "2024-08-27" "2024-06-07" "2024-08-31"
## [1] 0 5 NA 1 NA 5 4 4 0 5 3 3 2 3 4 0 1 4 2 3 NA 1 3 2 NA
## [26] 1 3 3 1 1 1 1 NA 1 1 4 4 2 3 1 2 1 NA 5 3 3 NA 1 2 1
## [51] 2 4 4 4 2 1 5 4 0 5 1 0 NA 5 2 3 4 5 NA 1 NA 4 2 5 0
## [76] 1 4 2 5 5 5 4 5 2 2 4 4 1 1 1 2 1 4 1 2 4 3 5 3 5
## [101] 1 2 5 2 3 1 2 4 5 5 0 1 2 2 2 5 3 0 1 2 3 0 3 3 1
## [126] 1 3 3 3 5 3 1 1 1 5 3 5 0 0 4 5 2 3 1 5 2 2 4 4 4
## # A tibble: 1 × 1
## review
## <chr>
## 1 The experience was mediocre. The appetizers were cold, and the main course la…
## # A tibble: 5 × 1
## rating
## <dbl>
## 1 0
## 2 5
## 3 NA
## 4 1
## 5 NA
## # A tibble: 1 × 3
## date rating review
## <date> <dbl> <chr>
## 1 2024-08-09 NA The setting is beautiful, but the food left much to be desi…
## # A tibble: 150 × 1
## rating
## <dbl>
## 1 0
## 2 5
## 3 NA
## 4 1
## 5 NA
## 6 5
## 7 4
## 8 4
## 9 0
## 10 5
## # ℹ 140 more rows
## # A tibble: 150 × 1
## rating
## <dbl>
## 1 0
## 2 5
## 3 NA
## 4 1
## 5 NA
## 6 5
## 7 4
## 8 4
## 9 0
## 10 5
## # ℹ 140 more rows
## spc_tbl_ [150 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ date : Date[1:150], format: "2024-08-25" "2024-08-11" ...
## $ rating: num [1:150] 0 5 NA 1 NA 5 4 4 0 5 ...
## $ review: chr [1:150] "The experience was mediocre. The appetizers were cold, and the main course lacked seasoning. The staff was frie"| __truncated__ "This place never disappoints! The food is always top-notch, and the staff is very professional. The ambiance is"| __truncated__ "The setting is beautiful, but the food left much to be desired. The steak was overcooked, and the side dishes w"| __truncated__ "This place never disappoints! The food is always top-notch, and the staff is very professional. The ambiance is"| __truncated__ ...
## - attr(*, "spec")=
## .. cols(
## .. date = col_date(format = ""),
## .. rating = col_double(),
## .. review = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
## date rating review
## [1,] FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE
## [3,] FALSE TRUE FALSE
## [4,] FALSE FALSE FALSE
## [5,] FALSE TRUE FALSE
## [6,] FALSE FALSE FALSE
## [7,] FALSE FALSE FALSE
## [8,] FALSE FALSE FALSE
## [9,] FALSE FALSE FALSE
## [10,] FALSE FALSE FALSE
## [11,] FALSE FALSE FALSE
## [12,] FALSE FALSE FALSE
## [13,] FALSE FALSE FALSE
## [14,] FALSE FALSE FALSE
## [15,] FALSE FALSE FALSE
## [16,] FALSE FALSE FALSE
## [17,] FALSE FALSE FALSE
## [18,] FALSE FALSE FALSE
## [19,] FALSE FALSE FALSE
## [20,] FALSE FALSE FALSE
## [21,] FALSE TRUE FALSE
## [22,] FALSE FALSE FALSE
## [23,] FALSE FALSE FALSE
## [24,] FALSE FALSE FALSE
## [25,] FALSE TRUE FALSE
## [26,] FALSE FALSE FALSE
## [27,] FALSE FALSE FALSE
## [28,] FALSE FALSE FALSE
## [29,] FALSE FALSE FALSE
## [30,] FALSE FALSE FALSE
## [31,] FALSE FALSE FALSE
## [32,] FALSE FALSE FALSE
## [33,] FALSE TRUE FALSE
## [34,] FALSE FALSE FALSE
## [35,] FALSE FALSE FALSE
## [36,] FALSE FALSE FALSE
## [37,] FALSE FALSE FALSE
## [38,] FALSE FALSE FALSE
## [39,] FALSE FALSE FALSE
## [40,] FALSE FALSE TRUE
## [41,] FALSE FALSE FALSE
## [42,] FALSE FALSE FALSE
## [43,] FALSE TRUE FALSE
## [44,] FALSE FALSE FALSE
## [45,] FALSE FALSE TRUE
## [46,] FALSE FALSE FALSE
## [47,] FALSE TRUE FALSE
## [48,] FALSE FALSE FALSE
## [49,] FALSE FALSE FALSE
## [50,] FALSE FALSE FALSE
## [51,] FALSE FALSE TRUE
## [52,] FALSE FALSE FALSE
## [53,] FALSE FALSE FALSE
## [54,] FALSE FALSE FALSE
## [55,] FALSE FALSE FALSE
## [56,] FALSE FALSE FALSE
## [57,] FALSE FALSE FALSE
## [58,] FALSE FALSE FALSE
## [59,] FALSE FALSE FALSE
## [60,] FALSE FALSE FALSE
## [61,] FALSE FALSE FALSE
## [62,] FALSE FALSE FALSE
## [63,] FALSE TRUE FALSE
## [64,] FALSE FALSE FALSE
## [65,] FALSE FALSE FALSE
## [66,] FALSE FALSE FALSE
## [67,] FALSE FALSE FALSE
## [68,] FALSE FALSE FALSE
## [69,] FALSE TRUE FALSE
## [70,] FALSE FALSE FALSE
## [71,] FALSE TRUE FALSE
## [72,] FALSE FALSE FALSE
## [73,] FALSE FALSE FALSE
## [74,] FALSE FALSE FALSE
## [75,] FALSE FALSE FALSE
## [76,] FALSE FALSE FALSE
## [77,] FALSE FALSE FALSE
## [78,] FALSE FALSE FALSE
## [79,] FALSE FALSE FALSE
## [80,] FALSE FALSE TRUE
## [81,] FALSE FALSE FALSE
## [82,] FALSE FALSE FALSE
## [83,] FALSE FALSE FALSE
## [84,] FALSE FALSE FALSE
## [85,] FALSE FALSE FALSE
## [86,] FALSE FALSE FALSE
## [87,] FALSE FALSE FALSE
## [88,] FALSE FALSE FALSE
## [89,] FALSE FALSE FALSE
## [90,] FALSE FALSE FALSE
## [91,] FALSE FALSE FALSE
## [92,] FALSE FALSE FALSE
## [93,] FALSE FALSE FALSE
## [94,] FALSE FALSE FALSE
## [95,] FALSE FALSE FALSE
## [96,] FALSE FALSE FALSE
## [97,] FALSE FALSE FALSE
## [98,] FALSE FALSE FALSE
## [99,] FALSE FALSE FALSE
## [100,] FALSE FALSE FALSE
## [101,] FALSE FALSE FALSE
## [102,] FALSE FALSE FALSE
## [103,] FALSE FALSE FALSE
## [104,] FALSE FALSE FALSE
## [105,] FALSE FALSE FALSE
## [106,] FALSE FALSE TRUE
## [107,] FALSE FALSE FALSE
## [108,] FALSE FALSE FALSE
## [109,] FALSE FALSE FALSE
## [110,] FALSE FALSE FALSE
## [111,] FALSE FALSE FALSE
## [112,] FALSE FALSE FALSE
## [113,] FALSE FALSE FALSE
## [114,] FALSE FALSE FALSE
## [115,] FALSE FALSE TRUE
## [116,] FALSE FALSE FALSE
## [117,] FALSE FALSE FALSE
## [118,] FALSE FALSE FALSE
## [119,] FALSE FALSE FALSE
## [120,] FALSE FALSE TRUE
## [121,] FALSE FALSE FALSE
## [122,] FALSE FALSE FALSE
## [123,] FALSE FALSE FALSE
## [124,] FALSE FALSE FALSE
## [125,] FALSE FALSE FALSE
## [126,] FALSE FALSE FALSE
## [127,] FALSE FALSE FALSE
## [128,] FALSE FALSE FALSE
## [129,] FALSE FALSE FALSE
## [130,] FALSE FALSE FALSE
## [131,] FALSE FALSE FALSE
## [132,] FALSE FALSE FALSE
## [133,] FALSE FALSE FALSE
## [134,] FALSE FALSE FALSE
## [135,] FALSE FALSE FALSE
## [136,] FALSE FALSE TRUE
## [137,] FALSE FALSE FALSE
## [138,] FALSE FALSE FALSE
## [139,] FALSE FALSE FALSE
## [140,] FALSE FALSE FALSE
## [141,] FALSE FALSE FALSE
## [142,] FALSE FALSE FALSE
## [143,] FALSE FALSE FALSE
## [144,] FALSE FALSE TRUE
## [145,] FALSE FALSE FALSE
## [146,] FALSE FALSE FALSE
## [147,] FALSE FALSE FALSE
## [148,] FALSE FALSE FALSE
## [149,] FALSE FALSE FALSE
## [150,] FALSE FALSE TRUE
## date rating review
## 0 10 10
## # A tibble: 20 × 3
## date rating review
## <date> <dbl> <chr>
## 1 2024-08-09 NA The setting is beautiful, but the food left much to be des…
## 2 2024-09-08 NA Had a disappointing meal here. The pasta was overcooked, a…
## 3 2024-09-02 NA This place never disappoints! The food is always top-notch…
## 4 2024-09-04 NA The experience was mediocre. The appetizers were cold, and…
## 5 2024-06-04 NA This is one of my favorite spots in town. The food is alwa…
## 6 2024-06-13 1 <NA>
## 7 2024-06-01 NA This is one of my favorite spots in town. The food is alwa…
## 8 2024-07-19 3 <NA>
## 9 2024-06-11 NA The location is convenient, but the food is nothing specia…
## 10 2024-06-15 2 <NA>
## 11 2024-07-24 NA Had a lovely evening here. The staff was attentive, and th…
## 12 2024-06-18 NA The location is convenient, but the food is nothing specia…
## 13 2024-08-27 NA This is one of my favorite spots in town. The food is alwa…
## 14 2024-08-09 5 <NA>
## 15 2024-09-03 1 <NA>
## 16 2024-07-24 2 <NA>
## 17 2024-06-06 2 <NA>
## 18 2024-06-05 3 <NA>
## 19 2024-06-06 1 <NA>
## 20 2024-08-31 4 <NA>
## [1] NA
## [1] 2.621429
## [1] 1.597966
## [1] "2024-06-01"
## [1] "2024-09-10"
reviews$miesiac<-as.numeric(format(reviews$date, "%m"))
reviews %>%
group_by(miesiac) %>%
summarise(srednia_ocena = mean(rating, na.rm = TRUE))## # A tibble: 4 × 2
## miesiac srednia_ocena
## <dbl> <dbl>
## 1 6 3.02
## 2 7 2.14
## 3 8 2.71
## 4 9 2.36
Analiza graficzna
ggplot(reviews, aes(y = rating)) +
geom_boxplot(fill="blue2") +
theme_minimal(base_size = 8) +
labs(y="Ocena") +
theme(
axis.text.x = element_blank(), # Ukrywa tekst (wartości) na osi X
axis.ticks.x = element_blank() # Ukrywa znaczniki (ticks) na osi X
)ggplot(reviews, aes(x = rating)) +
geom_histogram(binwidth = 1, fill="blue2") +
theme_minimal(base_size = 8) +
labs(x="Ocena", y="Liczba") +
scale_x_continuous(breaks = seq(0, 5, by = 1))ggplot(reviews, aes(x = rating)) +
geom_density(fill="blue2") +
theme_minimal(base_size = 8) +
labs(x="Gęstość", y="Liczba") ggplot(reviews, aes(x = as.factor(miesiac), y = rating, fill = as.factor(miesiac))) +
geom_boxplot() +
theme_minimal(base_size = 8) +
labs(x="MiesiÄ…c", y="Oceny") +
theme(legend.position = "non") Analiza tekstu
Tokenizacja
reviews$id<-1:nrow(reviews)
tokeny <- reviews %>%
unnest_tokens(word, review)
count(tokeny, word, sort = TRUE)## # A tibble: 237 × 2
## word n
## <chr> <int>
## 1 the 589
## 2 was 251
## 3 and 194
## 4 a 132
## 5 food 113
## 6 is 92
## 7 we 83
## 8 but 80
## 9 were 80
## 10 for 71
## # ℹ 227 more rows
## # A tibble: 237 × 2
## word n
## <chr> <int>
## 1 the 589
## 2 was 251
## 3 and 194
## 4 a 132
## 5 food 113
## 6 is 92
## 7 we 83
## 8 but 80
## 9 were 80
## 10 for 71
## 11 to 64
## 12 experience 52
## 13 of 51
## 14 staff 51
## 15 had 49
## 16 it 37
## 17 service 36
## 18 appetizers 34
## 19 very 34
## 20 with 34
## # ℹ 217 more rows
Usunięcie stop-words
stop_words<-stop_words
tokeny <- tokeny %>%
group_by(word) %>%
filter(n() > 5) %>%
ungroup() %>%
anti_join(stop_words)
print(count(tokeny, word, sort = TRUE), n = 20)## # A tibble: 110 × 2
## word n
## <chr> <int>
## 1 food 113
## 2 experience 52
## 3 staff 51
## 4 service 36
## 5 appetizers 34
## 6 friendly 33
## 7 atmosphere 29
## 8 evening 24
## 9 cold 22
## 10 meal 22
## 11 loved 21
## 12 dinner 20
## 13 left 20
## 14 notch 20
## 15 top 20
## 16 generous 19
## 17 portions 19
## 18 special 19
## 19 bland 18
## 20 dishes 18
## # ℹ 90 more rows
word_count <- count(tokeny, word, sort = TRUE)
word_count$proc<-word_count$n/sum(word_count$n)*100
set.seed(1)
wordcloud(words = word_count$word, freq = word_count$n, min.freq = 10,
max.words=80, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))tokeny %>%
count(rating, word, sort = TRUE) %>%
group_by(rating) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(reorder_within(word, n, rating), n,
fill = as.factor(rating))) +
geom_col(show.legend = FALSE) +
scale_x_reordered() +
coord_flip() +
facet_wrap(~rating, scales = "free") +
scale_y_continuous(expand = c(0, 0)) +
theme_bw(base_size = 10) +
labs(fill= "Ocena",
x= "wyrazy",
y= "n") +
theme(plot.title = element_text(lineheight=.8, face="bold"))tokeny$fill<-ifelse(tokeny$rating > 3, "positive",
ifelse(tokeny$rating<2, "negative", "neutral"))
tokeny %>%
count(fill, word, sort = TRUE) %>%
group_by(fill) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(reorder_within(word, n, fill), n,
fill = as.factor(fill))) +
geom_col(show.legend = FALSE) +
scale_x_reordered() +
coord_flip() +
facet_wrap(~fill, scales = "free") +
scale_y_continuous(expand = c(0, 0)) +
theme_bw(base_size = 10) +
labs(fill= "Ocena",
x= "wyrazy",
y= "n") +
theme(plot.title = element_text(lineheight=.8, face="bold"))Forma podstawowa
stem_hunspell <- function(term) {
stems <- hunspell_stem(term)[[1]]
if (length(stems) == 0) {
stem <- NA
} else {
if (nchar(stems[[length(stems)]]) > 1)
stem <- stems[[length(stems)]] else stem <- term
}
stem
}
## znajdowanie rdzenia słowa
words.stem <- lapply(word_count$word, stem_hunspell)
stem.list <- cbind(word_count, stem = unlist(words.stem))
stem.list <- stem.list[!is.na(stem.list[, 3]) & stem.list[, 1] != stem.list[, 3],
]
stem.list## word n proc stem
## 1 food 113 7.4293228 food
## 2 experience 52 3.4188034 experience
## 3 staff 51 3.3530572 staff
## 4 service 36 2.3668639 service
## 5 appetizers 34 2.2353715 appetizer
## 6 friendly 33 2.1696252 friend
## 7 atmosphere 29 1.9066404 atmosphere
## 8 evening 24 1.5779093 even
## 9 cold 22 1.4464168 cold
## 10 meal 22 1.4464168 meal
## 11 loved 21 1.3806706 love
## 12 dinner 20 1.3149244 dinner
## 13 left 20 1.3149244 left
## 14 notch 20 1.3149244 notch
## 15 top 20 1.3149244 top
## 16 generous 19 1.2491782 generous
## 17 portions 19 1.2491782 portion
## 18 special 19 1.2491782 special
## 19 bland 18 1.1834320 bland
## 20 dishes 18 1.1834320 dish
## 21 overcooked 18 1.1834320 overcook
## 22 ambiance 17 1.1176857 ambiance
## 23 attentive 17 1.1176857 attentive
## 24 dessert 17 1.1176857 dessert
## 25 friends 17 1.1176857 friend
## 26 perfect 17 1.1176857 perfect
## 27 restaurant 17 1.1176857 restaurant
## 28 crowd 15 0.9861933 crowd
## 29 highly 15 0.9861933 high
## 30 lacked 15 0.9861933 lack
## 31 main 15 0.9861933 main
## 32 mediocre 15 0.9861933 mediocre
## 33 overwhelmed 15 0.9861933 overwhelm
## 34 recommend 15 0.9861933 commend
## 35 seasoning 15 0.9861933 season
## 36 variety 15 0.9861933 variety
## 37 delicious 14 0.9204471 delicious
## 38 time 14 0.9204471 time
## 39 disappointing 13 0.8547009 appoint
## 40 amazing 12 0.7889546 amaze
## 41 meals 12 0.7889546 meal
## 42 steak 12 0.7889546 steak
## 43 bit 11 0.7232084 bit
## 44 coming 11 0.7232084 come
## 45 cozy 11 0.7232084 cozy
## 46 disappoints 11 0.7232084 appoint
## 47 favorite 11 0.7232084 favorite
## 48 live 11 0.7232084 live
## 49 love 11 0.7232084 love
## 50 occasion 11 0.7232084 occasion
## 51 professional 11 0.7232084 professional
## 52 relaxed 11 0.7232084 relax
## 53 romantic 11 0.7232084 romantic
## 54 spots 11 0.7232084 spot
## 55 town 11 0.7232084 town
## 56 beautiful 10 0.6574622 beautiful
## 57 desired 10 0.6574622 sire
## 58 expected 10 0.6574622 expect
## 59 feeling 10 0.6574622 feel
## 60 lovely 10 0.6574622 love
## 61 minutes 10 0.6574622 minute
## 62 price 10 0.6574622 price
## 63 satisfied 10 0.6574622 satisfy
## 64 setting 10 0.6574622 sett
## 65 wait 10 0.6574622 wait
## 66 waiter 10 0.6574622 wait
## NA <NA> NA NA <NA>
## 68 bold 9 0.5917160 bold
## 69 flavors 9 0.5917160 flavor
## 70 menu 9 0.5917160 menu
## 71 presentation 9 0.5917160 presentation
## 72 burger 8 0.5259698 burg
## 73 convenient 8 0.5259698 convenient
## 74 courses 8 0.5259698 course
## 75 didn’t 8 0.5259698 didn't
## 76 dry 8 0.5259698 dry
## 77 expectations 8 0.5259698 expectation
## 78 fries 8 0.5259698 fry
## 79 highlight 8 0.5259698 highlight
## 80 lackluster 8 0.5259698 lackluster
## 81 location 8 0.5259698 locate
## 82 overpriced 8 0.5259698 overprice
## 83 pasta 8 0.5259698 pasta
## 84 pleasant 8 0.5259698 pleasant
## 85 rest 8 0.5259698 re
## 86 salty 8 0.5259698 salty
## 87 sauce 8 0.5259698 sauce
## 88 soggy 8 0.5259698 soggy
## 89 arrive 7 0.4602235 arrive
## 90 beautifully 7 0.4602235 beautiful
## 91 chef 7 0.4602235 chef
## 92 clean 7 0.4602235 clean
## 93 dining 7 0.4602235 din
## 94 finish 7 0.4602235 finish
## 95 flavorful 7 0.4602235 flavorful
## 96 greet 7 0.4602235 greet
## 97 polite 7 0.4602235 polite
## 98 potential 7 0.4602235 potential
## 99 start 7 0.4602235 start
## 100 wonderful 7 0.4602235 wonderful
## 101 casual 6 0.3944773 casual
## 102 choice 6 0.3944773 choice
## 103 cooked 6 0.3944773 cook
## 104 excellent 6 0.3944773 excellent
## 105 lot 6 0.3944773 lot
## 106 outstanding 6 0.3944773 outstanding
## 107 perfection 6 0.3944773 perfection
## 108 quick 6 0.3944773 quick
## 109 quiet 6 0.3944773 quiet
## 110 solid 6 0.3944773 solid
Bigramy
bigramy <- reviews %>%
unnest_tokens(bigram, review, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE)
bigramy<- bigramy %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!is.na(word1))
bigramy %>%
unite(word, word1, word2, sep = " ") %>%
slice_max(n, n = 10) %>%
ggplot(aes(x = reorder(word, n), y = n, fill = word)) +
geom_col(show.legend = FALSE) +
scale_y_continuous(expand = c(0, 0)) +
coord_flip() +
scale_fill_brewer(palette = "Set3") + # paleta kolorów
labs(x = "Bigram", y = "Liczba wystąpień", title = "Top 10 bigramów") +
theme_minimal(base_size = 10)Analiza sentymentu
##AFINN
afinn<-get_sentiments("afinn")
sentiment<-tokeny %>%
inner_join(afinn)
sentiment2<-tokeny %>%
inner_join(afinn) %>%
group_by(id) %>% summarise(sentiment = sum(value))
reviews<-full_join(reviews, sentiment2, by = "id")
ggplot(reviews, aes(x = rating, y = sentiment)) +
geom_point() +
theme_minimal(base_size = 8) +
labs(x="Ocena", y="Sentyment")ggplot(reviews, aes(x = as.factor(rating), y = sentiment, fill = as.factor(rating))) +
geom_boxplot() +
theme_minimal(base_size = 8) +
labs(x="Ocena", y="Sentyment") +
theme(legend.position = "non")##Bing
bing<-get_sentiments("bing")
bing$value<-ifelse(bing$sentiment == "positive",1,-1)
sentiment<-tokeny %>%
inner_join(bing)
sentiment2<-tokeny %>%
inner_join(bing) %>%
group_by(id) %>% summarise(sentiment_bing = sum(value))
reviews<-full_join(reviews, sentiment2, by = "id")
ggplot(reviews, aes(x = rating, y = sentiment_bing)) +
geom_point() +
theme_minimal(base_size = 8) +
labs(x="Ocena", y="Sentyment")ggplot(reviews, aes(x = as.factor(rating), y = sentiment_bing, fill = as.factor(rating))) +
geom_boxplot() +
theme_minimal(base_size = 8) +
labs(x="Ocena", y="Sentyment") +
theme(legend.position = "non")