Tekst mining - projekt

Wstęp

Należy przedstawić zakres projektu - opisać przebieg procesu analizy tekstu

Część pierwsza - zapoznanie z danymi

library(readr)
library(lubridate)
library(dplyr)
library(ggplot2)
library(forcats)
library(tidytext)
library(wordcloud)
library(hunspell)
library(tidyr)

reviews <- read_csv("C:/Users/mbuko/OneDrive/Pulpit/Restaurant reviews.csv")
reviews<-reviews[,c(1,3:4,6)]

table(reviews$Restaurant)

## 
##                                 10 Downing Street 
##                                               100 
##                                          13 Dhaba 
##                                               100 
##                    3B's - Buddies, Bar & Barbecue 
##                                               100 
##                         AB's - Absolute Barbecues 
##                                               100 
##                                 Absolute Sizzlers 
##                                               100 
##                                Al Saba Restaurant 
##                                               100 
##                               American Wild Wings 
##                                               100 
##                                              Amul 
##                                               100 
##                                      Arena Eleven 
##                                               100 
##                                      Aromas@11SIX 
##                                               100 
##                                    Asian Meal Box 
##                                               100 
##                                            B-Dubs 
##                                               100 
##               Banana Leaf Multicuisine Restaurant 
##                                               100 
##                                   Barbeque Nation 
##                                               100 
##                                   Behrouz Biryani 
##                                               100 
##                                      Being Hungry 
##                                               100 
##                                   Beyond Flavours 
##                                               100 
##                                 Biryanis And More 
##                                               100 
##                                        Cafe Eclat 
##                                               100 
##           Cascade - Radisson Hyderabad Hitec City 
##                                               100 
##                                  Chinese Pavilion 
##                                               100 
##                                        Club Rogue 
##                                               100 
##              Collage - Hyatt Hyderabad Gachibowli 
##                                               100 
##                                       Cream Stone 
##                                               100 
##                                          Delhi-39 
##                                               100 
##                                     Deli 9 Bistro 
##                                               100 
##                                        Desi Bytes 
##                                               100 
##                                      Dine O China 
##                                               100 
##                                   Diners Pavilion 
##                                               100 
##                                    Domino's Pizza 
##                                               100 
##                                       Driven Cafe 
##                                               100 
##                                    Dunkin' Donuts 
##                                               100 
##                                 Eat India Company 
##                                               100 
##                                           eat.fit 
##                                               100 
##                                            Faasos 
##                                               100 
##                  Feast - Sheraton Hyderabad Hotel 
##                                               100 
##                                          Flechazo 
##                                               100 
##                                       Frio Bistro 
##                                               100 
##                                     Gal Punjab Di 
##                                               100 
##                                              GD's 
##                                               100 
##                         Green Bawarchi Restaurant 
##                                               100 
##                         Hitech Bawarchi Food Zone 
##                                               100 
##                                  Hotel Zara Hi-Fi 
##                                               100 
##                                Hunger Maggi Point 
##                                               100 
##                                   Hyderabad Chefs 
##                                               100 
##                                 Hyderabadi Daawat 
##                                               100 
##                                       Hyper Local 
##                                               100 
## Jonathan's Kitchen - Holiday Inn Express & Suites 
##                                               100 
##                                    Karachi Bakery 
##                                               100 
##                                      Karachi Cafe 
##                                               100 
##                                               KFC 
##                                               100 
##                                        Khaan Saab 
##                                               100 
##           Komatose - Holiday Inn Express & Suites 
##                                               100 
##                               Kritunga Restaurant 
##                                               100 
##                                         KS Bakers 
##                                               100 
##                        La La Land - Bar & Kitchen 
##                                               100 
##                                           Labonel 
##                                               100 
##                              Marsala Food Company 
##                                               100 
##                                     Mathura Vilas 
##                                               100 
##             Mazzo - Marriott Executive Apartments 
##                                               100 
##                               Mohammedia Shawarma 
##                                               100 
##                                     Momos Delight 
##                                               100 
##                            Mustang Terrace Lounge 
##                                               100 
##                               NorFest - The Dhaba 
##                                               100 
##                                      Olive Garden 
##                                               100 
##                        Over The Moon Brew Company 
##                                               100 
##                                       Owm Nom Nom 
##                                               100 
##                                     Pakwaan Grand 
##                                               100 
##                                          Paradise 
##                                               100 
##                                       Pista House 
##                                               100 
##                                        Pot Pourri 
##                                               100 
##                                        PourHouse7 
##                                               100 
##                              Prism Club & Kitchen 
##                                               100 
##                            Royal Spicy Restaurant 
##                                               100 
##                          Sardarji's Chaats & More 
##                                               100 
##                    Shah Ghouse Hotel & Restaurant 
##                                               100 
##                          Shah Ghouse Spl Shawarma 
##                                               100 
##                                   Shanghai Chef 2 
##                                               100 
##             Shree Santosh Dhaba Family Restaurant 
##                                               100 
##                                             SKYHY 
##                                               100 
##                                Squeeze @ The Lime 
##                                               100 
##                                           T Grill 
##                                               100 
##                               Tandoori Food Works 
##                                               100 
##                                          Tempteys 
##                                               100 
##                                The Chocolate Room 
##                                               100 
##                             The Fisherman's Wharf 
##                                               100 
##                        The Foodie Monster Kitchen 
##                                               100 
##                                   The Glass Onion 
##                                               100 
##                                    The Indi Grill 
##                                               100 
##                     The Lal Street - Bar Exchange 
##                                               100 
##                     The Old Madras Baking Company 
##                                               100 
##                             The Tilt Bar Republic 
##                                               100 
##                                        Tiki Shack 
##                                               100 
##                                          Triptify 
##                                               100 
##                                    Udipi's Upahar 
##                                               100 
##                                        Ulavacharu 
##                                               100 
##                        Urban Asia - Kitchen & Bar 
##                                               100 
##             Yum Yum Tree - The Arabian Food Court 
##                                               100 
##                   Zega - Sheraton Hyderabad Hotel 
##                                               100 
##                          Zing's Northeast Kitchen 
##                                               100

str(reviews)

## tibble [10,000 × 4] (S3: tbl_df/tbl/data.frame)
##  $ Restaurant: chr [1:10000] "Beyond Flavours" "Beyond Flavours" "Beyond Flavours" "Beyond Flavours" ...
##  $ Review    : chr [1:10000] "The ambience was good, food was quite good . had Saturday lunch , which was cost effective .\nGood place for a "| __truncated__ "Ambience is too good for a pleasant evening. Service is very prompt. Food is good. Over all a good experience. "| __truncated__ "A must try.. great food great ambience. Thnx for the service by Pradeep and Subroto. My personal recommendation"| __truncated__ "Soumen das and Arun was a great guy. Only because of their behavior and sincerety, And good food off course, I "| __truncated__ ...
##  $ Rating    : num [1:10000] 5 5 5 5 5 5 5 4 5 5 ...
##  $ Time      : chr [1:10000] "5/25/2019 15:54" "5/25/2019 14:20" "5/24/2019 22:54" "5/24/2019 22:11" ...

colSums(is.na(reviews))

## Restaurant     Review     Rating       Time 
##          0         45         39         38

mean(reviews$Rating, na.rm = TRUE)

## [1] 3.601044

sd(reviews$Rating, na.rm = TRUE)

## [1] 1.483461

reviews$Data<- as.Date(reviews$Time, format = "%m/%d/%Y %H:%M")
reviews$Year <- year(reviews$Data)


reviews$Month <- month(reviews$Data, label = TRUE, locale = "pl_PL")


reviews$Id<-1:nrow(reviews)

min(reviews$Data, na.rm = TRUE)

## [1] "2016-05-31"

max(reviews$Data, na.rm = TRUE)

## [1] "2019-05-25"

table(reviews$Rating)

## 
##    1  1.5    2  2.5    3  3.5    4  4.5    5 
## 1735    9  684   19 1193   47 2373   69 3832

reviews$Rating<-floor(reviews$Rating)
table(reviews$Rating)

## 
##    1    2    3    4    5 
## 1744  703 1240 2442 3832

Wnioski dotyczące tej części powiny wskazywać na charakter danych wykorzystanych do analizy

Część druga - analiza graficzna

ggplot(reviews, aes(x = Rating)) +
  geom_histogram(binwidth = 1, fill="cadetblue2") +
  geom_text(stat = "bin",
            aes(label = ifelse(after_stat(count) > 0, after_stat(count), "")),
            vjust = -0.5, size = 3) +
  theme_minimal(base_size = 8) + 
  labs(x="Ocena", y="Liczba") +
  scale_x_continuous(breaks = seq(1, 5, by = 1))

ggplot(reviews, aes(y = Rating)) +
  geom_boxplot(fill="cadetblue2") +
  theme_minimal(base_size = 8) + 
  labs(y="Ocena") +
  theme(
    axis.text.x = element_blank(),      
    axis.ticks.x = element_blank()    
  )

reviews %>%
  group_by(Year) %>%
  summarise(srednia_ocena = mean(Rating, na.rm = TRUE))

## # A tibble: 5 × 2
##    Year srednia_ocena
##   <dbl>         <dbl>
## 1  2016          3.72
## 2  2017          3.62
## 3  2018          3.52
## 4  2019          3.67
## 5    NA        NaN

ggplot(subset(reviews, !is.na(Year) & !is.na(Rating)),
       aes(x = as.factor(Year), y = Rating, fill = as.factor(Year))) +
  geom_boxplot() +
  theme_minimal(base_size = 8) +
  labs(x = "Rok", y = "Oceny") +
  theme(legend.position = "none") +
  scale_x_discrete(drop = TRUE)

reviews %>%
  group_by(Month) %>%
  summarise(srednia_ocena = mean(Rating, na.rm = TRUE))

## # A tibble: 13 × 2
##    Month  srednia_ocena
##    <ord>          <dbl>
##  1 sty             3.53
##  2 lut             3.49
##  3 mar             3.46
##  4 kwi             3.63
##  5 maj             3.89
##  6 cze             3.41
##  7 lip             3.71
##  8 sie             3.53
##  9 wrz             3.42
## 10 pa<9f>          3.49
## 11 lis             3.46
## 12 gru             3.53
## 13 <NA>          NaN

ggplot(subset(reviews, !is.na(Month) & !is.na(Rating)),
       aes(x = as.factor(Month), y = Rating, fill = as.factor(Month))) +
  geom_boxplot() +
  theme_minimal(base_size = 8) +
  labs(x = "Miesiąc", y = "Oceny") +
  theme(legend.position = "none") +
  scale_x_discrete(drop = TRUE)

reviews %>%
  group_by(Restaurant) %>%
  summarise(srednia_ocena = mean(Rating, na.rm = TRUE)) %>%
  arrange(desc(srednia_ocena))

## # A tibble: 100 × 2
##    Restaurant                       srednia_ocena
##    <chr>                                    <dbl>
##  1 AB's - Absolute Barbecues                 4.88
##  2 B-Dubs                                    4.81
##  3 3B's - Buddies, Bar & Barbecue            4.76
##  4 Paradise                                  4.7 
##  5 Flechazo                                  4.66
##  6 The Indi Grill                            4.6 
##  7 Zega - Sheraton Hyderabad Hotel           4.45
##  8 Over The Moon Brew Company                4.34
##  9 Beyond Flavours                           4.28
## 10 Feast - Sheraton Hyderabad Hotel          4.22
## # ℹ 90 more rows

reviews %>%
  group_by(Restaurant) %>%
  summarise(srednia_ocena = mean(Rating, na.rm = TRUE)) %>%
  arrange(desc(srednia_ocena)) %>%
  slice_head(n = 10) %>%
  ggplot(aes(x = fct_reorder(Restaurant, srednia_ocena), y = srednia_ocena)) +
  geom_col(fill = "cadetblue3") +
  geom_text(aes(label = round(srednia_ocena, 2)), hjust = -0.2, size = 3) +
  coord_flip() +
  theme_minimal(base_size = 10) +
  labs(x = "Restauracja", y = "Średnia ocena") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1)))

reviews %>%
  group_by(Restaurant) %>%
  summarise(srednia_ocena = mean(Rating, na.rm = TRUE)) %>%
  arrange(srednia_ocena) %>%          
  slice_head(n = 10) %>%               
  ggplot(aes(x = fct_reorder(Restaurant, srednia_ocena), y = srednia_ocena)) +
  geom_col(fill = "tomato3") +
  geom_text(aes(label = round(srednia_ocena, 2)), hjust = -0.2, size = 3) +
  coord_flip() +
  theme_minimal(base_size = 10) +
  labs(x = "Restauracja", y = "Średnia ocena") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1)))

table(reviews$Year)

## 
## 2016 2017 2018 2019 
##   43  213 4903 4803

reviews %>%
  filter(Data >= as.Date("2018-05-01")) %>%
  group_by(Data) %>%
  summarise(srednia_ocena = mean(Rating, na.rm = TRUE), .groups = "drop") %>%
  ggplot(aes(x = Data, y = srednia_ocena)) +
  geom_line(color = "cadetblue3", linewidth = 0.9) +
  geom_point(color = "cadetblue4", size = 1.8) +
  theme_minimal(base_size = 10) +
  labs(x = "Data", y = "Średnia ocena", title = "Zmiana średniej oceny w czasie") +
  scale_x_date(
    date_breaks = "1 month",        # zmień np. na "1 week" przy gęstych danych
    date_labels = "%Y-%m"           # format osi X: Rok-Miesiąc
  )

Wnioski dotyczące tej części powinny wskazywać na zmienność ocen - w zależności od roku, miesiąca. Ponadto należy omówić wyniki dotyczące najlepszych i najgorszych restauracji

Część trzecia - tokenizacja

tokeny <- reviews %>%
  unnest_tokens(word, Review)


print(count(tokeny, word, sort = TRUE), n = 30)

## # A tibble: 16,720 × 2
##    word        n
##    <chr>   <int>
##  1 the     23152
##  2 and     17616
##  3 was     10318
##  4 to       9902
##  5 a        9738
##  6 is       9249
##  7 i        7461
##  8 of       7455
##  9 good     7131
## 10 food     6590
## 11 for      6559
## 12 it       5860
## 13 place    5707
## 14 with     5507
## 15 in       5272
## 16 this     4798
## 17 we       3982
## 18 very     3719
## 19 not      3645
## 20 they     3409
## 21 but      3395
## 22 service  3209
## 23 have     3205
## 24 chicken  3093
## 25 you      2949
## 26 are      2800
## 27 were     2664
## 28 on       2663
## 29 had      2620
## 30 that     2492
## # ℹ 16,690 more rows

Usunięcie stop-words

tokeny <- tokeny %>%
  group_by(word) %>%
  filter(n() > 5) %>%
  ungroup() %>%
  anti_join(stop_words)


print(count(tokeny, word, sort = TRUE), n = 30)

## # A tibble: 3,469 × 2
##    word           n
##    <chr>      <int>
##  1 food        6590
##  2 service     3209
##  3 chicken     3093
##  4 5           2343
##  5 taste       2261
##  6 ambience    2094
##  7 time        1603
##  8 nice        1486
##  9 biryani     1338
## 10 staff       1316
## 11 restaurant  1263
## 12 visit       1260
## 13 veg         1242
## 14 experience  1185
## 15 amazing      998
## 16 awesome      864
## 17 quality      860
## 18 starters     856
## 19 served       831
## 20 4            829
## 21 rice         783
## 22 paneer       717
## 23 menu         701
## 24 friends      700
## 25 tasty        700
## 26 bad          686
## 27 quantity     685
## 28 buffet       656
## 29 3            650
## 30 love         650
## # ℹ 3,439 more rows

word_count <- count(tokeny, word, sort = TRUE)
word_count$proc<-word_count$n/sum(word_count$n)*100

set.seed(1)
wordcloud(words = word_count$word, freq = word_count$n, min.freq = 10,
          max.words=80, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

tokeny %>%
  filter(!is.na(Rating)) %>%
  count(Rating, word, sort = TRUE) %>%
  group_by(Rating) %>%
  top_n(10) %>%
  ungroup() %>%
  ggplot(aes(reorder_within(word, n, Rating), n,
             fill = as.factor(Rating))) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(y = n, label = n),
            hjust = 0.5,              
            size = 3) +
  scale_x_reordered() +
  coord_flip(clip = "off") +            
  facet_wrap(~Rating, scales = "free") +
  scale_y_continuous(expand = c(0, 0)) +
  theme_bw(base_size = 10) +
  labs(fill = "Ocena",
       x = "wyrazy",
       y = "n") +
  theme(
    plot.title = element_text(lineheight = .8, face = "bold"),
    plot.margin = margin(r = 10)
  )

tokeny$Fill<-ifelse(tokeny$Rating > 3, "positive", 
                    ifelse(tokeny$Rating<2, "negative", "neutral"))


tokeny %>%
  filter(!is.na(Rating)) %>%
  count(Fill, word, sort = TRUE) %>%
  group_by(Fill) %>%
  top_n(10) %>%
  ungroup() %>%
  ggplot(aes(reorder_within(word, n, Fill), n,
             fill = as.factor(Fill))) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(y = n, label = n),
            hjust = 0.5,              
            size = 3) +
  scale_x_reordered() +
  coord_flip(clip = "off") +            
  facet_wrap(~Fill, scales = "free") +
  scale_y_continuous(expand = c(0, 0)) +
  theme_bw(base_size = 10) +
  labs(fill = "Ocena",
       x = "wyrazy",
       y = "n") +
  theme(
    plot.title = element_text(lineheight = .8, face = "bold"),
    plot.margin = margin(r = 10)
  )