Należy przedstawić zakres projektu - opisać przebieg procesu analizy tekstu
library(readr)
library(lubridate)
library(dplyr)
library(ggplot2)
library(forcats)
library(tidytext)
library(wordcloud)
library(hunspell)
library(tidyr)
reviews <- read_csv("C:/Users/mbuko/OneDrive/Pulpit/Restaurant reviews.csv")
reviews<-reviews[,c(1,3:4,6)]
table(reviews$Restaurant)
##
## 10 Downing Street
## 100
## 13 Dhaba
## 100
## 3B's - Buddies, Bar & Barbecue
## 100
## AB's - Absolute Barbecues
## 100
## Absolute Sizzlers
## 100
## Al Saba Restaurant
## 100
## American Wild Wings
## 100
## Amul
## 100
## Arena Eleven
## 100
## Aromas@11SIX
## 100
## Asian Meal Box
## 100
## B-Dubs
## 100
## Banana Leaf Multicuisine Restaurant
## 100
## Barbeque Nation
## 100
## Behrouz Biryani
## 100
## Being Hungry
## 100
## Beyond Flavours
## 100
## Biryanis And More
## 100
## Cafe Eclat
## 100
## Cascade - Radisson Hyderabad Hitec City
## 100
## Chinese Pavilion
## 100
## Club Rogue
## 100
## Collage - Hyatt Hyderabad Gachibowli
## 100
## Cream Stone
## 100
## Delhi-39
## 100
## Deli 9 Bistro
## 100
## Desi Bytes
## 100
## Dine O China
## 100
## Diners Pavilion
## 100
## Domino's Pizza
## 100
## Driven Cafe
## 100
## Dunkin' Donuts
## 100
## Eat India Company
## 100
## eat.fit
## 100
## Faasos
## 100
## Feast - Sheraton Hyderabad Hotel
## 100
## Flechazo
## 100
## Frio Bistro
## 100
## Gal Punjab Di
## 100
## GD's
## 100
## Green Bawarchi Restaurant
## 100
## Hitech Bawarchi Food Zone
## 100
## Hotel Zara Hi-Fi
## 100
## Hunger Maggi Point
## 100
## Hyderabad Chefs
## 100
## Hyderabadi Daawat
## 100
## Hyper Local
## 100
## Jonathan's Kitchen - Holiday Inn Express & Suites
## 100
## Karachi Bakery
## 100
## Karachi Cafe
## 100
## KFC
## 100
## Khaan Saab
## 100
## Komatose - Holiday Inn Express & Suites
## 100
## Kritunga Restaurant
## 100
## KS Bakers
## 100
## La La Land - Bar & Kitchen
## 100
## Labonel
## 100
## Marsala Food Company
## 100
## Mathura Vilas
## 100
## Mazzo - Marriott Executive Apartments
## 100
## Mohammedia Shawarma
## 100
## Momos Delight
## 100
## Mustang Terrace Lounge
## 100
## NorFest - The Dhaba
## 100
## Olive Garden
## 100
## Over The Moon Brew Company
## 100
## Owm Nom Nom
## 100
## Pakwaan Grand
## 100
## Paradise
## 100
## Pista House
## 100
## Pot Pourri
## 100
## PourHouse7
## 100
## Prism Club & Kitchen
## 100
## Royal Spicy Restaurant
## 100
## Sardarji's Chaats & More
## 100
## Shah Ghouse Hotel & Restaurant
## 100
## Shah Ghouse Spl Shawarma
## 100
## Shanghai Chef 2
## 100
## Shree Santosh Dhaba Family Restaurant
## 100
## SKYHY
## 100
## Squeeze @ The Lime
## 100
## T Grill
## 100
## Tandoori Food Works
## 100
## Tempteys
## 100
## The Chocolate Room
## 100
## The Fisherman's Wharf
## 100
## The Foodie Monster Kitchen
## 100
## The Glass Onion
## 100
## The Indi Grill
## 100
## The Lal Street - Bar Exchange
## 100
## The Old Madras Baking Company
## 100
## The Tilt Bar Republic
## 100
## Tiki Shack
## 100
## Triptify
## 100
## Udipi's Upahar
## 100
## Ulavacharu
## 100
## Urban Asia - Kitchen & Bar
## 100
## Yum Yum Tree - The Arabian Food Court
## 100
## Zega - Sheraton Hyderabad Hotel
## 100
## Zing's Northeast Kitchen
## 100
str(reviews)
## tibble [10,000 × 4] (S3: tbl_df/tbl/data.frame)
## $ Restaurant: chr [1:10000] "Beyond Flavours" "Beyond Flavours" "Beyond Flavours" "Beyond Flavours" ...
## $ Review : chr [1:10000] "The ambience was good, food was quite good . had Saturday lunch , which was cost effective .\nGood place for a "| __truncated__ "Ambience is too good for a pleasant evening. Service is very prompt. Food is good. Over all a good experience. "| __truncated__ "A must try.. great food great ambience. Thnx for the service by Pradeep and Subroto. My personal recommendation"| __truncated__ "Soumen das and Arun was a great guy. Only because of their behavior and sincerety, And good food off course, I "| __truncated__ ...
## $ Rating : num [1:10000] 5 5 5 5 5 5 5 4 5 5 ...
## $ Time : chr [1:10000] "5/25/2019 15:54" "5/25/2019 14:20" "5/24/2019 22:54" "5/24/2019 22:11" ...
colSums(is.na(reviews))
## Restaurant Review Rating Time
## 0 45 39 38
mean(reviews$Rating, na.rm = TRUE)
## [1] 3.601044
sd(reviews$Rating, na.rm = TRUE)
## [1] 1.483461
reviews$Data<- as.Date(reviews$Time, format = "%m/%d/%Y %H:%M")
reviews$Year <- year(reviews$Data)
reviews$Month <- month(reviews$Data, label = TRUE, locale = "pl_PL")
reviews$Id<-1:nrow(reviews)
min(reviews$Data, na.rm = TRUE)
## [1] "2016-05-31"
max(reviews$Data, na.rm = TRUE)
## [1] "2019-05-25"
table(reviews$Rating)
##
## 1 1.5 2 2.5 3 3.5 4 4.5 5
## 1735 9 684 19 1193 47 2373 69 3832
reviews$Rating<-floor(reviews$Rating)
table(reviews$Rating)
##
## 1 2 3 4 5
## 1744 703 1240 2442 3832
Wnioski dotyczące tej części powiny wskazywać na charakter danych wykorzystanych do analizy
ggplot(reviews, aes(x = Rating)) +
geom_histogram(binwidth = 1, fill="cadetblue2") +
geom_text(stat = "bin",
aes(label = ifelse(after_stat(count) > 0, after_stat(count), "")),
vjust = -0.5, size = 3) +
theme_minimal(base_size = 8) +
labs(x="Ocena", y="Liczba") +
scale_x_continuous(breaks = seq(1, 5, by = 1))
ggplot(reviews, aes(y = Rating)) +
geom_boxplot(fill="cadetblue2") +
theme_minimal(base_size = 8) +
labs(y="Ocena") +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
)
reviews %>%
group_by(Year) %>%
summarise(srednia_ocena = mean(Rating, na.rm = TRUE))
## # A tibble: 5 × 2
## Year srednia_ocena
## <dbl> <dbl>
## 1 2016 3.72
## 2 2017 3.62
## 3 2018 3.52
## 4 2019 3.67
## 5 NA NaN
ggplot(subset(reviews, !is.na(Year) & !is.na(Rating)),
aes(x = as.factor(Year), y = Rating, fill = as.factor(Year))) +
geom_boxplot() +
theme_minimal(base_size = 8) +
labs(x = "Rok", y = "Oceny") +
theme(legend.position = "none") +
scale_x_discrete(drop = TRUE)
reviews %>%
group_by(Month) %>%
summarise(srednia_ocena = mean(Rating, na.rm = TRUE))
## # A tibble: 13 × 2
## Month srednia_ocena
## <ord> <dbl>
## 1 sty 3.53
## 2 lut 3.49
## 3 mar 3.46
## 4 kwi 3.63
## 5 maj 3.89
## 6 cze 3.41
## 7 lip 3.71
## 8 sie 3.53
## 9 wrz 3.42
## 10 pa<9f> 3.49
## 11 lis 3.46
## 12 gru 3.53
## 13 <NA> NaN
ggplot(subset(reviews, !is.na(Month) & !is.na(Rating)),
aes(x = as.factor(Month), y = Rating, fill = as.factor(Month))) +
geom_boxplot() +
theme_minimal(base_size = 8) +
labs(x = "MiesiÄ…c", y = "Oceny") +
theme(legend.position = "none") +
scale_x_discrete(drop = TRUE)
reviews %>%
group_by(Restaurant) %>%
summarise(srednia_ocena = mean(Rating, na.rm = TRUE)) %>%
arrange(desc(srednia_ocena))
## # A tibble: 100 × 2
## Restaurant srednia_ocena
## <chr> <dbl>
## 1 AB's - Absolute Barbecues 4.88
## 2 B-Dubs 4.81
## 3 3B's - Buddies, Bar & Barbecue 4.76
## 4 Paradise 4.7
## 5 Flechazo 4.66
## 6 The Indi Grill 4.6
## 7 Zega - Sheraton Hyderabad Hotel 4.45
## 8 Over The Moon Brew Company 4.34
## 9 Beyond Flavours 4.28
## 10 Feast - Sheraton Hyderabad Hotel 4.22
## # ℹ 90 more rows
reviews %>%
group_by(Restaurant) %>%
summarise(srednia_ocena = mean(Rating, na.rm = TRUE)) %>%
arrange(desc(srednia_ocena)) %>%
slice_head(n = 10) %>%
ggplot(aes(x = fct_reorder(Restaurant, srednia_ocena), y = srednia_ocena)) +
geom_col(fill = "cadetblue3") +
geom_text(aes(label = round(srednia_ocena, 2)), hjust = -0.2, size = 3) +
coord_flip() +
theme_minimal(base_size = 10) +
labs(x = "Restauracja", y = "Åšrednia ocena") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1)))
reviews %>%
group_by(Restaurant) %>%
summarise(srednia_ocena = mean(Rating, na.rm = TRUE)) %>%
arrange(srednia_ocena) %>%
slice_head(n = 10) %>%
ggplot(aes(x = fct_reorder(Restaurant, srednia_ocena), y = srednia_ocena)) +
geom_col(fill = "tomato3") +
geom_text(aes(label = round(srednia_ocena, 2)), hjust = -0.2, size = 3) +
coord_flip() +
theme_minimal(base_size = 10) +
labs(x = "Restauracja", y = "Åšrednia ocena") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1)))
table(reviews$Year)
##
## 2016 2017 2018 2019
## 43 213 4903 4803
reviews %>%
filter(Data >= as.Date("2018-05-01")) %>%
group_by(Data) %>%
summarise(srednia_ocena = mean(Rating, na.rm = TRUE), .groups = "drop") %>%
ggplot(aes(x = Data, y = srednia_ocena)) +
geom_line(color = "cadetblue3", linewidth = 0.9) +
geom_point(color = "cadetblue4", size = 1.8) +
theme_minimal(base_size = 10) +
labs(x = "Data", y = "Średnia ocena", title = "Zmiana średniej oceny w czasie") +
scale_x_date(
date_breaks = "1 month", # zmień np. na "1 week" przy gęstych danych
date_labels = "%Y-%m" # format osi X: Rok-MiesiÄ…c
)
Wnioski dotyczące tej części powinny wskazywać na zmienność ocen - w zależności od roku, miesiąca. Ponadto należy omówić wyniki dotyczące najlepszych i najgorszych restauracji
tokeny <- reviews %>%
unnest_tokens(word, Review)
print(count(tokeny, word, sort = TRUE), n = 30)
## # A tibble: 16,720 × 2
## word n
## <chr> <int>
## 1 the 23152
## 2 and 17616
## 3 was 10318
## 4 to 9902
## 5 a 9738
## 6 is 9249
## 7 i 7461
## 8 of 7455
## 9 good 7131
## 10 food 6590
## 11 for 6559
## 12 it 5860
## 13 place 5707
## 14 with 5507
## 15 in 5272
## 16 this 4798
## 17 we 3982
## 18 very 3719
## 19 not 3645
## 20 they 3409
## 21 but 3395
## 22 service 3209
## 23 have 3205
## 24 chicken 3093
## 25 you 2949
## 26 are 2800
## 27 were 2664
## 28 on 2663
## 29 had 2620
## 30 that 2492
## # ℹ 16,690 more rows
tokeny <- tokeny %>%
group_by(word) %>%
filter(n() > 5) %>%
ungroup() %>%
anti_join(stop_words)
print(count(tokeny, word, sort = TRUE), n = 30)
## # A tibble: 3,469 × 2
## word n
## <chr> <int>
## 1 food 6590
## 2 service 3209
## 3 chicken 3093
## 4 5 2343
## 5 taste 2261
## 6 ambience 2094
## 7 time 1603
## 8 nice 1486
## 9 biryani 1338
## 10 staff 1316
## 11 restaurant 1263
## 12 visit 1260
## 13 veg 1242
## 14 experience 1185
## 15 amazing 998
## 16 awesome 864
## 17 quality 860
## 18 starters 856
## 19 served 831
## 20 4 829
## 21 rice 783
## 22 paneer 717
## 23 menu 701
## 24 friends 700
## 25 tasty 700
## 26 bad 686
## 27 quantity 685
## 28 buffet 656
## 29 3 650
## 30 love 650
## # ℹ 3,439 more rows
word_count <- count(tokeny, word, sort = TRUE)
word_count$proc<-word_count$n/sum(word_count$n)*100
set.seed(1)
wordcloud(words = word_count$word, freq = word_count$n, min.freq = 10,
max.words=80, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
tokeny %>%
filter(!is.na(Rating)) %>%
count(Rating, word, sort = TRUE) %>%
group_by(Rating) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(reorder_within(word, n, Rating), n,
fill = as.factor(Rating))) +
geom_col(show.legend = FALSE) +
geom_text(aes(y = n, label = n),
hjust = 0.5,
size = 3) +
scale_x_reordered() +
coord_flip(clip = "off") +
facet_wrap(~Rating, scales = "free") +
scale_y_continuous(expand = c(0, 0)) +
theme_bw(base_size = 10) +
labs(fill = "Ocena",
x = "wyrazy",
y = "n") +
theme(
plot.title = element_text(lineheight = .8, face = "bold"),
plot.margin = margin(r = 10)
)
tokeny$Fill<-ifelse(tokeny$Rating > 3, "positive",
ifelse(tokeny$Rating<2, "negative", "neutral"))
tokeny %>%
filter(!is.na(Rating)) %>%
count(Fill, word, sort = TRUE) %>%
group_by(Fill) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(reorder_within(word, n, Fill), n,
fill = as.factor(Fill))) +
geom_col(show.legend = FALSE) +
geom_text(aes(y = n, label = n),
hjust = 0.5,
size = 3) +
scale_x_reordered() +
coord_flip(clip = "off") +
facet_wrap(~Fill, scales = "free") +
scale_y_continuous(expand = c(0, 0)) +
theme_bw(base_size = 10) +
labs(fill = "Ocena",
x = "wyrazy",
y = "n") +
theme(
plot.title = element_text(lineheight = .8, face = "bold"),
plot.margin = margin(r = 10)
)
bigramy <- reviews %>%
unnest_tokens(bigram, Review, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE)
is_number <- function(x) {
grepl("^[+-]?(?:(?:\\d{1,3}(?:[\\s\u00A0]?\\d{3})+)|\\d+)(?:[.,]\\d+)?$", x, perl = TRUE)
}
bigramy<- bigramy %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!is.na(word1)) %>%
filter(!is_number(word1), !is_number(word2))
bigramy %>%
unite(word, word1, word2, sep = " ") %>%
slice_max(n, n = 10) %>%
ggplot(aes(x = reorder(word, n), y = n, fill = word)) +
geom_col(show.legend = FALSE) +
scale_y_continuous(expand = c(0, 0)) +
coord_flip() +
scale_fill_brewer(palette = "Set3") +
labs(x = "Bigram", y = "Liczba wystąpień", title = "Top 10 bigramów") +
theme_minimal(base_size = 10)
trigramy <- reviews %>%
unnest_tokens(trigram, Review, token = "ngrams", n = 3) %>%
count(trigram, sort = TRUE)
Wnioski dotyczące tej części powinny wskazywać na zmienność tokenów - w zależności od oceny punktowej restauracji. Ponadto należy omówić wyniki dotyczące n-gramów
afinn<-get_sentiments("afinn")
sentiment<-tokeny %>%
inner_join(afinn)
sentiment2<-tokeny %>%
inner_join(afinn) %>%
group_by(Id) %>% summarise(sentiment = sum(value))
reviews<-full_join(reviews, sentiment2, by = "Id")
reviews %>%
filter(!is.na(Rating)) %>%
ggplot(aes(x = as.factor(Rating), y = sentiment, fill = as.factor(Rating))) +
geom_boxplot() +
theme_minimal(base_size = 8) +
labs(x = "Ocena", y = "Sentyment") +
theme(legend.position = "none")
max_review<-reviews %>%
filter(!is.na(sentiment)) %>%
slice_max(order_by = sentiment, n = 10, with_ties = FALSE) %>%
select(Id, Review, sentiment)
max_review
## # A tibble: 10 × 3
## Id Review sentiment
## <int> <chr> <dbl>
## 1 9622 "Follow My Instagram @BeingHydFoodie\n\nHyderabadi Daawat, A… 66
## 2 2131 "A peaceful n quite place is something u want after a hectic… 60
## 3 9623 "Hyderabadi Daawat as the name suggests has some authentic H… 56
## 4 9149 "It's nice amazing nice service nice place amazing food\n\nE… 50
## 5 6062 "Follow my Instagram page @journeywithfoodiess\n\nPourHouse7… 47
## 6 9697 "A newbie in the neighborhood calls for a visit and it was a… 46
## 7 5217 "Just the name of Biryani and Hyderabadis get hunger pangs! … 44
## 8 2144 "One of Hyderabad's most stylish fine dining places, Jonatha… 43
## 9 9033 "Arena eleven sports bar\n\nIPl is here and whats better way… 43
## 10 3255 "Recently we were invited for a food tasting session here at… 42
lowest_10 <- reviews %>%
filter(!is.na(sentiment)) %>%
slice_min(order_by = sentiment, n = 10, with_ties = FALSE) %>%
select(Review, Id, sentiment)
lowest_10
## # A tibble: 10 × 3
## Review Id sentiment
## <chr> <int> <dbl>
## 1 "Worst and worst biryani everrrrrr ..... Don't order and you… 8442 -24
## 2 "Was visiting my in laws for the first time. To celebrate th… 9292 -22
## 3 "This place is nothing as it is shown in the images. The amb… 7036 -20
## 4 "My friend and I visited this place on the 13th Of january a… 1858 -19
## 5 "Man! Who the hell are giving such high ratings to such usel… 4214 -19
## 6 "This is the worst place in Zomato gold I ever visited,waste… 7832 -19
## 7 "This place is worst.. biryani worst than ever... no quality… 1948 -18
## 8 "Pathetic restuarant. I had to struggle even for getting a t… 811 -16
## 9 "One of the worst biriyanis i ate . Home deliverd thru zomat… 2806 -16
## 10 "I am thinking where to start, there were so many bad experi… 3365 -16
bing<-get_sentiments("bing")
bing$value<-ifelse(bing$sentiment == "positive",1,-1)
sentiment<-tokeny %>%
inner_join(bing)
sentiment2<-tokeny %>%
inner_join(bing) %>%
group_by(Id) %>% summarise(sentiment_bing = sum(value))
reviews<-full_join(reviews, sentiment2, by = "Id")
reviews %>%
filter(!is.na(Rating)) %>%
ggplot(aes(x = as.factor(Rating), y = sentiment_bing, fill = as.factor(Rating))) +
geom_boxplot() +
theme_minimal(base_size = 8) +
labs(x="Ocena", y="Sentyment") +
theme(legend.position = "non")
ggplot(reviews, aes(x = sentiment, y = sentiment_bing)) +
geom_point() +
theme_minimal(base_size = 8)
ggplot(reviews, aes(x = sentiment, y = sentiment_bing)) +
geom_bin2d() +
theme_minimal(base_size = 8)