Należy przedstawić zakres projektu - opisać przebieg procesu analizy tekstu
library(readr)
library(lubridate)
library(dplyr)
library(ggplot2)
library(forcats)
library(tidytext)
library(wordcloud)
library(hunspell)
library(tidyr)
reviews <- read_csv("C:/Users/mbuko/OneDrive/Pulpit/Restaurant reviews.csv")
reviews<-reviews[,c(1,3:4,6)]
table(reviews$Restaurant)
##
## 10 Downing Street
## 100
## 13 Dhaba
## 100
## 3B's - Buddies, Bar & Barbecue
## 100
## AB's - Absolute Barbecues
## 100
## Absolute Sizzlers
## 100
## Al Saba Restaurant
## 100
## American Wild Wings
## 100
## Amul
## 100
## Arena Eleven
## 100
## Aromas@11SIX
## 100
## Asian Meal Box
## 100
## B-Dubs
## 100
## Banana Leaf Multicuisine Restaurant
## 100
## Barbeque Nation
## 100
## Behrouz Biryani
## 100
## Being Hungry
## 100
## Beyond Flavours
## 100
## Biryanis And More
## 100
## Cafe Eclat
## 100
## Cascade - Radisson Hyderabad Hitec City
## 100
## Chinese Pavilion
## 100
## Club Rogue
## 100
## Collage - Hyatt Hyderabad Gachibowli
## 100
## Cream Stone
## 100
## Delhi-39
## 100
## Deli 9 Bistro
## 100
## Desi Bytes
## 100
## Dine O China
## 100
## Diners Pavilion
## 100
## Domino's Pizza
## 100
## Driven Cafe
## 100
## Dunkin' Donuts
## 100
## Eat India Company
## 100
## eat.fit
## 100
## Faasos
## 100
## Feast - Sheraton Hyderabad Hotel
## 100
## Flechazo
## 100
## Frio Bistro
## 100
## Gal Punjab Di
## 100
## GD's
## 100
## Green Bawarchi Restaurant
## 100
## Hitech Bawarchi Food Zone
## 100
## Hotel Zara Hi-Fi
## 100
## Hunger Maggi Point
## 100
## Hyderabad Chefs
## 100
## Hyderabadi Daawat
## 100
## Hyper Local
## 100
## Jonathan's Kitchen - Holiday Inn Express & Suites
## 100
## Karachi Bakery
## 100
## Karachi Cafe
## 100
## KFC
## 100
## Khaan Saab
## 100
## Komatose - Holiday Inn Express & Suites
## 100
## Kritunga Restaurant
## 100
## KS Bakers
## 100
## La La Land - Bar & Kitchen
## 100
## Labonel
## 100
## Marsala Food Company
## 100
## Mathura Vilas
## 100
## Mazzo - Marriott Executive Apartments
## 100
## Mohammedia Shawarma
## 100
## Momos Delight
## 100
## Mustang Terrace Lounge
## 100
## NorFest - The Dhaba
## 100
## Olive Garden
## 100
## Over The Moon Brew Company
## 100
## Owm Nom Nom
## 100
## Pakwaan Grand
## 100
## Paradise
## 100
## Pista House
## 100
## Pot Pourri
## 100
## PourHouse7
## 100
## Prism Club & Kitchen
## 100
## Royal Spicy Restaurant
## 100
## Sardarji's Chaats & More
## 100
## Shah Ghouse Hotel & Restaurant
## 100
## Shah Ghouse Spl Shawarma
## 100
## Shanghai Chef 2
## 100
## Shree Santosh Dhaba Family Restaurant
## 100
## SKYHY
## 100
## Squeeze @ The Lime
## 100
## T Grill
## 100
## Tandoori Food Works
## 100
## Tempteys
## 100
## The Chocolate Room
## 100
## The Fisherman's Wharf
## 100
## The Foodie Monster Kitchen
## 100
## The Glass Onion
## 100
## The Indi Grill
## 100
## The Lal Street - Bar Exchange
## 100
## The Old Madras Baking Company
## 100
## The Tilt Bar Republic
## 100
## Tiki Shack
## 100
## Triptify
## 100
## Udipi's Upahar
## 100
## Ulavacharu
## 100
## Urban Asia - Kitchen & Bar
## 100
## Yum Yum Tree - The Arabian Food Court
## 100
## Zega - Sheraton Hyderabad Hotel
## 100
## Zing's Northeast Kitchen
## 100
str(reviews)
## tibble [10,000 × 4] (S3: tbl_df/tbl/data.frame)
## $ Restaurant: chr [1:10000] "Beyond Flavours" "Beyond Flavours" "Beyond Flavours" "Beyond Flavours" ...
## $ Review : chr [1:10000] "The ambience was good, food was quite good . had Saturday lunch , which was cost effective .\nGood place for a "| __truncated__ "Ambience is too good for a pleasant evening. Service is very prompt. Food is good. Over all a good experience. "| __truncated__ "A must try.. great food great ambience. Thnx for the service by Pradeep and Subroto. My personal recommendation"| __truncated__ "Soumen das and Arun was a great guy. Only because of their behavior and sincerety, And good food off course, I "| __truncated__ ...
## $ Rating : num [1:10000] 5 5 5 5 5 5 5 4 5 5 ...
## $ Time : chr [1:10000] "5/25/2019 15:54" "5/25/2019 14:20" "5/24/2019 22:54" "5/24/2019 22:11" ...
colSums(is.na(reviews))
## Restaurant Review Rating Time
## 0 45 39 38
mean(reviews$Rating, na.rm = TRUE)
## [1] 3.601044
sd(reviews$Rating, na.rm = TRUE)
## [1] 1.483461
reviews$Data<- as.Date(reviews$Time, format = "%m/%d/%Y %H:%M")
reviews$Year <- year(reviews$Data)
reviews$Month <- month(reviews$Data, label = TRUE, locale = "pl_PL")
reviews$Id<-1:nrow(reviews)
min(reviews$Data, na.rm = TRUE)
## [1] "2016-05-31"
max(reviews$Data, na.rm = TRUE)
## [1] "2019-05-25"
table(reviews$Rating)
##
## 1 1.5 2 2.5 3 3.5 4 4.5 5
## 1735 9 684 19 1193 47 2373 69 3832
reviews$Rating<-floor(reviews$Rating)
table(reviews$Rating)
##
## 1 2 3 4 5
## 1744 703 1240 2442 3832
Wnioski dotyczące tej części powiny wskazywać na charakter danych wykorzystanych do analizy
ggplot(reviews, aes(x = Rating)) +
geom_histogram(binwidth = 1, fill="cadetblue2") +
geom_text(stat = "bin",
aes(label = ifelse(after_stat(count) > 0, after_stat(count), "")),
vjust = -0.5, size = 3) +
theme_minimal(base_size = 8) +
labs(x="Ocena", y="Liczba") +
scale_x_continuous(breaks = seq(1, 5, by = 1))
ggplot(reviews, aes(y = Rating)) +
geom_boxplot(fill="cadetblue2") +
theme_minimal(base_size = 8) +
labs(y="Ocena") +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
)
reviews %>%
group_by(Year) %>%
summarise(srednia_ocena = mean(Rating, na.rm = TRUE))
## # A tibble: 5 × 2
## Year srednia_ocena
## <dbl> <dbl>
## 1 2016 3.72
## 2 2017 3.62
## 3 2018 3.52
## 4 2019 3.67
## 5 NA NaN
ggplot(subset(reviews, !is.na(Year) & !is.na(Rating)),
aes(x = as.factor(Year), y = Rating, fill = as.factor(Year))) +
geom_boxplot() +
theme_minimal(base_size = 8) +
labs(x = "Rok", y = "Oceny") +
theme(legend.position = "none") +
scale_x_discrete(drop = TRUE)
reviews %>%
group_by(Month) %>%
summarise(srednia_ocena = mean(Rating, na.rm = TRUE))
## # A tibble: 13 × 2
## Month srednia_ocena
## <ord> <dbl>
## 1 sty 3.53
## 2 lut 3.49
## 3 mar 3.46
## 4 kwi 3.63
## 5 maj 3.89
## 6 cze 3.41
## 7 lip 3.71
## 8 sie 3.53
## 9 wrz 3.42
## 10 pa<9f> 3.49
## 11 lis 3.46
## 12 gru 3.53
## 13 <NA> NaN
ggplot(subset(reviews, !is.na(Month) & !is.na(Rating)),
aes(x = as.factor(Month), y = Rating, fill = as.factor(Month))) +
geom_boxplot() +
theme_minimal(base_size = 8) +
labs(x = "MiesiÄ…c", y = "Oceny") +
theme(legend.position = "none") +
scale_x_discrete(drop = TRUE)
reviews %>%
group_by(Restaurant) %>%
summarise(srednia_ocena = mean(Rating, na.rm = TRUE)) %>%
arrange(desc(srednia_ocena))
## # A tibble: 100 × 2
## Restaurant srednia_ocena
## <chr> <dbl>
## 1 AB's - Absolute Barbecues 4.88
## 2 B-Dubs 4.81
## 3 3B's - Buddies, Bar & Barbecue 4.76
## 4 Paradise 4.7
## 5 Flechazo 4.66
## 6 The Indi Grill 4.6
## 7 Zega - Sheraton Hyderabad Hotel 4.45
## 8 Over The Moon Brew Company 4.34
## 9 Beyond Flavours 4.28
## 10 Feast - Sheraton Hyderabad Hotel 4.22
## # ℹ 90 more rows
reviews %>%
group_by(Restaurant) %>%
summarise(srednia_ocena = mean(Rating, na.rm = TRUE)) %>%
arrange(desc(srednia_ocena)) %>%
slice_head(n = 10) %>%
ggplot(aes(x = fct_reorder(Restaurant, srednia_ocena), y = srednia_ocena)) +
geom_col(fill = "cadetblue3") +
geom_text(aes(label = round(srednia_ocena, 2)), hjust = -0.2, size = 3) +
coord_flip() +
theme_minimal(base_size = 10) +
labs(x = "Restauracja", y = "Åšrednia ocena") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1)))
reviews %>%
group_by(Restaurant) %>%
summarise(srednia_ocena = mean(Rating, na.rm = TRUE)) %>%
arrange(srednia_ocena) %>%
slice_head(n = 10) %>%
ggplot(aes(x = fct_reorder(Restaurant, srednia_ocena), y = srednia_ocena)) +
geom_col(fill = "tomato3") +
geom_text(aes(label = round(srednia_ocena, 2)), hjust = -0.2, size = 3) +
coord_flip() +
theme_minimal(base_size = 10) +
labs(x = "Restauracja", y = "Åšrednia ocena") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1)))
table(reviews$Year)
##
## 2016 2017 2018 2019
## 43 213 4903 4803
reviews %>%
filter(Data >= as.Date("2018-05-01")) %>%
group_by(Data) %>%
summarise(srednia_ocena = mean(Rating, na.rm = TRUE), .groups = "drop") %>%
ggplot(aes(x = Data, y = srednia_ocena)) +
geom_line(color = "cadetblue3", linewidth = 0.9) +
geom_point(color = "cadetblue4", size = 1.8) +
theme_minimal(base_size = 10) +
labs(x = "Data", y = "Średnia ocena", title = "Zmiana średniej oceny w czasie") +
scale_x_date(
date_breaks = "1 month", # zmień np. na "1 week" przy gęstych danych
date_labels = "%Y-%m" # format osi X: Rok-MiesiÄ…c
)
Wnioski dotyczące tej części powinny wskazywać na zmienność ocen - w zależności od roku, miesiąca. Ponadto należy omówić wyniki dotyczące najlepszych i najgorszych restauracji
tokeny <- reviews %>%
unnest_tokens(word, Review)
print(count(tokeny, word, sort = TRUE), n = 30)
## # A tibble: 16,720 × 2
## word n
## <chr> <int>
## 1 the 23152
## 2 and 17616
## 3 was 10318
## 4 to 9902
## 5 a 9738
## 6 is 9249
## 7 i 7461
## 8 of 7455
## 9 good 7131
## 10 food 6590
## 11 for 6559
## 12 it 5860
## 13 place 5707
## 14 with 5507
## 15 in 5272
## 16 this 4798
## 17 we 3982
## 18 very 3719
## 19 not 3645
## 20 they 3409
## 21 but 3395
## 22 service 3209
## 23 have 3205
## 24 chicken 3093
## 25 you 2949
## 26 are 2800
## 27 were 2664
## 28 on 2663
## 29 had 2620
## 30 that 2492
## # ℹ 16,690 more rows
tokeny <- tokeny %>%
group_by(word) %>%
filter(n() > 5) %>%
ungroup() %>%
anti_join(stop_words)
print(count(tokeny, word, sort = TRUE), n = 30)
## # A tibble: 3,469 × 2
## word n
## <chr> <int>
## 1 food 6590
## 2 service 3209
## 3 chicken 3093
## 4 5 2343
## 5 taste 2261
## 6 ambience 2094
## 7 time 1603
## 8 nice 1486
## 9 biryani 1338
## 10 staff 1316
## 11 restaurant 1263
## 12 visit 1260
## 13 veg 1242
## 14 experience 1185
## 15 amazing 998
## 16 awesome 864
## 17 quality 860
## 18 starters 856
## 19 served 831
## 20 4 829
## 21 rice 783
## 22 paneer 717
## 23 menu 701
## 24 friends 700
## 25 tasty 700
## 26 bad 686
## 27 quantity 685
## 28 buffet 656
## 29 3 650
## 30 love 650
## # ℹ 3,439 more rows
word_count <- count(tokeny, word, sort = TRUE)
word_count$proc<-word_count$n/sum(word_count$n)*100
set.seed(1)
wordcloud(words = word_count$word, freq = word_count$n, min.freq = 10,
max.words=80, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
tokeny %>%
filter(!is.na(Rating)) %>%
count(Rating, word, sort = TRUE) %>%
group_by(Rating) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(reorder_within(word, n, Rating), n,
fill = as.factor(Rating))) +
geom_col(show.legend = FALSE) +
geom_text(aes(y = n, label = n),
hjust = 0.5,
size = 3) +
scale_x_reordered() +
coord_flip(clip = "off") +
facet_wrap(~Rating, scales = "free") +
scale_y_continuous(expand = c(0, 0)) +
theme_bw(base_size = 10) +
labs(fill = "Ocena",
x = "wyrazy",
y = "n") +
theme(
plot.title = element_text(lineheight = .8, face = "bold"),
plot.margin = margin(r = 10)
)
tokeny$Fill<-ifelse(tokeny$Rating > 3, "positive",
ifelse(tokeny$Rating<2, "negative", "neutral"))
tokeny %>%
filter(!is.na(Rating)) %>%
count(Fill, word, sort = TRUE) %>%
group_by(Fill) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(reorder_within(word, n, Fill), n,
fill = as.factor(Fill))) +
geom_col(show.legend = FALSE) +
geom_text(aes(y = n, label = n),
hjust = 0.5,
size = 3) +
scale_x_reordered() +
coord_flip(clip = "off") +
facet_wrap(~Fill, scales = "free") +
scale_y_continuous(expand = c(0, 0)) +
theme_bw(base_size = 10) +
labs(fill = "Ocena",
x = "wyrazy",
y = "n") +
theme(
plot.title = element_text(lineheight = .8, face = "bold"),
plot.margin = margin(r = 10)
)