require(data.table) #data read
## Loading required package: data.table
require(readr) ; # data read
## Loading required package: readr
library(tidyverse) # data manipulation and graphs
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 2.2.1 √ purrr 0.2.4
## √ tibble 1.4.2 √ dplyr 0.7.4
## √ tidyr 0.8.0 √ stringr 1.3.0
## √ ggplot2 2.2.1 √ forcats 0.3.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::between() masks data.table::between()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(stringr) # string manipulation
library(lubridate) # date manipulation
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:base':
##
## date
library('wordcloud') # wordcloud
## Loading required package: RColorBrewer
library(tidytext) # tidy implementation of NLP methods
library(leaflet) # maps
library(igraph) # graphs
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:lubridate':
##
## %--%, union
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph) # graphs
library(topicmodels) # for LDA topic modelling
library(tm) # general text mining functions, making document term matrixes
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(SnowballC) # for stemming
library(textcat)
library(doBy)
library(reshape)
##
## Attaching package: 'reshape'
## The following object is masked from 'package:lubridate':
##
## stamp
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
## The following object is masked from 'package:data.table':
##
## melt
User<-fread(“yelp_user.csv”) Biz<-fread(“yelp_business.csv”) Biz_Att<-fread(“yelp_business_attributes.csv”) Biz_Hour<-data.table(read_csv(“yelp_business_hours.csv”)) Check_in<-fread(“yelp_checkin.csv”) Tip<-fread(“yelp_tip.csv”) Review<-data.table(read_csv(“yelp_review.csv”))
glimpse(User)#유저수 1,326,100명, 22개 Column으로 되어 있고.
####review_count, yelping_since, useful,funny, cool, fans, elite, average_stars선택하고 활동년도와 elete선정 년도 추가 User\(Act_Duration<-round(as.numeric(Sys.Date()-as.Date(User\)yelping_since))/365)#활동년도 추가 User\(elite_Act<-ifelse(str_count(User\)elite,“,”)>0, str_count(User$elite,“,”)+1,0)#Yelp Elite 선정 년도 추가
User_R<- User %>% select(user_id, review_count,yelping_since, useful,funny,fans, elite, elite_Act, Act_Duration,average_stars)#관심 값 추출
User_R\(review_count<-as.numeric(User_R\)review_count) User_R\(useful<-as.numeric(User_R\)useful) User_R\(funny<-as.numeric(User_R\)funny) User_R\(fans<-as.numeric(User_R\)fans) User_R\(average_stars<-as.numeric(User_R\)average_stars)
quantile(User_R\(Act_Duration) mean(User_R\)Act_Duration) # 유저들의 평균 활동 기간은 4.77년
User_sum<-User_R %>% group_by(Act_Duration) %>% summarise(review_sum = sum(review_count), useful_sum=sum(useful), funny_sum=sum(funny), fans_sum=sum(fans))
ggplot(data = User_sum)+ geom_bar(aes(x=Act_Duration, y=review_sum), stat = “identity”)
ggplot(data = User_sum)+ geom_bar(aes(x=Act_Duration, y=useful_sum), stat = “identity”)
ggplot(data = User_sum)+ geom_bar(aes(x=Act_Duration, y=funny_sum), stat = “identity”)
ggplot(data = User_sum)+ geom_bar(aes(x=Act_Duration, y=fans_sum), stat = “identity”)
User_sum_E<-User_R %>% group_by(elite_Act) %>% summarise(review_sum = sum(review_count), useful_sum=sum(useful), funny_sum=sum(funny), fans_sum=sum(fans))
ggplot(data = User_sum_E)+ geom_bar(aes(x=elite_Act, y=review_sum), stat = “identity”)
ggplot(data = User_sum_E)+ geom_bar(aes(x=elite_Act, y=useful_sum), stat = “identity”)
ggplot(data = User_sum_E)+ geom_bar(aes(x=elite_Act, y=funny_sum), stat = “identity”)
ggplot(data = User_sum_E)+ geom_bar(aes(x=elite_Act, y=fans_sum), stat = “identity”)
glimpse(Biz)##총사업자수 174,567
Biz_Category<-str_split(Biz$categories,“;”)#사업자의 사업 영역 추출 Biz_Category<-as.data.frame(unlist(Biz_Category)) #데이터 프레임으로 변환 colnames(Biz_Category)=c(“Name”) #열 이름 변경, 총 66,8027개의 사업명 존재
Biz_Category %>% group_by(Name) %>% summarise(Count = n()) %>% arrange(desc(Count)) %>% head(20) #Restaurants,Food(3),Bars(10),Sandwiches(14),Fast Food(15), American (Traditional)(17), #Pizza (18),Coffee & Tea (19)
Restaurant<- Biz %>% select(business_id, review_count,stars, categories) %>% filter(categories %like% “Restaurants”) %>% arrange(desc(review_count,stars))
Food<- Biz %>% select(business_id, review_count,stars, categories) %>% filter(categories %like% “Food”) %>% arrange(desc(review_count,stars))
Bars<- Biz %>% select(business_id, review_count,stars, categories) %>% filter(categories %like% “Bars”) %>% arrange(desc(review_count,stars))
Sandwich<- Biz %>% select(business_id, review_count,stars, categories) %>% filter(categories %like% “Sandwiches”) %>% arrange(desc(review_count,stars))
FastFood<- Biz %>% select(business_id, review_count,stars, categories) %>% filter(categories %like% “Fast Food”) %>% arrange(desc(review_count,stars))
American<- Biz %>% select(business_id, review_count,stars, categories) %>% filter(categories %like% “American”) %>% arrange(desc(review_count,stars))
Pizza<- Biz %>% select(business_id, review_count,stars, categories) %>% filter(categories %like% “Pizza”) %>% arrange(desc(review_count,stars))
Coffee_Tea<- Biz %>% select(business_id, review_count,stars, categories) %>% filter(categories %like% “Coffee & Tea”) %>% arrange(desc(review_count,stars))
glimpse(Check_in) Check_in_R<-as.data.table(dcast(data = data.frame(Check_in), business_id~weekday, value.var = “checkins”, sum, margins=T)) names(Check_in_R)[9]<-c(“Total”)#컬럼 명 바꾸기 Check_in_R<-arrange(Check_in_R, desc(Check_in_R$Total))
glimpse(Biz_Hour)
Biz_Hour[,.(.N),by=c(“monday”,“tuesday”,“wednesday”,“thursday”,“friday”,“saturday”,“sunday”)][order(-N)] Biz_Hour_Bi<-data.frame(apply(Biz_Hour[,2:length(Biz_Hour)],2,function(x) str_count(x,“-”))) names(Biz_Hour_Bi)<-paste(names(Biz_Hour_Bi),1:length(Biz_Hour_Bi),sep=“_“) Biz_Hour1<-cbind(Biz_Hour,Biz_Hour_Bi) Biz_Hour1\(open_days<-rowSums(Biz_Hour1[,9:15]) hist(Biz_Hour1\)open_days) 100*table(Biz_Hour1\(open_days)/sum(table(Biz_Hour1\)open_days))
setkey(Biz,“business_id”) setkey(Biz_Hour1,“business_id”) setkey(Check_in_R,“business_id”) Master<-Biz[Biz_Hour1[Check_in_R]] names(Master) Biz_Master<-Master[,c(1:2,10:11,28,36)]
glimpse(Review) setkey(Biz_Master,“business_id”) setkey(Review,“business_id”) Master<-Review[Biz_Master]
setkey(Master, “user_id”) setkey(User_R,“user_id”) Master<-Master[User_R]
a<-Biz %>% select(business_id, categories) setkey(Master,“business_id”) setkey(a, “business_id”) Master<-Master[a] names(Biz) names(Master)
write_csv(Master, “D:/R_File/Master.csv”)
Restaurant<-Master %>% filter(“Restaurant” %in% categories)
Sentiment_1<-Restaurant[c(1:100000),] %>% filter(textcat(text) == “english”) %>% # considering only English text unnest_tokens(word, text) %>% inner_join(get_sentiments(“afinn”), by = “word”) %>% group_by(business_id) %>% summarize(sentiment = mean(score),words = n()) %>% ungroup() %>% filter(words >= 5) print(Sentiment_1)
Sentiment_F<-data.table(rbind(Sentiment_1,Sentiment_2, Sentiment_3, Sentiment_4)) Restaurant_F<-Restaurant[,-6] names(Restaurant_F) setkey(Sentiment_F, ‘business_id’) setkey(Restaurant_F, ‘business_id’) Master_F<-Sentiment_F[Restaurant_F] Master_F1<-Master_F[complete.cases(Master_F)]
glimpse(Master_F2)
summary(Master_F) Master_F2<-Master_F[complete.cases(Master_F$sentiment)]
names(Master_F2) Master_F3<-Master_F2 %>% select(1:3, 6, 8:9, 12:15, 18:20,22) summary(Master_F3)
Master_F4<-summaryBy(Master_F3[,2:13] ~ business_id, data = Master_F3, FUN = c(mean)) %>% arrange(desc(c(fun)))
#interest indicator: word.mean, review_count.mean
Master_F4<-read.csv("D:/R_File/Customer-segmentation/Master_F4.csv")
Restaurant<-read_csv("D:/R_File/Customer-segmentation/Restaurant.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_integer(),
## review_id = col_character(),
## user_id = col_character(),
## business_id = col_character(),
## date = col_date(format = ""),
## text = col_character(),
## name = col_character(),
## yelping_since = col_date(format = ""),
## elite = col_character(),
## average_stars = col_double(),
## categories = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 1500786 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 1489 i.stars no trailing characters .5 'D:/R_File/Customer-segment~ file 2 1490 i.stars no trailing characters .5 'D:/R_File/Customer-segment~ row 3 1491 i.stars no trailing characters .5 'D:/R_File/Customer-segment~ col 4 1492 i.stars no trailing characters .5 'D:/R_File/Customer-segment~ expected 5 1493 i.stars no trailing characters .5 'D:/R_File/Customer-segment~
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
attach(Master_F4)
q1<-quantile(words.mean, probs = c(0.5, 0.75, 0.9))
q2<-quantile(review_count.mean, probs = c(0.6, 0.9, 0.98))
Master_F4$words_group<-ifelse(words.mean<=q1[1], "D",
ifelse(q1[1]<=words.mean & words.mean<=q1[2],"C",
ifelse(q1[2]<=words.mean & words.mean<=q1[3],"B","A")))
table(Master_F4$words_group)
##
## A B C D
## 632 949 1584 3166
Master_F4$review_group<-ifelse(review_count.mean<=q2[1], "D",
ifelse(q2[1]<=review_count.mean & review_count.mean<=q2[2],"C",
ifelse(q2[2]<=review_count.mean & review_count.mean<=q2[3],"B","A")))
table(Master_F4$review_group)
##
## A B C D
## 127 503 1897 3804
#population indicator: stars.mean, Total.mean
q3<-quantile(stars.mean, probs = c(0.25, 0.5, 0.75))
Master_F4$star_group<-ifelse(stars.mean<=q3[1], "D",
ifelse(q3[1]<=stars.mean & stars.mean<=q3[2],"C",
ifelse(q3[2]<=stars.mean & stars.mean<=q3[3],"B","A")))
table(Master_F4$star_group)
##
## A B C D
## 1540 1622 1411 1758
q4<-quantile(Total.mean, probs = c(0.5, 0.75, 0.9))
Master_F4$checkin_group<-ifelse(Total.mean<=q4[1], "D",
ifelse(q4[1]<=Total.mean & Total.mean<=q4[2],"C",
ifelse(q4[2]<=Total.mean & Total.mean<=q4[3],"B","A")))
table(Master_F4$checkin_group)
##
## A B C D
## 633 945 1557 3196
#service quality indicator: open_days.mean, sentiment.mean
q5<-quantile(open_days.mean, probs = c(0.1, 0.25, 0.3))
Master_F4$open_group<-ifelse(open_days.mean<=q5[1], "D",
ifelse(q5[1]<=open_days.mean & open_days.mean<=q5[2],"C",
ifelse(q5[2]<=open_days.mean & open_days.mean<=q5[3],"B","A")))
table(Master_F4$open_group)
##
## A B C D
## 3686 1056 8 1581
q6<-quantile(sentiment.mean, probs = c(0.2, 0.75, 0.8))
Master_F4$sentiment_group<-ifelse(sentiment.mean<=q6[1], "D",
ifelse(q6[1]<=sentiment.mean & sentiment.mean<=q6[2],"C",
ifelse(q6[2]<=sentiment.mean & sentiment.mean<=q6[3],"B","A")))
table(Master_F4$sentiment_group)
##
## A B C D
## 1266 309 3489 1267
Biz_Seg<-Master_F4[,c(1,14:19)] #사업자 분류 결과
write.csv(Biz_Seg, file = "Biz_Seg.csv") # 분류 결과 첨부 파일 참조
Text1<-Restaurant %>%
filter(business_id == "364hhL5st0LV16UcBHRJ3A")
Text1 %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) %>%
count(word, sort =TRUE) %>%
ungroup() %>%
head(30) %>%
with(wordcloud(word, n, max.words = 30, colors = brewer.pal(8,"Dark2")))
Text1 %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) %>%
filter(!word %in% c('food', 'restaurant')) %>%
count(word, sort =TRUE) %>%
ungroup() %>%
mutate(word =factor(word, levels= rev(unique(word)))) %>%
head(10) %>%
ggplot(aes(x=word, y=n))+
geom_bar(stat = "identity", colour = "white", fill ='orange')+
geom_text(aes(x=word, y=1, label = paste0("(",n,")", sep="")),
hjust=0, vjust=.5, size=4, colour ="black", fontface ="bold")+
labs(x='Word', y="Word Count", title ="Word Count")+
coord_flip()+
theme_bw()
Text1_Sentments<-
Text1 %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE) %>%
ungroup() %>%
inner_join(get_sentiments("afinn"), by ="word") %>%
group_by(word) %>%
summarize(occurences =n(),
contribution = sum(score))
Text1_Sentments %>%
top_n(20, abs(contribution)) %>%
mutate(word = reorder(word, contribution)) %>%
head(20) %>%
ggplot(aes(word, contribution, fill = contribution>0)) +
geom_col(show.legend = FALSE)+
coord_flip()+theme_bw()
###11.4. Text Mining: 부정 댓글 분석(Negative Review)
Review_sentiment<-Text1 %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("afinn"), by="word") %>%
group_by(user_id) %>%
summarize(sentiment = mean(score), words=n()) %>%
ungroup() %>%
arrange(desc(sentiment)) %>%
top_n(-10, sentiment)
negative_review<-Review_sentiment %>%
inner_join(Text1, by="user_id") %>%
select(date, sentiment, text)
print(negative_review)
## # A tibble: 11 x 3
## date sentiment text
## <date> <dbl> <chr>
## 1 2015-03-30 -1.00 When in Vegas, this is a must try, place was full~
## 2 2015-12-30 -1.00 Sorry but the dishes were not anything new. The c~
## 3 2014-03-27 -1.00 "Based on the prior reviews, I was expecting spec~
## 4 2015-12-24 -1.30 "Ok, so normally I don't write bad reviews but th~
## 5 2015-11-23 -1.33 We went there for our anniversary dinner at 5pm t~
## 6 2014-12-28 -2.00 We were deeply disappointed by the sushi and miso~
## 7 2014-01-22 -2.20 After hearing some stuff about this place. me and~
## 8 2017-05-21 -2.33 It's a small gem hiding among those heavy buffets~
## 9 2014-10-18 -2.67 First yelp review. Terrible service ever. Sushi q~
## 10 2017-04-15 -3.00 Worst freakin place ever Overpriced one little p~
## 11 2014-11-15 -3.00 "Il y a effectivement des restaurants dont on se ~
Review_sentiment_P<-Text1 %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("afinn"), by="word") %>%
group_by(user_id) %>%
summarize(sentiment = mean(score), words=n()) %>%
ungroup() %>%
arrange(desc(sentiment)) %>%
top_n(10, sentiment)
positive_review<-Review_sentiment_P %>%
inner_join(Text1, by="user_id") %>%
select(date, sentiment, text)
print(positive_review)
## # A tibble: 10 x 3
## date sentiment text
## <date> <dbl> <chr>
## 1 2014-08-30 4.00 "Simply amazing!!!\r\nMy wife and I tried the tas~
## 2 2015-12-22 4.00 Few weeks back, I visited this restaurant with my~
## 3 2016-02-14 4.00 Second time here, and this time had a completely ~
## 4 2014-08-10 4.00 I found this place last year. Off the strip and ~
## 5 2014-12-26 4.00 Awesome dinner. Chose the prefixe 8 course meal f~
## 6 2015-01-20 4.00 Proof that delicious and artistic Japanese food c~
## 7 2016-02-14 4.00 This place was amazing for our first time, we ord~
## 8 2016-09-15 4.00 This place was wonderful from start to finish. Th~
## 9 2015-10-23 4.00 They are open for lunch now and it's amazing. Wil~
## 10 2016-12-21 3.67 "My experience at Yonaka was outstanding. My girl~