html knit -> publish -> rpub zip파일로 압축(url, html, data)
연구주제: 리뷰 분석을 통한 의류 마케팅 통찰 제공
Write text and code here.
What is (are) your main question(s)? What is your story? What does the final graphic show?
Explain where the data came from, what agency or company made it, how it is structured, what it shows, etc.
Describe and show how you cleaned and reshaped the data
raw_review <- read_csv('women_clothes_review.csv')
## New names:
## Rows: 23486 Columns: 11
## ── Column specification
## ────────────────────────────────────────────────────────
## Delimiter: "," chr (5): Title, Review Text, Division Name, Department Name,
## Class Name dbl (6): ...1, Clothing ID, Age, Rating, Recommended IND, Positive
## Feedback ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
names(raw_review) <- gsub(" ", "_", names(raw_review))
raw_review$Review_Text <- tolower(raw_review$Review_Text) # 소문자로 바꿔줌
bing <- get_sentiments("bing")
tidy_review <- raw_review %>%
# new feature by age group
mutate(
age_group = case_when(
Age < 30 ~ 'teenage',
Age < 60 ~ 'adults',
TRUE ~ 'elderly'
)
)%>%
# tokenizing
unnest_tokens(word, Review_Text) %>%
# remove stop words
anti_join(stop_words) %>%
inner_join(bing)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., bing): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 22648 of `x` matches multiple rows in `y`.
## ℹ Row 3857 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
tidy_review
## # A tibble: 98,674 × 13
## ...1 Clothing_ID Age Title Rating Recommended_IND Positive_Feedback_Co…¹
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 0 767 33 <NA> 4 1 0
## 2 0 767 33 <NA> 4 1 0
## 3 0 767 33 <NA> 4 1 0
## 4 1 1080 34 <NA> 5 1 4
## 5 1 1080 34 <NA> 5 1 4
## 6 1 1080 34 <NA> 5 1 4
## 7 1 1080 34 <NA> 5 1 4
## 8 2 1077 60 Some m… 3 0 0
## 9 2 1077 60 Some m… 3 0 0
## 10 2 1077 60 Some m… 3 0 0
## # ℹ 98,664 more rows
## # ℹ abbreviated name: ¹Positive_Feedback_Count
## # ℹ 6 more variables: Division_Name <chr>, Department_Name <chr>,
## # Class_Name <chr>, age_group <chr>, word <chr>, sentiment <chr>
tidy_review %>%
count(word, sentiment, sort = TRUE)
## # A tibble: 1,737 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 love positive 8948
## 2 top positive 7405
## 3 perfect positive 3772
## 4 flattering positive 3517
## 5 soft positive 3343
## 6 comfortable positive 3057
## 7 cute positive 3041
## 8 nice positive 3023
## 9 beautiful positive 2960
## 10 pretty positive 2194
## # ℹ 1,727 more rows
tidy_review %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free") +
labs(x = "Contribution to sentiment",
y = NULL)
review_pos <- tidy_review %>%
filter(sentiment == 'positive') %>%
count(word, sentiment, sort = TRUE)
review_pos
## # A tibble: 724 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 love positive 8948
## 2 top positive 7405
## 3 perfect positive 3772
## 4 flattering positive 3517
## 5 soft positive 3343
## 6 comfortable positive 3057
## 7 cute positive 3041
## 8 nice positive 3023
## 9 beautiful positive 2960
## 10 pretty positive 2194
## # ℹ 714 more rows
review_neg <- tidy_review %>%
filter(sentiment == 'negative') %>%
count(word, sentiment, sort = TRUE)
review_neg
## # A tibble: 1,013 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 fall negative 1200
## 2 loose negative 1181
## 3 worn negative 1137
## 4 bust negative 992
## 5 tank negative 679
## 6 disappointed negative 584
## 7 skinny negative 565
## 8 dark negative 447
## 9 bad negative 392
## 10 issue negative 381
## # ℹ 1,003 more rows
wordcloud(words = review_pos$word,
freq = review_pos$n,
max.words = 8,
colors = brewer.pal(1, "Blues"),
scale = c(3, 1),
random.order = FALSE,
main = "Positive Words")
## Warning in brewer.pal(1, "Blues"): minimal value for n is 3, returning requested palette with 3 different levels
wordcloud(words = review_neg$word,
freq = review_pos$n,
max.words = 10,
colors = brewer.pal(8, "Reds"),
scale = c(3, 1),
random.order = FALSE,
main = "Positive Words")
Describe and show how you created the first figure. Why did you choose this figure type? ## Anaysis and Figure 2
graph_pos <- tidy_review %>%
filter(sentiment == 'positive') %>%
pairwise_count(item = word,
feature = Clothing_ID,
sort = T) %>%
filter(n >= 250) %>%
as_tbl_graph()
graph_pos
## # A tbl_graph: 11 nodes and 88 edges
## #
## # A directed simple graph with 1 component
## #
## # Node Data: 11 × 1 (active)
## name
## <chr>
## 1 perfect
## 2 love
## 3 soft
## 4 top
## 5 comfortable
## 6 flattering
## 7 cute
## 8 nice
## 9 super
## 10 beautiful
## 11 pretty
## #
## # Edge Data: 88 × 3
## from to n
## <int> <int> <dbl>
## 1 1 2 381
## 2 2 1 381
## 3 3 2 379
## # ℹ 85 more rows
set.seed(99) # fix a random number
ggraph(graph_pos, layout = "fr") + # layout
geom_edge_link(color = "gray50", # edge color
alpha = 0.5) + # edge contrast
geom_node_point(color = "lightblue", # node color
size = 5) + # node size
geom_node_text(aes(label = name), # text label
repel = T, # off-node display
size = 5) + # font
theme_graph()
graph_neg <- tidy_review %>%
filter(sentiment == 'negative') %>%
pairwise_count(item = word,
feature = Clothing_ID,
sort = T) %>%
filter(n >= 100) %>%
as_tbl_graph()
graph_neg
## # A tbl_graph: 15 nodes and 72 edges
## #
## # A directed simple graph with 1 component
## #
## # Node Data: 15 × 1 (active)
## name
## <chr>
## 1 worn
## 2 loose
## 3 fall
## 4 disappointed
## 5 skinny
## 6 bad
## 7 dark
## 8 hard
## 9 sadly
## 10 fell
## 11 issue
## 12 bust
## 13 tank
## 14 cheap
## 15 cold
## #
## # Edge Data: 72 × 3
## from to n
## <int> <int> <dbl>
## 1 1 2 158
## 2 2 1 158
## 3 1 3 152
## # ℹ 69 more rows
set.seed(99) # fix a random number
ggraph(graph_neg, layout = "fr") + # layout
geom_edge_link(color = "gray50", # edge color
alpha = 0.5) + # edge contrast
geom_node_point(color = "lightcoral", # node color
size = 5) + # node size
geom_node_text(aes(label = name), # text label
repel = T, # off-node display
size = 5) + # font
theme_graph()
# Rating 3.5 이상은 긍정 리뷰, 미만은 부정 리뷰로 분류
tidy_pos <- tidy_review %>%
filter(Rating >= 3.5)
tidy_neg <- tidy_review %>%
filter(Rating < 3.5)
# 긍정 리뷰의 연령 그룹 별 tf-idf 계산
frequency_pos <- tidy_pos %>%
count(age_group, word) %>%
bind_tf_idf(term = word,
document = age_group,
n = n) %>%
arrange(-tf_idf)
frequency_pos
## # A tibble: 2,735 × 6
## age_group word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 adults pros 44 0.000759 0.405 0.000308
## 2 adults fussy 16 0.000276 1.10 0.000303
## 3 elderly suitable 7 0.000670 0.405 0.000272
## 4 adults cheerful 14 0.000242 1.10 0.000265
## 5 teenage luck 6 0.000618 0.405 0.000250
## 6 elderly positive 6 0.000574 0.405 0.000233
## 7 elderly roomier 6 0.000574 0.405 0.000233
## 8 teenage godsend 2 0.000206 1.10 0.000226
## 9 teenage impractical 2 0.000206 1.10 0.000226
## 10 teenage mediocre 2 0.000206 1.10 0.000226
## # ℹ 2,725 more rows
# 부정 리뷰의 연령 그룹 별 tf-idf 계산
frequency_neg <- tidy_neg %>%
count(age_group, word) %>%
bind_tf_idf(term = word,
document = age_group,
n = n) %>%
arrange(-tf_idf)
frequency_neg
## # A tibble: 1,850 × 6
## age_group word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 elderly shock 3 0.00141 1.10 0.00155
## 2 teenage lying 3 0.00109 1.10 0.00120
## 3 elderly elegance 2 0.000940 1.10 0.00103
## 4 elderly trust 2 0.000940 1.10 0.00103
## 5 teenage worried 7 0.00254 0.405 0.00103
## 6 adults boring 14 0.000893 1.10 0.000981
## 7 adults itch 14 0.000893 1.10 0.000981
## 8 adults sweet 14 0.000893 1.10 0.000981
## 9 adults unusual 13 0.000829 1.10 0.000911
## 10 adults negative 11 0.000701 1.10 0.000771
## # ℹ 1,840 more rows
# 긍정 리뷰 tf-idf 그래프 시각화
top10 <- frequency_pos %>%
group_by(age_group) %>%
slice_max(tf_idf, n = 10, with_ties = F)
# Ordering graph
top10$age_group <- factor(top10$age_group,
levels = c("teenage", "adults", "elderly"))
# Create a bar graph
ggplot(top10, aes(x = reorder_within(word, tf_idf, age_group),
y = tf_idf,
fill = age_group)) +
geom_col(show.legend = F) +
coord_flip() +
facet_wrap(~ age_group, scales = "free", ncol = 2) +
scale_x_reordered() +
labs(x = NULL, title = 'Top10 TF-IDF score by age group - Positive review')
# 부정 리뷰 tf-idf 그래프 시각화
top10 <- frequency_neg %>%
group_by(age_group) %>%
slice_max(tf_idf, n = 10, with_ties = F)
# Ordering graph
top10$age_group <- factor(top10$age_group,
levels = c("teenage", "adults", "elderly"))
# Create a bar graph
ggplot(top10, aes(x = reorder_within(word, tf_idf, age_group),
y = tf_idf,
fill = age_group)) +
geom_col(show.legend = F) +
coord_flip() +
facet_wrap(~ age_group, scales = "free", ncol = 2) +
scale_x_reordered() +
labs(x = NULL, title = 'Top10 TF-IDF score by age group - Negative review')
In showing the figures that you created, describe why you designed it
the way you did. Why did you choose those colors, fonts, and other
design elements? Does it convey truth?
You can also include images like this: