Executive summary

What is (are) your main question(s)? What is your story? What does the final graphic show?

Data background

Explain where the data came from, what agency or company made it, how it is structured, what it shows, etc.

Data loading, cleaning and preprocessing

Describe and show how you cleaned and reshaped the data

raw_review <- read_csv('women_clothes_review.csv')

## New names:
## Rows: 23486 Columns: 11
## ── Column specification
## ────────────────────────────────────────────────────────
## Delimiter: "," chr (5): Title, Review Text, Division Name, Department Name,
## Class Name dbl (6): ...1, Clothing ID, Age, Rating, Recommended IND, Positive
## Feedback ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

names(raw_review) <- gsub(" ", "_", names(raw_review))
raw_review$Review_Text <- tolower(raw_review$Review_Text) # 소문자로 바꿔줌

bing <- get_sentiments("bing")
tidy_review <- raw_review %>%
  # new feature by age group
  mutate(
    age_group = case_when( 
      Age < 30 ~ 'teenage',
      Age < 60 ~ 'adults',
      TRUE ~ 'elderly'
    ) 
  )%>%
  # tokenizing
  unnest_tokens(word, Review_Text) %>%
  # remove stop words
  anti_join(stop_words) %>%
  inner_join(bing)

## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`

## Warning in inner_join(., bing): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 22648 of `x` matches multiple rows in `y`.
## ℹ Row 3857 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

tidy_review

## # A tibble: 98,674 × 13
##     ...1 Clothing_ID   Age Title   Rating Recommended_IND Positive_Feedback_Co…¹
##    <dbl>       <dbl> <dbl> <chr>    <dbl>           <dbl>                  <dbl>
##  1     0         767    33 <NA>         4               1                      0
##  2     0         767    33 <NA>         4               1                      0
##  3     0         767    33 <NA>         4               1                      0
##  4     1        1080    34 <NA>         5               1                      4
##  5     1        1080    34 <NA>         5               1                      4
##  6     1        1080    34 <NA>         5               1                      4
##  7     1        1080    34 <NA>         5               1                      4
##  8     2        1077    60 Some m…      3               0                      0
##  9     2        1077    60 Some m…      3               0                      0
## 10     2        1077    60 Some m…      3               0                      0
## # ℹ 98,664 more rows
## # ℹ abbreviated name: ¹Positive_Feedback_Count
## # ℹ 6 more variables: Division_Name <chr>, Department_Name <chr>,
## #   Class_Name <chr>, age_group <chr>, word <chr>, sentiment <chr>

Text data analysis

Individual analysis and figures

Anaysis and Figure 1

tidy_review %>%
  count(word, sentiment, sort = TRUE)

## # A tibble: 1,737 × 3
##    word        sentiment     n
##    <chr>       <chr>     <int>
##  1 love        positive   8948
##  2 top         positive   7405
##  3 perfect     positive   3772
##  4 flattering  positive   3517
##  5 soft        positive   3343
##  6 comfortable positive   3057
##  7 cute        positive   3041
##  8 nice        positive   3023
##  9 beautiful   positive   2960
## 10 pretty      positive   2194
## # ℹ 1,727 more rows

tidy_review %>%
  count(word, sentiment, sort = TRUE) %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free") +
  labs(x = "Contribution to sentiment",
       y = NULL)

review_pos <- tidy_review %>%
  filter(sentiment == 'positive') %>%
  count(word, sentiment, sort = TRUE)
review_pos

## # A tibble: 724 × 3
##    word        sentiment     n
##    <chr>       <chr>     <int>
##  1 love        positive   8948
##  2 top         positive   7405
##  3 perfect     positive   3772
##  4 flattering  positive   3517
##  5 soft        positive   3343
##  6 comfortable positive   3057
##  7 cute        positive   3041
##  8 nice        positive   3023
##  9 beautiful   positive   2960
## 10 pretty      positive   2194
## # ℹ 714 more rows

review_neg <- tidy_review %>%
  filter(sentiment == 'negative') %>%
  count(word, sentiment, sort = TRUE)
review_neg

## # A tibble: 1,013 × 3
##    word         sentiment     n
##    <chr>        <chr>     <int>
##  1 fall         negative   1200
##  2 loose        negative   1181
##  3 worn         negative   1137
##  4 bust         negative    992
##  5 tank         negative    679
##  6 disappointed negative    584
##  7 skinny       negative    565
##  8 dark         negative    447
##  9 bad          negative    392
## 10 issue        negative    381
## # ℹ 1,003 more rows

wordcloud(words = review_pos$word,
          freq = review_pos$n,
          max.words = 8,
          colors = brewer.pal(1, "Blues"),
          scale = c(3, 1),
          random.order = FALSE,
          main = "Positive Words")

## Warning in brewer.pal(1, "Blues"): minimal value for n is 3, returning requested palette with 3 different levels

wordcloud(words = review_neg$word,
          freq = review_pos$n,
          max.words = 10,
          colors = brewer.pal(8, "Reds"),
          scale = c(3, 1),
          random.order = FALSE,
          main = "Positive Words")

Describe and show how you created the first figure. Why did you choose this figure type? ## Anaysis and Figure 2

graph_pos <- tidy_review %>%
  filter(sentiment == 'positive') %>%
  pairwise_count(item = word,
                 feature = Clothing_ID,
                 sort = T) %>%
  filter(n >= 250) %>%
  as_tbl_graph()
graph_pos

## # A tbl_graph: 11 nodes and 88 edges
## #
## # A directed simple graph with 1 component
## #
## # Node Data: 11 × 1 (active)
##    name       
##    <chr>      
##  1 perfect    
##  2 love       
##  3 soft       
##  4 top        
##  5 comfortable
##  6 flattering 
##  7 cute       
##  8 nice       
##  9 super      
## 10 beautiful  
## 11 pretty     
## #
## # Edge Data: 88 × 3
##    from    to     n
##   <int> <int> <dbl>
## 1     1     2   381
## 2     2     1   381
## 3     3     2   379
## # ℹ 85 more rows

set.seed(99)                              # fix a random number
ggraph(graph_pos, layout = "fr") +      # layout 

  geom_edge_link(color = "gray50",          # edge color
                 alpha = 0.5) +             # edge contrast

  geom_node_point(color = "lightblue",     # node color
                  size = 5) +               # node size

  geom_node_text(aes(label = name),         # text label
                 repel = T,                 # off-node display
                 size = 5) +  # font

  theme_graph()

graph_neg <- tidy_review %>%
  filter(sentiment == 'negative') %>%
  pairwise_count(item = word,
                 feature = Clothing_ID,
                 sort = T) %>%
  filter(n >= 100) %>%
  as_tbl_graph()
graph_neg

## # A tbl_graph: 15 nodes and 72 edges
## #
## # A directed simple graph with 1 component
## #
## # Node Data: 15 × 1 (active)
##    name        
##    <chr>       
##  1 worn        
##  2 loose       
##  3 fall        
##  4 disappointed
##  5 skinny      
##  6 bad         
##  7 dark        
##  8 hard        
##  9 sadly       
## 10 fell        
## 11 issue       
## 12 bust        
## 13 tank        
## 14 cheap       
## 15 cold        
## #
## # Edge Data: 72 × 3
##    from    to     n
##   <int> <int> <dbl>
## 1     1     2   158
## 2     2     1   158
## 3     1     3   152
## # ℹ 69 more rows

set.seed(99)                              # fix a random number
ggraph(graph_neg, layout = "fr") +      # layout 

  geom_edge_link(color = "gray50",          # edge color
                 alpha = 0.5) +             # edge contrast

  geom_node_point(color = "lightcoral",     # node color
                  size = 5) +               # node size

  geom_node_text(aes(label = name),         # text label
                 repel = T,                 # off-node display
                 size = 5) +  # font

  theme_graph()

Anaysis and Figure 3

# Rating 3.5 이상은 긍정 리뷰, 미만은 부정 리뷰로 분류
tidy_pos <- tidy_review %>%
  filter(Rating >= 3.5)

tidy_neg <- tidy_review %>%
  filter(Rating < 3.5)

# 긍정 리뷰의 연령 그룹 별 tf-idf 계산
frequency_pos <- tidy_pos %>%
  count(age_group, word) %>%
  bind_tf_idf(term = word,
              document = age_group,
              n = n) %>%
  arrange(-tf_idf)
frequency_pos

## # A tibble: 2,735 × 6
##    age_group word            n       tf   idf   tf_idf
##    <chr>     <chr>       <int>    <dbl> <dbl>    <dbl>
##  1 adults    pros           44 0.000759 0.405 0.000308
##  2 adults    fussy          16 0.000276 1.10  0.000303
##  3 elderly   suitable        7 0.000670 0.405 0.000272
##  4 adults    cheerful       14 0.000242 1.10  0.000265
##  5 teenage   luck            6 0.000618 0.405 0.000250
##  6 elderly   positive        6 0.000574 0.405 0.000233
##  7 elderly   roomier         6 0.000574 0.405 0.000233
##  8 teenage   godsend         2 0.000206 1.10  0.000226
##  9 teenage   impractical     2 0.000206 1.10  0.000226
## 10 teenage   mediocre        2 0.000206 1.10  0.000226
## # ℹ 2,725 more rows

# 부정 리뷰의 연령 그룹 별 tf-idf 계산
frequency_neg <- tidy_neg %>%
  count(age_group, word) %>%
  bind_tf_idf(term = word,
              document = age_group,
              n = n) %>%
  arrange(-tf_idf)
frequency_neg

## # A tibble: 1,850 × 6
##    age_group word         n       tf   idf   tf_idf
##    <chr>     <chr>    <int>    <dbl> <dbl>    <dbl>
##  1 elderly   shock        3 0.00141  1.10  0.00155 
##  2 teenage   lying        3 0.00109  1.10  0.00120 
##  3 elderly   elegance     2 0.000940 1.10  0.00103 
##  4 elderly   trust        2 0.000940 1.10  0.00103 
##  5 teenage   worried      7 0.00254  0.405 0.00103 
##  6 adults    boring      14 0.000893 1.10  0.000981
##  7 adults    itch        14 0.000893 1.10  0.000981
##  8 adults    sweet       14 0.000893 1.10  0.000981
##  9 adults    unusual     13 0.000829 1.10  0.000911
## 10 adults    negative    11 0.000701 1.10  0.000771
## # ℹ 1,840 more rows

# 긍정 리뷰 tf-idf 그래프 시각화
top10 <- frequency_pos %>%
  group_by(age_group) %>%
  slice_max(tf_idf, n = 10, with_ties = F)

# Ordering graph
top10$age_group <- factor(top10$age_group,
                          levels = c("teenage", "adults", "elderly"))

# Create a bar graph
ggplot(top10, aes(x = reorder_within(word, tf_idf, age_group),
                  y = tf_idf,
                  fill = age_group)) +
  geom_col(show.legend = F) +
  coord_flip() +
  facet_wrap(~ age_group, scales = "free", ncol = 2) +
  scale_x_reordered() +
  labs(x = NULL, title = 'Top10 TF-IDF score by age group - Positive review')

# 부정 리뷰 tf-idf 그래프 시각화
top10 <- frequency_neg %>%
  group_by(age_group) %>%
  slice_max(tf_idf, n = 10, with_ties = F)

# Ordering graph
top10$age_group <- factor(top10$age_group,
                          levels = c("teenage", "adults", "elderly"))

# Create a bar graph
ggplot(top10, aes(x = reorder_within(word, tf_idf, age_group),
                  y = tf_idf,
                  fill = age_group)) +
  geom_col(show.legend = F) +
  coord_flip() +
  facet_wrap(~ age_group, scales = "free", ncol = 2) +
  scale_x_reordered() +
  labs(x = NULL, title = 'Top10 TF-IDF score by age group - Negative review')

In showing the figures that you created, describe why you designed it the way you did. Why did you choose those colors, fonts, and other design elements? Does it convey truth?

You can also include images like this:

Your title here

Your name here