About the Document

On October 10, 2024, the Swedish Academy announced that Ms. Han Kang won the Nobel Prize in Literature, becoming the first writer from Korea to receive the award.

The news got me interested in Nobel Prize winners, even only in the field of literature, asking, “Who has ever won the Nobel Prize in Literature?”

I asked more specific questions as below. Each question was answered in the following parts of the document with data analysis and visualization. All data is from the List of Nobel Prize Winners in Literature from Wikipedia.

  1. What language did the Nobel Prize winners use?
  2. How many women were awarded the Nobel Prize in Literature?
  3. What literature genres did the Nobel Prize winners do?
  4. Which words were the most frequently used in citations for recognition?

Step 1: Load the required packages

library(tidyverse)
library(tidytext)
library(tm) # text mining
library(rvest)

# install packages that hasn't installed...

install.packages("wordcloud", repos = "http://cran.us.r-project.org")
## 패키지 'wordcloud'를 성공적으로 압축해제하였고 MD5 sums 이 확인되었습니다
## 
## 다운로드된 바이너리 패키지들은 다음의 위치에 있습니다
##  C:\Users\yohan\AppData\Local\Temp\RtmpWy9yfS\downloaded_packages
library(wordcloud)

Step 2: Load the data

# web scraping with the Wikipedia page

url1 <- "https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Literature"
web_page1 <- read_html(url1)
tables1 <- web_page1 %>% html_table(fill=TRUE)
npl <- tables1[[1]]

Step 3: Clean the data

# Removing citation marks such as [1], [2], [3]... in every column

npl$Year <- gsub("\\[\\d+\\]", "", npl$Year)
npl$Picture <- gsub("\\[\\d+\\]", "", npl$Picture)
npl$Laureate <- gsub("\\[\\d+\\]", "", npl$Laureate)
npl$Country <- gsub("\\[\\d+\\]", "", npl$Country)
npl$`Language(s)` <- gsub("\\[\\d+\\]", "", npl$`Language(s)`)
npl$Citation <- gsub("\\[\\d+\\]", "", npl$Citation)
npl$`Genre(s)` <- gsub("\\[\\d+\\]", "", npl$`Genre(s)`)

# Removing an unnecessary column - Picture

npl <- npl %>% select(-Picture)

# Removing rows with no winners

npl_filtered <- npl %>% filter(Laureate != "Not awarded")

# Remove parentheses and then trim any leading or trailing spaces ('Laureate' column)

npl_filtered$Laureate <- trimws(gsub("\\s*\\(.*\\)", "", npl_filtered$Laureate))

# add a gender information to each laureate

female_laureate <- c("Selma Lagerlöf", "Sigrid Undset", "Pearl Buck", "Gabriela Mistral", "Nelly Sachs", "Nadine Gordimer", "Toni Morrison", "Wisława Szymborska", "Elfriede Jelinek", "Doris Lessing", "Herta Müller", "Alice Munro", "Svetlana Alexievich", "Olga Tokarczuk", "Louise Glück", "Annie Ernaux", "Han Kang")

npl_gender <- npl_filtered %>%
  mutate(gender =case_when(
    Laureate %in% female_laureate ~ "F",
    TRUE ~ "M" # to assign "M" to all others
  ))

Step 4: Analyze and visualize the data

ggplot(npl_gender, aes(x=`Language(s)`))+
  geom_bar(fill="steelblue")+
  labs(title = "Number of Laureates by Language(s)", x="Language", y="Count of Laureates")+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# or, if you want languages in descending order based on the number of laureates

language_counts <- npl_gender %>% count(`Language(s)`) %>% arrange(desc(n))

ggplot(language_counts, aes(x=reorder(`Language(s)`, -n), y=n))+ #added y=n because of labeling
  geom_bar(aes(y=n), stat = "identity", fill="steelblue")+
  labs(title = "Number of Laureates by Language(s)", x="Language", y="Count of Laureates")+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  geom_text(aes(label = ifelse(n >=5, n, "")), vjust = -0.5, color = "black")

# First, we group the data into 10-year period

npl_gender$Year <- as.numeric(npl_gender$Year)

npl_periods <- npl_gender %>%
  mutate(Period = cut(Year,
                      breaks = seq(1900, 2030, by=10), # be careful!
                      labels = paste(seq(1900, 2020, by=10), seq(1909, 2029, by=10), sep = "-"),
                      right = FALSE
                      ))

# summarize the number of males and females by 10-year period

gender_counts <- npl_periods %>%
  group_by(Period, gender) %>%
  summarise(Count =n()) %>%
  ungroup()

# Create a stacked bar chart showing the number of males and females by 10-year period

ggplot(gender_counts, aes(x=Period, y=Count, fill = gender))+
  geom_bar(stat = "identity", position = "stack")+
  geom_text(aes(label = Count), position = position_stack(vjust = 0.5))+
  labs(title = "Number of Male and Female Laureates by 10-year Period")+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  scale_fill_manual(values = c("M"="steelblue", "F"="pink"))

# split the genre column into individual rows based on commas

npl_genres <- npl_filtered %>%
  separate_rows(`Genre(s)`, sep = ", ")

# count the occurrences of each genre

genre_counts <- npl_genres %>%
  group_by(`Genre(s)`) %>%
  summarise(Count=n()) %>%
  arrange(desc(Count)) %>%
  ungroup()

# create a bar chart to visualize the genre distribution

ggplot(genre_counts, aes(x=reorder(`Genre(s)`, -Count), y= Count))+
  geom_bar(aes(y=Count), stat = "identity", fill="steelblue")+
  labs(title = "Distribution of Genres among Nobel Prize Laureates", x="Genres")+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  geom_text(aes(label = Count), vjust = -0.5, color = "black")

# clean the Citation column

clean_citations <- npl_filtered$Citation %>%
  tolower() %>%
  removePunctuation() %>%
  removeNumbers() %>%
  removeWords(c(stopwords("en"), "and", "the", "awarded", "recognition"))

# tokenize the citations

citation_words <- unlist(strsplit(clean_citations, "\\s+")) # split by white space

# count the frequency of each word

word_freq <- table(citation_words) # create a frequency table

word_freq <- as.data.frame(word_freq, stringsAsFactors = FALSE) # convert to a data frame

colnames(word_freq) <- c("word", "freq") # rename columns

word_freq <- word_freq %>% arrange(desc(freq))

filtered_word_freq <- word_freq %>%
  filter(!word %in% c("", "-"))

# generate a word cloud

wordcloud(words = filtered_word_freq$word,
          freq = filtered_word_freq$freq,
          max.words = 50,
          min.freq = 3,
          random.order = FALSE,
          colors = brewer.pal(8, "Dark2"))