Step 3: Clean the data
# Removing citation marks such as [1], [2], [3]... in every column
npl$Year <- gsub("\\[\\d+\\]", "", npl$Year)
npl$Picture <- gsub("\\[\\d+\\]", "", npl$Picture)
npl$Laureate <- gsub("\\[\\d+\\]", "", npl$Laureate)
npl$Country <- gsub("\\[\\d+\\]", "", npl$Country)
npl$`Language(s)` <- gsub("\\[\\d+\\]", "", npl$`Language(s)`)
npl$Citation <- gsub("\\[\\d+\\]", "", npl$Citation)
npl$`Genre(s)` <- gsub("\\[\\d+\\]", "", npl$`Genre(s)`)
# Removing an unnecessary column - Picture
npl <- npl %>% select(-Picture)
# Removing rows with no winners
npl_filtered <- npl %>% filter(Laureate != "Not awarded")
# Remove parentheses and then trim any leading or trailing spaces ('Laureate' column)
npl_filtered$Laureate <- trimws(gsub("\\s*\\(.*\\)", "", npl_filtered$Laureate))
# add a gender information to each laureate
female_laureate <- c("Selma Lagerlöf", "Sigrid Undset", "Pearl Buck", "Gabriela Mistral", "Nelly Sachs", "Nadine Gordimer", "Toni Morrison", "Wisława Szymborska", "Elfriede Jelinek", "Doris Lessing", "Herta Müller", "Alice Munro", "Svetlana Alexievich", "Olga Tokarczuk", "Louise Glück", "Annie Ernaux", "Han Kang")
npl_gender <- npl_filtered %>%
mutate(gender =case_when(
Laureate %in% female_laureate ~ "F",
TRUE ~ "M" # to assign "M" to all others
))
Step 4: Analyze and visualize the data
ggplot(npl_gender, aes(x=`Language(s)`))+
geom_bar(fill="steelblue")+
labs(title = "Number of Laureates by Language(s)", x="Language", y="Count of Laureates")+
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))

# or, if you want languages in descending order based on the number of laureates
language_counts <- npl_gender %>% count(`Language(s)`) %>% arrange(desc(n))
ggplot(language_counts, aes(x=reorder(`Language(s)`, -n), y=n))+ #added y=n because of labeling
geom_bar(aes(y=n), stat = "identity", fill="steelblue")+
labs(title = "Number of Laureates by Language(s)", x="Language", y="Count of Laureates")+
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
geom_text(aes(label = ifelse(n >=5, n, "")), vjust = -0.5, color = "black")

# First, we group the data into 10-year period
npl_gender$Year <- as.numeric(npl_gender$Year)
npl_periods <- npl_gender %>%
mutate(Period = cut(Year,
breaks = seq(1900, 2030, by=10), # be careful!
labels = paste(seq(1900, 2020, by=10), seq(1909, 2029, by=10), sep = "-"),
right = FALSE
))
# summarize the number of males and females by 10-year period
gender_counts <- npl_periods %>%
group_by(Period, gender) %>%
summarise(Count =n()) %>%
ungroup()
# Create a stacked bar chart showing the number of males and females by 10-year period
ggplot(gender_counts, aes(x=Period, y=Count, fill = gender))+
geom_bar(stat = "identity", position = "stack")+
geom_text(aes(label = Count), position = position_stack(vjust = 0.5))+
labs(title = "Number of Male and Female Laureates by 10-year Period")+
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
scale_fill_manual(values = c("M"="steelblue", "F"="pink"))

# split the genre column into individual rows based on commas
npl_genres <- npl_filtered %>%
separate_rows(`Genre(s)`, sep = ", ")
# count the occurrences of each genre
genre_counts <- npl_genres %>%
group_by(`Genre(s)`) %>%
summarise(Count=n()) %>%
arrange(desc(Count)) %>%
ungroup()
# create a bar chart to visualize the genre distribution
ggplot(genre_counts, aes(x=reorder(`Genre(s)`, -Count), y= Count))+
geom_bar(aes(y=Count), stat = "identity", fill="steelblue")+
labs(title = "Distribution of Genres among Nobel Prize Laureates", x="Genres")+
theme_minimal()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
geom_text(aes(label = Count), vjust = -0.5, color = "black")

# clean the Citation column
clean_citations <- npl_filtered$Citation %>%
tolower() %>%
removePunctuation() %>%
removeNumbers() %>%
removeWords(c(stopwords("en"), "and", "the", "awarded", "recognition"))
# tokenize the citations
citation_words <- unlist(strsplit(clean_citations, "\\s+")) # split by white space
# count the frequency of each word
word_freq <- table(citation_words) # create a frequency table
word_freq <- as.data.frame(word_freq, stringsAsFactors = FALSE) # convert to a data frame
colnames(word_freq) <- c("word", "freq") # rename columns
word_freq <- word_freq %>% arrange(desc(freq))
filtered_word_freq <- word_freq %>%
filter(!word %in% c("", "-"))
# generate a word cloud
wordcloud(words = filtered_word_freq$word,
freq = filtered_word_freq$freq,
max.words = 50,
min.freq = 3,
random.order = FALSE,
colors = brewer.pal(8, "Dark2"))
