This is a dataset that I’ve chosen from the Harvard Dataverse containing a list of awarded noble prizes and metadata about the winners.
https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/HYRJDX
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
raw_data_path<-("https://raw.githubusercontent.com/amedina613/Data-607-Project-2/main/complete.csv")
raw_data <- read.csv(raw_data_path, sep = ",")
print(colnames(raw_data))
## [1] "awardYear" "category"
## [3] "categoryFullName" "sortOrder"
## [5] "portion" "prizeAmount"
## [7] "prizeAmountAdjusted" "dateAwarded"
## [9] "prizeStatus" "motivation"
## [11] "categoryTopMotivation" "award_link"
## [13] "id" "name"
## [15] "knownName" "givenName"
## [17] "familyName" "fullName"
## [19] "penName" "gender"
## [21] "laureate_link" "birth_date"
## [23] "birth_city" "birth_cityNow"
## [25] "birth_continent" "birth_country"
## [27] "birth_countryNow" "birth_locationString"
## [29] "death_date" "death_city"
## [31] "death_cityNow" "death_continent"
## [33] "death_country" "death_countryNow"
## [35] "death_locationString" "orgName"
## [37] "nativeName" "acronym"
## [39] "org_founded_date" "org_founded_city"
## [41] "org_founded_cityNow" "org_founded_continent"
## [43] "org_founded_country" "org_founded_countryNow"
## [45] "org_founded_locationString" "ind_or_org"
## [47] "residence_1" "residence_2"
## [49] "affiliation_1" "affiliation_2"
## [51] "affiliation_3" "affiliation_4"
raw_data <- raw_data %>%
mutate(residence = coalesce(residence_1, residence_2)) %>%
select(-c(residence_1, residence_2))
raw_data <- raw_data %>%
mutate(affiliation = coalesce(affiliation_1, affiliation_2, affiliation_3, affiliation_4)) %>%
select(-c(affiliation_1, affiliation_2, affiliation_3, affiliation_4))
This is quite a wide data set and these are just some examples of what you can do
awards <- na.omit(subset(raw_data, select = c("awardYear", "category", "prizeAmount")))
recipients <- na.omit(subset(raw_data, select = c("name", "birth_date", "gender")))
recipients_without_gender <- recipients %>%
filter(is.na(gender) | gender == "")
recipients <- recipients %>%
filter(!is.na(gender), gender != "")
I made a df for the recipients without gender to make sure everything checks out. It does, 950-27 is 923, which is the number of obs. in the filtered final df.
gender_counts <- table(recipients$gender)
proportion_women <- gender_counts["female"] / sum(gender_counts)
proportion_men <- gender_counts["male"] / sum(gender_counts)
gender_data <- data.frame(
Gender = c("Female", "Male"),
Proportion = c(proportion_women, proportion_men)
)
ggplot(gender_data, aes(x = Gender, y = Proportion, fill = Gender)) +
geom_bar(stat = "identity") +
labs(title = "Proportion of Awards by Gender",
x = "Gender",
y = "Proportion") +
scale_fill_manual(values = c("Female" = "blue", "Male" = "red")) +
theme_minimal()
field_prize_sum <- awards %>%
group_by(category) %>%
summarise(total_prize_amount = sum(prizeAmount, na.rm = TRUE))
max_prize_field <- field_prize_sum %>%
filter(total_prize_amount == max(total_prize_amount))
min_prize_field <- field_prize_sum %>%
filter(total_prize_amount == min(total_prize_amount))
print("Field with the largest prize amount:")
## [1] "Field with the largest prize amount:"
print(max_prize_field)
## # A tibble: 1 × 2
## category total_prize_amount
## <chr> <int>
## 1 Physics 725890928
print("Field with the lowest prize amount:")
## [1] "Field with the lowest prize amount:"
print(min_prize_field)
## # A tibble: 1 × 2
## category total_prize_amount
## <chr> <int>
## 1 Literature 289282102
I attempted to make a bar plot to visualize the number of awards given out each year for each award type, however there was so much information in the plot, it was not decipherable. So I decided to simplify the plot by aggregating by decade.
The floor function was used for this.
awards_by_decade_type <- awards %>%
group_by(decade = floor(awardYear / 10) * 10, category) %>%
summarise(num_awards = n(), .groups = 'drop')
ggplot(awards_by_decade_type, aes(x = as.factor(decade), y = num_awards, fill = category)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Decade", y = "Number of Awards", fill = "Award Type") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))