options(width=100)
knitr::opts_chunk$set(out.width='1000px',dpi=200,message=FALSE,warning=FALSE)
#load packages and csv file
library(ggplot2)
library(dplyr)
library(gridExtra)
library(Amelia)
library(grid)
library(stringr)
library(ggmap)
library(plotly)
library(fmsb)
library(knitr)

1 Data preparation

df<-read.csv('archive.csv',sep=',')
#missingmap
#missmap(df)

There are no missing entries, however there are empty dtrings, which corresponds to the Category = Peace. So for the section where I will retrieve the geolocations, I will remove this entry

#df %>% select(Birth.Country,Category) %>% filter(Birth.Country == "")

1.1 Get geoCodes

comments :

  • the code-chunk below is not compiled
  • I have run this code before-hand because we cannot get the geoCodes within Kaggle kernel
  • the code below loops over Birth.Country, get the real name of the country inside parentheses, and then get the Longtitude and Latitude for each
  • append the result in a dataframe
#get geoCodes from google
getGeo<-function(x,coord){
  for(i in 1:length(x)){
    val<-as.numeric(geocode(as.character(x[i])))[coord]
    return(val)
  }
}
#function to keep name of country (hence remove parentheses)
stripMe<-function(x){
  re <- "\\(([^()]+)\\)"
  x<-as.character(x)
  if(grepl("\\(",x) == TRUE){
    y<-gsub(re, "\\1", str_extract_all(x, re)[[1]])
    return(as.character(y))
  }
  else{return(as.character(x))}
}
names<-df %>% filter(Birth.Country != "") %>% select(Birth.Country)
names$name<-sapply(names$Birth.Country,stripMe)
countryTest<-data.frame("val"=unique(names$name))
countryTest$geoLong<-sapply(countryTest$val,getGeo,1)
countryTest$geoLat<-sapply(countryTest$val,getGeo,2)

1.2 Impute Geocodes by hand

copy-paste the following dataframe

#cat(paste0('"',countryCodes$countries,'"'),sep=",")
nameCountry<-c("Netherlands","France","Poland","Switzerland","Germany","India","Sweden","Norway","Denmark","United Kingdom","Scotland","Spain","Russia","Czech Republic","Slovakia","Italy","United States of America","New Zealand","Ukraine","Luxembourg","Latvia","Belgium","Austria","Australia","Slovenia","Ireland","Canada","Indonesia","Argentina","Hungary","Croatia","Finland","Chile","Portugal","Japan","South Africa","Iceland","China","Algeria","Guadeloupe Island","Brazil","Zimbabwe","Bosnia and Herzegovina","Azerbaijan","Turkey","Egypt","Guatemala","Belarus","Vietnam","Romania","Northern Ireland","Lithuania","Saint Lucia","Greece","Republic of Macedonia","Pakistan","Venezuela","Bulgaria","Colombia","Mexico","Madagascar","Taiwan","Nigeria","South Korea","Costa Rica","People's Republic of China","Myanmar","Israel","East Timor","Trinidad","Ghana","Iran","Kenya","Bangladesh","Cyprus","Peru","Liberia","Yemen","Morocco")

#cat(countryCodes$geoLong,sep=",")
geoLong<-c(5.291266,2.213749,19.14514,8.227512,10.45153,78.96288,18.6435,8.468946,9.501785,-3.435973,-4.202646,-3.74922,105.3188,15.47296,19.69902,12.56738,-95.71289,174.886,31.16558,6.129583,24.60319,4.469936,14.55007,133.7751,14.99546,-7.692054,-106.3468,113.9213,-63.61667,19.5033,15.2,25.74815,-71.54297,-8.224454,138.2529,22.93751,-19.02084,104.1954,1.659626,-61.551,-51.92528,29.15486,17.67908,47.57693,35.24332,30.8025,-90.23076,27.95339,108.2772,24.96676,-6.492314,23.88127,-60.97889,21.82431,21.74527,69.34512,-66.58973,25.48583,-74.29733,-102.5528,46.86911,120.9605,8.675277,127.7669,-83.75343,104.1954,95.95597,34.85161,125.7275,-61.2225,-1.023194,53.68805,37.90619,90.35633,33.42986,-75.01515,-9.429499,48.51639,-7.09262)

#cat(countryCodes$geoLat,sep=",")
geoLat<-c(52.13263,46.22764,51.91944,46.81819,51.16569,20.59368,60.12816,60.47202,56.26392,55.37805,56.49067,40.46367,61.52401,49.81749,48.66903,41.87194,37.09024,-40.90056,48.37943,49.81527,56.87964,50.50389,47.51623,-25.2744,46.15124,53.14237,56.13037,-0.789275,-38.4161,47.16249,45.1,61.92411,-35.67515,39.39987,36.20482,-30.55948,64.96305,35.86166,28.03389,16.265,-14.235,-19.01544,43.91589,40.1431,38.96375,26.82055,15.78347,53.70981,14.05832,45.94316,54.78771,55.16944,13.90944,39.07421,41.60863,30.37532,6.42375,42.73388,4.570868,23.6345,-18.76695,23.69781,9.081999,35.90776,9.748917,35.86166,21.91622,31.04605,-8.874217,10.6918,7.946527,32.42791,-0.023559,23.68499,35.12641,-9.189967,6.428055,15.55273,31.7917)

countryCodes<-data.frame("country"=nameCountry,"Longitude"=geoLong,"Latitude"=geoLat)
DT::datatable(countryCodes)

2 By Country

2.1 Map

byCountry<-as.data.frame(df %>% select(Birth.Country) %>% filter(Birth.Country != "") %>% group_by(Birth.Country) %>% summarise(number=n()) %>% arrange(-number))
byCountry$name<-sapply(byCountry$Birth.Country,stripMe)
byCountry<-as.data.frame(byCountry %>% select(number,name) %>% group_by(name) %>% summarise(sum=sum(number)))
colnames(byCountry)<-c('country','number')
byCountry$country<-as.character(byCountry$country)
#left join
res<-merge(x=byCountry,y=countryCodes,by='country',all.x=TRUE)
res<-na.omit(res)
map<-ggplot() + borders("world",colour="grey75",fill="white")
g1<-map + geom_point(data=res,aes(x=Longitude, y=Latitude,size=number,labels=country),alpha=.5) +
  xlab('Longitude') + ylab('Latitude') +
  theme(legend.text=element_text(size=12),legend.position="top") + 
  labs(size="Number of Nobel Prizes") +
  ggtitle('Map of Nobel Prizes')
#g1
py <- ggplotly(g1,width = 800, height = 600)
py

2.2 Histogram

res$country <- reorder(res$country,-res$number)
res$Percentage <- res$number / sum(res$number) * 100
res$Percent <-paste0(round(res$Percentage,1),"%")
g2<-ggplot(res, aes(x=country, y=number,labels=Percent)) + geom_bar(width = 0.9, stat="identity") + 
  xlab('') + ylab('Number of Nobel Prizes') + theme(axis.text.x = element_text(angle=90, hjust=1)) +
  ggtitle('Distribution of Nobel Prizes per country')
py2 <- ggplotly(g2,width = 800, height = 600)
py2
  • The United States of America are on top of the Countries having Nobel Prizes with ~30% of all prizes since their creation.

3 By Year, Gender and Category

3.1 Categories vs. Year, Gender

tt<-as.data.frame(df %>% select(Year,Sex,Category,Birth.Country) %>% 
                    filter(Birth.Country != "") %>% 
                    group_by(Year,Sex,Category) %>% 
                    summarise(number=n()) %>% 
                    arrange(-number))
g1<-ggplot(data=filter(tt,Sex=='Female'),aes(Year,Category,fill=number)) + 
  geom_tile(aes(fill = number),colour = "white") + 
  scale_fill_gradient(low="#3B9AB2",high="black",guide = guide_legend(title = "Female")) +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank()) +xlim(1900,2015)
g2<-ggplot(data=filter(tt,Sex=='Male'),aes(Year,Category,fill=number)) + 
  geom_tile(aes(fill = number),colour = "white") + 
  scale_fill_gradient(low="#EBCC2A",high="black",guide = guide_legend(title = "Male")) +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())+xlim(1900,2015)

#grid.newpage()
grid.draw(rbind(ggplotGrob(g1), ggplotGrob(g2), size = "last"))

#grid.arrange(g1,g2,ncol=1)
#df %>% select(Year,Sex,Category,Birth.Country) %>% 
#  filter(Birth.Country != "") %>% 
#  group_by(Year,Sex,Category) %>% 
#  summarise(number=n()) %>% 
#  arrange(-number) %>% 
#  ggplot(aes(Year,Category)) + geom_tile(aes(fill = number),colour = "white") +
#  scale_fill_gradient(low="steelblue",high="black") + facet_wrap(~Sex,ncol=2) + 
#  theme(legend.position="top") + xlab('') + ylab('')
  • There is a huge disparity between Men and Women, for all Categories
  • It’s interesting to see the gap the WWII period (When WW II broke out in Europe in 1939, it created a new situation for the Nobel Peace Prize. The Norwegian Nobel Committee awarded no prizes before the war ended in 1945) source

3.2 Histogram by Category, Gender

ttt<-df %>% select(Sex,Category,Birth.Country) %>% filter(Birth.Country != "") %>% group_by(Sex,Category) %>% summarise(number=n()) %>% arrange(-number)
ttt$Percentage <- ttt$number / sum(ttt$number) * 100
ttt$Percent <-paste0(round(ttt$Percentage,1),"%")
ggplot(ttt, aes(x=Category, y=number, fill=Sex)) + geom_bar(width = 0.9, stat="identity",position="dodge") + 
  xlab('') + ylab('Number of Nobel Prizes') + theme(axis.text.x = element_text(angle=90, hjust=1)) + 
  geom_text(aes(label=Percent), position=position_dodge(width=0.9), vjust=-0.25,size=4) +
  scale_fill_manual(values = c("#3B9AB2","#EBCC2A"))

  • The disparity is even more obvious in this histogram
  • In all Categories the women are under-represented

3.3 Number of Nobel Prizes per Year, cumulared over Years

tt<-as.data.frame(df %>% filter(Birth.Country != "") %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylim(0,20) + ylab('number of Nobel Prizes') +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())

ttt<-as.data.frame(df %>% filter(Birth.Country != "") %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())

grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))

4 Number of Nobel Prizes per Year, cumulated over Years, breakdown by Category

4.1 Category : Chemistry

tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Chemistry') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylim(0,20) + ylab('number of Nobel Prizes') +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())

ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Chemistry') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())

grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))

4.2 Category : Physics

tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Physics') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylim(0,20) + ylab('number of Nobel Prizes') +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())

ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Physics') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())

grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))

4.3 Category : Medicine

tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Medicine') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylim(0,20) + ylab('number of Nobel Prizes') +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())

ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Medicine') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())

grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))

4.4 Category : Economics

tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Economics') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylim(0,20) + ylab('number of Nobel Prizes') +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())

ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Economics') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())

grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))

4.5 Category : Literature

tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Literature') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylim(0,20) + ylab('number of Nobel Prizes') +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())

ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Literature') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())

grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))

4.6 Category : Peace

tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Peace') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylim(0,20) + ylab('number of Nobel Prizes') +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())

ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Peace') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())

grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))

4.7 Comments

Now that’s a really scary finding :

  • No Women won a Nobel Prize in Physics since the sixties
  • No Women won Nobel Prizes in Economics
  • the ratio (# of Nobel Prizes Men) / (# of Nobel Prizes Women) looks a bit better in Literature, same for Peace category

5 Comparison Men/Women [radarchart]

cat<-c('Chemistry','Economics','Literature','Medicine','Peace','Physics')
chemistry<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Chemistry') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))

economics<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Economics') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))

literature<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Literature') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))

medicine<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Medicine') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))

peace<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Peace') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))

physics<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Physics') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))

women<-c(chemistry$number[1],economics$number[1],literature$number[1],medicine$number[1],peace$number[1],physics$number[1])
men<-c(chemistry$number[2],economics$number[2],literature$number[2],medicine$number[2],peace$number[2],physics$number[2])

min<-rep(0,6)
max<-c(200,100,120,250,100,250)
summary<-data.frame(rbind(max,min,women,men))
colnames(summary)<-cat

womenPercent<-c(chemistry$percentage[1],economics$percentage[1],literature$percentage[1],medicine$percentage[1],peace$percentage[1],physics$percentage[1])
menPercent<-c(chemistry$percentage[2],economics$percentage[2],literature$percentage[2],medicine$percentage[2],peace$percentage[2],physics$percentage[2])

min<-rep(0,6)
max<-rep(100,6)
summaryPercent<-data.frame(rbind(max,min,womenPercent,menPercent))
colnames(summaryPercent)<-cat
op <- par(mar = c(1, 2, 2, 1), mfrow = c(1, 2))

radarchart( summary , axistype=2 , pcol=c("#3B9AB2", "#EBCC2A") , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=1,vlcex=1,palcex=1.2)
legend(x=-1.3, y=1.3, legend = c('Women','Men'), bty = "n", pch=16 ,col=c("#3B9AB2", "#EBCC2A") , text.col = "black", cex=.8, pt.cex=.5,title='Raw number of \n Nobel Prizes')

radarchart( summaryPercent , axistype=2 , pcol=c("#3B9AB2", "#EBCC2A") , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=1,vlcex=1,palcex=1.2)
legend(x=-1.3, y=1.3, legend = c('Women','Men'), bty = "n", pch=16 ,col=c("#3B9AB2", "#EBCC2A") , text.col = "black", cex=.8, pt.cex=.5,title='Percentage of Nobel Prizes \n within Category')

The 2 radar charts show the same information ; left is raw number of nobel Prizes but right panel is the percentage Women/Men with a Category

kable(summary, format = "markdown")
Chemistry Economics Literature Medicine Peace Physics
max 200 100 120 250 100 250
min 0 0 0 0 0 0
women 4 2 14 12 16 2
men 190 81 99 215 88 220

History :