1 Data preparation
- 1.1 Get geoCodes
- 1.2 Impute Geocodes by hand
2 By Country
- 2.1 Map
- 2.2 Histogram
3 By Year, Gender and Category
4 Number of Nobel Prizes per Year, cumulated over Years, breakdown by Category
5 Comparison Men/Women [radarchart]

options(width=100)
knitr::opts_chunk$set(out.width='1000px',dpi=200,message=FALSE,warning=FALSE)

#load packages and csv file
library(ggplot2)
library(dplyr)
library(gridExtra)
library(Amelia)
library(grid)
library(stringr)
library(ggmap)
library(plotly)
library(fmsb)
library(knitr)

1 Data preparation

df<-read.csv('archive.csv',sep=',')
#missingmap
#missmap(df)

There are no missing entries, however there are empty dtrings, which corresponds to the Category = Peace. So for the section where I will retrieve the geolocations, I will remove this entry

#df %>% select(Birth.Country,Category) %>% filter(Birth.Country == "")

1.1 Get `geoCodes`

comments :

the code-chunk below is not compiled
I have run this code before-hand because we cannot get the geoCodes within Kaggle kernel
the code below loops over Birth.Country, get the real name of the country inside parentheses, and then get the Longtitude and Latitude for each
append the result in a dataframe

#get geoCodes from google
getGeo<-function(x,coord){
  for(i in 1:length(x)){
    val<-as.numeric(geocode(as.character(x[i])))[coord]
    return(val)
  }
}
#function to keep name of country (hence remove parentheses)
stripMe<-function(x){
  re <- "\\(([^()]+)\\)"
  x<-as.character(x)
  if(grepl("\\(",x) == TRUE){
    y<-gsub(re, "\\1", str_extract_all(x, re)[[1]])
    return(as.character(y))
  }
  else{return(as.character(x))}
}

names<-df %>% filter(Birth.Country != "") %>% select(Birth.Country)
names$name<-sapply(names$Birth.Country,stripMe)
countryTest<-data.frame("val"=unique(names$name))
countryTest$geoLong<-sapply(countryTest$val,getGeo,1)
countryTest$geoLat<-sapply(countryTest$val,getGeo,2)

1.2 Impute `Geocodes` by hand

copy-paste the following dataframe

#cat(paste0('"',countryCodes$countries,'"'),sep=",")
nameCountry<-c("Netherlands","France","Poland","Switzerland","Germany","India","Sweden","Norway","Denmark","United Kingdom","Scotland","Spain","Russia","Czech Republic","Slovakia","Italy","United States of America","New Zealand","Ukraine","Luxembourg","Latvia","Belgium","Austria","Australia","Slovenia","Ireland","Canada","Indonesia","Argentina","Hungary","Croatia","Finland","Chile","Portugal","Japan","South Africa","Iceland","China","Algeria","Guadeloupe Island","Brazil","Zimbabwe","Bosnia and Herzegovina","Azerbaijan","Turkey","Egypt","Guatemala","Belarus","Vietnam","Romania","Northern Ireland","Lithuania","Saint Lucia","Greece","Republic of Macedonia","Pakistan","Venezuela","Bulgaria","Colombia","Mexico","Madagascar","Taiwan","Nigeria","South Korea","Costa Rica","People's Republic of China","Myanmar","Israel","East Timor","Trinidad","Ghana","Iran","Kenya","Bangladesh","Cyprus","Peru","Liberia","Yemen","Morocco")

#cat(countryCodes$geoLong,sep=",")
geoLong<-c(5.291266,2.213749,19.14514,8.227512,10.45153,78.96288,18.6435,8.468946,9.501785,-3.435973,-4.202646,-3.74922,105.3188,15.47296,19.69902,12.56738,-95.71289,174.886,31.16558,6.129583,24.60319,4.469936,14.55007,133.7751,14.99546,-7.692054,-106.3468,113.9213,-63.61667,19.5033,15.2,25.74815,-71.54297,-8.224454,138.2529,22.93751,-19.02084,104.1954,1.659626,-61.551,-51.92528,29.15486,17.67908,47.57693,35.24332,30.8025,-90.23076,27.95339,108.2772,24.96676,-6.492314,23.88127,-60.97889,21.82431,21.74527,69.34512,-66.58973,25.48583,-74.29733,-102.5528,46.86911,120.9605,8.675277,127.7669,-83.75343,104.1954,95.95597,34.85161,125.7275,-61.2225,-1.023194,53.68805,37.90619,90.35633,33.42986,-75.01515,-9.429499,48.51639,-7.09262)

#cat(countryCodes$geoLat,sep=",")
geoLat<-c(52.13263,46.22764,51.91944,46.81819,51.16569,20.59368,60.12816,60.47202,56.26392,55.37805,56.49067,40.46367,61.52401,49.81749,48.66903,41.87194,37.09024,-40.90056,48.37943,49.81527,56.87964,50.50389,47.51623,-25.2744,46.15124,53.14237,56.13037,-0.789275,-38.4161,47.16249,45.1,61.92411,-35.67515,39.39987,36.20482,-30.55948,64.96305,35.86166,28.03389,16.265,-14.235,-19.01544,43.91589,40.1431,38.96375,26.82055,15.78347,53.70981,14.05832,45.94316,54.78771,55.16944,13.90944,39.07421,41.60863,30.37532,6.42375,42.73388,4.570868,23.6345,-18.76695,23.69781,9.081999,35.90776,9.748917,35.86166,21.91622,31.04605,-8.874217,10.6918,7.946527,32.42791,-0.023559,23.68499,35.12641,-9.189967,6.428055,15.55273,31.7917)

countryCodes<-data.frame("country"=nameCountry,"Longitude"=geoLong,"Latitude"=geoLat)
DT::datatable(countryCodes)

2 By Country

2.1 Map

byCountry<-as.data.frame(df %>% select(Birth.Country) %>% filter(Birth.Country != "") %>% group_by(Birth.Country) %>% summarise(number=n()) %>% arrange(-number))
byCountry$name<-sapply(byCountry$Birth.Country,stripMe)
byCountry<-as.data.frame(byCountry %>% select(number,name) %>% group_by(name) %>% summarise(sum=sum(number)))
colnames(byCountry)<-c('country','number')
byCountry$country<-as.character(byCountry$country)
#left join
res<-merge(x=byCountry,y=countryCodes,by='country',all.x=TRUE)
res<-na.omit(res)
map<-ggplot() + borders("world",colour="grey75",fill="white")
g1<-map + geom_point(data=res,aes(x=Longitude, y=Latitude,size=number,labels=country),alpha=.5) +
  xlab('Longitude') + ylab('Latitude') +
  theme(legend.text=element_text(size=12),legend.position="top") + 
  labs(size="Number of Nobel Prizes") +
  ggtitle('Map of Nobel Prizes')
#g1
py <- ggplotly(g1,width = 800, height = 600)
py

2.2 Histogram

res$country <- reorder(res$country,-res$number)
res$Percentage <- res$number / sum(res$number) * 100
res$Percent <-paste0(round(res$Percentage,1),"%")
g2<-ggplot(res, aes(x=country, y=number,labels=Percent)) + geom_bar(width = 0.9, stat="identity") + 
  xlab('') + ylab('Number of Nobel Prizes') + theme(axis.text.x = element_text(angle=90, hjust=1)) +
  ggtitle('Distribution of Nobel Prizes per country')
py2 <- ggplotly(g2,width = 800, height = 600)
py2

The United States of America are on top of the Countries having Nobel Prizes with ~30% of all prizes since their creation.

3 By Year, Gender and Category

3.1 Categories vs. Year, Gender

tt<-as.data.frame(df %>% select(Year,Sex,Category,Birth.Country) %>% 
                    filter(Birth.Country != "") %>% 
                    group_by(Year,Sex,Category) %>% 
                    summarise(number=n()) %>% 
                    arrange(-number))
g1<-ggplot(data=filter(tt,Sex=='Female'),aes(Year,Category,fill=number)) + 
  geom_tile(aes(fill = number),colour = "white") + 
  scale_fill_gradient(low="#3B9AB2",high="black",guide = guide_legend(title = "Female")) +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank()) +xlim(1900,2015)
g2<-ggplot(data=filter(tt,Sex=='Male'),aes(Year,Category,fill=number)) + 
  geom_tile(aes(fill = number),colour = "white") + 
  scale_fill_gradient(low="#EBCC2A",high="black",guide = guide_legend(title = "Male")) +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())+xlim(1900,2015)

#grid.newpage()
grid.draw(rbind(ggplotGrob(g1), ggplotGrob(g2), size = "last"))

#grid.arrange(g1,g2,ncol=1)
#df %>% select(Year,Sex,Category,Birth.Country) %>% 
#  filter(Birth.Country != "") %>% 
#  group_by(Year,Sex,Category) %>% 
#  summarise(number=n()) %>% 
#  arrange(-number) %>% 
#  ggplot(aes(Year,Category)) + geom_tile(aes(fill = number),colour = "white") +
#  scale_fill_gradient(low="steelblue",high="black") + facet_wrap(~Sex,ncol=2) + 
#  theme(legend.position="top") + xlab('') + ylab('')

There is a huge disparity between Men and Women, for all Categories
It’s interesting to see the gap the WWII period (When WW II broke out in Europe in 1939, it created a new situation for the Nobel Peace Prize. The Norwegian Nobel Committee awarded no prizes before the war ended in 1945) source

3.2 Histogram by Category, Gender

ttt<-df %>% select(Sex,Category,Birth.Country) %>% filter(Birth.Country != "") %>% group_by(Sex,Category) %>% summarise(number=n()) %>% arrange(-number)
ttt$Percentage <- ttt$number / sum(ttt$number) * 100
ttt$Percent <-paste0(round(ttt$Percentage,1),"%")
ggplot(ttt, aes(x=Category, y=number, fill=Sex)) + geom_bar(width = 0.9, stat="identity",position="dodge") + 
  xlab('') + ylab('Number of Nobel Prizes') + theme(axis.text.x = element_text(angle=90, hjust=1)) + 
  geom_text(aes(label=Percent), position=position_dodge(width=0.9), vjust=-0.25,size=4) +
  scale_fill_manual(values = c("#3B9AB2","#EBCC2A"))

The disparity is even more obvious in this histogram
In all Categories the women are under-represented

3.3 Number of Nobel Prizes per Year, cumulared over Years

tt<-as.data.frame(df %>% filter(Birth.Country != "") %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylim(0,20) + ylab('number of Nobel Prizes') +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())

ttt<-as.data.frame(df %>% filter(Birth.Country != "") %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())

grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))

4 Number of Nobel Prizes per Year, cumulated over Years, breakdown by Category

4.1 Category : `Chemistry`

tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Chemistry') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylim(0,20) + ylab('number of Nobel Prizes') +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())

ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Chemistry') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())

grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))

4.2 Category : `Physics`

tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Physics') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylim(0,20) + ylab('number of Nobel Prizes') +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())

ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Physics') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())

grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))

4.3 Category : `Medicine`

tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Medicine') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylim(0,20) + ylab('number of Nobel Prizes') +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())

ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Medicine') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())

grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))

4.4 Category : `Economics`

tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Economics') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylim(0,20) + ylab('number of Nobel Prizes') +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())

ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Economics') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())

grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))

4.5 Category : `Literature`

tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Literature') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylim(0,20) + ylab('number of Nobel Prizes') +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())

ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Literature') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())

grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))

4.6 Category : `Peace`

tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Peace') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylim(0,20) + ylab('number of Nobel Prizes') +
  theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())

ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Peace') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
  scale_color_manual(values = c("#3B9AB2","#EBCC2A")) + 
  ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())

grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))

4.7 Comments

Now that’s a really scary finding :

No Women won a Nobel Prize in Physics since the sixties
No Women won Nobel Prizes in Economics
the ratio (# of Nobel Prizes Men) / (# of Nobel Prizes Women) looks a bit better in Literature, same for Peace category

5 Comparison Men/Women [radarchart]

cat<-c('Chemistry','Economics','Literature','Medicine','Peace','Physics')
chemistry<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Chemistry') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))

economics<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Economics') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))

literature<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Literature') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))

medicine<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Medicine') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))

peace<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Peace') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))

physics<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Physics') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))

women<-c(chemistry$number[1],economics$number[1],literature$number[1],medicine$number[1],peace$number[1],physics$number[1])
men<-c(chemistry$number[2],economics$number[2],literature$number[2],medicine$number[2],peace$number[2],physics$number[2])

min<-rep(0,6)
max<-c(200,100,120,250,100,250)
summary<-data.frame(rbind(max,min,women,men))
colnames(summary)<-cat

womenPercent<-c(chemistry$percentage[1],economics$percentage[1],literature$percentage[1],medicine$percentage[1],peace$percentage[1],physics$percentage[1])
menPercent<-c(chemistry$percentage[2],economics$percentage[2],literature$percentage[2],medicine$percentage[2],peace$percentage[2],physics$percentage[2])

min<-rep(0,6)
max<-rep(100,6)
summaryPercent<-data.frame(rbind(max,min,womenPercent,menPercent))
colnames(summaryPercent)<-cat

op <- par(mar = c(1, 2, 2, 1), mfrow = c(1, 2))

radarchart( summary , axistype=2 , pcol=c("#3B9AB2", "#EBCC2A") , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=1,vlcex=1,palcex=1.2)
legend(x=-1.3, y=1.3, legend = c('Women','Men'), bty = "n", pch=16 ,col=c("#3B9AB2", "#EBCC2A") , text.col = "black", cex=.8, pt.cex=.5,title='Raw number of \n Nobel Prizes')

radarchart( summaryPercent , axistype=2 , pcol=c("#3B9AB2", "#EBCC2A") , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=1,vlcex=1,palcex=1.2)
legend(x=-1.3, y=1.3, legend = c('Women','Men'), bty = "n", pch=16 ,col=c("#3B9AB2", "#EBCC2A") , text.col = "black", cex=.8, pt.cex=.5,title='Percentage of Nobel Prizes \n within Category')

The 2 radar charts show the same information ; left is raw number of nobel Prizes but right panel is the percentage Women/Men with a Category

kable(summary, format = "markdown")

	Chemistry	Economics	Literature	Medicine	Peace	Physics
max	200	100	120	250	100	250
min	0	0	0	0	0	0
women	4	2	14	12	16	2
men	190	81	99	215	88	220

History :

version 1 : initial commit

Nobel Prizes

Jonathan Bouchet

2017-02-27

1 Data preparation

1.1 Get `geoCodes`

1.2 Impute `Geocodes` by hand

2 By Country

2.1 Map

2.2 Histogram

3 By Year, Gender and Category

3.1 Categories vs. Year, Gender

3.2 Histogram by Category, Gender

3.3 Number of Nobel Prizes per Year, cumulared over Years

4 Number of Nobel Prizes per Year, cumulated over Years, breakdown by Category

4.1 Category : `Chemistry`

4.2 Category : `Physics`

4.3 Category : `Medicine`

4.4 Category : `Economics`

4.5 Category : `Literature`

4.6 Category : `Peace`

4.7 Comments

5 Comparison Men/Women [radarchart]

Nobel Prizes

Jonathan Bouchet

2017-02-27

1 Data preparation

1.1 Get geoCodes

1.2 Impute Geocodes by hand

2 By Country

2.1 Map

2.2 Histogram

3 By Year, Gender and Category

3.1 Categories vs. Year, Gender

3.2 Histogram by Category, Gender

3.3 Number of Nobel Prizes per Year, cumulared over Years

4 Number of Nobel Prizes per Year, cumulated over Years, breakdown by Category

4.1 Category : Chemistry

4.2 Category : Physics

4.3 Category : Medicine

4.4 Category : Economics

4.5 Category : Literature

4.6 Category : Peace

4.7 Comments

5 Comparison Men/Women [radarchart]

1.1 Get `geoCodes`

1.2 Impute `Geocodes` by hand

4.1 Category : `Chemistry`

4.2 Category : `Physics`

4.3 Category : `Medicine`

4.4 Category : `Economics`

4.5 Category : `Literature`

4.6 Category : `Peace`