options(width=100)
knitr::opts_chunk$set(out.width='1000px',dpi=200,message=FALSE,warning=FALSE)
#load packages and csv file
library(ggplot2)
library(dplyr)
library(gridExtra)
library(Amelia)
library(grid)
library(stringr)
library(ggmap)
library(plotly)
library(fmsb)
library(knitr)
df<-read.csv('archive.csv',sep=',')
#missingmap
#missmap(df)
There are no missing entries, however there are empty dtrings, which corresponds to the Category
= Peace
. So for the section where I will retrieve the geolocations, I will remove this entry
#df %>% select(Birth.Country,Category) %>% filter(Birth.Country == "")
geoCodes
comments :
Birth.Country
, get the real name of the country inside parentheses, and then get the Longtitude
and Latitude
for each#get geoCodes from google
getGeo<-function(x,coord){
for(i in 1:length(x)){
val<-as.numeric(geocode(as.character(x[i])))[coord]
return(val)
}
}
#function to keep name of country (hence remove parentheses)
stripMe<-function(x){
re <- "\\(([^()]+)\\)"
x<-as.character(x)
if(grepl("\\(",x) == TRUE){
y<-gsub(re, "\\1", str_extract_all(x, re)[[1]])
return(as.character(y))
}
else{return(as.character(x))}
}
names<-df %>% filter(Birth.Country != "") %>% select(Birth.Country)
names$name<-sapply(names$Birth.Country,stripMe)
countryTest<-data.frame("val"=unique(names$name))
countryTest$geoLong<-sapply(countryTest$val,getGeo,1)
countryTest$geoLat<-sapply(countryTest$val,getGeo,2)
Geocodes
by handcopy-paste the following dataframe
#cat(paste0('"',countryCodes$countries,'"'),sep=",")
nameCountry<-c("Netherlands","France","Poland","Switzerland","Germany","India","Sweden","Norway","Denmark","United Kingdom","Scotland","Spain","Russia","Czech Republic","Slovakia","Italy","United States of America","New Zealand","Ukraine","Luxembourg","Latvia","Belgium","Austria","Australia","Slovenia","Ireland","Canada","Indonesia","Argentina","Hungary","Croatia","Finland","Chile","Portugal","Japan","South Africa","Iceland","China","Algeria","Guadeloupe Island","Brazil","Zimbabwe","Bosnia and Herzegovina","Azerbaijan","Turkey","Egypt","Guatemala","Belarus","Vietnam","Romania","Northern Ireland","Lithuania","Saint Lucia","Greece","Republic of Macedonia","Pakistan","Venezuela","Bulgaria","Colombia","Mexico","Madagascar","Taiwan","Nigeria","South Korea","Costa Rica","People's Republic of China","Myanmar","Israel","East Timor","Trinidad","Ghana","Iran","Kenya","Bangladesh","Cyprus","Peru","Liberia","Yemen","Morocco")
#cat(countryCodes$geoLong,sep=",")
geoLong<-c(5.291266,2.213749,19.14514,8.227512,10.45153,78.96288,18.6435,8.468946,9.501785,-3.435973,-4.202646,-3.74922,105.3188,15.47296,19.69902,12.56738,-95.71289,174.886,31.16558,6.129583,24.60319,4.469936,14.55007,133.7751,14.99546,-7.692054,-106.3468,113.9213,-63.61667,19.5033,15.2,25.74815,-71.54297,-8.224454,138.2529,22.93751,-19.02084,104.1954,1.659626,-61.551,-51.92528,29.15486,17.67908,47.57693,35.24332,30.8025,-90.23076,27.95339,108.2772,24.96676,-6.492314,23.88127,-60.97889,21.82431,21.74527,69.34512,-66.58973,25.48583,-74.29733,-102.5528,46.86911,120.9605,8.675277,127.7669,-83.75343,104.1954,95.95597,34.85161,125.7275,-61.2225,-1.023194,53.68805,37.90619,90.35633,33.42986,-75.01515,-9.429499,48.51639,-7.09262)
#cat(countryCodes$geoLat,sep=",")
geoLat<-c(52.13263,46.22764,51.91944,46.81819,51.16569,20.59368,60.12816,60.47202,56.26392,55.37805,56.49067,40.46367,61.52401,49.81749,48.66903,41.87194,37.09024,-40.90056,48.37943,49.81527,56.87964,50.50389,47.51623,-25.2744,46.15124,53.14237,56.13037,-0.789275,-38.4161,47.16249,45.1,61.92411,-35.67515,39.39987,36.20482,-30.55948,64.96305,35.86166,28.03389,16.265,-14.235,-19.01544,43.91589,40.1431,38.96375,26.82055,15.78347,53.70981,14.05832,45.94316,54.78771,55.16944,13.90944,39.07421,41.60863,30.37532,6.42375,42.73388,4.570868,23.6345,-18.76695,23.69781,9.081999,35.90776,9.748917,35.86166,21.91622,31.04605,-8.874217,10.6918,7.946527,32.42791,-0.023559,23.68499,35.12641,-9.189967,6.428055,15.55273,31.7917)
countryCodes<-data.frame("country"=nameCountry,"Longitude"=geoLong,"Latitude"=geoLat)
DT::datatable(countryCodes)
byCountry<-as.data.frame(df %>% select(Birth.Country) %>% filter(Birth.Country != "") %>% group_by(Birth.Country) %>% summarise(number=n()) %>% arrange(-number))
byCountry$name<-sapply(byCountry$Birth.Country,stripMe)
byCountry<-as.data.frame(byCountry %>% select(number,name) %>% group_by(name) %>% summarise(sum=sum(number)))
colnames(byCountry)<-c('country','number')
byCountry$country<-as.character(byCountry$country)
#left join
res<-merge(x=byCountry,y=countryCodes,by='country',all.x=TRUE)
res<-na.omit(res)
map<-ggplot() + borders("world",colour="grey75",fill="white")
g1<-map + geom_point(data=res,aes(x=Longitude, y=Latitude,size=number,labels=country),alpha=.5) +
xlab('Longitude') + ylab('Latitude') +
theme(legend.text=element_text(size=12),legend.position="top") +
labs(size="Number of Nobel Prizes") +
ggtitle('Map of Nobel Prizes')
#g1
py <- ggplotly(g1,width = 800, height = 600)
py
res$country <- reorder(res$country,-res$number)
res$Percentage <- res$number / sum(res$number) * 100
res$Percent <-paste0(round(res$Percentage,1),"%")
g2<-ggplot(res, aes(x=country, y=number,labels=Percent)) + geom_bar(width = 0.9, stat="identity") +
xlab('') + ylab('Number of Nobel Prizes') + theme(axis.text.x = element_text(angle=90, hjust=1)) +
ggtitle('Distribution of Nobel Prizes per country')
py2 <- ggplotly(g2,width = 800, height = 600)
py2
United States of America
are on top of the Countries having Nobel Prizes with ~30% of all prizes since their creation.tt<-as.data.frame(df %>% select(Year,Sex,Category,Birth.Country) %>%
filter(Birth.Country != "") %>%
group_by(Year,Sex,Category) %>%
summarise(number=n()) %>%
arrange(-number))
g1<-ggplot(data=filter(tt,Sex=='Female'),aes(Year,Category,fill=number)) +
geom_tile(aes(fill = number),colour = "white") +
scale_fill_gradient(low="#3B9AB2",high="black",guide = guide_legend(title = "Female")) +
theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank()) +xlim(1900,2015)
g2<-ggplot(data=filter(tt,Sex=='Male'),aes(Year,Category,fill=number)) +
geom_tile(aes(fill = number),colour = "white") +
scale_fill_gradient(low="#EBCC2A",high="black",guide = guide_legend(title = "Male")) +
theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())+xlim(1900,2015)
#grid.newpage()
grid.draw(rbind(ggplotGrob(g1), ggplotGrob(g2), size = "last"))
#grid.arrange(g1,g2,ncol=1)
#df %>% select(Year,Sex,Category,Birth.Country) %>%
# filter(Birth.Country != "") %>%
# group_by(Year,Sex,Category) %>%
# summarise(number=n()) %>%
# arrange(-number) %>%
# ggplot(aes(Year,Category)) + geom_tile(aes(fill = number),colour = "white") +
# scale_fill_gradient(low="steelblue",high="black") + facet_wrap(~Sex,ncol=2) +
# theme(legend.position="top") + xlab('') + ylab('')
When WW II broke out in Europe in 1939, it created a new situation for the Nobel Peace Prize. The Norwegian Nobel Committee awarded no prizes before the war ended in 1945
) sourcettt<-df %>% select(Sex,Category,Birth.Country) %>% filter(Birth.Country != "") %>% group_by(Sex,Category) %>% summarise(number=n()) %>% arrange(-number)
ttt$Percentage <- ttt$number / sum(ttt$number) * 100
ttt$Percent <-paste0(round(ttt$Percentage,1),"%")
ggplot(ttt, aes(x=Category, y=number, fill=Sex)) + geom_bar(width = 0.9, stat="identity",position="dodge") +
xlab('') + ylab('Number of Nobel Prizes') + theme(axis.text.x = element_text(angle=90, hjust=1)) +
geom_text(aes(label=Percent), position=position_dodge(width=0.9), vjust=-0.25,size=4) +
scale_fill_manual(values = c("#3B9AB2","#EBCC2A"))
tt<-as.data.frame(df %>% filter(Birth.Country != "") %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
scale_color_manual(values = c("#3B9AB2","#EBCC2A")) +
ylim(0,20) + ylab('number of Nobel Prizes') +
theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())
ttt<-as.data.frame(df %>% filter(Birth.Country != "") %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
scale_color_manual(values = c("#3B9AB2","#EBCC2A")) +
ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())
grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))
Chemistry
tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Chemistry') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
scale_color_manual(values = c("#3B9AB2","#EBCC2A")) +
ylim(0,20) + ylab('number of Nobel Prizes') +
theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())
ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Chemistry') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
scale_color_manual(values = c("#3B9AB2","#EBCC2A")) +
ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())
grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))
Physics
tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Physics') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
scale_color_manual(values = c("#3B9AB2","#EBCC2A")) +
ylim(0,20) + ylab('number of Nobel Prizes') +
theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())
ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Physics') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
scale_color_manual(values = c("#3B9AB2","#EBCC2A")) +
ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())
grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))
Medicine
tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Medicine') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
scale_color_manual(values = c("#3B9AB2","#EBCC2A")) +
ylim(0,20) + ylab('number of Nobel Prizes') +
theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())
ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Medicine') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
scale_color_manual(values = c("#3B9AB2","#EBCC2A")) +
ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())
grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))
Economics
tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Economics') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
scale_color_manual(values = c("#3B9AB2","#EBCC2A")) +
ylim(0,20) + ylab('number of Nobel Prizes') +
theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())
ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Economics') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
scale_color_manual(values = c("#3B9AB2","#EBCC2A")) +
ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())
grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))
Literature
tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Literature') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
scale_color_manual(values = c("#3B9AB2","#EBCC2A")) +
ylim(0,20) + ylab('number of Nobel Prizes') +
theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())
ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Literature') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
scale_color_manual(values = c("#3B9AB2","#EBCC2A")) +
ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())
grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))
Peace
tt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Peace') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()))
h1<-ggplot(data=tt,aes(x=Year,y=number)) + geom_line(aes(color=Sex),size=2) +
scale_color_manual(values = c("#3B9AB2","#EBCC2A")) +
ylim(0,20) + ylab('number of Nobel Prizes') +
theme(legend.position="right") + xlab('') + ylab('') + theme(axis.title.x = element_blank())
ttt<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Peace') %>% select(Sex,Year) %>% group_by(Sex,Year) %>% summarise(number=n()) %>% mutate(cs = cumsum(number)))
h2<-ggplot(data=ttt,aes(x=Year,y=cs)) + geom_line(aes(color=Sex),size=2) +
scale_color_manual(values = c("#3B9AB2","#EBCC2A")) +
ylab('Cumulated number of Nobel Prizes') + theme(axis.title.x = element_blank())
grid.draw(rbind(ggplotGrob(h1), ggplotGrob(h2), size = "last"))
cat<-c('Chemistry','Economics','Literature','Medicine','Peace','Physics')
chemistry<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Chemistry') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))
economics<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Economics') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))
literature<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Literature') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))
medicine<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Medicine') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))
peace<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Peace') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))
physics<-as.data.frame(df %>% filter(Birth.Country != "" & Category=='Physics') %>% select(Sex) %>% group_by(Sex) %>% summarise(number=n())) %>% mutate(countT= sum(number)) %>% mutate(percentage=round(100*number/countT,1))
women<-c(chemistry$number[1],economics$number[1],literature$number[1],medicine$number[1],peace$number[1],physics$number[1])
men<-c(chemistry$number[2],economics$number[2],literature$number[2],medicine$number[2],peace$number[2],physics$number[2])
min<-rep(0,6)
max<-c(200,100,120,250,100,250)
summary<-data.frame(rbind(max,min,women,men))
colnames(summary)<-cat
womenPercent<-c(chemistry$percentage[1],economics$percentage[1],literature$percentage[1],medicine$percentage[1],peace$percentage[1],physics$percentage[1])
menPercent<-c(chemistry$percentage[2],economics$percentage[2],literature$percentage[2],medicine$percentage[2],peace$percentage[2],physics$percentage[2])
min<-rep(0,6)
max<-rep(100,6)
summaryPercent<-data.frame(rbind(max,min,womenPercent,menPercent))
colnames(summaryPercent)<-cat
op <- par(mar = c(1, 2, 2, 1), mfrow = c(1, 2))
radarchart( summary , axistype=2 , pcol=c("#3B9AB2", "#EBCC2A") , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=1,vlcex=1,palcex=1.2)
legend(x=-1.3, y=1.3, legend = c('Women','Men'), bty = "n", pch=16 ,col=c("#3B9AB2", "#EBCC2A") , text.col = "black", cex=.8, pt.cex=.5,title='Raw number of \n Nobel Prizes')
radarchart( summaryPercent , axistype=2 , pcol=c("#3B9AB2", "#EBCC2A") , plwd=2 , plty=3,cglcol="grey", cglty=2,axislabcol="grey", cglwd=1,vlcex=1,palcex=1.2)
legend(x=-1.3, y=1.3, legend = c('Women','Men'), bty = "n", pch=16 ,col=c("#3B9AB2", "#EBCC2A") , text.col = "black", cex=.8, pt.cex=.5,title='Percentage of Nobel Prizes \n within Category')
The 2 radar charts show the same information ; left is raw number of nobel Prizes but right panel is the percentage Women/Men with a Category
kable(summary, format = "markdown")
Chemistry | Economics | Literature | Medicine | Peace | Physics | |
---|---|---|---|---|---|---|
max | 200 | 100 | 120 | 250 | 100 | 250 |
min | 0 | 0 | 0 | 0 | 0 | 0 |
women | 4 | 2 | 14 | 12 | 16 | 2 |
men | 190 | 81 | 99 | 215 | 88 | 220 |
History :
4.7 Comments
Now that’s a really scary finding :
Physics
since the sixtiesEconomics
Men
) / (# of Nobel PrizesWomen
) looks a bit better inLiterature
, same forPeace
category