Import data and create data overview

nobel <- read.csv('nobel_cleansed.csv', stringsAsFactors=FALSE)
nobel$Birth_Year <- as.integer(nobel$Birth_Year)
## Warning: NAs introduced by coercion
names(nobel)
##  [1] "Name"           "Category"       "Birthdate"      "Birth_Day"     
##  [5] "Birth_Month"    "Birth_Year"     "Birth.Place"    "Motivation"    
##  [9] "Prize.Name"     "Year"           "County"         "Residence"     
## [13] "Role.Affiliate" "Field.Language"

Is there a particular time period or year when Nobel Prize winners were born?

library(ggthemes)
## Loading required package: ggplot2
theme_set(theme_minimal(12))

#Find the range of birth year to set appropriate plot range
summary(nobel$Birth_Year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1647    1887    1912    1907    1930    1988       8
ggplot(aes(x = Birth_Year), data = subset(nobel, !is.na(nobel$Birth_Year))) +
  geom_histogram(binwidth = 1)+
  scale_x_continuous(breaks = seq(1647, 1988, 50))+
  xlab('Birth Year of Winners')+
  ylab('Countsof Winnder')+
  ggtitle('Winner Birth Year Analysis')+
  ggsave('Winner_Birth_Year.png')
## Saving 7 x 5 in image

# Insight: 1918 has the highest number of Nobel Prize winners born in a single year (~30% more than the year with 2nd highest number of winners born); Most of the winners were born in a time window of 1900 - 1940, which makes sense given the demand for scientific advanced for war and economic growth

Any outlier in birth month?

library(ggthemes)
theme_set(theme_minimal(12))

ggplot(aes(x = Birth_Month), data = subset(nobel, nobel$Birth_Month != '')) +
  geom_histogram(binwidth = 1)+
  xlab('Birth Month of Winners')+
  ylab('Countsof Winnder')+
  ggtitle('Winner Birth Month Analysis')+
  ggsave('Winner_Birth_Month.png')
## Saving 7 x 5 in image

When did the winners, who born in 1918 in particular, receive the awards?

df_1918 = subset(nobel, nobel$Birth_Year == 1918)

ggplot(aes(x = Year), data = df_1918)+
  geom_histogram(binwidth = 1)+
  xlab('Year of Nobel Price Awarded')+
  ylab('Counts of Winners')+
  ggtitle('Histogram of Year of Award (winner born in 1918)')

table(df_1918$Category)
## 
##  chemistry  economics literature   medicine      peace    physics 
##          8          2          1          4          2          6

How old, on average, are the winners when they receive the Nobel Price Award?

nobel$Age_Award = (nobel$Year - nobel$Birth_Year)

ggplot(aes(x = Age_Award), data = subset(nobel, !is.na(nobel$Age_Award)))+
  geom_histogram(binwidth = 1)+
  scale_x_continuous(breaks = seq(0, 300, 30))+
  xlab('Age when Received Nobel Award')+
  ylab('Counts of Winners')+
  ggtitle('Histogram of Age when Received Award')

# Insight: Winners receive the Nobel Prize award in their late 50s on average. There are 2 outliers with age of 3 and 300 respectively.

At what age did the scienctists from various field receive Nobel Awards?

ggplot(aes(y = Age_Award, x = Category), data = subset(nobel, !is.na(nobel$Category)))+
  geom_boxplot()+ 
  xlab('Field of Study')+
  ylab('Age when Receive Nobel Prize')+
  ggtitle('Boxplot of Age when Awarded by Field of Study')
## Warning in loop_apply(n, do.ply): Removed 8 rows containing non-finite
## values (stat_boxplot).

by(nobel$Age_Award, nobel$Category, summary)
## nobel$Category: chemistry
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   35.00   49.00   56.00   57.16   64.00   85.00 
## -------------------------------------------------------- 
## nobel$Category: economics
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   48.00   62.00   66.00   66.88   73.00   90.00       1 
## -------------------------------------------------------- 
## nobel$Category: literature
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   56.00   63.00   63.93   72.50   88.00 
## -------------------------------------------------------- 
## nobel$Category: medicine
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   32.00   49.00   56.00   57.28   64.00   87.00       1 
## -------------------------------------------------------- 
## nobel$Category: peace
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    3.00   48.00   61.00   59.68   71.00  300.00       3 
## -------------------------------------------------------- 
## nobel$Category: physics
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   25.00   45.00   54.00   54.59   63.75   88.00       3
# Insight: the boxplot shows that scientiests in the hard science fields, such as Chemistry, Physics, and Medicine, received the awards in a younger age compare to other fields. Age of the winners with Nobel Peace Award various the most.