setwd("D:/R/Udacity/EDA_Course_Materials/lesson3")
library(knitr)
library(ggplot2)
library(tidyr)

Population aged 20-39 years, both sexes (%)

#read two files
male <- read.csv("indicator_male 20-39 percen - Data.csv", header = TRUE, row.names = 1)
female <- read.csv("indicator_female 20-39 percen - Data.csv", header = TRUE, row.names = 1)
#add new columns which represent sex
male['sex'] = 'male'
male['country'] <- row.names(male)
female['sex'] = 'female'
female['country'] <- row.names(female)
#making tidydata 
tidymale<-gather(male, year, population, -sex, - country)
tidyfemale<-gather(female, year, population, -sex, - country)
#merge two datasets
data <- rbind(tidymale, tidyfemale)

#correct year names
data['year'] <- extract_numeric(data$year)
#plot of male and female population distribution all over the world
qplot(x = year, y = population, data = data,
      main = "World population age 20-39 in %%") + facet_wrap(~sex)

#plot of current population divided by genders
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
today<-filter(data, year == "2015")
qplot(x= sex, y = population, data = today,
      main = "World Male and Female population in 2015 in %%",
      geom = 'boxplot')

#statistical summary
by(today$population, today$sex, summary)
## today$sex: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.76   27.59   30.06   30.12   32.60   40.80 
## -------------------------------------------------------- 
## today$sex: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   23.16   28.75   31.02   31.23   33.33   51.53
#Population rate in Russia age 20-39 in %%
russia<-filter(data, country == "Russia")
qplot(x = year, y = population, data = russia,
      main = "Population rate in Russia age 20-39 in %%",
      geom = "freqpoly", 
      color = sex, 
      binwidth = 1, 
      stat = 'identity')

#Population rate in Turkey age 20-39 in %%
turkey<-filter(data, country == "Turkey")
qplot(x = year, y = population, data = turkey,
      main = "Population rate in Turkey age 20-39 in %%",
      geom = "freqpoly", 
      color = sex, 
      binwidth = 1, 
      stat = 'identity')

#Population rate age 20-39 in Turkey, Russia, UK and USA in %%
new_set<-filter(data, country == "Russia" | country == "Turkey" | country == "United States" | country == "United Kingdom")

by_country_year<-group_by(new_set, country, year)
mean_pop<-summarise_each(by_country_year, funs(mean), population)
qplot(x = year, y = population, data=mean_pop,
      geom = "freqpoly", 
      color = country, 
      binwidth = 1, 
      stat = 'identity', 
      main = "Population rate age 20-39 in Turkey, Russia, UK and USA in %%")