Unemployment in the World

setwd("D:/R/Udacity/EDA_Course_Materials/lesson5")
library(ggplot2)
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# read 6 files

# female
f_15_24 <- read.csv("indicator_f 15-24 unemploy - Data.csv", header = TRUE, row.names = 1)
f_25_54 <- read.csv("indicator_f 25-54 unemploy - Data.csv", header = TRUE, row.names = 1)
f_above_55 <- read.csv("indicator_f above 55 unemploy - Data.csv", header = TRUE, row.names = 1)

# male
m_15_24 <- read.csv("indicator_m 15-24 unemploy - Data.csv", header = TRUE, row.names = 1)
m_25_54 <- read.csv("indicator_m 25-54 unemploy - Data.csv", header = TRUE, row.names = 1)
m_above_55 <- read.csv("indicator_m above 55 unemploy - Data.csv", header = TRUE, row.names = 1)

#add new columns (sex,age,country) to datasets
f_15_24['sex'] = 'female'
f_25_54['sex'] = 'female'
f_above_55['sex'] = 'female'

f_15_24['age'] = '15-24'
f_25_54['age'] = '25-54'
f_above_55['age'] = 'above 55'

f_15_24['country'] = row.names(f_15_24)
f_25_54['country'] = row.names(f_25_54)
f_above_55['country'] = row.names(f_above_55)

m_15_24['sex'] = 'male'
m_25_54['sex'] = 'male'
m_above_55['sex'] = 'male'

m_15_24['age'] = '15-24'
m_25_54['age'] = '25-54'
m_above_55['age'] = 'above 55'

m_15_24['country'] = row.names(m_15_24)
m_25_54['country'] = row.names(m_25_54)
m_above_55['country'] = row.names(m_above_55)

# merge datasets
data <- rbind(f_15_24, f_25_54, f_above_55, m_15_24, m_25_54, m_above_55)

#making tidydata 
t_data <- gather(data, year, total, -sex, -age, -country)

#correct year names
t_data['year'] <- extract_numeric(t_data$year)

First look at tidy_data

head(t_data)

##      sex   age    country year total
## 1 female 15-24  Australia 1981    NA
## 2 female 15-24     Canada 1981  11.7
## 3 female 15-24 Czech Rep. 1981    NA
## 4 female 15-24    Estonia 1981    NA
## 5 female 15-24    Finland 1981   9.6
## 6 female 15-24     France 1981  23.7

dim(t_data)

## [1] 4350    5

summary(t_data)

##      sex                age              country               year     
##  Length:4350        Length:4350        Length:4350        Min.   :1981  
##  Class :character   Class :character   Class :character   1st Qu.:1987  
##  Mode  :character   Mode  :character   Mode  :character   Median :1993  
##                                                           Mean   :1993  
##                                                           3rd Qu.:1999  
##                                                           Max.   :2005  
##                                                                         
##      total      
##  Min.   : 0.00  
##  1st Qu.: 3.80  
##  Median : 6.50  
##  Mean   : 8.77  
##  3rd Qu.:11.70  
##  Max.   :50.90  
##  NA's   :1265

Unemployment rate according to gender

Notes: Unemployment quantiles and means for male and female are alomost the same but female boxplot has much higher outliers even over 50% of unemployment.

ggplot(aes(x = sex, y = total),
       data = subset(t_data, !is.na(total))) + geom_boxplot() +
    stat_summary(fun.y = mean, geom = 'point', shape = 2)

Unemployment mean rate according to gender

Notes: Mean graphs for 1980 - 2005 show that in average female have higher or equal to male unemployment rate.

ggplot(aes(x = year, y = total),
       data = subset(t_data, !is.na(total))) + 
    geom_line(aes(color = sex), stat = 'summary', fun.y = mean)

Unemployment rate of each country yearly by age

Notes: Check how different are patterns for different countries. We can aslo see that Spain is the most problematic country in our dataset.

t_data$age <- factor(t_data$age)
t_data$age <- ordered(t_data$age, levels = c('15-24','25-54', 'above 55'))

ggplot(t_data, aes(x = year, y = total)) +
    geom_point(aes(color = t_data$age), alpha = (1/2), position = position_jitter(width = 0.4)) +
    facet_wrap( ~ country) +
    scale_color_brewer(type = 'qual')

## Warning: Removed 30 rows containing missing values (geom_point).

## Warning: Removed 72 rows containing missing values (geom_point).

## Warning: Removed 114 rows containing missing values (geom_point).

## Warning: Removed 60 rows containing missing values (geom_point).

## Warning: Removed 24 rows containing missing values (geom_point).

## Warning: Removed 12 rows containing missing values (geom_point).

## Warning: Removed 66 rows containing missing values (geom_point).

## Warning: Removed 90 rows containing missing values (geom_point).

## Warning: Removed 102 rows containing missing values (geom_point).

## Warning: Removed 36 rows containing missing values (geom_point).

## Warning: Removed 30 rows containing missing values (geom_point).

## Warning: Removed 30 rows containing missing values (geom_point).

## Warning: Removed 72 rows containing missing values (geom_point).

## Warning: Removed 30 rows containing missing values (geom_point).

## Warning: Removed 78 rows containing missing values (geom_point).

## Warning: Removed 12 rows containing missing values (geom_point).

## Warning: Removed 102 rows containing missing values (geom_point).

## Warning: Removed 72 rows containing missing values (geom_point).

## Warning: Removed 60 rows containing missing values (geom_point).

## Warning: Removed 53 rows containing missing values (geom_point).

## Warning: Removed 120 rows containing missing values (geom_point).

Unemployment analysis by age

Notes: Unemployment is higher in younger age.

ggplot(t_data, aes(x = year, y = total, color = age)) +
    geom_point() +
    scale_color_brewer(type = 'qual')

## Warning: Removed 1265 rows containing missing values (geom_point).