setwd("D:/R/Udacity/EDA_Course_Materials/lesson5")
library(ggplot2)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# read 6 files
# female
f_15_24 <- read.csv("indicator_f 15-24 unemploy - Data.csv", header = TRUE, row.names = 1)
f_25_54 <- read.csv("indicator_f 25-54 unemploy - Data.csv", header = TRUE, row.names = 1)
f_above_55 <- read.csv("indicator_f above 55 unemploy - Data.csv", header = TRUE, row.names = 1)
# male
m_15_24 <- read.csv("indicator_m 15-24 unemploy - Data.csv", header = TRUE, row.names = 1)
m_25_54 <- read.csv("indicator_m 25-54 unemploy - Data.csv", header = TRUE, row.names = 1)
m_above_55 <- read.csv("indicator_m above 55 unemploy - Data.csv", header = TRUE, row.names = 1)
#add new columns (sex,age,country) to datasets
f_15_24['sex'] = 'female'
f_25_54['sex'] = 'female'
f_above_55['sex'] = 'female'
f_15_24['age'] = '15-24'
f_25_54['age'] = '25-54'
f_above_55['age'] = 'above 55'
f_15_24['country'] = row.names(f_15_24)
f_25_54['country'] = row.names(f_25_54)
f_above_55['country'] = row.names(f_above_55)
m_15_24['sex'] = 'male'
m_25_54['sex'] = 'male'
m_above_55['sex'] = 'male'
m_15_24['age'] = '15-24'
m_25_54['age'] = '25-54'
m_above_55['age'] = 'above 55'
m_15_24['country'] = row.names(m_15_24)
m_25_54['country'] = row.names(m_25_54)
m_above_55['country'] = row.names(m_above_55)
# merge datasets
data <- rbind(f_15_24, f_25_54, f_above_55, m_15_24, m_25_54, m_above_55)
#making tidydata
t_data <- gather(data, year, total, -sex, -age, -country)
#correct year names
t_data['year'] <- extract_numeric(t_data$year)
head(t_data)
## sex age country year total
## 1 female 15-24 Australia 1981 NA
## 2 female 15-24 Canada 1981 11.7
## 3 female 15-24 Czech Rep. 1981 NA
## 4 female 15-24 Estonia 1981 NA
## 5 female 15-24 Finland 1981 9.6
## 6 female 15-24 France 1981 23.7
dim(t_data)
## [1] 4350 5
summary(t_data)
## sex age country year
## Length:4350 Length:4350 Length:4350 Min. :1981
## Class :character Class :character Class :character 1st Qu.:1987
## Mode :character Mode :character Mode :character Median :1993
## Mean :1993
## 3rd Qu.:1999
## Max. :2005
##
## total
## Min. : 0.00
## 1st Qu.: 3.80
## Median : 6.50
## Mean : 8.77
## 3rd Qu.:11.70
## Max. :50.90
## NA's :1265
Notes: Unemployment quantiles and means for male and female are alomost the same but female boxplot has much higher outliers even over 50% of unemployment.
ggplot(aes(x = sex, y = total),
data = subset(t_data, !is.na(total))) + geom_boxplot() +
stat_summary(fun.y = mean, geom = 'point', shape = 2)
Notes: Mean graphs for 1980 - 2005 show that in average female have higher or equal to male unemployment rate.
ggplot(aes(x = year, y = total),
data = subset(t_data, !is.na(total))) +
geom_line(aes(color = sex), stat = 'summary', fun.y = mean)
Notes: Check how different are patterns for different countries. We can aslo see that Spain is the most problematic country in our dataset.
t_data$age <- factor(t_data$age)
t_data$age <- ordered(t_data$age, levels = c('15-24','25-54', 'above 55'))
ggplot(t_data, aes(x = year, y = total)) +
geom_point(aes(color = t_data$age), alpha = (1/2), position = position_jitter(width = 0.4)) +
facet_wrap( ~ country) +
scale_color_brewer(type = 'qual')
## Warning: Removed 30 rows containing missing values (geom_point).
## Warning: Removed 72 rows containing missing values (geom_point).
## Warning: Removed 114 rows containing missing values (geom_point).
## Warning: Removed 60 rows containing missing values (geom_point).
## Warning: Removed 24 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 66 rows containing missing values (geom_point).
## Warning: Removed 90 rows containing missing values (geom_point).
## Warning: Removed 102 rows containing missing values (geom_point).
## Warning: Removed 36 rows containing missing values (geom_point).
## Warning: Removed 30 rows containing missing values (geom_point).
## Warning: Removed 30 rows containing missing values (geom_point).
## Warning: Removed 72 rows containing missing values (geom_point).
## Warning: Removed 30 rows containing missing values (geom_point).
## Warning: Removed 78 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 102 rows containing missing values (geom_point).
## Warning: Removed 72 rows containing missing values (geom_point).
## Warning: Removed 60 rows containing missing values (geom_point).
## Warning: Removed 53 rows containing missing values (geom_point).
## Warning: Removed 120 rows containing missing values (geom_point).
Notes: Unemployment is higher in younger age.
ggplot(t_data, aes(x = year, y = total, color = age)) +
geom_point() +
scale_color_brewer(type = 'qual')
## Warning: Removed 1265 rows containing missing values (geom_point).