This is a part of an introduction to R for PyData Cyprus in Limassol at 17.05.2018 See the meetup https://www.meetup.com/PyData-Cyprus/
To be able to show greek text on the graphs
Sys.setlocale(category = "LC_ALL", locale = "Greek")
## [1] "LC_COLLATE=Greek_Greece.1253;LC_CTYPE=Greek_Greece.1253;LC_MONETARY=Greek_Greece.1253;LC_NUMERIC=C;LC_TIME=Greek_Greece.1253"
Load the necessary libraries
library(tidyverse) #Data wrangling, visualisation
## Warning: package 'tidyverse' was built under R version 3.4.4
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 2.2.1 <U+221A> purrr 0.2.5
## <U+221A> tibble 1.4.2 <U+221A> dplyr 0.7.6
## <U+221A> tidyr 0.8.1 <U+221A> stringr 1.2.0
## <U+221A> readr 1.1.1 <U+221A> forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl) #read excel files
library(httr) #Working with URLs and HTTP.I need this because the xls file is online
## Warning: package 'httr' was built under R version 3.4.4
library(forcats) #deal wiith factors
library(stringr) #deal with strings
library(hrbrthemes) #theme with focus on typography
## Warning: package 'hrbrthemes' was built under R version 3.4.4
Read the dataset (it is in Excel and can be found in https://bit.ly/2Gugfib)
#Save the url into an object
url<-"http://www.cystat.gov.cy/mof/cystat/statistics.nsf/All/002613A044D463C2C2257FF00052B154/$file/EARNINGS-SURVEY2014_MONTHLY-150716.xls"
GET(url, write_disk(tf <- tempfile(fileext = ".xls"))) #get the dataset and save on disk temporatily
## Response [http://www.cystat.gov.cy/mof/cystat/statistics.nsf/All/002613A044D463C2C2257FF00052B154/$file/EARNINGS-SURVEY2014_MONTHLY-150716.xls]
## Date: 2018-09-01 12:17
## Status: 200
## Content-Type: application/vnd.ms-excel
## Size: 691 kB
## <ON DISK> C:\Users\User\AppData\Local\Temp\Rtmp2tFTmr\file58429fd4de1.xls
Many thanks to these guys here https://bit.ly/2k8j1AU
Read the excel file in R
salarydata <- read_excel(tf,sheet="Î’3-B3",skip=6) #read the sheet we want and skip the first 6 lines
a)drop some columns ~ remove the first 2 columns
b)get the total earnings only
Note that the first blank column in the worksheet is not read by R since it is completely blank
salary=salarydata%>%
select(4:7)%>%
filter(between (row_number(),2, 20 ))
## Warning: package 'bindrcpp' was built under R version 3.4.4
salary=salary%>%
magrittr::set_names(c("nace", "total", "male", "female"))%>%
gather(gender, earnings, total:female) %>%
#separate the nace into gr (greek) and en (english)
separate(nace, into = c("nace_gr", "nace_en"), sep = "\n")
d)earnings are not recognized as a numeric variable (due to the untidiness of the source file)
We coerce it to be numeric:
salary$earnings=as.numeric(salary$earnings)
Plot the total (average) earnings in euro, per economic activity (NACE classification)
salary%>%
filter(gender=="total")%>%
ggplot(aes(x=fct_reorder(nace_en, earnings), y=round(earnings,1), group=1))+
coord_flip()+
geom_point()+
geom_bar(stat="identity", width=0.05)+
geom_text(aes(label=round(earnings,0)), hjust=-0.25)+
theme(axis.text.y = element_text(hjust = 0))+
ylim(0,3000)+
#wrap the labels (too long)
scale_x_discrete(labels = function(x) str_wrap(x, width = 40))+
labs(title="Average earnings by NACE",
x="", y="Average Earnings",
caption="Source: CYSTAT")+
theme_ipsum_rc()
Gender gap by economic by activity
Dot plot
#turn the earnings into thousands
salary_thousands = salary%>%
mutate(earnings=earnings/1000)
salary_thousands%>%
#remove the overall results (keeps the female and male observations)
filter(gender!="total")%>%
#Lets plot
ggplot(aes(x=fct_reorder(nace_gr, earnings), y=earnings))+
#turn the coordinated system (to look better)
coord_flip()+
#line indicating the gap
geom_line(aes(group= nace_gr))+
#and the oints
geom_point(aes(colour=gender), size = 3, alpha=0.89)+
#Now the numbers
#the male's number's on the right
geom_text(data = subset(salary_thousands, gender == "male"),
aes(label=round(earnings,1), colour= gender),
hjust = -0.85)+
#the female's on the left
geom_text(data = subset(salary_thousands, gender == "female"),
aes(label=round(earnings,1), colour= gender),
hjust = +1.85)+
#remove the legend title (gender is obvious)
guides(colour=guide_legend(title=NULL))+
#fix some aesthetic components
scale_y_continuous(expand=c(0.2,0), limits=c(0, 3.5))+
scale_x_discrete(labels = function(x) str_wrap(x, width = 40))+
#the titles
labs(title="Gender pay gap in Cyprus 2014", subtitle="By economic activity",
x="NACE activity -Rev.2", y="Average Earnings('€000)",
caption="Source: CYSTAT")+
#the colour to be used
scale_color_ipsum()+
hrbrthemes::theme_ipsum_rc()+
#fix the above theme a little bit
theme(panel.grid.major.y = element_line(colour="grey90"),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.justification = c(0, 1))