This is a part of an introduction to R for PyData Cyprus in Limassol at 17.05.2018 See the meetup https://www.meetup.com/PyData-Cyprus/

To be able to show greek text on the graphs

Sys.setlocale(category = "LC_ALL", locale = "Greek")
## [1] "LC_COLLATE=Greek_Greece.1253;LC_CTYPE=Greek_Greece.1253;LC_MONETARY=Greek_Greece.1253;LC_NUMERIC=C;LC_TIME=Greek_Greece.1253"

Libraries

Load the necessary libraries

library(tidyverse)  #Data wrangling, visualisation
## Warning: package 'tidyverse' was built under R version 3.4.4
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 2.2.1     <U+221A> purrr   0.2.5
## <U+221A> tibble  1.4.2     <U+221A> dplyr   0.7.6
## <U+221A> tidyr   0.8.1     <U+221A> stringr 1.2.0
## <U+221A> readr   1.1.1     <U+221A> forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readxl)     #read excel files
library(httr)       #Working with URLs and HTTP.I need this because the xls file is online
## Warning: package 'httr' was built under R version 3.4.4
library(forcats)    #deal wiith factors
library(stringr)    #deal with strings
library(hrbrthemes) #theme with focus on typography
## Warning: package 'hrbrthemes' was built under R version 3.4.4

Read dataset from the CYSTAT page

Read the dataset (it is in Excel and can be found in https://bit.ly/2Gugfib)

#Save the url into an object
url<-"http://www.cystat.gov.cy/mof/cystat/statistics.nsf/All/002613A044D463C2C2257FF00052B154/$file/EARNINGS-SURVEY2014_MONTHLY-150716.xls"

GET(url, write_disk(tf <- tempfile(fileext = ".xls"))) #get the dataset and save on disk temporatily
## Response [http://www.cystat.gov.cy/mof/cystat/statistics.nsf/All/002613A044D463C2C2257FF00052B154/$file/EARNINGS-SURVEY2014_MONTHLY-150716.xls]
##   Date: 2018-09-01 12:17
##   Status: 200
##   Content-Type: application/vnd.ms-excel
##   Size: 691 kB
## <ON DISK>  C:\Users\User\AppData\Local\Temp\Rtmp2tFTmr\file58429fd4de1.xls

Many thanks to these guys here https://bit.ly/2k8j1AU

Read the excel file in R

salarydata <- read_excel(tf,sheet="Î’3-B3",skip=6)      #read the sheet we want and skip the first 6 lines

Now some cleaning:

a)drop some columns ~ remove the first 2 columns
b)get the total earnings only
Note that the first blank column in the worksheet is not read by R since it is completely blank

salary=salarydata%>%
  select(4:7)%>%
  filter(between (row_number(),2, 20 ))
## Warning: package 'bindrcpp' was built under R version 3.4.4
  1. change the column names and tidy a little bit
salary=salary%>%
  magrittr::set_names(c("nace", "total", "male", "female"))%>%
  gather(gender, earnings, total:female) %>% 
  #separate the nace into gr (greek) and en (english)
  separate(nace, into = c("nace_gr", "nace_en"), sep = "\n") 

d)earnings are not recognized as a numeric variable (due to the untidiness of the source file)
We coerce it to be numeric:

salary$earnings=as.numeric(salary$earnings)

Plots

Plot the total (average) earnings in euro, per economic activity (NACE classification)

salary%>%
  filter(gender=="total")%>%
  ggplot(aes(x=fct_reorder(nace_en, earnings), y=round(earnings,1), group=1))+
  coord_flip()+
  geom_point()+
  geom_bar(stat="identity", width=0.05)+
  geom_text(aes(label=round(earnings,0)), hjust=-0.25)+
  theme(axis.text.y = element_text(hjust = 0))+
  ylim(0,3000)+
  #wrap the labels (too long)
  scale_x_discrete(labels = function(x)  str_wrap(x, width = 40))+
  labs(title="Average earnings by NACE",
       x="", y="Average Earnings",
       caption="Source: CYSTAT")+
  theme_ipsum_rc()

Gender gap by economic by activity

Dot plot

#turn the earnings into thousands
salary_thousands = salary%>%
  mutate(earnings=earnings/1000)

salary_thousands%>%
  #remove the overall results (keeps the female and male observations)
  filter(gender!="total")%>%
  #Lets plot
  ggplot(aes(x=fct_reorder(nace_gr, earnings), y=earnings))+
  #turn the coordinated system (to look better)
  coord_flip()+
  #line indicating the gap
  geom_line(aes(group= nace_gr))+
  #and the oints
  geom_point(aes(colour=gender), size = 3, alpha=0.89)+
  #Now the numbers
  #the male's number's on the right
  geom_text(data = subset(salary_thousands, gender == "male"), 
            aes(label=round(earnings,1), colour= gender),
            hjust = -0.85)+
  #the female's on the left
  geom_text(data = subset(salary_thousands, gender == "female"), 
            aes(label=round(earnings,1), colour= gender), 
            hjust = +1.85)+
  #remove the legend title (gender is obvious)
  guides(colour=guide_legend(title=NULL))+
  #fix some aesthetic components
  scale_y_continuous(expand=c(0.2,0), limits=c(0, 3.5))+
  scale_x_discrete(labels = function(x)  str_wrap(x, width = 40))+
  #the titles
  labs(title="Gender pay gap in Cyprus 2014", subtitle="By economic activity",
       x="NACE activity -Rev.2", y="Average Earnings('€000)",
       caption="Source: CYSTAT")+
  #the colour to be used
  scale_color_ipsum()+
hrbrthemes::theme_ipsum_rc()+
  #fix the above theme a little bit
theme(panel.grid.major.y = element_line(colour="grey90"),
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank(),
      legend.justification = c(0, 1))