Pakistan Labour Force Data - Exploring in R

This is a small project to explore the LFS data in R using tidyverse packages

library(readxl)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.1
library("tidyverse")
## Warning: package 'tidyverse' was built under R version 3.6.1
## -- Attaching packages ----------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble  2.1.3     v purrr   0.3.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.1
## Warning: package 'tidyr' was built under R version 3.6.1
## Warning: package 'readr' was built under R version 3.6.1
## Warning: package 'purrr' was built under R version 3.6.1
## Warning: package 'forcats' was built under R version 3.6.1
## -- Conflicts -------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(gg3D)
library(gtable)
require(gridExtra)
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(foreign)
library(haven)
## Warning: package 'haven' was built under R version 3.6.1
#input_data <- read_excel("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced Econometrics/Data/lfs pak.xlsx")

input_data <- read.spss("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced Econometrics/lfs 2017-18.sav1( with labels).sav",use.value.labels = TRUE,to.data.frame = TRUE)
## re-encoding from UTF-8
## Warning in read.spss("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced
## Econometrics/lfs 2017-18.sav1( with labels).sav", : Undeclared level(s) 3
## added in variable: S04C04
## Warning in read.spss("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced
## Econometrics/lfs 2017-18.sav1( with labels).sav", : Undeclared level(s) 11,
## 12, 13, 14, 15 added in variable: S04C09
## Warning in read.spss("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced
## Econometrics/lfs 2017-18.sav1( with labels).sav", : Undeclared level(s) 11,
## 12, 13, 14, 15 added in variable: S04C10
## Warning in read.spss("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced
## Econometrics/lfs 2017-18.sav1( with labels).sav", : Undeclared level(s) 42
## added in variable: occupation
## Warning in read.spss("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced
## Econometrics/lfs 2017-18.sav1( with labels).sav", : Undeclared level(s) 0
## added in variable: under_employed
#input_data <- read_excel("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced Econometrics/Data/lfs final data.xlsx")

Checking the summary of the data loaded

#summary(input_data)
dim(input_data)
## [1] 272490    189

Selecting fields of interest and renaming them in the same step for ease of use later on.’=

LFS<-input_data %>% select(Gendar = S04C05 ,Age = S04C06, Marital_Status = S04C07, Literacy = S04C08, Edu_Level = S04C09, vocational_training = S04C11, duration_of_training_weeks = S04C13, last_month_earning_kind = S07C042, last_month_earning_cash = S07C043, enterprise_type = S05C11, earning_last_year = S07C053, education_level,occupation, industry,Region,PROVINCE,prov6) 

We create 3 new variables by recoding enterprise_type description, occupation and years of education based on values from the survey description..

We follow this up by identifying factor variables.

LFS <- mutate(LFS, enterprise_type_desc=recode(enterprise_type,'1'="Federal",'2'="Provincial",'3'="Local_body",'4'="Pub_Ent", '5'="Pub_Lim",'6'="Pvt_lim",'7'="Coop_Soc",'8'="Inv_own",'9'="Partnership",'10'="Other", .default = "Unmatched", .missing = "N/A"), Edu_Yrs=recode(education_level,"No formal education" = 0,"Nursery but below KG" = 0.5,"KG but below Primary" = 1, "Primary but below Middle" = 6, "Middle But below Matric" = 9, "Mattic but below intermediate" = 11, "Intermediate below Degree" = 12, "Degree in engineering" = 16, "Degree in Medicine" = 16,"Degree in computer" = 16), Main_occ=recode(occupation, 'Chief Exective, senior officials and legislators'="Managerial",'Admistrative and Commercial Managers'="Managerial",'Production and specialized service Managers'="Managerial",'Hospitality retail and other service Managers'="Managerial",'Science and engineering profesioanls'="Professionals",'Health professionals'="Professionals",'Teaching professionals'="Professionals",'Business and administration professionals'="Professionals", 'Informaion and communication technology professional'="Professionals",'Information and communication technology professional'="Professionals",'legal, social and cultural professional'="Professionals",'Scinnce and engineering associate professional'="Technician", 'Business and adminstration assocaite professionals'="Technician", 'Legal, social cultural and related associate professional'="Technician", 'Health associate professional'="Technician",'Informtion and communication Technician'="Technician",'Informtion and communication technicians'="Clerks" ,'Clerical and keyboard clerks'="Clerks", 'Business and adminstration assocaite professionals'="Clerks", 'Legal, social cultural and related associate professional'="Clerks", 'Informtion and communication Technician'="Clerks",'Clerical and keyboard clerks'="Clerks",'42'="Clerks", 'Numerical and material recording clerks'="Clerks", 'other clerical support workers'="Clerks",'personal service workers'="Services", 'salers workers'="Services", 'personal care workers'="Services",'protective service workers'="Services",'craft and related trade workers'="Craft",'Metal, machinary and related trade workers'="Craft", 'Handicraft and printing workers'="Craft", 'electrical and electronic trade workers'="Craft",'Food processing, wood working, garments and other craft related trade workers'="Craft",'stationery, plant and machine operators'="Operators", 'Assemblers'="Operators", 'Drivers and mobile plant operators'="Operators",'Cleaners and helpers'="Elementary", 'Agricultural, forestery and fishery labourers'="Elementary", 'Labourer in Mining, construction, manufacturing and transport'="Elementary", 'Food preparation Assistant'="Elementary",'Street and related sales and service workers'="Elementary",'Refuse workers and Elementary workers'="Elementary",'subsistence farmers, fishers, hunters a'="Elementary", 'msrlte oriented skilled fishery, forestery and hunting workers'="Elementary",'Market orineted skilled agriculture workers'="Skilled",'Refuse workers and elementary workers'="Elementary",'msrlte oriented skilled fishery, forestery and hunting workers'="Skilled",'subsistence farmers, fishers, hunters and gatherers'="Elementary",'armed forces'="Services",'Hospitality  retail and other service Managers'="Managerial", .default = 'Unmathed')) 

LFS <- LFS %>% mutate(Main_occu=recode(Main_occ,'Managerial'=1, 'Professionals'=2, 'Technician'=3, 'Informtion and communication technicians'=4, 'Clerks'=4, 'Services'=5, 'Skilled'=9, 'Elementary'=6, 'Craft'=7, 'Operators'=8, 'Elementary'=5)) %>% mutate_at(vars(Gendar,Marital_Status,Literacy, Edu_Level, vocational_training,duration_of_training_weeks, education_level,occupation, industry,Region,PROVINCE,enterprise_type_desc,Edu_Yrs,Main_occu),as.factor)

Checking the distribution of Marital Status

LFS %>% group_by(Marital_Status) %>% summarise(count=n()) %>% mutate(percent = (count/sum(count))*100) %>% 
  ggplot(aes(x=reorder(Marital_Status, -percent), y=percent)) + geom_bar(stat = "identity", fill = "brown" )  +  
  geom_text(aes(label=paste(round(percent,2),"%", Sep="")), hjust=-.1 ) +   xlab("Marital Status") + ylim(0, 40) +theme(axis.text.x=element_blank(), axis.title.x=element_blank()) + coord_flip()
## Warning: Factor `Marital_Status` contains implicit NA, consider using
## `forcats::fct_explicit_na`

Checking the distribution of Education Level

ch1 <- LFS %>% group_by(education_level) %>% summarise(count=n()) %>% mutate(percent = (count/sum(count))*100) %>% na.omit() %>%
  ggplot(aes(x=reorder(education_level, -percent), y=percent)) + geom_bar(stat = "identity", fill = "brown" )  +  
  geom_text(aes(label=paste(round(percent,2),"%", Sep="")), hjust=-.1 ) +   xlab("Education Level") + ylim(0, 40) +theme(axis.text.x=element_blank(), axis.title.x=element_blank()) + coord_flip()
## Warning: Factor `education_level` contains implicit NA, consider using
## `forcats::fct_explicit_na`
ch2 <- LFS %>% group_by(Edu_Yrs) %>% summarise(count=n()) %>% mutate(percent = (count/sum(count))*100) %>% na.omit() %>%
  ggplot(aes(x=Edu_Yrs, y=percent)) + geom_bar(stat = "identity", fill = "brown" )  +  
  geom_text(aes(label=paste(round(percent,2),"%", Sep="")), hjust=-.1 ) +   xlab("Education # of Years") + ylim(0, 40) +theme(axis.text.x=element_blank(), axis.title.x=element_blank()) + coord_flip()
## Warning: Factor `Edu_Yrs` contains implicit NA, consider using
## `forcats::fct_explicit_na`
grid.arrange(ch1,ch2, ncol=2)

Lets check how education level is distributed across genders

LFS %>% group_by(Edu_Yrs, Gendar) %>% summarise(count=n()) %>% mutate(percent = (count/sum(count))*100) %>% na.omit() %>%
  ggplot(aes(x=Edu_Yrs, y=count, fill=Gendar)) + geom_bar(stat = "identity" ,position = 'dodge')  +  
  scale_fill_manual("Gendar", values = c("Male" = "blue", "Female" = "red")) +
  geom_text(aes(label=count),position = position_dodge(width = 1), hjust=-.2)+ ylim(0, 60000) +
  xlab("Education # of Years") +theme(axis.text.x=element_blank(), axis.title.x=element_blank()) + coord_flip()
## Warning: Factor `Edu_Yrs` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `Edu_Yrs` contains implicit NA, consider using
## `forcats::fct_explicit_na`

Checking the distribution of occupation level

LFS %>% group_by(occupation) %>% summarise(count=n()) %>% mutate(percent = (count/sum(count))*100)
## Warning: Factor `occupation` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 40 x 3
##    occupation                                            count percent
##    <fct>                                                 <int>   <dbl>
##  1 Chief Exective, senior officials and legislators        142  0.0521
##  2 Admistrative and Commercial Managers                    218  0.0800
##  3 Production and specialized service Managers             785  0.288 
##  4 Hospitality  retail and other service Managers          521  0.191 
##  5 Science and engineering profesioanls                    135  0.0495
##  6 Health professionals                                    264  0.0969
##  7 Teaching professionals                                 2773  1.02  
##  8 Business and administration professionals               275  0.101 
##  9 Information and communication technology professional   750  0.275 
## 10 legal, social and cultural professional                 588  0.216 
## # ... with 30 more rows

Checking the distribution of yearly earnings (note that this field has very few observations)

summary(LFS$earning_last_year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0    3000    6000   13708   15000  740000  271165
LFS %>% select(earning_last_year) %>% na.omit() %>% ggplot(aes(x=earning_last_year)) + geom_histogram(bins = 100, color="blue") +  xlab("Earnings last year") + ylab("No of Individuals") 

We check last year yearnings against education level and gendar - note the concentration of reds to the left of each grouping.

LFS %>% select(earning_last_year, Edu_Yrs, Gendar) %>% na.omit() %>% ggplot() + geom_point(aes(x=earning_last_year, y=Edu_Yrs, color=Gendar), alpha = .2) +    scale_color_manual(values = c("Male" = "blue", "Female" = "red")) +
  xlab("Last Year's earning") + ylab("Education Level") 

Regressing Earnings on Years of Education, Province, Ae, Gendar and Occupation

m<-lm(last_month_earning_cash~Edu_Yrs + PROVINCE + Age + Gendar + Main_occu ,data = LFS)

summary(m)
## 
## Call:
## lm(formula = last_month_earning_cash ~ Edu_Yrs + PROVINCE + Age + 
##     Gendar + Main_occu, data = LFS)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -53792  -5507   -607   4308 325125 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          20968.445   1140.803  18.380  < 2e-16 ***
## Edu_Yrs0.5            4054.867   1805.861   2.245 0.024758 *  
## Edu_Yrs1              1232.082    577.733   2.133 0.032974 *  
## Edu_Yrs6              1360.528    328.674   4.139 3.50e-05 ***
## Edu_Yrs9              1915.900    333.944   5.737 9.83e-09 ***
## Edu_Yrs11             4449.106    305.186  14.578  < 2e-16 ***
## Edu_Yrs12             7256.473    365.738  19.841  < 2e-16 ***
## Edu_Yrs16            37124.799    690.896  53.734  < 2e-16 ***
## PROVINCEPUNJAB       -2678.899    311.260  -8.607  < 2e-16 ***
## PROVINCESINDH        -2002.747    349.705  -5.727 1.04e-08 ***
## PROVINCEBALOCHISTAN   1393.220    385.112   3.618 0.000298 ***
## Age                    295.933      8.164  36.247  < 2e-16 ***
## GendarFemale         -6205.410    378.013 -16.416  < 2e-16 ***
## Main_occu2          -11235.608   1052.888 -10.671  < 2e-16 ***
## Main_occu3          -10995.818   1094.371 -10.048  < 2e-16 ***
## Main_occu4          -10986.503   1109.222  -9.905  < 2e-16 ***
## Main_occu5          -14953.946   1042.362 -14.346  < 2e-16 ***
## Main_occu6          -15747.527   1049.531 -15.004  < 2e-16 ***
## Main_occu7          -14335.883   1068.668 -13.415  < 2e-16 ***
## Main_occu8          -14623.684   1059.876 -13.798  < 2e-16 ***
## Main_occu9          -13866.231   1493.423  -9.285  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11770 on 13979 degrees of freedom
##   (258490 observations deleted due to missingness)
## Multiple R-squared:  0.3493, Adjusted R-squared:  0.3484 
## F-statistic: 375.2 on 20 and 13979 DF,  p-value: < 2.2e-16