This is a small project to explore the LFS data in R using tidyverse packages
library(readxl)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.1
library("tidyverse")
## Warning: package 'tidyverse' was built under R version 3.6.1
## -- Attaching packages ----------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble 2.1.3 v purrr 0.3.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.1
## Warning: package 'tidyr' was built under R version 3.6.1
## Warning: package 'readr' was built under R version 3.6.1
## Warning: package 'purrr' was built under R version 3.6.1
## Warning: package 'forcats' was built under R version 3.6.1
## -- Conflicts -------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(gg3D)
library(gtable)
require(gridExtra)
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(foreign)
library(haven)
## Warning: package 'haven' was built under R version 3.6.1
#input_data <- read_excel("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced Econometrics/Data/lfs pak.xlsx")
input_data <- read.spss("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced Econometrics/lfs 2017-18.sav1( with labels).sav",use.value.labels = TRUE,to.data.frame = TRUE)
## re-encoding from UTF-8
## Warning in read.spss("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced
## Econometrics/lfs 2017-18.sav1( with labels).sav", : Undeclared level(s) 3
## added in variable: S04C04
## Warning in read.spss("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced
## Econometrics/lfs 2017-18.sav1( with labels).sav", : Undeclared level(s) 11,
## 12, 13, 14, 15 added in variable: S04C09
## Warning in read.spss("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced
## Econometrics/lfs 2017-18.sav1( with labels).sav", : Undeclared level(s) 11,
## 12, 13, 14, 15 added in variable: S04C10
## Warning in read.spss("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced
## Econometrics/lfs 2017-18.sav1( with labels).sav", : Undeclared level(s) 42
## added in variable: occupation
## Warning in read.spss("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced
## Econometrics/lfs 2017-18.sav1( with labels).sav", : Undeclared level(s) 0
## added in variable: under_employed
#input_data <- read_excel("C:/MAIN/Personal/P.I.D.E/ETS-771 Topics in Advanced Econometrics/Data/lfs final data.xlsx")
Checking the summary of the data loaded
#summary(input_data)
dim(input_data)
## [1] 272490 189
Selecting fields of interest and renaming them in the same step for ease of use later on.’=
LFS<-input_data %>% select(Gendar = S04C05 ,Age = S04C06, Marital_Status = S04C07, Literacy = S04C08, Edu_Level = S04C09, vocational_training = S04C11, duration_of_training_weeks = S04C13, last_month_earning_kind = S07C042, last_month_earning_cash = S07C043, enterprise_type = S05C11, earning_last_year = S07C053, education_level,occupation, industry,Region,PROVINCE,prov6)
We create 3 new variables by recoding enterprise_type description, occupation and years of education based on values from the survey description..
We follow this up by identifying factor variables.
LFS <- mutate(LFS, enterprise_type_desc=recode(enterprise_type,'1'="Federal",'2'="Provincial",'3'="Local_body",'4'="Pub_Ent", '5'="Pub_Lim",'6'="Pvt_lim",'7'="Coop_Soc",'8'="Inv_own",'9'="Partnership",'10'="Other", .default = "Unmatched", .missing = "N/A"), Edu_Yrs=recode(education_level,"No formal education" = 0,"Nursery but below KG" = 0.5,"KG but below Primary" = 1, "Primary but below Middle" = 6, "Middle But below Matric" = 9, "Mattic but below intermediate" = 11, "Intermediate below Degree" = 12, "Degree in engineering" = 16, "Degree in Medicine" = 16,"Degree in computer" = 16), Main_occ=recode(occupation, 'Chief Exective, senior officials and legislators'="Managerial",'Admistrative and Commercial Managers'="Managerial",'Production and specialized service Managers'="Managerial",'Hospitality retail and other service Managers'="Managerial",'Science and engineering profesioanls'="Professionals",'Health professionals'="Professionals",'Teaching professionals'="Professionals",'Business and administration professionals'="Professionals", 'Informaion and communication technology professional'="Professionals",'Information and communication technology professional'="Professionals",'legal, social and cultural professional'="Professionals",'Scinnce and engineering associate professional'="Technician", 'Business and adminstration assocaite professionals'="Technician", 'Legal, social cultural and related associate professional'="Technician", 'Health associate professional'="Technician",'Informtion and communication Technician'="Technician",'Informtion and communication technicians'="Clerks" ,'Clerical and keyboard clerks'="Clerks", 'Business and adminstration assocaite professionals'="Clerks", 'Legal, social cultural and related associate professional'="Clerks", 'Informtion and communication Technician'="Clerks",'Clerical and keyboard clerks'="Clerks",'42'="Clerks", 'Numerical and material recording clerks'="Clerks", 'other clerical support workers'="Clerks",'personal service workers'="Services", 'salers workers'="Services", 'personal care workers'="Services",'protective service workers'="Services",'craft and related trade workers'="Craft",'Metal, machinary and related trade workers'="Craft", 'Handicraft and printing workers'="Craft", 'electrical and electronic trade workers'="Craft",'Food processing, wood working, garments and other craft related trade workers'="Craft",'stationery, plant and machine operators'="Operators", 'Assemblers'="Operators", 'Drivers and mobile plant operators'="Operators",'Cleaners and helpers'="Elementary", 'Agricultural, forestery and fishery labourers'="Elementary", 'Labourer in Mining, construction, manufacturing and transport'="Elementary", 'Food preparation Assistant'="Elementary",'Street and related sales and service workers'="Elementary",'Refuse workers and Elementary workers'="Elementary",'subsistence farmers, fishers, hunters a'="Elementary", 'msrlte oriented skilled fishery, forestery and hunting workers'="Elementary",'Market orineted skilled agriculture workers'="Skilled",'Refuse workers and elementary workers'="Elementary",'msrlte oriented skilled fishery, forestery and hunting workers'="Skilled",'subsistence farmers, fishers, hunters and gatherers'="Elementary",'armed forces'="Services",'Hospitality retail and other service Managers'="Managerial", .default = 'Unmathed'))
LFS <- LFS %>% mutate(Main_occu=recode(Main_occ,'Managerial'=1, 'Professionals'=2, 'Technician'=3, 'Informtion and communication technicians'=4, 'Clerks'=4, 'Services'=5, 'Skilled'=9, 'Elementary'=6, 'Craft'=7, 'Operators'=8, 'Elementary'=5)) %>% mutate_at(vars(Gendar,Marital_Status,Literacy, Edu_Level, vocational_training,duration_of_training_weeks, education_level,occupation, industry,Region,PROVINCE,enterprise_type_desc,Edu_Yrs,Main_occu),as.factor)
Checking the distribution of Marital Status
LFS %>% group_by(Marital_Status) %>% summarise(count=n()) %>% mutate(percent = (count/sum(count))*100) %>%
ggplot(aes(x=reorder(Marital_Status, -percent), y=percent)) + geom_bar(stat = "identity", fill = "brown" ) +
geom_text(aes(label=paste(round(percent,2),"%", Sep="")), hjust=-.1 ) + xlab("Marital Status") + ylim(0, 40) +theme(axis.text.x=element_blank(), axis.title.x=element_blank()) + coord_flip()
## Warning: Factor `Marital_Status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
Checking the distribution of Education Level
ch1 <- LFS %>% group_by(education_level) %>% summarise(count=n()) %>% mutate(percent = (count/sum(count))*100) %>% na.omit() %>%
ggplot(aes(x=reorder(education_level, -percent), y=percent)) + geom_bar(stat = "identity", fill = "brown" ) +
geom_text(aes(label=paste(round(percent,2),"%", Sep="")), hjust=-.1 ) + xlab("Education Level") + ylim(0, 40) +theme(axis.text.x=element_blank(), axis.title.x=element_blank()) + coord_flip()
## Warning: Factor `education_level` contains implicit NA, consider using
## `forcats::fct_explicit_na`
ch2 <- LFS %>% group_by(Edu_Yrs) %>% summarise(count=n()) %>% mutate(percent = (count/sum(count))*100) %>% na.omit() %>%
ggplot(aes(x=Edu_Yrs, y=percent)) + geom_bar(stat = "identity", fill = "brown" ) +
geom_text(aes(label=paste(round(percent,2),"%", Sep="")), hjust=-.1 ) + xlab("Education # of Years") + ylim(0, 40) +theme(axis.text.x=element_blank(), axis.title.x=element_blank()) + coord_flip()
## Warning: Factor `Edu_Yrs` contains implicit NA, consider using
## `forcats::fct_explicit_na`
grid.arrange(ch1,ch2, ncol=2)
Lets check how education level is distributed across genders
LFS %>% group_by(Edu_Yrs, Gendar) %>% summarise(count=n()) %>% mutate(percent = (count/sum(count))*100) %>% na.omit() %>%
ggplot(aes(x=Edu_Yrs, y=count, fill=Gendar)) + geom_bar(stat = "identity" ,position = 'dodge') +
scale_fill_manual("Gendar", values = c("Male" = "blue", "Female" = "red")) +
geom_text(aes(label=count),position = position_dodge(width = 1), hjust=-.2)+ ylim(0, 60000) +
xlab("Education # of Years") +theme(axis.text.x=element_blank(), axis.title.x=element_blank()) + coord_flip()
## Warning: Factor `Edu_Yrs` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `Edu_Yrs` contains implicit NA, consider using
## `forcats::fct_explicit_na`
Checking the distribution of occupation level
LFS %>% group_by(occupation) %>% summarise(count=n()) %>% mutate(percent = (count/sum(count))*100)
## Warning: Factor `occupation` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 40 x 3
## occupation count percent
## <fct> <int> <dbl>
## 1 Chief Exective, senior officials and legislators 142 0.0521
## 2 Admistrative and Commercial Managers 218 0.0800
## 3 Production and specialized service Managers 785 0.288
## 4 Hospitality retail and other service Managers 521 0.191
## 5 Science and engineering profesioanls 135 0.0495
## 6 Health professionals 264 0.0969
## 7 Teaching professionals 2773 1.02
## 8 Business and administration professionals 275 0.101
## 9 Information and communication technology professional 750 0.275
## 10 legal, social and cultural professional 588 0.216
## # ... with 30 more rows
Checking the distribution of yearly earnings (note that this field has very few observations)
summary(LFS$earning_last_year)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 3000 6000 13708 15000 740000 271165
LFS %>% select(earning_last_year) %>% na.omit() %>% ggplot(aes(x=earning_last_year)) + geom_histogram(bins = 100, color="blue") + xlab("Earnings last year") + ylab("No of Individuals")
We check last year yearnings against education level and gendar - note the concentration of reds to the left of each grouping.
LFS %>% select(earning_last_year, Edu_Yrs, Gendar) %>% na.omit() %>% ggplot() + geom_point(aes(x=earning_last_year, y=Edu_Yrs, color=Gendar), alpha = .2) + scale_color_manual(values = c("Male" = "blue", "Female" = "red")) +
xlab("Last Year's earning") + ylab("Education Level")
Regressing Earnings on Years of Education, Province, Ae, Gendar and Occupation
m<-lm(last_month_earning_cash~Edu_Yrs + PROVINCE + Age + Gendar + Main_occu ,data = LFS)
summary(m)
##
## Call:
## lm(formula = last_month_earning_cash ~ Edu_Yrs + PROVINCE + Age +
## Gendar + Main_occu, data = LFS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53792 -5507 -607 4308 325125
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20968.445 1140.803 18.380 < 2e-16 ***
## Edu_Yrs0.5 4054.867 1805.861 2.245 0.024758 *
## Edu_Yrs1 1232.082 577.733 2.133 0.032974 *
## Edu_Yrs6 1360.528 328.674 4.139 3.50e-05 ***
## Edu_Yrs9 1915.900 333.944 5.737 9.83e-09 ***
## Edu_Yrs11 4449.106 305.186 14.578 < 2e-16 ***
## Edu_Yrs12 7256.473 365.738 19.841 < 2e-16 ***
## Edu_Yrs16 37124.799 690.896 53.734 < 2e-16 ***
## PROVINCEPUNJAB -2678.899 311.260 -8.607 < 2e-16 ***
## PROVINCESINDH -2002.747 349.705 -5.727 1.04e-08 ***
## PROVINCEBALOCHISTAN 1393.220 385.112 3.618 0.000298 ***
## Age 295.933 8.164 36.247 < 2e-16 ***
## GendarFemale -6205.410 378.013 -16.416 < 2e-16 ***
## Main_occu2 -11235.608 1052.888 -10.671 < 2e-16 ***
## Main_occu3 -10995.818 1094.371 -10.048 < 2e-16 ***
## Main_occu4 -10986.503 1109.222 -9.905 < 2e-16 ***
## Main_occu5 -14953.946 1042.362 -14.346 < 2e-16 ***
## Main_occu6 -15747.527 1049.531 -15.004 < 2e-16 ***
## Main_occu7 -14335.883 1068.668 -13.415 < 2e-16 ***
## Main_occu8 -14623.684 1059.876 -13.798 < 2e-16 ***
## Main_occu9 -13866.231 1493.423 -9.285 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11770 on 13979 degrees of freedom
## (258490 observations deleted due to missingness)
## Multiple R-squared: 0.3493, Adjusted R-squared: 0.3484
## F-statistic: 375.2 on 20 and 13979 DF, p-value: < 2.2e-16