Data Wrangling

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

rm(list = ls())
setwd("C:/Users/hp/OneDrive - Higher Education Commission")
library(foreign)
library(haven)
library(tidyverse)
## -- Attaching packages ------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
LFS<-read.spss("LFS_labels.sav",use.value.labels = TRUE,to.data.frame = TRUE)
## re-encoding from CP1252
## Warning in read.spss("LFS_labels.sav", use.value.labels = TRUE,
## to.data.frame = TRUE): Undeclared level(s) 3 added in variable: S04C04
## Warning in read.spss("LFS_labels.sav", use.value.labels = TRUE,
## to.data.frame = TRUE): Undeclared level(s) 11, 12, 13, 14, 15 added in
## variable: S04C09
## Warning in read.spss("LFS_labels.sav", use.value.labels = TRUE,
## to.data.frame = TRUE): Undeclared level(s) 11, 12, 13, 14, 15 added in
## variable: S04C10
## Warning in read.spss("LFS_labels.sav", use.value.labels = TRUE,
## to.data.frame = TRUE): Undeclared level(s) 42 added in variable: occupation
## Warning in read.spss("LFS_labels.sav", use.value.labels = TRUE,
## to.data.frame = TRUE): Undeclared level(s) 0 added in variable:
## under_employed
#View(LFS)
## Select variables which will be used in this analysis
## Select variables which will be used in this analysis
LFS1<-LFS %>% select(S04C05,S04C06,S04C07,S04C08, S04C09, S04C11,S04C13,S07C042, S07C043, S05C11,S07C053, education_level,occupation, industry,Region,PROVINCE,prov6)
## S05C11 variable measures professional categories, we ll drop all values with NA
LFS1<-LFS1 %>% filter(!is.na(S05C11))
#LFS1
dim(LFS1)
## [1] 46440    17
## No we recode Profession categories
LFS1<-LFS1 %>% 
  mutate(Emp_cat=recode(S05C11,'1'="Federal",'2'="Provincial",'3'="Local_body",'4'="Pub_Ent", '5'="Pub_Lim",'6'="Pvt_lim",'7'="Coop_Soc",'8'="Inv_own",'9'="Partnership",'10'="Other"))
dim(LFS1)
## [1] 46440    18
## We declare some variables as factors
LFS1<-mutate_at(LFS1, vars(S04C05,S04C07,S04C08, S04C09, S04C11,S04C13, education_level,occupation, industry,Region,PROVINCE),as.factor)
## To calculate experience we subtract 6 from age

LFS1<-mutate_at(LFS1, vars(S04C06),as.numeric)
LFS1<- mutate(LFS1,Exp=S04C06-6)


LFS1 %>% 
  group_by(education_level) %>% 
  summarise(avg=mean(S07C042)) %>% 
  arrange(avg)
## # A tibble: 15 x 2
##    education_level                 avg
##    <fct>                         <dbl>
##  1 No formal education              NA
##  2 Nursery but below KG             NA
##  3 KG but below Primary             NA
##  4 Primary but below Middle         NA
##  5 Middle But below Matric          NA
##  6 Mattic but below intermediate    NA
##  7 Intermediate below Degree        NA
##  8 Degree in engineering            NA
##  9 Degree in Medicine               NA
## 10 Degree in computer               NA
## 11 Degree in agriculture            NA
## 12 Degree in other subjects         NA
## 13 M.A/ M.SC                        NA
## 14 MPhil                            NA
## 15 Ph.D                             NA
LFS1 %>% 
  group_by(S04C07) %>% 
  summarise(count=n())
## # A tibble: 4 x 2
##   S04C07        count
##   <fct>         <int>
## 1 Never married 12937
## 2 Married       32532
## 3 Widow/Widower   771
## 4 Divorced        200
LFS %>%filter(S04C06>10) %>% 
  group_by(S04C07) %>% 
   summarise(n = n()) %>% 
    mutate(freq = n*100 /sum(n),freq=round(freq,2)) #Round to two decimal points
## # A tibble: 4 x 3
##   S04C07             n  freq
##   <fct>          <int> <dbl>
## 1 Never married  72241 39.6 
## 2 Married       102575 56.3 
## 3 Widow/Widower   6847  3.76
## 4 Divorced         547  0.3
##Sex ratio
LFS  %>% 
  group_by(S04C05) %>% 
   summarise(n = n()) %>% 
    mutate(freq = n*100 /sum(n),freq=round(freq,2)) #Round to two decimal points
## # A tibble: 2 x 3
##   S04C05      n  freq
##   <fct>   <int> <dbl>
## 1 Male   139125  51.1
## 2 Female 133365  48.9
LFS1<- mutate(LFS1,Exp=S04C06-6)

#View(LFS1$Exp)
#logit<-glm(S04C05~Emp_cat+Region,data = LFS1,family =  "binomial")

#summary(logit)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

#Variable S04C09 measure education but some numbers are not labeled properly

## We label missing values as below
LFS1<-LFS1 %>% 
  mutate(Edu_Yrs=recode(S04C09,'12'="Intermediate but below degree",'11'="Matric but below Intermediate",'13'="Intermediate but below degree",'14'="Bachelor", '15'="Degree but below master"))
# We verify whether now all values are labelled as desired
LFS1 %>% 
  group_by(occupation) %>% 
  summarise(count=n())
## Warning: Factor `occupation` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 39 x 2
##    occupation                                            count
##    <fct>                                                 <int>
##  1 Chief Exective, senior officials and legislators        141
##  2 Admistrative and Commercial Managers                    217
##  3 Production and specialized service Managers             746
##  4 Hospitality  retail and other service Managers          521
##  5 Science and engineering profesioanls                    135
##  6 Health professionals                                    264
##  7 Teaching professionals                                 2773
##  8 Business and administration professionals               274
##  9 Information and communication technology professional   750
## 10 legal, social and cultural professional                 588
## # ... with 29 more rows
Table1<-LFS1 %>% 
  group_by(occupation) %>% 
  summarise(count=n())
## Warning: Factor `occupation` contains implicit NA, consider using
## `forcats::fct_explicit_na`
LFS1<- LFS1 %>% mutate(Main_occ=recode(occupation, 'Chief Exective, senior officials and legislators'="Managerial",'Admistrative and Commercial Managers'="Managerial",'Production and specialized service Managers'="Managerial",'Hospitality retail and other service Managers'="Managerial",'Science and engineering profesioanls'="Professionals",'Health professionals'="Professionals",'Teaching professionals'="Professionals",'Business and administration professionals'="Professionals", 'Informaion and communication technology professional'="Professionals",'Information and communication technology professional'="Professionals",'legal, social and cultural professional'="Professionals",'Scinnce and engineering associate professional'="Technician", 'Business and adminstration assocaite professionals'="Technician", 'Legal, social cultural and related associate professional'="Technician", 'Health associate professional'="Technician",'Informtion and communication Technician'="Technician",'Informtion and communication technicians'="Clerks" ,'Clerical and keyboard clerks'="Clerks", 'Business and adminstration assocaite professionals'="Clerks", 'Legal, social cultural and related associate professional'="Clerks", 'Informtion and communication Technician'="Clerks",'Clerical and keyboard clerks'="Clerks",'42'="Clerks", 'Numerical and material recording clerks'="Clerks", 'other clerical support workers'="Clerks",'personal service workers'="Services", 'salers workers'="Services", 'personal care workers'="Services",'protective service workers'="Services",'craft and related trade workers'="Craft",'Metal, machinary and related trade workers'="Craft", 'Handicraft and printing workers'="Craft", 'electrical and electronic trade workers'="Craft",'Food processing, wood working, garments and other craft related trade workers'="Craft",'stationery, plant and machine operators'="Operators", 'Assemblers'="Operators", 'Drivers and mobile plant operators'="Operators",'Cleaners and helpers'="Elementary", 'Agricultural, forestery and fishery labourers'="Elementary", 'Labourer in Mining, construction, manufacturing and transport'="Elementary", 'Food preparation Assistant'="Elementary",'Street and related sales and service workers'="Elementary",'Refuse workers and Elementary workers'="Elementary",'subsistence farmers, fishers, hunters a'="Elementary", 'msrlte oriented skilled fishery, forestery and hunting workers'="Elementary",'Market orineted skilled agriculture workers'="Skilled",'Refuse workers and elementary workers'="Elementary",'msrlte oriented skilled fishery, forestery and hunting workers
'="Skilled",'subsistence farmers, fishers, hunters and gatherers'="Elementary",'armed forces'="Services",'Hospitality  retail and other service Managers'="Managerial"))
LFS1<-as_tibble(LFS1)

LFS11<-LFS1 %>% mutate(Main_occu=recode(Main_occ,'Managerial'=1, 'Professionals'=2, 
'Technician'=3, 'Informtion and communication technicians'=4, 'Clerks'=4, 
'Services'=5, 'Skilled'=9, 'Elementary'=6, 'Craft'=7, 'Operators'=8, 'Elementary'=5
))

LFS11<-mutate_at(LFS11, vars(Main_occu),as.factor)
LFS11<-LFS11 %>% filter(!is.na(Main_occu))
tbl1<-LFS11 %>% 
  group_by(Main_occu) %>% 
  summarise(count=n())
tbl1
## # A tibble: 9 x 2
##   Main_occu count
##   <fct>     <int>
## 1 1          1625
## 2 2          4784
## 3 3          1946
## 4 4          1220
## 5 5         12430
## 6 6          8845
## 7 7         10465
## 8 8          5004
## 9 9           120
save(LFS11,file = "LabourForceSurvery.RDATA")
dim(LFS1)
## [1] 46440    21
df1 <- 
  model.matrix(~ -1 + LFS1$Main_occ) %>% 
  tibble::as_tibble() %>% 
  purrr::set_names(levels(LFS1$Main_occ))
library(nnet)
#LFS1$Main_occ <- relevel(LFS1$Main_occ, ref = "academic")
#m1<-multinom(LFS1$Main_occu~LFS1$education_level+LFS1$Exp,data = LFS1)
#summary(m1)
#exp(coef(m1))
wage<-LFS11 %>% filter(!is.na(S07C043)) 
#wage
p1<-wage %>%group_by(education_level,S04C05) %>% 
  summarize(avg=round(mean(S07C043,0))) %>% 
  arrange(avg)
p1
## # A tibble: 30 x 3
## # Groups:   education_level [15]
##    education_level               S04C05   avg
##    <fct>                         <fct>  <dbl>
##  1 KG but below Primary          Female  6221
##  2 No formal education           Female  7889
##  3 Primary but below Middle      Female  8952
##  4 Nursery but below KG          Female  9500
##  5 Middle But below Matric       Female 13843
##  6 KG but below Primary          Male   15271
##  7 Mattic but below intermediate Female 15564
##  8 Primary but below Middle      Male   15955
##  9 Intermediate below Degree     Female 16356
## 10 Middle But below Matric       Male   16558
## # ... with 20 more rows
p2<-ggplot(p1,aes(x=education_level,y=avg, fill=S04C05))+geom_bar(stat = "identity",width=0.7,position = 'dodge') +scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+coord_flip()
p2+ggtitle("Average monthly earnings by education_level and genderwise")

## To display numbers instead of 1+e00 use following
##p2+scale_y_continuous(labels = function(x) format(x, scientific = FALSE))

#Label Bar charts  theme(legend.title = element_blank()) .
p3<-p2+geom_text(aes(label=avg), vjust =1.6, size = 3.0) + theme(legend.title = element_blank())
p3

## Histogram and boxplot
#ggplot(wage)+aes(log(S07C043))+geom_histogram()
m<-lm(S07C043~Edu_Yrs,data = wage)
summary(m)
## 
## Call:
## lm(formula = S07C043 ~ Edu_Yrs, data = wage)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -99842   -8369   -2141    4770 1268778 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)                           15230.0      375.6  40.547  < 2e-16
## Edu_YrsDegree in Computer             19715.3     2603.6   7.572 3.85e-14
## Edu_YrsMatric but below Intermediate   5139.0      524.4   9.800  < 2e-16
## Edu_YrsIntermediate but below degree  15992.4      462.6  34.569  < 2e-16
## Edu_YrsBachelor                       33343.8     2065.3  16.145  < 2e-16
## Edu_YrsDegree but below master        96112.1     4743.3  20.263  < 2e-16
## Edu_YrsNursery but below KG            4351.1     3409.1   1.276   0.2019
## Edu_YrsKG but below Primary            -523.6     1042.7  -0.502   0.6156
## Edu_YrsPrimary but below Middel         411.1      595.9   0.690   0.4903
## Edu_YrsMiddle but below Matric         1224.3      595.7   2.055   0.0399
## Edu_YrsDegree in Engineering          35488.3     1557.2  22.789  < 2e-16
## Edu_YrsDegree in medicine             58594.4     1839.5  31.853  < 2e-16
##                                         
## (Intercept)                          ***
## Edu_YrsDegree in Computer            ***
## Edu_YrsMatric but below Intermediate ***
## Edu_YrsIntermediate but below degree ***
## Edu_YrsBachelor                      ***
## Edu_YrsDegree but below master       ***
## Edu_YrsNursery but below KG             
## Edu_YrsKG but below Primary             
## Edu_YrsPrimary but below Middel         
## Edu_YrsMiddle but below Matric       *  
## Edu_YrsDegree in Engineering         ***
## Edu_YrsDegree in medicine            ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20610 on 16956 degrees of freedom
## Multiple R-squared:  0.1749, Adjusted R-squared:  0.1744 
## F-statistic: 326.9 on 11 and 16956 DF,  p-value: < 2.2e-16
#ggplot(wage)+aes(log(S07C043))+geom_histogram()+facet_wrap(S04C05)
#glimpse(wage)
p<-ggplot(wage,aes(x=S04C05,y=log(S07C043)))+geom_boxplot()
p+ stat_summary(fun.y=mean, geom="point", shape=23, size=4) ## diplay statistics
## Warning: Removed 17 rows containing non-finite values (stat_boxplot).
## Warning: Removed 17 rows containing non-finite values (stat_summary).

p+coord_flip()
## Warning: Removed 17 rows containing non-finite values (stat_boxplot).

# Box plot with mean points
p + stat_summary(fun.y=mean, geom="point", shape=23, size=4)
## Warning: Removed 17 rows containing non-finite values (stat_boxplot).

## Warning: Removed 17 rows containing non-finite values (stat_summary).