This is an R Markdown Notebook. In this notebook, I will show how to use R programming to clean and reshape the data

Step 1: import the data

library(foreign) # package for import the stata file 
library(DT) # packag for table interface
tk1 <- read.dta(file = "/Users/Michael/Library/Mobile Documents/com~apple~CloudDocs/CMSE/R/Data Science/b3a_tk1.dta") # choose the file from any file in the computer 
library(tidyverse) # package for cleaning, reshaping and visualizing the data set. 
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
tk1_tf <- tbl_df(tk1) # convert the data set as table format and also data frame format. 
tk1_tf # view the data 
## # A tibble: 29,055 x 43
##     hhid07_9 pid07 tk01a tk01b tk01c tk01d  tk01  tk02  tk03  tk04  tk05
##  *     <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 001220000     1     1     3     3     3     1    NA    NA    NA    NA
##  2 001220000     2     1     3     1     3     1    NA    NA    NA    NA
##  3 001220000     6     1     3     3     3     1    NA    NA    NA    NA
##  4 001220000     7     1     3     1     3     1    NA    NA    NA    NA
##  5 001220000     9     1     1     1     3     3     1    NA    NA    NA
##  6 001220003     1     1     1     3     3     1    NA    NA    NA    NA
##  7 001220008     1     3     1     3     3     3     3     3     3     3
##  8 001250000     3     3     3     3     3    95     3     1    NA    NA
##  9 001250000     8     1     3     1     3     4     1    NA    NA    NA
## 10 001290000     1     1     3     3     3     1    NA    NA    NA    NA
## # ... with 29,045 more rows, and 32 more variables: tk06a <dbl>,
## #   tk07 <dbl>, tk08 <chr>, tk15 <dbl>, tk16ax <dbl>, tk16a1x <dbl>,
## #   tk16a1 <dbl>, tk16b <dbl>, tk16cart <dbl>, tk16c1 <dbl>, tk16d <dbl>,
## #   tk16ex <dbl>, tk16e <dbl>, tk16fa <dbl>, tk16fb <dbl>, tk16fc <dbl>,
## #   tk16fd <dbl>, tk16fe <dbl>, tk16ff <dbl>, tk16fg <dbl>, tk16g <dbl>,
## #   tk16h <dbl>, tk16i <dbl>, tk16ja <dbl>, tk16jb <dbl>, tk16jc <dbl>,
## #   tk16jd <dbl>, hhid07 <chr>, pidlink <chr>, tk16a <dbl>, version <dbl>,
## #   module <chr>

Setp 2.1: glance at the data, know the structure and character of data.

Variabale name: TK08 – Why haven’t you worked again since that year?

Variable elements:

Retirement - A Prolonged sickness - B Handicap - C Marriage - D Too old - E Have a child - F Family responsibilities - N Forbidden - O Other family reason - P Fired - Q Cannot find work - R Do not want to work - S Company closed/moved/bankrupt - T Other - V

glimpse(tk1_tf) # strucutre of data set
## Observations: 29,055
## Variables: 43
## $ hhid07_9 <chr> "001220000", "001220000", "001220000", "001220000", "...
## $ pid07    <dbl> 1, 2, 6, 7, 9, 1, 1, 3, 8, 1, 2, 1, 2, 3, 4, 1, 2, 2,...
## $ tk01a    <dbl> 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1,...
## $ tk01b    <dbl> 3, 3, 3, 3, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk01c    <dbl> 3, 1, 3, 1, 1, 3, 3, 3, 1, 3, 1, 3, 1, 1, 1, 3, 1, 1,...
## $ tk01d    <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk01     <dbl> 1, 1, 1, 1, 3, 1, 3, 95, 4, 1, 4, 1, 1, 4, 1, 1, 4, 7...
## $ tk02     <dbl> NA, NA, NA, NA, 1, NA, 3, 3, 1, NA, 1, NA, NA, 1, NA,...
## $ tk03     <dbl> NA, NA, NA, NA, NA, NA, 3, 1, NA, NA, NA, NA, NA, NA,...
## $ tk04     <dbl> NA, NA, NA, NA, NA, NA, 3, NA, NA, NA, NA, NA, NA, NA...
## $ tk05     <dbl> NA, NA, NA, NA, NA, NA, 3, NA, NA, NA, NA, NA, NA, NA...
## $ tk06a    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk07     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk08     <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "...
## $ tk15     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16ax   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16a1x  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16a1   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16b    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16cart <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16c1   <dbl> 2, 2, 3, 3, 3, 2, NA, 2, 2, 3, 2, 2, 2, 2, 3, 3, NA, ...
## $ tk16d    <dbl> 3, 9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk16ex   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16e    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fa   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fb   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fc   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fd   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fe   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16ff   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fg   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16g    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16h    <dbl> 1, 2, 2, 1, 3, 3, 3, 5, 8, 1, 4, 1, 8, 1, 1, 2, 4, 2,...
## $ tk16i    <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk16ja   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16jb   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16jc   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16jd   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ hhid07   <chr> "0012200", "0012200", "0012200", "0012200", "0012200"...
## $ pidlink  <chr> "001220001", "001220002", "001220006", "001220007", "...
## $ tk16a    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ version  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ module   <chr> "B3A_TK1", "B3A_TK1", "B3A_TK1", "B3A_TK1", "B3A_TK1"...
# we have 29055 items and 43 variables, many are numbers, some are character (string)

summary(tk1_tf$tk08)
##    Length     Class      Mode 
##     29055 character character
unique(tk1_tf$tk08) # show the unique element of tk_08
##  [1] ""     "S"    "E"    "F"    "N"    "B"    "A"    "V"    "FN"   "D"   
## [11] "O"    "EO"   "EN"   "P"    "DO"   "T"    "CE"   "EP"   "Q"    "C"   
## [21] "R"    "DE"   "NP"   "AE"   "DENO" "ER"   "FNS"  "DSV"  "ST"   "DF"  
## [31] "AR"   "FO"   "DNO"  "PS"   "ET"   "FNP"  "NPV"  "DN"   "DP"   "BP"  
## [41] "BE"   "BO"   "EV"   "ES"   "PV"   "OP"   "BC"   "NO"   "DFO"  "DFP" 
## [51] "NPS"  "AO"   "FV"   "ORS"  "EOS"  "SV"   "OR"   "OV"   "EOP"  "DFN" 
## [61] "DV"   "FOP"  "FP"
table(tk1_tf$tk08) # show the number of unique element of tk_08. lots of elements are empty string - "";
## 
##           A    AE    AO    AR     B    BC    BE    BO    BP     C    CE 
## 28045   124     4     1     1    49     2     4     1     1     6     2 
##     D    DE  DENO    DF   DFN   DFO   DFP    DN   DNO    DO    DP   DSV 
##   178     1     1     5     1     1     1     4     1     1     3     1 
##    DV     E    EN    EO   EOP   EOS    EP    ER    ES    ET    EV     F 
##     1   138     2    18     2     2     2     4     1     1     3    88 
##    FN   FNP   FNS    FO   FOP    FP    FV     N    NO    NP   NPS   NPV 
##    10     4     2     2     1     1     1    41     2     5     1     1 
##     O    OP    OR   ORS    OV     P    PS    PV     Q     R     S    ST 
##    32     2     1     1     1    58     4     1    10    16    61     1 
##    SV     T     V 
##     1    35    60
table(tk1_tf$tk08 == "")
## 
## FALSE  TRUE 
##  1010 28045

Step 2.2 Clean the data

We can clean the data by two ways:

  1. Filter the data which is not “” in tk08
  2. replace “” with NA(Not Availabe) in tk08

2.2.1 Filter the data which is not “” in tk08

tk1_clean_1 <- filter(tk1_tf, tk08 != "")
library(knitr) # package for showing the data in the neat format
glimpse(tk1_clean_1) # only 1010 obervations are left. 
## Observations: 1,010
## Variables: 43
## $ hhid07_9 <chr> "002040003", "002210000", "002280000", "002280000", "...
## $ pid07    <dbl> 2, 1, 4, 6, 2, 1, 1, 8, 4, 5, 11, 2, 3, 1, 3, 2, 10, ...
## $ tk01a    <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3,...
## $ tk01b    <dbl> 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3,...
## $ tk01c    <dbl> 1, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1,...
## $ tk01d    <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk01     <dbl> 95, 5, 2, 95, 4, 4, 5, 3, 4, 4, 3, 4, 4, 5, 4, 4, 4, ...
## $ tk02     <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk03     <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk04     <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk05     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ tk06a    <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk07     <dbl> 1994, 1995, 2001, 2007, 2007, 1997, 1991, 2007, 1996,...
## $ tk08     <chr> "S", "E", "F", "N", "B", "B", "A", "V", "B", "FN", "V...
## $ tk15     <dbl> 5, 1, 5, 5, 1, 1, 4, 6, 6, 4, 7, 7, 5, 1, 6, 5, 2, 8,...
## $ tk16ax   <dbl> 1, 1, 1, 1, 8, 8, 8, NA, NA, 1, 1, 1, 1, 1, NA, 1, 1,...
## $ tk16a1x  <dbl> NA, NA, NA, NA, 2, 2, 1, NA, NA, NA, NA, NA, NA, NA, ...
## $ tk16a1   <dbl> NA, NA, NA, NA, 28, 21, 12, NA, NA, NA, NA, NA, NA, N...
## $ tk16b    <dbl> 1, 3, 1, 1, 3, 3, 1, NA, NA, 1, 1, 1, 1, 1, NA, 1, 3,...
## $ tk16cart <dbl> NA, NA, NA, NA, NA, NA, NA, 1, 1, NA, NA, NA, NA, NA,...
## $ tk16c1   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16d    <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk16ex   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16e    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fa   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fb   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fc   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fd   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fe   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16ff   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fg   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16g    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16h    <dbl> 95, 8, 1, 1, 95, 7, 8, 3, 95, 8, 3, 4, 4, 8, 8, 4, 8,...
## $ tk16i    <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk16ja   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16jb   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16jc   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16jd   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ hhid07   <chr> "0020441", "0022100", "0022800", "0022800", "0031500"...
## $ pidlink  <chr> "002044102", "002210001", "002280004", "002280006", "...
## $ tk16a    <dbl> 450000, 200000, 200000, 200000, NA, NA, NA, NA, NA, 5...
## $ version  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ module   <chr> "B3A_TK1", "B3A_TK1", "B3A_TK1", "B3A_TK1", "B3A_TK1"...
# datatable(head(tk1_clean_1)) # only show the first ten rows

# we can also select the columns we want for analyzing

# Example
names(tk1_clean_1) # show all variable names 
##  [1] "hhid07_9" "pid07"    "tk01a"    "tk01b"    "tk01c"    "tk01d"   
##  [7] "tk01"     "tk02"     "tk03"     "tk04"     "tk05"     "tk06a"   
## [13] "tk07"     "tk08"     "tk15"     "tk16ax"   "tk16a1x"  "tk16a1"  
## [19] "tk16b"    "tk16cart" "tk16c1"   "tk16d"    "tk16ex"   "tk16e"   
## [25] "tk16fa"   "tk16fb"   "tk16fc"   "tk16fd"   "tk16fe"   "tk16ff"  
## [31] "tk16fg"   "tk16g"    "tk16h"    "tk16i"    "tk16ja"   "tk16jb"  
## [37] "tk16jc"   "tk16jd"   "hhid07"   "pidlink"  "tk16a"    "version" 
## [43] "module"
column_example <- select(tk1_clean_1, "hhid07_9", contains("tk0"))
datatable(head(column_example)) # only ID and variables with "tk0" were left in the dataset

2.2.2 Replace “” or 0 with NA in tk08, as we might need all the observations to analyze the data.

tk1_tf$tk08[tk1_tf$tk08 == ""] <- NA
head(tk1_tf$tk08) # all "" has been repaced with NA
## [1] NA NA NA NA NA NA
table(tk1_tf$tk08) # R will not take NA as a value, that's why you wont't see it in the table 
## 
##    A   AE   AO   AR    B   BC   BE   BO   BP    C   CE    D   DE DENO   DF 
##  124    4    1    1   49    2    4    1    1    6    2  178    1    1    5 
##  DFN  DFO  DFP   DN  DNO   DO   DP  DSV   DV    E   EN   EO  EOP  EOS   EP 
##    1    1    1    4    1    1    3    1    1  138    2   18    2    2    2 
##   ER   ES   ET   EV    F   FN  FNP  FNS   FO  FOP   FP   FV    N   NO   NP 
##    4    1    1    3   88   10    4    2    2    1    1    1   41    2    5 
##  NPS  NPV    O   OP   OR  ORS   OV    P   PS   PV    Q    R    S   ST   SV 
##    1    1   32    2    1    1    1   58    4    1   10   16   61    1    1 
##    T    V 
##   35   60

There is a disadvante by replacing “” with NA, it might affect the analyse as it will be taken as “Not Availbe”. To solve this problem, we can replace NA with 0.

tk1_tf$tk08[is.na(tk1_tf$tk08)] <- 0
head(tk1_tf$tk08)
## [1] "0" "0" "0" "0" "0" "0"
table((tk1_tf$tk08))
## 
##     0     A    AE    AO    AR     B    BC    BE    BO    BP     C    CE 
## 28045   124     4     1     1    49     2     4     1     1     6     2 
##     D    DE  DENO    DF   DFN   DFO   DFP    DN   DNO    DO    DP   DSV 
##   178     1     1     5     1     1     1     4     1     1     3     1 
##    DV     E    EN    EO   EOP   EOS    EP    ER    ES    ET    EV     F 
##     1   138     2    18     2     2     2     4     1     1     3    88 
##    FN   FNP   FNS    FO   FOP    FP    FV     N    NO    NP   NPS   NPV 
##    10     4     2     2     1     1     1    41     2     5     1     1 
##     O    OP    OR   ORS    OV     P    PS    PV     Q     R     S    ST 
##    32     2     1     1     1    58     4     1    10    16    61     1 
##    SV     T     V 
##     1    35    60
tk1_clean_2 <- tk1_tf # make a new data set as the version 2

Step 3 Reshape the data

Now, we want to reshape the data by adding new varialbe.

I will use the clean data set 1 to reshhape the data as it includes only 1010 obervations. Then the large data set 2 will be reshpaed by the same way.

3.1 Data set 1 reshape

table(tk1_clean_1$tk08) # glimplse the data again 
## 
##    A   AE   AO   AR    B   BC   BE   BO   BP    C   CE    D   DE DENO   DF 
##  124    4    1    1   49    2    4    1    1    6    2  178    1    1    5 
##  DFN  DFO  DFP   DN  DNO   DO   DP  DSV   DV    E   EN   EO  EOP  EOS   EP 
##    1    1    1    4    1    1    3    1    1  138    2   18    2    2    2 
##   ER   ES   ET   EV    F   FN  FNP  FNS   FO  FOP   FP   FV    N   NO   NP 
##    4    1    1    3   88   10    4    2    2    1    1    1   41    2    5 
##  NPS  NPV    O   OP   OR  ORS   OV    P   PS   PV    Q    R    S   ST   SV 
##    1    1   32    2    1    1    1   58    4    1   10   16   61    1    1 
##    T    V 
##   35   60

We want to reshape the data into dummy variables like this:

# as A means 'retirement', the observations should show 1 when it includes A, like this:

dummy_sample <- data.frame(ID = 1:6, Retirement = c(1, 0, 0, 1, 1, 0), Marriage = c(0, 1, 1, 0, 1, 1) )
datatable(dummy_sample)

To simplify the mission, all we need is to separate one column into several

In this section, we will use package called ‘splitstackshape’ for separating one culumn into several columns and fill the 1 and 0 in the corresponding columns.

library(splitstackshape) # the package for splitting the data and change into the dummy variables
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
tk_dummy <- cSplit_e(tk1_clean_1, "tk08", sep = "", type = "character", fill = 0)
names(tk_dummy)
##  [1] "hhid07_9" "pid07"    "tk01a"    "tk01b"    "tk01c"    "tk01d"   
##  [7] "tk01"     "tk02"     "tk03"     "tk04"     "tk05"     "tk06a"   
## [13] "tk07"     "tk08"     "tk15"     "tk16ax"   "tk16a1x"  "tk16a1"  
## [19] "tk16b"    "tk16cart" "tk16c1"   "tk16d"    "tk16ex"   "tk16e"   
## [25] "tk16fa"   "tk16fb"   "tk16fc"   "tk16fd"   "tk16fe"   "tk16ff"  
## [31] "tk16fg"   "tk16g"    "tk16h"    "tk16i"    "tk16ja"   "tk16jb"  
## [37] "tk16jc"   "tk16jd"   "hhid07"   "pidlink"  "tk16a"    "version" 
## [43] "module"   "tk08_A"   "tk08_B"   "tk08_C"   "tk08_D"   "tk08_E"  
## [49] "tk08_F"   "tk08_N"   "tk08_O"   "tk08_P"   "tk08_Q"   "tk08_R"  
## [55] "tk08_S"   "tk08_T"   "tk08_V"
datatable(tk_dummy[c(1:2, 14, 44:57)]) # select column 1(id), 2, 14(tk08), 44-57(new columns with seperated options) and present it as the neat table 
selc_colum <- c(1:2, 14, 44:57)

Now, we have converted the multiple answers in one columns into different dummy variables

3.2 If you want, we can change the names of tk08_A/B..

Now, we can give the name for differnt answers.

Here, we have 14 variables:

Retirement - A: rtm Prolonged sickness - B : pls Handicap - C : hdc Marriage - D : mrg Too old - E : tod Have a child - F : hac Family responsibilities - N : fmrsp Forbidden - O : fbd Other family reason - P : ofr Fired - Q : frd Cannot find work - R : cfw Do not want to work - S : dwtk Company closed/moved/bankrupt - T : ccmb Other - V : otr

names_indit <- c("rtm", "pls", "hdc", "mrg", "tod", "hac", "fmrsp", "fbd", "ofr", "frd", "cfw", "dwtk", "ccmb", "otr")
capital_indit <- c(paste(LETTERS[1:6]), "N", paste(LETTERS[15:20]), "V")
length(names_indit) # check the number of varialbes
## [1] 14
length(capital_indit)
## [1] 14
tk_dummy_1_indit <- tk_dummy
tk_dummy_2_capital <- tk_dummy

# we copy tk_dummy and give them different format and different names

names(tk_dummy_1_indit)[44:57] <- names_indit
datatable(tk_dummy_1_indit[selc_colum])
names(tk_dummy_2_capital)[44:57] <- capital_indit
datatable(tk_dummy_2_capital[selc_colum])

3.2 Data set 2 reshape

Now, we can use the same method to reshape the data set 2 which is the full data set.

tk_dt2_dummy <- cSplit_e(tk1_clean_2, "tk08", sep = "", type = "character", fill = 0)
names(tk_dt2_dummy)
##  [1] "hhid07_9" "pid07"    "tk01a"    "tk01b"    "tk01c"    "tk01d"   
##  [7] "tk01"     "tk02"     "tk03"     "tk04"     "tk05"     "tk06a"   
## [13] "tk07"     "tk08"     "tk15"     "tk16ax"   "tk16a1x"  "tk16a1"  
## [19] "tk16b"    "tk16cart" "tk16c1"   "tk16d"    "tk16ex"   "tk16e"   
## [25] "tk16fa"   "tk16fb"   "tk16fc"   "tk16fd"   "tk16fe"   "tk16ff"  
## [31] "tk16fg"   "tk16g"    "tk16h"    "tk16i"    "tk16ja"   "tk16jb"  
## [37] "tk16jc"   "tk16jd"   "hhid07"   "pidlink"  "tk16a"    "version" 
## [43] "module"   "tk08_0"   "tk08_A"   "tk08_B"   "tk08_C"   "tk08_D"  
## [49] "tk08_E"   "tk08_F"   "tk08_N"   "tk08_O"   "tk08_P"   "tk08_Q"  
## [55] "tk08_R"   "tk08_S"   "tk08_T"   "tk08_V"

You will notice that there is one more vaiable called “tk08_0”, as we replace “” with 0.

we can drop column called “tk08_0”

tk_dt2_dummy <- select(tk_dt2_dummy, -tk08_0)
names(tk_dt2_dummy)
##  [1] "hhid07_9" "pid07"    "tk01a"    "tk01b"    "tk01c"    "tk01d"   
##  [7] "tk01"     "tk02"     "tk03"     "tk04"     "tk05"     "tk06a"   
## [13] "tk07"     "tk08"     "tk15"     "tk16ax"   "tk16a1x"  "tk16a1"  
## [19] "tk16b"    "tk16cart" "tk16c1"   "tk16d"    "tk16ex"   "tk16e"   
## [25] "tk16fa"   "tk16fb"   "tk16fc"   "tk16fd"   "tk16fe"   "tk16ff"  
## [31] "tk16fg"   "tk16g"    "tk16h"    "tk16i"    "tk16ja"   "tk16jb"  
## [37] "tk16jc"   "tk16jd"   "hhid07"   "pidlink"  "tk16a"    "version" 
## [43] "module"   "tk08_A"   "tk08_B"   "tk08_C"   "tk08_D"   "tk08_E"  
## [49] "tk08_F"   "tk08_N"   "tk08_O"   "tk08_P"   "tk08_Q"   "tk08_R"  
## [55] "tk08_S"   "tk08_T"   "tk08_V"
names(tk_dummy) == names(tk_dt2_dummy) # check the names of different data set
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [29] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [43] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [57] TRUE
# the name should be same !
datatable(tk_dt2_dummy[selc_colum])
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## http://rstudio.github.io/DT/server.html

For tk08, when it is 0, it means that no data for that observations. When it is not 0, the corresponding columns should be 1, like this:

datatable(filter(tk_dt2_dummy, tk08 != 0)[selc_colum]) # show the data set 2 after filtering the none zero elements in tk_08 

4 Explore the data

After cleaning and reshaping the data, we can explore the data set by using ggplot2

4.1 Data visualization

we can use ggplot2 to visualizing the data and explore the distribution of different variables. For example, the variable “tk07” means that the year that participants worked last time.

Before visualzing the data, always chekck the data

range(tk_dt2_dummy$tk07, na.rm = TRUE)
## [1] 1946 9998
summary(tk_dt2_dummy$tk07)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1946    1990    1995    2151    1997    9998   28045

As it includes the 9998, which is not consistent with year (like 2017), this means that we need remove the outlier before visulizating it.

Now, let’s try another variable “tk16a”, which is the monthly income when participants are working.

summary(tk_dt2_dummy$tk16a)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## 0.00e+00 7.00e+04 2.00e+05 2.88e+06 4.50e+05 1.00e+09    28261
range(tk_dt2_dummy$tk16a, na.rm = TRUE)
## [1] 0e+00 1e+09
ggplot(data = tk_dt2_dummy, aes(tk16a)) + geom_density() + xlim(0, 450000) 
## Warning: Removed 28452 rows containing non-finite values (stat_density).

# remove the max, focus on range[0, 3rd Qtile]

5 Analyze the data

Now, we can do regression analysis by using the cleaned data set.

Firstly, let’s write data into the stata format.

Of course, R can do regression by itself

rg1 <- lm(tk16a ~ tk08_A + tk08_E + tk08_N, data = tk_dt2_dummy)
summary(rg1)
## 
## Call:
## lm(formula = tk16a ~ tk08_A + tk08_E + tk08_N, data = tk_dt2_dummy)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -19784408   -421202   -262656     37344 980212590 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   462656    2152315   0.215 0.829856    
## tk08_A          9727    5199624   0.002 0.998508    
## tk08_E      19324751    5323958   3.630 0.000302 ***
## tk08_N      -1313017    7024581  -0.187 0.851773    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49810000 on 790 degrees of freedom
##   (28261 observations deleted due to missingness)
## Multiple R-squared:  0.01694,    Adjusted R-squared:  0.0132 
## F-statistic: 4.537 on 3 and 790 DF,  p-value: 0.003665

Save the data in to stata foramt or csv format

write.dta(tk_dummy, file = "/Users/Michael/Library/Mobile Documents/com~apple~CloudDocs/CMSE/R/Data Science/Ade_1.dta")
write.dta(tk_dt2_dummy, file = "/Users/Michael/Library/Mobile Documents/com~apple~CloudDocs/CMSE/R/Data Science/Ade_2.dta")

Enjoy your data !

Thank You !