This is an R Markdown Notebook. In this notebook, I will show how to use R programming to clean and reshape the data
library(foreign) # package for import the stata file
library(DT) # packag for table interface
tk1 <- read.dta(file = "/Users/Michael/Library/Mobile Documents/com~apple~CloudDocs/CMSE/R/Data Science/b3a_tk1.dta") # choose the file from any file in the computer
library(tidyverse) # package for cleaning, reshaping and visualizing the data set.
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
tk1_tf <- tbl_df(tk1) # convert the data set as table format and also data frame format.
tk1_tf # view the data
## # A tibble: 29,055 x 43
## hhid07_9 pid07 tk01a tk01b tk01c tk01d tk01 tk02 tk03 tk04 tk05
## * <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 001220000 1 1 3 3 3 1 NA NA NA NA
## 2 001220000 2 1 3 1 3 1 NA NA NA NA
## 3 001220000 6 1 3 3 3 1 NA NA NA NA
## 4 001220000 7 1 3 1 3 1 NA NA NA NA
## 5 001220000 9 1 1 1 3 3 1 NA NA NA
## 6 001220003 1 1 1 3 3 1 NA NA NA NA
## 7 001220008 1 3 1 3 3 3 3 3 3 3
## 8 001250000 3 3 3 3 3 95 3 1 NA NA
## 9 001250000 8 1 3 1 3 4 1 NA NA NA
## 10 001290000 1 1 3 3 3 1 NA NA NA NA
## # ... with 29,045 more rows, and 32 more variables: tk06a <dbl>,
## # tk07 <dbl>, tk08 <chr>, tk15 <dbl>, tk16ax <dbl>, tk16a1x <dbl>,
## # tk16a1 <dbl>, tk16b <dbl>, tk16cart <dbl>, tk16c1 <dbl>, tk16d <dbl>,
## # tk16ex <dbl>, tk16e <dbl>, tk16fa <dbl>, tk16fb <dbl>, tk16fc <dbl>,
## # tk16fd <dbl>, tk16fe <dbl>, tk16ff <dbl>, tk16fg <dbl>, tk16g <dbl>,
## # tk16h <dbl>, tk16i <dbl>, tk16ja <dbl>, tk16jb <dbl>, tk16jc <dbl>,
## # tk16jd <dbl>, hhid07 <chr>, pidlink <chr>, tk16a <dbl>, version <dbl>,
## # module <chr>
Variabale name: TK08 – Why haven’t you worked again since that year?
Variable elements:
Retirement - A Prolonged sickness - B Handicap - C Marriage - D Too old - E Have a child - F Family responsibilities - N Forbidden - O Other family reason - P Fired - Q Cannot find work - R Do not want to work - S Company closed/moved/bankrupt - T Other - V
glimpse(tk1_tf) # strucutre of data set
## Observations: 29,055
## Variables: 43
## $ hhid07_9 <chr> "001220000", "001220000", "001220000", "001220000", "...
## $ pid07 <dbl> 1, 2, 6, 7, 9, 1, 1, 3, 8, 1, 2, 1, 2, 3, 4, 1, 2, 2,...
## $ tk01a <dbl> 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1,...
## $ tk01b <dbl> 3, 3, 3, 3, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk01c <dbl> 3, 1, 3, 1, 1, 3, 3, 3, 1, 3, 1, 3, 1, 1, 1, 3, 1, 1,...
## $ tk01d <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk01 <dbl> 1, 1, 1, 1, 3, 1, 3, 95, 4, 1, 4, 1, 1, 4, 1, 1, 4, 7...
## $ tk02 <dbl> NA, NA, NA, NA, 1, NA, 3, 3, 1, NA, 1, NA, NA, 1, NA,...
## $ tk03 <dbl> NA, NA, NA, NA, NA, NA, 3, 1, NA, NA, NA, NA, NA, NA,...
## $ tk04 <dbl> NA, NA, NA, NA, NA, NA, 3, NA, NA, NA, NA, NA, NA, NA...
## $ tk05 <dbl> NA, NA, NA, NA, NA, NA, 3, NA, NA, NA, NA, NA, NA, NA...
## $ tk06a <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk07 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk08 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "...
## $ tk15 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16ax <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16a1x <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16a1 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16b <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16cart <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16c1 <dbl> 2, 2, 3, 3, 3, 2, NA, 2, 2, 3, 2, 2, 2, 2, 3, 3, NA, ...
## $ tk16d <dbl> 3, 9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk16ex <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16e <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fa <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fb <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fc <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fd <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fe <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16ff <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fg <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16g <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16h <dbl> 1, 2, 2, 1, 3, 3, 3, 5, 8, 1, 4, 1, 8, 1, 1, 2, 4, 2,...
## $ tk16i <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk16ja <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16jb <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16jc <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16jd <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ hhid07 <chr> "0012200", "0012200", "0012200", "0012200", "0012200"...
## $ pidlink <chr> "001220001", "001220002", "001220006", "001220007", "...
## $ tk16a <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ version <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ module <chr> "B3A_TK1", "B3A_TK1", "B3A_TK1", "B3A_TK1", "B3A_TK1"...
# we have 29055 items and 43 variables, many are numbers, some are character (string)
summary(tk1_tf$tk08)
## Length Class Mode
## 29055 character character
unique(tk1_tf$tk08) # show the unique element of tk_08
## [1] "" "S" "E" "F" "N" "B" "A" "V" "FN" "D"
## [11] "O" "EO" "EN" "P" "DO" "T" "CE" "EP" "Q" "C"
## [21] "R" "DE" "NP" "AE" "DENO" "ER" "FNS" "DSV" "ST" "DF"
## [31] "AR" "FO" "DNO" "PS" "ET" "FNP" "NPV" "DN" "DP" "BP"
## [41] "BE" "BO" "EV" "ES" "PV" "OP" "BC" "NO" "DFO" "DFP"
## [51] "NPS" "AO" "FV" "ORS" "EOS" "SV" "OR" "OV" "EOP" "DFN"
## [61] "DV" "FOP" "FP"
table(tk1_tf$tk08) # show the number of unique element of tk_08. lots of elements are empty string - "";
##
## A AE AO AR B BC BE BO BP C CE
## 28045 124 4 1 1 49 2 4 1 1 6 2
## D DE DENO DF DFN DFO DFP DN DNO DO DP DSV
## 178 1 1 5 1 1 1 4 1 1 3 1
## DV E EN EO EOP EOS EP ER ES ET EV F
## 1 138 2 18 2 2 2 4 1 1 3 88
## FN FNP FNS FO FOP FP FV N NO NP NPS NPV
## 10 4 2 2 1 1 1 41 2 5 1 1
## O OP OR ORS OV P PS PV Q R S ST
## 32 2 1 1 1 58 4 1 10 16 61 1
## SV T V
## 1 35 60
table(tk1_tf$tk08 == "")
##
## FALSE TRUE
## 1010 28045
We can clean the data by two ways:
tk1_clean_1 <- filter(tk1_tf, tk08 != "")
library(knitr) # package for showing the data in the neat format
glimpse(tk1_clean_1) # only 1010 obervations are left.
## Observations: 1,010
## Variables: 43
## $ hhid07_9 <chr> "002040003", "002210000", "002280000", "002280000", "...
## $ pid07 <dbl> 2, 1, 4, 6, 2, 1, 1, 8, 4, 5, 11, 2, 3, 1, 3, 2, 10, ...
## $ tk01a <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3,...
## $ tk01b <dbl> 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3,...
## $ tk01c <dbl> 1, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1,...
## $ tk01d <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk01 <dbl> 95, 5, 2, 95, 4, 4, 5, 3, 4, 4, 3, 4, 4, 5, 4, 4, 4, ...
## $ tk02 <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk03 <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk04 <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk05 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ tk06a <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk07 <dbl> 1994, 1995, 2001, 2007, 2007, 1997, 1991, 2007, 1996,...
## $ tk08 <chr> "S", "E", "F", "N", "B", "B", "A", "V", "B", "FN", "V...
## $ tk15 <dbl> 5, 1, 5, 5, 1, 1, 4, 6, 6, 4, 7, 7, 5, 1, 6, 5, 2, 8,...
## $ tk16ax <dbl> 1, 1, 1, 1, 8, 8, 8, NA, NA, 1, 1, 1, 1, 1, NA, 1, 1,...
## $ tk16a1x <dbl> NA, NA, NA, NA, 2, 2, 1, NA, NA, NA, NA, NA, NA, NA, ...
## $ tk16a1 <dbl> NA, NA, NA, NA, 28, 21, 12, NA, NA, NA, NA, NA, NA, N...
## $ tk16b <dbl> 1, 3, 1, 1, 3, 3, 1, NA, NA, 1, 1, 1, 1, 1, NA, 1, 3,...
## $ tk16cart <dbl> NA, NA, NA, NA, NA, NA, NA, 1, 1, NA, NA, NA, NA, NA,...
## $ tk16c1 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16d <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk16ex <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16e <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fa <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fb <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fc <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fd <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fe <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16ff <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16fg <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16g <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16h <dbl> 95, 8, 1, 1, 95, 7, 8, 3, 95, 8, 3, 4, 4, 8, 8, 4, 8,...
## $ tk16i <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## $ tk16ja <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16jb <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16jc <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ tk16jd <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ hhid07 <chr> "0020441", "0022100", "0022800", "0022800", "0031500"...
## $ pidlink <chr> "002044102", "002210001", "002280004", "002280006", "...
## $ tk16a <dbl> 450000, 200000, 200000, 200000, NA, NA, NA, NA, NA, 5...
## $ version <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ module <chr> "B3A_TK1", "B3A_TK1", "B3A_TK1", "B3A_TK1", "B3A_TK1"...
# datatable(head(tk1_clean_1)) # only show the first ten rows
# we can also select the columns we want for analyzing
# Example
names(tk1_clean_1) # show all variable names
## [1] "hhid07_9" "pid07" "tk01a" "tk01b" "tk01c" "tk01d"
## [7] "tk01" "tk02" "tk03" "tk04" "tk05" "tk06a"
## [13] "tk07" "tk08" "tk15" "tk16ax" "tk16a1x" "tk16a1"
## [19] "tk16b" "tk16cart" "tk16c1" "tk16d" "tk16ex" "tk16e"
## [25] "tk16fa" "tk16fb" "tk16fc" "tk16fd" "tk16fe" "tk16ff"
## [31] "tk16fg" "tk16g" "tk16h" "tk16i" "tk16ja" "tk16jb"
## [37] "tk16jc" "tk16jd" "hhid07" "pidlink" "tk16a" "version"
## [43] "module"
column_example <- select(tk1_clean_1, "hhid07_9", contains("tk0"))
datatable(head(column_example)) # only ID and variables with "tk0" were left in the dataset
tk1_tf$tk08[tk1_tf$tk08 == ""] <- NA
head(tk1_tf$tk08) # all "" has been repaced with NA
## [1] NA NA NA NA NA NA
table(tk1_tf$tk08) # R will not take NA as a value, that's why you wont't see it in the table
##
## A AE AO AR B BC BE BO BP C CE D DE DENO DF
## 124 4 1 1 49 2 4 1 1 6 2 178 1 1 5
## DFN DFO DFP DN DNO DO DP DSV DV E EN EO EOP EOS EP
## 1 1 1 4 1 1 3 1 1 138 2 18 2 2 2
## ER ES ET EV F FN FNP FNS FO FOP FP FV N NO NP
## 4 1 1 3 88 10 4 2 2 1 1 1 41 2 5
## NPS NPV O OP OR ORS OV P PS PV Q R S ST SV
## 1 1 32 2 1 1 1 58 4 1 10 16 61 1 1
## T V
## 35 60
There is a disadvante by replacing “” with NA, it might affect the analyse as it will be taken as “Not Availbe”. To solve this problem, we can replace NA with 0.
tk1_tf$tk08[is.na(tk1_tf$tk08)] <- 0
head(tk1_tf$tk08)
## [1] "0" "0" "0" "0" "0" "0"
table((tk1_tf$tk08))
##
## 0 A AE AO AR B BC BE BO BP C CE
## 28045 124 4 1 1 49 2 4 1 1 6 2
## D DE DENO DF DFN DFO DFP DN DNO DO DP DSV
## 178 1 1 5 1 1 1 4 1 1 3 1
## DV E EN EO EOP EOS EP ER ES ET EV F
## 1 138 2 18 2 2 2 4 1 1 3 88
## FN FNP FNS FO FOP FP FV N NO NP NPS NPV
## 10 4 2 2 1 1 1 41 2 5 1 1
## O OP OR ORS OV P PS PV Q R S ST
## 32 2 1 1 1 58 4 1 10 16 61 1
## SV T V
## 1 35 60
tk1_clean_2 <- tk1_tf # make a new data set as the version 2
Now, we want to reshape the data by adding new varialbe.
I will use the clean data set 1 to reshhape the data as it includes only 1010 obervations. Then the large data set 2 will be reshpaed by the same way.
table(tk1_clean_1$tk08) # glimplse the data again
##
## A AE AO AR B BC BE BO BP C CE D DE DENO DF
## 124 4 1 1 49 2 4 1 1 6 2 178 1 1 5
## DFN DFO DFP DN DNO DO DP DSV DV E EN EO EOP EOS EP
## 1 1 1 4 1 1 3 1 1 138 2 18 2 2 2
## ER ES ET EV F FN FNP FNS FO FOP FP FV N NO NP
## 4 1 1 3 88 10 4 2 2 1 1 1 41 2 5
## NPS NPV O OP OR ORS OV P PS PV Q R S ST SV
## 1 1 32 2 1 1 1 58 4 1 10 16 61 1 1
## T V
## 35 60
We want to reshape the data into dummy variables like this:
# as A means 'retirement', the observations should show 1 when it includes A, like this:
dummy_sample <- data.frame(ID = 1:6, Retirement = c(1, 0, 0, 1, 1, 0), Marriage = c(0, 1, 1, 0, 1, 1) )
datatable(dummy_sample)
To simplify the mission, all we need is to separate one column into several
In this section, we will use package called ‘splitstackshape’ for separating one culumn into several columns and fill the 1 and 0 in the corresponding columns.
library(splitstackshape) # the package for splitting the data and change into the dummy variables
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
tk_dummy <- cSplit_e(tk1_clean_1, "tk08", sep = "", type = "character", fill = 0)
names(tk_dummy)
## [1] "hhid07_9" "pid07" "tk01a" "tk01b" "tk01c" "tk01d"
## [7] "tk01" "tk02" "tk03" "tk04" "tk05" "tk06a"
## [13] "tk07" "tk08" "tk15" "tk16ax" "tk16a1x" "tk16a1"
## [19] "tk16b" "tk16cart" "tk16c1" "tk16d" "tk16ex" "tk16e"
## [25] "tk16fa" "tk16fb" "tk16fc" "tk16fd" "tk16fe" "tk16ff"
## [31] "tk16fg" "tk16g" "tk16h" "tk16i" "tk16ja" "tk16jb"
## [37] "tk16jc" "tk16jd" "hhid07" "pidlink" "tk16a" "version"
## [43] "module" "tk08_A" "tk08_B" "tk08_C" "tk08_D" "tk08_E"
## [49] "tk08_F" "tk08_N" "tk08_O" "tk08_P" "tk08_Q" "tk08_R"
## [55] "tk08_S" "tk08_T" "tk08_V"
datatable(tk_dummy[c(1:2, 14, 44:57)]) # select column 1(id), 2, 14(tk08), 44-57(new columns with seperated options) and present it as the neat table
selc_colum <- c(1:2, 14, 44:57)
Now, we have converted the multiple answers in one columns into different dummy variables
Now, we can give the name for differnt answers.
Here, we have 14 variables:
Retirement - A: rtm Prolonged sickness - B : pls Handicap - C : hdc Marriage - D : mrg Too old - E : tod Have a child - F : hac Family responsibilities - N : fmrsp Forbidden - O : fbd Other family reason - P : ofr Fired - Q : frd Cannot find work - R : cfw Do not want to work - S : dwtk Company closed/moved/bankrupt - T : ccmb Other - V : otr
names_indit <- c("rtm", "pls", "hdc", "mrg", "tod", "hac", "fmrsp", "fbd", "ofr", "frd", "cfw", "dwtk", "ccmb", "otr")
capital_indit <- c(paste(LETTERS[1:6]), "N", paste(LETTERS[15:20]), "V")
length(names_indit) # check the number of varialbes
## [1] 14
length(capital_indit)
## [1] 14
tk_dummy_1_indit <- tk_dummy
tk_dummy_2_capital <- tk_dummy
# we copy tk_dummy and give them different format and different names
names(tk_dummy_1_indit)[44:57] <- names_indit
datatable(tk_dummy_1_indit[selc_colum])
names(tk_dummy_2_capital)[44:57] <- capital_indit
datatable(tk_dummy_2_capital[selc_colum])
Now, we can use the same method to reshape the data set 2 which is the full data set.
tk_dt2_dummy <- cSplit_e(tk1_clean_2, "tk08", sep = "", type = "character", fill = 0)
names(tk_dt2_dummy)
## [1] "hhid07_9" "pid07" "tk01a" "tk01b" "tk01c" "tk01d"
## [7] "tk01" "tk02" "tk03" "tk04" "tk05" "tk06a"
## [13] "tk07" "tk08" "tk15" "tk16ax" "tk16a1x" "tk16a1"
## [19] "tk16b" "tk16cart" "tk16c1" "tk16d" "tk16ex" "tk16e"
## [25] "tk16fa" "tk16fb" "tk16fc" "tk16fd" "tk16fe" "tk16ff"
## [31] "tk16fg" "tk16g" "tk16h" "tk16i" "tk16ja" "tk16jb"
## [37] "tk16jc" "tk16jd" "hhid07" "pidlink" "tk16a" "version"
## [43] "module" "tk08_0" "tk08_A" "tk08_B" "tk08_C" "tk08_D"
## [49] "tk08_E" "tk08_F" "tk08_N" "tk08_O" "tk08_P" "tk08_Q"
## [55] "tk08_R" "tk08_S" "tk08_T" "tk08_V"
You will notice that there is one more vaiable called “tk08_0”, as we replace “” with 0.
we can drop column called “tk08_0”
tk_dt2_dummy <- select(tk_dt2_dummy, -tk08_0)
names(tk_dt2_dummy)
## [1] "hhid07_9" "pid07" "tk01a" "tk01b" "tk01c" "tk01d"
## [7] "tk01" "tk02" "tk03" "tk04" "tk05" "tk06a"
## [13] "tk07" "tk08" "tk15" "tk16ax" "tk16a1x" "tk16a1"
## [19] "tk16b" "tk16cart" "tk16c1" "tk16d" "tk16ex" "tk16e"
## [25] "tk16fa" "tk16fb" "tk16fc" "tk16fd" "tk16fe" "tk16ff"
## [31] "tk16fg" "tk16g" "tk16h" "tk16i" "tk16ja" "tk16jb"
## [37] "tk16jc" "tk16jd" "hhid07" "pidlink" "tk16a" "version"
## [43] "module" "tk08_A" "tk08_B" "tk08_C" "tk08_D" "tk08_E"
## [49] "tk08_F" "tk08_N" "tk08_O" "tk08_P" "tk08_Q" "tk08_R"
## [55] "tk08_S" "tk08_T" "tk08_V"
names(tk_dummy) == names(tk_dt2_dummy) # check the names of different data set
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [29] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [43] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [57] TRUE
# the name should be same !
datatable(tk_dt2_dummy[selc_colum])
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## http://rstudio.github.io/DT/server.html
For tk08, when it is 0, it means that no data for that observations. When it is not 0, the corresponding columns should be 1, like this:
datatable(filter(tk_dt2_dummy, tk08 != 0)[selc_colum]) # show the data set 2 after filtering the none zero elements in tk_08
After cleaning and reshaping the data, we can explore the data set by using ggplot2
we can use ggplot2 to visualizing the data and explore the distribution of different variables. For example, the variable “tk07” means that the year that participants worked last time.
Before visualzing the data, always chekck the data
range(tk_dt2_dummy$tk07, na.rm = TRUE)
## [1] 1946 9998
summary(tk_dt2_dummy$tk07)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1946 1990 1995 2151 1997 9998 28045
As it includes the 9998, which is not consistent with year (like 2017), this means that we need remove the outlier before visulizating it.
Now, let’s try another variable “tk16a”, which is the monthly income when participants are working.
summary(tk_dt2_dummy$tk16a)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00e+00 7.00e+04 2.00e+05 2.88e+06 4.50e+05 1.00e+09 28261
range(tk_dt2_dummy$tk16a, na.rm = TRUE)
## [1] 0e+00 1e+09
ggplot(data = tk_dt2_dummy, aes(tk16a)) + geom_density() + xlim(0, 450000)
## Warning: Removed 28452 rows containing non-finite values (stat_density).
# remove the max, focus on range[0, 3rd Qtile]
Now, we can do regression analysis by using the cleaned data set.
Firstly, let’s write data into the stata format.
Of course, R can do regression by itself
rg1 <- lm(tk16a ~ tk08_A + tk08_E + tk08_N, data = tk_dt2_dummy)
summary(rg1)
##
## Call:
## lm(formula = tk16a ~ tk08_A + tk08_E + tk08_N, data = tk_dt2_dummy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19784408 -421202 -262656 37344 980212590
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 462656 2152315 0.215 0.829856
## tk08_A 9727 5199624 0.002 0.998508
## tk08_E 19324751 5323958 3.630 0.000302 ***
## tk08_N -1313017 7024581 -0.187 0.851773
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49810000 on 790 degrees of freedom
## (28261 observations deleted due to missingness)
## Multiple R-squared: 0.01694, Adjusted R-squared: 0.0132
## F-statistic: 4.537 on 3 and 790 DF, p-value: 0.003665
Save the data in to stata foramt or csv format
write.dta(tk_dummy, file = "/Users/Michael/Library/Mobile Documents/com~apple~CloudDocs/CMSE/R/Data Science/Ade_1.dta")
write.dta(tk_dt2_dummy, file = "/Users/Michael/Library/Mobile Documents/com~apple~CloudDocs/CMSE/R/Data Science/Ade_2.dta")