Set the directory to where the files (MABEL waves 1 to 10, inclusive) are stored.
setwd("~/Desktop/MABEL_2019_w1_to_w10/MABEL User DVD v10.0/Files to import")
Import additional R packages for later use.
library(plyr)
library(tidyverse)
[37m── [1mAttaching packages[22m ──────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[37m[32m✔[37m [34mggplot2[37m 3.1.1 [32m✔[37m [34mpurrr [37m 0.3.2
[32m✔[37m [34mtibble [37m 2.1.1 [32m✔[37m [34mdplyr [37m 0.8.0.[31m1[37m
[32m✔[37m [34mtidyr [37m 0.8.3 [32m✔[37m [34mstringr[37m 1.4.0
[32m✔[37m [34mreadr [37m 1.3.1 [32m✔[37m [34mforcats[37m 0.4.0 [39m
[37m── [1mConflicts[22m ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[37m [34mdplyr[37m::[32marrange()[37m masks [34mplyr[37m::arrange()
[31m✖[37m [34mpurrr[37m::[32mcompact()[37m masks [34mplyr[37m::compact()
[31m✖[37m [34mdplyr[37m::[32mcount()[37m masks [34mplyr[37m::count()
[31m✖[37m [34mdplyr[37m::[32mfailwith()[37m masks [34mplyr[37m::failwith()
[31m✖[37m [34mdplyr[37m::[32mfilter()[37m masks [34mstats[37m::filter()
[31m✖[37m [34mdplyr[37m::[32mid()[37m masks [34mplyr[37m::id()
[31m✖[37m [34mdplyr[37m::[32mlag()[37m masks [34mstats[37m::lag()
[31m✖[37m [34mdplyr[37m::[32mmutate()[37m masks [34mplyr[37m::mutate()
[31m✖[37m [34mdplyr[37m::[32mrename()[37m masks [34mplyr[37m::rename()
[31m✖[37m [34mdplyr[37m::[32msummarise()[37m masks [34mplyr[37m::summarise()
[31m✖[37m [34mdplyr[37m::[32msummarize()[37m masks [34mplyr[37m::summarize()[39m
library(haven)
library(plm)
Loading required package: Formula
Attaching package: ‘plm’
The following objects are masked from ‘package:dplyr’:
between, lag, lead
library(ExPanDaR)
library(lfactors)
lfactors v1.0.4
library(MASS)
Attaching package: ‘MASS’
The following object is masked from ‘package:dplyr’:
select
Read in the data (using a function that can import Stata’s .dta files and include the variable labels).
w1 <- read_dta("w1_publicrelease_v10.0.dta")
w2 <- read_dta("w2_publicrelease_v10.0.dta")
w3 <- read_dta("w3_publicrelease_v10.0.dta")
w4 <- read_dta("w4_publicrelease_v10.0.dta")
w5 <- read_dta("w5_publicrelease_v10.0.dta")
w6 <- read_dta("w6_publicrelease_v10.0.dta")
w7 <- read_dta("w7_publicrelease_v10.0.dta")
w8 <- read_dta("w8_publicrelease_v10.0.dta")
w9 <- read_dta("w9_publicrelease_v10.0.dta")
w10 <- read_dta("w10_publicrelease_v10.0.dta")
The 1st cut of variable selection. The variables are appended with the prefix ‘a’ for wave 1 but the selection is in fact the most general - that is, it is the UNION of all avaiable variables.
selVar_w1 <- c(
# general/filtering variables
"asdtype","acohort","acsclid",
# job satisfaction
"ajsfm","ajsva","ajspw","ajsau","ajscw","ajsrc","ajshw","ajswr","ajsrp","ajsfl","ajsch",
# places of work
"apwtohi","apwpip",
"apwnwmf_gp","apwnwmf_sp","apwnwmp_gp","apwnwmp_sp","apwnwff_gp","apwnwff_sp","apwnwfp_gp","apwnwfp_sp",
"apwnwn_gp","apwnwn_sp","apwnwap_gp","apwnwap_sp","apwnwad_gp","apwnwad_sp","apwnwo_gp","apwnwo_sp",
"apwcl","apwbr","apwoce","apwwh","apwmhpasgc","apwmhmmm","apwwmth","apwwyr","apwpm",
# workload
"awlwhi","awldph","awlidph","awleh","awlmh","awlothh",
"awlana","awlobs","awlsur","awleme","awlnon",
"awlante","awlwom","awlpsych","awlskin","awlchild","awlsport","awlotspe",
"awluan","awluob","awluop","awluem","awluad","awlume","awluin","awlupm","awlupa","awluah","awluge",
"awlure","awlupo","awluot1","awluot1_text","awluot2","awluot2_text","awluna",
"awlnan","awlnob","awlnop","awlnem","awlnad","awlnme","awlnin","awlnpm","awlnpa","awlnah","awlnge",
"awlnre","awlnpo","awlnot1","awlnot1_text","awlnot2","awlnot2_text","awlnna",
"awlprop","awlnppc","awlnpph","awlnprh","awlnprr","awlnph",
"awlwy","awlwod","awlwd","awlww","awlnt","awlna",
"awlcmin","awlcfi","awlbbp","awlbpna","awlah",
"awlocr","awlocna",
"awlwhpy_gp","awlwhpy_sp","awlmlpy","awlsdpy","awlotpy","awlva","awlvadk","awlvana","awlvau","awlvauna",
# Finances
"afigey_gp","afigey_sp","afigey_imp_gp","afigey_imp_sp","afiney_gp","afiney_sp",
"afib","afibv",
"afidme","afimedk","afidp","afidpdk","afidpna","afips","afips2",
"afioti","afispm","afisnpm","afisgi","afishw","afisoth","afisoth_neg",
"afisadd","afiip","afiefr","afighiy_gp","afighiy_sp","afinhiy_gp","afinhiy_sp",
# Geographic location
"aglnl_gp","aglnl_sp",
"agltwwasgc","agltwwmmm",
"aglmth","aglyr",
"agldistgr",
"agltwlasgc","agltwlmmm","aglosi","aglfiw",
"aglbl","aglpfiw","aglgeo","aglacsc",
"aglyrrs","aglrri","aglrl",
"aglrlpv","aglrltv","aglrlrp","aglrlot","aglrlna",
"agltps",
"aglout1asgc","aglout2asgc","aglout3asgc","aglout1mmm","aglout2mmm","aglout3mmm",
"aglngrow","aglndis",
"aglnpers","aglncomp","aglnsupp",
# Family circumstances
"afclp","afcpes","afcpmd","afcpr","afcpr_dk","afcpr_na",
"afcprtasgc","afcprtmmm","afcndc",
"afcccrf","afcccn","afccccw","afcccdc","afcccna",
"afcrwncc","afcpwncc","afcoq",
# Personal
"apigeni","apiagei","apicmdi","apicmda","apicmdo","apicmdoi","apirp",
"apimspi","apimsp6i","apimspx",
"apimspg","apisespg",
"apirs","apimr","apihth","apilfsa",
# Personality
"apertj","aperct","aperrd","aperor","aperwo","aperfr","aperlz","apersoc","aperart","apernev",
"apereff","aperrsv","aperknd","aperimg","aperstr",
"apifirisk","apicarisk","apiclrisk",
# Others/External
"aweight_cs","aweifght_l","aweight_panel",
"ael_metro","ael_mindist","ael_gpratio",
"ael_seifa1","ael_seifa2","ael_seifa3","ael_seifa4","ael_seifa5",
"ael_page5","ael_page65","ael_medhp","adistpubl","adistpriv","adistemer","adws"
)
Create the same ‘Selected Variables’ string vector for waves 2 to 10.
baseSelVar <- stringr::str_sub(selVar_w1, start = 2, end = -1)
selVar_w2 <- paste0("b",baseSelVar)
selVar_w3 <- paste0("c",baseSelVar)
selVar_w4 <- paste0("d",baseSelVar)
selVar_w5 <- paste0("e",baseSelVar)
selVar_w6 <- paste0("f",baseSelVar)
selVar_w7 <- paste0("g",baseSelVar)
selVar_w8 <- paste0("h",baseSelVar)
selVar_w9 <- paste0("i",baseSelVar)
selVar_w10 <- paste0("j",baseSelVar)
rm(baseSelVar)
Add in the ID (medical professonal identifier) at the start of the string vector. This is added separately because the ID does not have a wave-specific prefix.
selVar_w1 <- append("xwaveid",selVar_w1)
selVar_w2 <- append("xwaveid",selVar_w2)
selVar_w3 <- append("xwaveid",selVar_w3)
selVar_w4 <- append("xwaveid",selVar_w4)
selVar_w5 <- append("xwaveid",selVar_w5)
selVar_w6 <- append("xwaveid",selVar_w6)
selVar_w7 <- append("xwaveid",selVar_w7)
selVar_w8 <- append("xwaveid",selVar_w8)
selVar_w9 <- append("xwaveid",selVar_w9)
selVar_w10 <- append("xwaveid",selVar_w10)
Since the ‘Selected Variables’ vector contains variables that may not be available in individual waves (i.e. a variable that is only in wave 1 but not wave 2) the %in% operator selects the INTERSECTION of variables that is in only the given wave as well as string vector and ignore the rest.
w1_new <- w1[,(names(w1) %in% selVar_w1)]
w2_new <- w2[,(names(w2) %in% selVar_w2)]
w3_new <- w3[,(names(w3) %in% selVar_w3)]
w4_new <- w4[,(names(w4) %in% selVar_w4)]
w5_new <- w5[,(names(w5) %in% selVar_w5)]
w6_new <- w6[,(names(w6) %in% selVar_w6)]
w7_new <- w7[,(names(w7) %in% selVar_w7)]
w8_new <- w8[,(names(w8) %in% selVar_w8)]
w9_new <- w9[,(names(w9) %in% selVar_w9)]
w10_new <- w10[,(names(w10) %in% selVar_w10)]
rm(w1,w2,w3,w4,w5,w6,w7,w8,w9,w10)
rm(selVar_w1,selVar_w2,selVar_w3,selVar_w4,selVar_w5,selVar_w6,selVar_w7,selVar_w8,selVar_w9,selVar_w10)
Select only GP and specialist for the analysis.
w1_new <- w1_new[(w1_new$asdtype == 1 | w1_new$asdtype == 2), ]
w2_new <- w2_new[(w2_new$bsdtype == 1 | w2_new$bsdtype == 2), ]
w3_new <- w3_new[(w3_new$csdtype == 1 | w3_new$csdtype == 2), ]
w4_new <- w4_new[(w4_new$dsdtype == 1 | w4_new$dsdtype == 2), ]
w5_new <- w5_new[(w5_new$esdtype == 1 | w5_new$esdtype == 2), ]
w6_new <- w6_new[(w6_new$fsdtype == 1 | w6_new$fsdtype == 2), ]
w7_new <- w7_new[(w7_new$gsdtype == 1 | w7_new$gsdtype == 2), ]
w8_new <- w8_new[(w8_new$hsdtype == 1 | w8_new$hsdtype == 2), ]
w9_new <- w9_new[(w9_new$isdtype == 1 | w9_new$isdtype == 2), ]
w10_new <- w10_new[(w10_new$jsdtype == 1 | w10_new$jsdtype == 2), ]
Select only doctors who are ‘Currently doing clinical work in Australia’ for each wave/survey. Note the variable acsclid is not present in wave 1. This filter variable is then removed.
w2_new <- w2_new[(w2_new$bcsclid == 1),]
w3_new <- w3_new[(w3_new$ccsclid == 1),]
w4_new <- w4_new[(w4_new$dcsclid == 1),]
w5_new <- w5_new[(w5_new$ecsclid == 1),]
w6_new <- w6_new[(w6_new$fcsclid == 1),]
w7_new <- w7_new[(w7_new$gcsclid == 1),]
w8_new <- w8_new[(w8_new$hcsclid == 1),]
w9_new <- w9_new[(w9_new$icsclid == 1),]
w10_new <- w10_new[(w10_new$jcsclid == 1),]
w2_new <- w2_new[,!names(w2_new) %in% "bcsclid"]
w3_new <- w3_new[,!names(w3_new) %in% "ccsclid"]
w4_new <- w4_new[,!names(w4_new) %in% "dcsclid"]
w5_new <- w5_new[,!names(w5_new) %in% "ecsclid"]
w6_new <- w6_new[,!names(w6_new) %in% "fcsclid"]
w7_new <- w7_new[,!names(w7_new) %in% "gcsclid"]
w8_new <- w8_new[,!names(w8_new) %in% "hcsclid"]
w9_new <- w9_new[,!names(w9_new) %in% "icsclid"]
w10_new <- w10_new[,!names(w10_new) %in% "jcsclid"]
Add the wave-specific prefix to the identifier so that every variable in each wave begins with the same prefix (See the next chunk for the rationale behind this)
colnames(w1_new)[colnames(w1_new) == "xwaveid"] <- "axwaveid"
colnames(w2_new)[colnames(w2_new) == "xwaveid"] <- "bxwaveid"
colnames(w3_new)[colnames(w3_new) == "xwaveid"] <- "cxwaveid"
colnames(w4_new)[colnames(w4_new) == "xwaveid"] <- "dxwaveid"
colnames(w5_new)[colnames(w5_new) == "xwaveid"] <- "exwaveid"
colnames(w6_new)[colnames(w6_new) == "xwaveid"] <- "fxwaveid"
colnames(w7_new)[colnames(w7_new) == "xwaveid"] <- "gxwaveid"
colnames(w8_new)[colnames(w8_new) == "xwaveid"] <- "hxwaveid"
colnames(w9_new)[colnames(w9_new) == "xwaveid"] <- "ixwaveid"
colnames(w10_new)[colnames(w10_new) == "xwaveid"] <- "jxwaveid"
In order to construct a panel data, we will need to vertically concatenate each of the cross sectional data file. To do that requires lining up the columns with the same names.
Right now each ‘identical’ column/variable differs only by its wave-specific prefix. We therefore need to remove the prefixes. The identity of the waves are still stored in their naming conventions.
colnames(w1_new) <- substring(colnames(w1_new), 2)
colnames(w2_new) <- substring(colnames(w2_new), 2)
colnames(w3_new) <- substring(colnames(w3_new), 2)
colnames(w4_new) <- substring(colnames(w4_new), 2)
colnames(w5_new) <- substring(colnames(w5_new), 2)
colnames(w6_new) <- substring(colnames(w6_new), 2)
colnames(w7_new) <- substring(colnames(w7_new), 2)
colnames(w8_new) <- substring(colnames(w8_new), 2)
colnames(w9_new) <- substring(colnames(w9_new), 2)
colnames(w10_new) <- substring(colnames(w10_new), 2)
Create a Year column based on the wave names.
w1_new$Year <- "2008"
w2_new$Year <- "2009"
w3_new$Year <- "2010"
w4_new$Year <- "2011"
w5_new$Year <- "2012"
w6_new$Year <- "2013"
w7_new$Year <- "2014"
w8_new$Year <- "2015"
w9_new$Year <- "2016"
w10_new$Year <- "2017"
Create a first-cut panel data in the standard long format. In particular, plyr’s rbind.fill vertically concatenate columns of the same names. For variables that are not common across all waves (e.g. only available in w10), it still create a column for the entire panel but populates the remaining cells in the column as NA (in this example, cells pertaining to the original w1 to w9 would be NA).
The xwaveid and Year variables are then placed at the front.
Finally, the combined dataset is sorted in ascending order for xwaveid followed by Year.
p_first <- plyr::rbind.fill(w1_new,
w2_new,
w3_new,
w4_new,
w5_new,
w6_new,
w7_new,
w8_new,
w9_new,
w10_new)
p_first <- p_first[,c(1,146,2:145,147:241)]
p_first <- p_first %>% dplyr::arrange(xwaveid, Year)
p_first$xwaveid <- as.character(p_first$xwaveid)
p_first
rm(w1_new,w2_new,w3_new,w4_new,w5_new,w6_new,w7_new,w8_new,w9_new,w10_new)
Select variables that will be included as part of any regression analysis and remove their invalid or missing values. These variables will be called upon again when we transformed the current panel dataset into a 10-year balanaced panel.
First I check to see if any row containing NA should be removed. Second any negative value indicate an invalid response (e.g. refusal to answer, unable to determine value, etc) and likewise needs to be removed as they cannot contribute to any statistical estimation.
For this run, I have selected the indispensable variables to be:
No.4, in conjunection with no.3, can be used to impute a proxy for number of years in the medical workforce.
Note:
gltwwmmm is not present in any wavesgltwwasgc is able available for GPs and withheld (i.e. recoded to -3 (NA)) for Specialists# Essential variable 1 - doctor type
sum(is.na(p_first$sdtype))
[1] 0
table(p_first$sdtype)
1 2
33138 37282
# Essential variable 2 - Gender
sum(is.na(p_first$pigeni))
[1] 0
table(p_first$pigeni)
-2 0 1
49 41143 29228
# p_first <- p_first[!(p_first$pigeni == -2),]
# p_first$pigeni <- ifelse(p_first$pigeni < 0, NA, p_first$pigeni)
p_first$Gender <- as_factor(p_first$pigeni)
p_first <- p_first[,!names(p_first) %in% c("pigeni")]
ggplot(data = p_first, aes(x = Year, y = Gender)) + geom_bar(aes(fill = Gender), stat = "identity")
ggplot(data = p_first) + geom_bar(aes(x = Year, fill = Gender), position = "fill")
# Essential variable 3 - Age group
sum(is.na(p_first$piagei))
[1] 0
table(p_first$piagei)
-2 1 2 3 4 5 6 7 8 9
1387 5110 8267 10062 10430 10854 9794 7202 4107 3207
# p_first <- p_first[!(p_first$piagei == -2),]
# p_first$piagei <- ifelse(p_first$piagei < 0, NA, p_first$piagei)
p_first$Age_group <- as_factor(p_first$piagei)
p_first <- p_first[,!names(p_first) %in% c("piagei")]
ggplot(data = p_first, aes(x = Year, y = Age_group)) + geom_bar(aes(fill = Age_group), stat = "identity")
ggplot(data = p_first) + geom_bar(aes(x = Year, fill = Age_group), position = "fill")
# Essential variable 4 - In what year did you complete your basic medical degree?
sum(is.na(p_first$picmdi))
[1] 0
table(p_first$picmdi)
-4 -2 -1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
8 1189 689 49 233 472 1366 2980 5512 8896 10544 9460 8994 9069 6198 3375 1378 7
1198
1
# p_first <- p_first[!(p_first$picmdi == -1 | p_first$picmdi == -2 | p_first$picmdi == -4
# | p_first$picmdi == 15 | p_first$picmdi == 1198),]
# p_first$picmdi <- ifelse((p_first$picmdi < 0 | p_first$picmdi == 15 | p_first$picmdi == 1198), NA, p_first$picmdi)
p_first$Medical_degree_completion <- as_factor(p_first$picmdi)
p_first <- p_first[,!names(p_first) %in% c("picmdi")]
ggplot(data = p_first, aes(x = Year, y = Medical_degree_completion)) + geom_bar(aes(fill = Medical_degree_completion), stat = "identity")
ggplot(data = p_first) + geom_bar(aes(x = Year, fill = Medical_degree_completion), position = "fill")
# Essential variable 5 - Residency status
sum(is.na(p_first$pirs))
[1] 0
table(p_first$pirs)
-5 -4 -3 -2 -1 0 1 2 9
4 2 1 2887 1 60205 4994 1745 581
# p_first <- p_first[!(p_first$pirs < 0),]
attributes(p_first$pirs)
$label
[1] "What is your residency status?"
$labels
[0]australian citizen [1]permanent resident
0 1
[2]temporary resident [9]unchanged since I last completed the MABEL survey
2 9
$class
[1] "haven_labelled"
# many refused to answer - lose too many data points
We want to create a balanced panel in which each doctor has participated in all 10 surveys.
To achieve, we first expand the rows to include all possible combinations of xwaveid and Year (i.e. to create the ideal case) And then based on the ID grouping, we filter in such that that we only keep doctors with no NA values in their essential variable 1 (doctor type). Finally, we need to ungroup to avoid complications later in regression analysis.
Can revisit if the 10-wave requirement is too restrictive
p_second <- p_first %>%
complete(xwaveid, Year) %>%
group_by(xwaveid) %>%
filter(sum(is.na(sdtype)) == 0) %>%
ungroup()
p_second
Separate the panel into GPs and Specialsts. And then remove the sdtype column as it will no longer be required.
GP_0 <- p_second[(p_second$sdtype == 1),]
Specialist_0 <- p_second[(p_second$sdtype == 2),]
GP_0 <- GP_0[,!names(GP_0) %in% "sdtype"]
Specialist_0 <- Specialist_0[,!names(Specialist_0) %in% "sdtype"]
GP_0 for exploratory analysis in ExPanD5 Not applicable from the original variables# GP_0 <- GP_0[!(GP_0$jsfm < 0 | GP_0$jsfm == 5),]
# GP_0 <- GP_0[!(GP_0$jsva < 0 | GP_0$jsva == 5),]
# GP_0 <- GP_0[!(GP_0$jspw < 0 | GP_0$jspw == 5),]
# GP_0 <- GP_0[!(GP_0$jsau < 0 | GP_0$jsau == 5),]
# GP_0 <- GP_0[!(GP_0$jscw < 0 | GP_0$jscw == 5),]
# GP_0 <- GP_0[!(GP_0$jsrc < 0 | GP_0$jsrc == 5),]
# GP_0$jsrc <- ifelse(GP_0$jsrc < 0, NA, GP_0$jsrc)
# GP_0 <- GP_0[!(GP_0$jshw < 0 | GP_0$jshw == 5),]
# GP_0$jshw <- ifelse(GP_0$jshw < 0, NA, GP_0$jshw)
# GP_0 <- GP_0[!(GP_0$jswr < 0 | GP_0$jswr == 5),]
# GP_0$jswr <- ifelse(GP_0$jswr < 0, NA, GP_0$jswr)
# GP_0 <- GP_0[!(GP_0$jsrp < 0 | GP_0$jsrp == 5),]
# GP_0 <- GP_0[!(GP_0$jsfl < 0 | GP_0$jsfl == 5),]
print("Freedom to choose your own method of working")
[1] "Freedom to choose your own method of working"
with(GP_0, table(jsfm,Year))
Year
jsfm 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-2 3 8 3 11 1 7 7 7 16 24
0 12 12 7 10 9 5 4 8 6 2
1 40 27 26 24 24 21 19 16 15 24
2 18 14 14 14 14 8 10 13 11 6
3 320 302 310 295 291 297 279 284 300 281
4 380 409 409 415 428 430 450 440 418 429
5 0 0 0 0 0 0 0 1 0 0
ggplot(data = GP_0) + geom_boxplot(mapping = aes(x = Year, y = jsfm))
# little variability over time
GP_0 <- GP_0[,!names(GP_0) %in% c("jsfm")]
print("Amount of variety in your work")
[1] "Amount of variety in your work"
with(GP_0, table(jsva,Year))
Year
jsva 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-2 3 5 3 11 0 8 8 5 16 26
0 6 7 2 4 6 2 2 4 3 2
1 36 22 20 22 23 24 21 25 21 28
2 10 16 16 16 12 13 10 13 13 12
3 332 347 360 351 348 348 335 340 332 316
4 386 375 368 364 378 373 393 380 381 382
5 0 0 0 1 0 0 0 2 0 0
ggplot(data = GP_0) + geom_boxplot(mapping = aes(x = Year, y = jsva))
# little variability over time
GP_0 <- GP_0[,!names(GP_0) %in% c("jsva")]
print("Physical working conditions")
[1] "Physical working conditions"
with(GP_0, table(jspw,Year))
Year
jspw 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-2 4 8 3 11 1 7 8 5 17 26
0 7 8 6 6 6 4 5 5 7 5
1 53 48 36 38 31 34 22 28 20 28
2 27 17 16 16 18 13 13 19 19 13
3 319 319 339 302 312 305 284 284 284 275
4 362 372 369 396 399 404 437 427 418 419
5 1 0 0 0 0 1 0 1 1 0
ggplot(data = GP_0) + geom_boxplot(mapping = aes(x = Year, y = jspw))
# little variability over time
GP_0 <- GP_0[,!names(GP_0) %in% c("jspw")]
print("Opportunities to use your abilities")
[1] "Opportunities to use your abilities"
with(GP_0, table(jsau,Year))
Year
jsau 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-2 4 10 3 10 0 10 9 6 16 25
0 12 10 7 7 6 5 4 10 7 6
1 52 49 35 30 39 29 28 27 28 21
2 30 31 26 24 25 24 25 24 17 20
3 336 324 357 356 309 325 313 311 317 306
4 338 348 341 342 388 374 390 391 381 388
5 1 0 0 0 0 1 0 0 0 0
ggplot(data = GP_0) + geom_boxplot(mapping = aes(x = Year, y = jsau))
# little variability over time
GP_0 <- GP_0[,!names(GP_0) %in% c("jsau")]
print("Your colleagues and fellow workers")
[1] "Your colleagues and fellow workers"
with(GP_0, table(jscw,Year))
Year
jscw 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-2 4 6 3 10 0 7 8 6 17 24
0 10 8 7 10 10 11 10 7 7 8
1 32 32 32 29 32 21 28 25 23 22
2 41 42 38 34 25 31 40 34 29 33
3 295 280 309 314 289 290 277 310 278 293
4 388 397 371 362 403 394 396 381 403 380
5 3 7 9 10 8 14 10 6 9 6
ggplot(data = GP_0) + geom_boxplot(mapping = aes(x = Year, y = jscw))
# little variability over time
GP_0 <- GP_0[,!names(GP_0) %in% c("jscw")]
print("Recognition you get for good work")
[1] "Recognition you get for good work"
# with(GP_0, table(jsrc,Year))
# ggplot(data = GP_0) + geom_boxplot(mapping = aes(x = Year, y = jsrc))
GP_0$JS_Recognition_for_good_work <- as_factor(GP_0$jsrc, ordered = TRUE)
GP_0$JS_Recognition_for_good_work = droplevels(GP_0$JS_Recognition_for_good_work)
ggplot(data = GP_0) + geom_boxplot(mapping = aes(x = Year, y = as.numeric(JS_Recognition_for_good_work)))
with(GP_0, table(JS_Recognition_for_good_work,Year))
Year
JS_Recognition_for_good_work 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-4 1 0 0 0 0 0 0 0 0 0
-2 4 7 3 10 0 6 12 6 17 25
[0]very dissatisfied 20 26 18 17 17 15 18 16 19 18
[1]moderately dissatisfied 113 81 82 77 67 60 57 71 63 62
[2]not sure 88 84 80 97 88 92 83 79 84 88
[3]moderately satisfied 351 346 383 358 346 350 353 353 342 317
[4]very satisfied 195 223 200 207 246 242 245 241 235 253
[5]not applicable 1 5 3 3 3 3 1 3 6 3
GP_0 <- GP_0[,!names(GP_0) %in% c("jsrc")]
print("Your hours of work")
[1] "Your hours of work"
# with(GP_0, table(jshw,Year))
# ggplot(data = GP_0) + geom_boxplot(mapping = aes(x = Year, y = jshw))
GP_0$JS_Hours_of_work <- as_factor(GP_0$jshw, ordered = TRUE)
GP_0$JS_Hours_of_work <- droplevels(GP_0$JS_Hours_of_work)
ggplot(data = GP_0) + geom_boxplot(mapping = aes(x = Year, y = as.numeric(JS_Hours_of_work)))
with(GP_0, table(JS_Hours_of_work,Year))
Year
JS_Hours_of_work 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-2 3 4 3 11 0 6 11 5 16 26
[0]very dissatisfied 30 31 26 24 23 15 14 14 11 15
[1]moderately dissatisfied 144 117 96 98 84 90 74 76 72 78
[2]not sure 41 46 53 43 47 38 51 44 39 38
[3]moderately satisfied 322 324 326 321 341 341 316 330 343 310
[4]very satisfied 232 248 265 271 272 277 303 298 285 299
[5]not applicable 1 2 0 1 0 1 0 2 0 0
GP_0 <- GP_0[,!names(GP_0) %in% c("jshw")]
print("Your remuneration")
[1] "Your remuneration"
# with(GP_0, table(jswr,Year))
# ggplot(data = GP_0) + geom_boxplot(mapping = aes(x = Year, y = jswr))
GP_0$JS_Remuneration <- as_factor(GP_0$jswr, ordered = TRUE)
GP_0$JS_Remuneration <- droplevels(GP_0$JS_Remuneration)
ggplot(data = GP_0) + geom_boxplot(mapping = aes(x = Year, y = as.numeric(GP_0$JS_Remuneration)))
with(GP_0, table(JS_Remuneration,Year))
Year
JS_Remuneration 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-4 1 0 0 0 0 0 0 0 0 0
-2 3 5 6 11 1 6 7 5 16 25
[0]very dissatisfied 58 51 38 50 40 35 31 48 60 45
[1]moderately dissatisfied 174 147 135 110 113 112 120 108 121 133
[2]not sure 45 48 64 52 64 55 51 64 59 62
[3]moderately satisfied 332 351 360 370 368 362 364 358 348 334
[4]very satisfied 160 170 166 175 181 198 196 186 162 167
[5]not applicable 0 0 0 1 0 0 0 0 0 0
GP_0 <- GP_0[,!names(GP_0) %in% c("jswr")]
print("Amount of responsibility you are given")
[1] "Amount of responsibility you are given"
with(GP_0, table(jsrp,Year))
Year
jsrp 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-2 7 8 8 13 2 12 10 7 20 28
0 11 12 6 12 13 8 9 7 6 5
1 61 30 33 26 22 29 27 28 24 23
2 26 43 32 32 29 26 25 33 27 22
3 286 266 283 260 264 254 227 241 255 247
4 380 406 405 424 435 437 468 451 431 435
5 2 7 2 2 2 2 3 2 3 6
ggplot(data = GP_0) + geom_boxplot(mapping = aes(x = Year, y = jsrp))
# little variability over time
GP_0 <- GP_0[,!names(GP_0) %in% c("jsrp")]
print("Taking everything into consideration, how do you feel about your work?")
[1] "Taking everything into consideration, how do you feel about your work?"
with(GP_0, table(jsfl,Year))
Year
jsfl 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-2 5 7 2 12 0 7 8 6 17 24
0 11 10 4 8 11 6 7 6 7 6
1 45 38 45 30 25 29 27 29 23 16
2 33 27 18 34 27 25 22 31 32 31
3 395 391 394 380 360 363 345 347 363 365
4 283 299 306 305 344 338 360 349 324 324
5 1 0 0 0 0 0 0 1 0 0
ggplot(data = GP_0) + geom_boxplot(mapping = aes(x = Year, y = jsfl))
# little variability over time
GP_0 <- GP_0[,!names(GP_0) %in% c("jsfl")]
# GP_0 <- GP_0[!(GP_0$jsch < 0),]
# GP_0$jsch <- ifelse(GP_0$jsch < 0, NA, GP_0$jsch)
# print("Would you like to change your hours of work (incl. day time and after hours)?")
# with(GP_0, table(jsch,Year))
GP_0$JS_Like_to_change_work_hours <- as_factor(GP_0$jsch)
GP_0$JS_Like_to_change_work_hours <- droplevels(GP_0$JS_Like_to_change_work_hours)
with(GP_0, table(JS_Like_to_change_work_hours,Year))
Year
JS_Like_to_change_work_hours 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-2 5 3 4 13 6 10 12 6 17 26
[0]no 367 371 392 394 400 416 411 454 409 421
[1]yes, I'd like to increase my hours 24 21 16 19 16 20 19 22 20 15
[2]yes, I'd like to decrease my hours 377 377 357 343 345 322 327 287 320 304
ggplot(data = GP_0, mapping = aes(x = Year, y = JS_Like_to_change_work_hours)) + geom_bar(mapping = aes(fill = JS_Like_to_change_work_hours), stat = "identity")
ggplot(data = GP_0) + geom_bar(aes(x = Year, fill = JS_Like_to_change_work_hours), position = "fill")
GP_0 <- GP_0[,!names(GP_0) %in% c("jsch")]
# GP_0 <- GP_0[!(GP_0$figey_gp < 0),]
# GP_0$figey_gp <- ifelse(GP_0$figey_gp < 0, NA, GP_0$figey_gp)
# GP_0 <- GP_0[!(GP_0$figey_imp_gp < 0),]
# GP_0$figey_imp_gp <- ifelse(GP_0$figey_imp_gp < 0, NA, GP_0$figey_imp_gp)
# GP_0 <- GP_0[!(GP_0$finey_gp < 0),]
# GP_0$finey_gp <- ifelse(GP_0$finey_gp < 0, NA, GP_0$finey_gp)
colnames(GP_0)[colnames(GP_0) == "figey_gp"] <- "Annual_earning_before_tax"
with(GP_0, table(Annual_earning_before_tax,Year))
Year
Annual_earning_before_tax 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-5 3 0 0 0 0 0 0 1 0 0
-4 2 12 1 24 0 2 6 5 4 4
-3 0 0 23 0 24 28 30 34 34 25
-2 75 84 71 70 84 82 81 85 79 91
0 0 1 0 0 0 0 0 1 0 1
1000 0 0 0 0 0 0 0 0 0 1
10000 1 0 0 0 0 0 1 0 0 0
10500 1 0 0 0 0 0 0 0 0 0
12000 0 0 0 0 0 0 1 0 0 1
13000 0 1 0 0 0 0 0 0 0 0
13500 1 1 0 0 0 0 0 0 0 0
15000 0 0 0 0 0 1 1 1 0 0
16000 0 0 0 0 0 0 0 1 0 1
18000 0 0 0 0 0 0 1 0 1 1
20000 1 0 2 0 1 0 0 0 1 3
20500 0 1 0 0 0 0 0 0 0 0
21000 0 1 0 0 0 1 1 0 0 0
23000 1 0 0 0 0 0 0 0 0 0
23500 0 2 1 1 0 0 0 0 0 0
24000 0 0 1 1 0 0 0 0 0 0
25000 1 2 1 0 0 2 2 1 1 0
26000 2 2 0 0 1 0 0 2 2 1
28000 1 1 0 0 0 0 0 0 0 0
28500 1 1 0 0 1 0 0 0 0 0
29000 0 1 1 0 0 0 0 0 0 0
30000 5 4 2 0 1 0 1 0 1 1
30500 0 1 0 0 0 0 0 0 0 0
31000 0 1 0 0 1 2 0 2 0 0
32000 1 1 0 0 0 0 0 0 0 0
33000 0 0 1 1 0 0 0 0 0 0
33500 1 0 0 0 0 1 0 0 0 0
34000 1 2 1 0 0 0 0 1 0 1
35000 4 2 1 1 0 2 2 1 1 1
36000 0 2 2 0 1 0 0 2 2 1
36500 2 0 0 0 1 0 0 0 1 0
37000 0 0 1 0 0 0 0 0 0 0
38000 0 0 0 3 1 1 1 1 1 0
39000 1 1 3 0 0 2 0 0 1 0
40000 4 3 4 2 3 3 3 3 1 5
41000 0 0 1 1 0 0 0 0 0 0
41500 0 0 1 0 0 0 0 0 1 0
42000 0 2 0 0 0 0 1 1 0 2
43000 0 0 1 0 0 0 0 0 0 0
43500 0 1 0 0 0 0 0 0 0 0
44000 1 0 1 1 1 1 1 0 0 0
44500 0 0 0 0 0 0 0 0 0 1
45000 3 1 1 3 2 2 1 0 1 1
46000 1 0 0 0 0 0 0 0 0 1
46500 0 1 0 0 1 0 0 0 0 0
47000 0 0 0 2 0 0 0 0 0 0
47500 1 0 1 1 0 0 0 0 0 0
48000 1 1 0 0 1 0 2 0 1 0
49000 0 0 0 0 1 0 2 1 0 0
49500 0 0 0 0 1 0 0 0 0 0
50000 3 6 4 8 3 2 5 5 2 2
50500 0 0 0 1 0 0 1 0 0 0
51000 2 0 0 4 1 1 0 0 0 1
51500 0 1 1 0 0 0 0 0 0 1
52000 1 4 3 3 2 1 2 4 1 1
53000 2 1 0 0 1 0 0 0 0 0
53500 0 0 0 1 0 0 0 0 0 0
54000 2 1 0 1 0 0 0 0 1 0
54500 0 0 0 0 1 2 0 0 1 1
55000 3 4 8 1 2 3 1 1 2 1
55500 0 0 0 0 0 0 0 0 1 1
56000 0 1 2 1 0 0 0 0 0 1
57000 0 1 2 0 0 0 1 0 0 2
57500 0 0 0 0 0 0 1 0 0 0
58000 4 0 1 0 2 0 0 0 2 1
58500 0 0 0 0 0 0 0 0 0 1
59000 0 1 0 0 1 0 0 0 0 1
59500 0 0 1 0 0 0 0 0 0 0
60000 8 6 6 4 8 8 2 5 7 7
60500 0 1 0 0 0 0 0 0 1 0
61000 1 1 0 0 0 0 0 0 0 0
61500 0 0 0 0 0 0 0 0 1 0
62000 0 0 1 1 1 1 0 0 0 0
62500 0 3 0 1 1 0 0 1 1 1
63000 1 2 1 2 1 0 0 2 0 0
63500 0 0 0 0 0 1 0 0 0 0
64000 0 0 0 0 1 0 0 0 0 1
64500 0 1 0 0 0 0 1 0 0 0
65000 9 8 5 6 3 3 5 2 6 2
65500 0 0 0 1 0 0 1 0 0 0
66000 1 2 2 1 0 1 1 1 1 0
66500 0 0 1 0 1 1 0 0 1 0
67000 1 1 0 1 0 1 1 1 3 0
67500 0 0 2 0 0 0 1 1 0 1
68000 0 1 1 1 2 0 1 1 0 0
68500 0 1 0 0 0 1 0 0 0 0
69000 0 0 0 1 0 1 0 1 0 0
69500 0 0 0 0 0 0 0 1 1 1
70000 10 3 7 8 6 3 3 7 7 3
70500 0 0 1 0 0 0 0 0 0 0
71000 0 0 0 0 0 1 1 0 0 1
71500 1 0 0 0 1 0 0 0 0 0
72000 0 0 0 0 0 1 0 1 0 2
72500 0 0 1 0 1 0 0 0 0 0
73000 2 2 1 1 1 1 2 0 2 0
73500 0 0 1 0 0 0 1 0 0 0
[ reached getOption("max.print") -- omitted 528 rows ]
ggplot(data = GP_0, aes(x = Annual_earning_before_tax)) + geom_histogram() + facet_wrap(~ Year)
ggplot(data = GP_0, aes(x = Year, y = Annual_earning_before_tax), group = Year) + geom_boxplot()
GP_0$Log_Annual_earning_before_tax <- log(GP_0$Annual_earning_before_tax + 1)
NaNs produced
ggplot(data = GP_0, aes(x = Log_Annual_earning_before_tax)) + geom_histogram() + facet_wrap(~ Year)
ggplot(data = GP_0, aes(x = Year, y = Log_Annual_earning_before_tax), group = Year) + geom_boxplot()
colnames(GP_0)[colnames(GP_0) == "figey_imp_gp"] <- "Annual_imputed_earning_before_tax"
with(GP_0, table(Annual_imputed_earning_before_tax,Year))
Year
Annual_imputed_earning_before_tax 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-3 117 83 77 77 83 83 92 95 88 96
0 0 2 1 0 1 0 0 0 0 0
1000 0 0 0 0 0 0 0 2 0 1
1500 0 0 1 0 0 0 0 0 0 0
5000 0 0 0 0 0 1 0 1 0 0
5500 0 0 1 0 0 0 0 0 0 0
6000 0 1 0 0 1 0 0 0 0 0
9500 0 0 0 1 0 0 0 0 0 0
10000 1 0 0 0 0 1 1 1 0 0
10500 1 0 0 0 0 0 0 0 0 0
11000 0 1 1 0 0 0 1 1 1 0
11500 0 0 0 0 0 0 1 0 1 0
12000 2 3 0 0 0 0 1 0 0 1
12500 0 0 0 0 0 0 0 1 0 0
13000 0 1 0 0 0 1 0 0 0 0
14000 1 0 0 0 0 0 1 0 0 0
14500 0 1 0 0 0 0 0 0 0 0
15000 3 0 0 0 1 0 1 1 0 1
15500 0 1 0 0 0 0 0 0 0 0
16000 0 0 2 0 0 0 0 0 0 0
16500 0 1 0 0 1 0 0 0 0 0
17000 1 0 1 0 1 1 0 0 0 1
17500 0 1 0 0 0 1 0 0 0 1
18000 2 0 0 0 0 0 2 1 1 1
18500 0 1 0 0 0 0 0 0 0 0
19500 0 0 1 0 0 0 0 0 0 0
20000 2 0 4 0 0 0 0 0 0 1
20500 0 1 0 0 0 2 0 1 1 0
21000 0 2 0 0 0 0 0 1 0 0
21500 1 0 0 0 0 0 0 1 1 0
22500 1 0 0 1 0 1 0 0 0 0
23000 0 1 0 0 0 0 1 0 0 0
23500 0 3 2 0 0 0 0 0 0 0
24000 0 0 1 1 0 0 0 0 0 0
24500 0 0 0 0 0 0 0 0 1 0
25000 0 2 0 0 0 0 0 1 0 1
25500 2 0 0 1 0 0 0 0 0 0
26000 2 3 3 0 1 0 2 0 1 0
26500 0 1 0 0 1 0 0 0 0 0
27000 0 2 0 1 1 0 0 0 0 1
27500 3 1 1 0 0 0 1 0 1 0
28000 0 1 1 0 0 0 0 0 0 0
28500 1 0 1 0 0 1 0 0 0 0
29000 1 0 0 2 0 0 0 1 0 0
29500 2 0 0 0 0 0 0 0 0 0
30000 0 1 0 0 1 0 1 0 1 0
30500 0 1 2 0 0 0 1 0 0 0
31000 0 0 1 1 1 0 1 0 0 1
32000 0 1 0 0 0 0 1 0 0 0
32500 0 0 1 1 1 0 0 0 0 0
33000 0 1 0 0 0 0 0 0 0 0
33500 3 4 0 0 4 2 1 0 0 3
34000 0 2 0 0 0 0 0 1 0 0
34500 0 0 1 0 1 0 0 0 0 0
35000 1 1 1 3 0 1 2 0 1 1
35500 0 0 3 1 0 1 1 0 1 0
36000 0 1 0 0 1 0 0 0 0 0
36500 1 0 0 0 0 1 0 1 1 0
37000 1 0 0 0 0 2 0 0 1 0
37500 0 1 1 0 0 0 0 0 0 0
38000 0 0 0 2 0 0 1 1 0 0
38500 0 1 0 1 0 1 1 0 0 1
39000 0 3 0 0 0 0 1 0 1 1
39500 1 0 1 0 0 0 0 0 0 0
40000 1 0 0 0 5 2 1 2 0 4
40500 1 0 1 0 0 0 1 1 1 1
41000 2 2 0 0 1 0 0 1 1 0
41500 0 0 2 1 0 2 0 0 0 0
42000 0 1 0 1 0 0 0 0 0 1
42500 0 1 1 1 0 0 0 1 0 2
43000 1 1 1 3 0 0 0 0 0 0
43500 0 0 1 0 0 0 0 1 0 1
44000 1 3 0 0 0 0 0 1 0 0
44500 0 0 2 1 1 0 0 0 0 0
45000 0 1 1 2 1 1 1 0 0 1
45500 2 1 0 0 0 0 1 0 0 0
46000 1 1 1 0 0 1 2 0 1 0
46500 0 2 0 1 0 1 1 0 0 0
47000 0 0 0 1 0 0 1 0 0 0
47500 1 0 0 1 0 0 1 0 0 0
48000 0 2 3 0 2 1 5 0 3 0
48500 0 1 0 0 0 0 0 0 0 0
49000 4 0 1 1 1 0 0 0 0 0
49500 0 1 1 0 0 0 1 0 0 0
50000 1 2 2 3 0 1 0 1 0 0
50500 0 0 0 1 0 0 1 0 0 0
51000 1 0 0 2 1 0 0 0 0 1
51500 1 1 0 0 1 0 0 0 0 0
52000 2 1 3 0 2 0 1 0 1 0
52500 0 0 0 0 1 0 0 0 0 1
53000 0 0 0 0 0 1 0 0 0 0
53500 2 0 0 0 0 1 0 0 0 0
54000 1 0 1 1 1 0 0 2 1 0
54500 0 0 0 1 0 1 0 0 1 1
55000 5 3 8 0 1 2 0 0 0 0
55500 1 0 0 2 1 2 1 0 0 0
56000 0 3 0 1 0 0 2 1 1 1
56500 4 1 0 3 0 0 1 1 1 0
57000 1 0 3 0 0 0 0 0 1 0
57500 0 0 3 1 0 0 0 0 1 0
[ reached getOption("max.print") -- omitted 771 rows ]
ggplot(data = GP_0, aes(x = Annual_imputed_earning_before_tax)) + geom_histogram() + facet_wrap(~ Year)
ggplot(data = GP_0, aes(x = Year, y = Annual_imputed_earning_before_tax), group = Year) + geom_boxplot()
GP_0$Log_Annual_imputed_earning_before_tax <- log(GP_0$Annual_imputed_earning_before_tax + 1)
NaNs produced
ggplot(data = GP_0, aes(x = Log_Annual_imputed_earning_before_tax)) + geom_histogram() + facet_wrap(~ Year)
ggplot(data = GP_0, aes(x = Year, y = Log_Annual_imputed_earning_before_tax), group = Year) + geom_boxplot()
colnames(GP_0)[colnames(GP_0) == "finey_gp"] <- "Annual_earning_after_tax"
with(GP_0, table(Annual_earning_after_tax,Year))
Year
Annual_earning_after_tax 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-4 4 17 1 172 0 1 4 8 7 8
-3 0 0 180 0 193 194 184 172 187 187
-2 210 232 72 69 84 82 81 85 79 91
-1 0 0 0 3 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 0 1
1000 0 0 0 0 0 0 0 1 0 0
1500 0 0 1 0 0 0 0 0 0 0
5000 0 0 0 0 0 0 0 1 0 0
8000 0 0 0 0 0 0 0 0 1 0
8500 0 1 0 0 0 0 0 0 0 0
10000 1 0 1 0 0 1 1 2 0 0
11000 0 0 1 0 0 0 0 0 0 1
11500 0 0 0 0 0 0 1 0 0 0
12000 2 1 0 0 0 0 1 0 0 1
13000 0 0 0 0 0 1 0 0 0 0
14000 0 1 0 0 0 0 0 0 0 0
15000 4 1 0 0 1 0 1 1 0 1
16000 0 1 1 0 0 0 0 0 0 1
17000 0 0 0 0 0 1 1 0 0 1
18000 1 1 0 0 0 0 1 1 2 1
18500 0 0 0 0 1 0 0 0 0 0
19500 0 0 0 1 0 0 0 0 0 0
20000 3 3 1 1 0 2 0 2 2 0
20500 0 1 0 0 0 0 0 0 0 0
21000 0 0 0 0 0 0 0 2 1 0
22000 0 2 1 1 0 0 1 0 0 0
23000 1 0 0 0 0 0 0 0 1 0
23500 1 1 1 0 0 0 0 0 0 0
24000 1 1 3 1 0 0 1 0 0 0
25000 5 4 0 1 0 1 1 0 0 1
25500 0 1 1 0 0 0 0 0 0 0
26000 3 1 3 0 0 2 2 1 0 0
27000 1 0 0 0 0 0 0 0 0 0
28000 0 1 0 1 0 1 0 0 0 1
28500 0 1 0 0 0 0 0 0 0 0
29000 1 0 0 0 0 0 0 0 0 0
30000 3 2 2 1 5 3 1 0 1 3
30500 0 0 1 0 0 0 0 0 0 0
31000 0 1 1 1 0 0 0 0 1 0
31500 0 0 0 0 0 0 1 0 1 0
32000 2 2 3 1 0 0 0 0 0 0
32500 0 0 0 1 0 1 0 0 0 0
33000 0 0 0 0 0 1 0 0 0 0
34000 0 1 2 0 0 1 1 1 1 1
34500 1 0 1 0 0 0 0 0 0 0
35000 4 3 1 1 6 2 2 2 1 0
35500 0 1 0 0 0 0 0 0 1 1
36000 1 0 2 0 0 1 1 0 0 2
36500 1 0 0 1 0 0 0 1 1 0
37000 1 0 2 1 0 1 0 0 0 1
37500 0 0 0 0 0 0 1 0 0 0
38000 0 2 1 1 0 0 1 0 0 1
39000 3 4 1 2 2 1 6 1 1 1
39500 0 1 0 0 0 0 0 0 0 0
40000 6 4 7 3 3 2 6 0 5 1
40500 0 0 0 1 0 0 0 0 0 0
41000 0 2 1 0 0 0 1 1 0 0
41500 1 0 0 2 0 0 0 0 0 0
42000 0 2 0 0 1 1 0 0 0 2
42500 0 0 3 0 0 0 0 0 0 1
43000 0 2 0 0 1 0 0 0 1 0
43500 1 0 0 0 0 1 0 0 0 0
44000 3 0 2 1 2 0 1 0 0 1
44500 0 0 0 2 0 0 0 0 0 1
45000 6 5 3 4 3 2 1 4 2 1
45500 1 0 0 0 0 0 0 1 1 0
46000 0 1 1 1 1 0 0 1 1 0
47000 3 3 1 2 0 0 2 0 3 1
47500 0 1 0 0 0 0 0 1 0 1
48000 2 3 1 0 1 0 2 1 0 0
48500 0 1 1 0 0 0 0 0 0 0
49000 1 1 1 1 2 2 0 0 0 0
49500 0 1 0 1 1 1 0 0 0 0
50000 16 8 9 11 8 6 4 8 5 6
50500 0 0 0 0 0 0 0 0 0 1
51000 1 0 0 2 1 0 0 1 0 0
51500 0 1 0 0 0 0 0 1 0 0
52000 8 2 6 3 4 3 4 5 2 3
52500 0 0 1 1 0 0 0 0 0 1
53000 1 3 2 2 1 0 1 2 0 0
53500 2 1 2 0 0 0 1 1 2 0
54000 1 3 1 0 0 0 1 1 2 3
54500 2 0 0 1 1 0 0 0 0 1
55000 3 3 4 2 2 0 2 2 3 2
55500 0 0 1 0 0 1 2 1 2 0
56000 1 2 0 0 1 1 1 0 5 0
56500 0 0 0 0 0 0 1 0 1 0
57000 4 2 1 1 1 1 1 1 2 1
57500 2 0 0 0 0 0 1 0 0 0
58000 0 0 0 0 2 1 0 0 0 0
58500 1 0 0 1 0 0 0 0 0 0
59000 0 0 0 3 1 1 0 1 0 0
59500 0 0 0 3 0 0 0 1 0 0
60000 19 18 10 10 17 10 7 14 5 14
60500 1 0 0 0 1 0 1 2 1 0
61000 0 3 1 0 0 1 0 0 0 0
61500 0 1 1 1 1 0 1 0 0 0
62000 3 0 0 2 0 1 1 0 2 0
62500 2 2 1 0 3 1 1 0 0 0
63000 0 2 0 4 0 1 0 0 0 0
[ reached getOption("max.print") -- omitted 333 rows ]
ggplot(data = GP_0, aes(x = Annual_earning_after_tax)) + geom_histogram() + facet_wrap(~ Year)
ggplot(data = GP_0, aes(x = Year, y = Annual_earning_after_tax), group = Year) + geom_boxplot()
GP_0$Log_Annual_earning_after_tax <- log(GP_0$Annual_earning_after_tax + 1)
NaNs produced
ggplot(data = GP_0, aes(x = Log_Annual_earning_after_tax)) + geom_histogram() + facet_wrap(~ Year)
ggplot(data = GP_0, aes(x = Year, y = Log_Annual_earning_after_tax), group = Year) + geom_boxplot()
GP_0 <- GP_0[,!names(GP_0) %in% c("figey_gp","figey_imp_gp","finey_gp")]
The variables below are already accounted for in Annual_imputed_earning_before_tax
# GP_0 <- GP_0[!(GP_0$fib < 0),]
# GP_0 <- GP_0[!(GP_0$fibv < 0),]
#
# print("In addition, did you receive any ongoing 'in kind' benefits or subsidies as part of your job/s?")
# with(GP_0, table(fib,Year))
#
# print("What is the approximate annual values in $ of these benefits?")
# with(GP_0, table(fibv,Year))
#
# ggplot(data = GP_0, aes(x = fibv)) + geom_histogram() + facet_wrap(~ Year)
# ggplot(data = GP_0, aes(x = Year, y = fibv), group = Year) + geom_boxplot()
# GP_0 <- GP_0[!(GP_0$fidme < 0),]
# GP_0$fidme <- ifelse(GP_0$fidme < 0, NA, GP_0$fidme)
print("Medical education debt in $")
[1] "Medical education debt in $"
with(GP_0, table(fidme,Year))
Year
fidme 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-4 0 26 27 24 22 13 14 22 18 20
-3 2 0 2 2 0 1 1 0 1 1
-2 98 36 30 33 38 36 40 38 41 46
0 651 701 701 696 700 706 706 699 703 695
2 1 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 1 0 0 0 0
12 1 0 0 0 0 0 0 0 0 0
15 0 0 0 0 1 0 0 0 0 0
250 1 0 0 0 0 0 0 0 0 0
1000 1 0 0 1 0 0 0 0 0 1
1400 0 0 0 0 0 0 0 0 0 1
1500 0 0 0 0 1 0 0 0 0 0
1600 0 0 0 0 0 1 0 0 0 0
2000 0 1 0 4 0 0 0 0 0 0
2600 0 0 0 0 1 0 0 0 0 0
3000 2 0 0 1 0 2 0 0 0 0
3112 0 0 1 0 0 0 0 0 0 0
3500 0 0 0 0 0 0 0 1 0 0
4000 0 0 0 1 0 0 1 0 1 0
4300 0 0 0 0 0 1 0 0 0 0
5000 1 1 1 2 0 1 1 1 1 0
6000 0 0 0 0 0 1 1 1 0 0
7000 0 0 1 1 0 0 0 0 0 0
8000 0 1 0 0 0 0 0 1 0 0
8500 0 0 0 0 1 0 0 0 0 0
9000 1 0 0 0 0 0 0 0 0 0
10000 3 0 1 0 0 3 2 0 0 1
11000 0 0 1 0 0 0 0 0 0 0
12000 0 0 0 0 1 0 0 0 0 0
15000 1 0 1 0 0 1 0 0 0 0
16000 1 1 1 1 1 0 0 0 0 0
17000 0 0 0 0 0 0 1 0 0 0
18000 1 0 0 0 0 0 0 0 0 0
20000 3 0 0 0 0 0 0 0 0 0
26000 1 0 0 0 0 0 0 0 0 0
27000 0 1 0 0 0 0 0 0 0 0
30000 0 0 0 1 0 0 0 0 1 0
35000 1 0 0 1 0 0 0 0 0 0
40000 0 0 0 0 0 0 0 1 0 0
43000 0 0 0 0 0 0 1 0 0 0
45000 0 0 1 0 0 0 0 0 0 0
50000 0 2 0 0 0 0 0 1 0 0
80000 0 1 0 0 0 0 0 1 0 0
1e+05 1 1 0 0 1 0 1 1 0 0
150000 1 0 0 0 0 0 0 0 0 0
2e+05 1 0 0 1 0 1 0 0 0 0
4e+05 0 0 0 0 0 0 0 0 0 1
5e+05 0 0 0 0 0 0 0 2 0 0
720000 0 0 1 0 0 0 0 0 0 0
ggplot(data = GP_0, aes(x = fidme)) + geom_histogram() + facet_wrap(~ Year)
ggplot(data = GP_0, aes(x = Year, y = fidme), group = Year) + geom_boxplot()
GP_0$Has_medical_education_debt <- ifelse(GP_0$fidme > 0, 1, 0)
GP_0$Has_medical_education_debt <- lfactor(GP_0$Has_medical_education_debt, level = 0:1, labels = c("No debt","Has debt"))
attributes(GP_0$Has_medical_education_debt)
$levels
[1] "No debt" "Has debt"
$class
[1] "lfactor" "factor"
$llevels
[1] 0 1
table(GP_0$fidme, GP_0$Has_medical_education_debt)
No debt Has debt
-4 186 0
-3 10 0
-2 436 0
0 6958 0
2 0 1
6 0 1
12 0 1
15 0 1
250 0 1
1000 0 3
1400 0 1
1500 0 1
1600 0 1
2000 0 5
2600 0 1
3000 0 5
3112 0 1
3500 0 1
4000 0 3
4300 0 1
5000 0 9
6000 0 3
7000 0 2
8000 0 2
8500 0 1
9000 0 1
10000 0 10
11000 0 1
12000 0 1
15000 0 3
16000 0 5
17000 0 1
18000 0 1
20000 0 3
26000 0 1
27000 0 1
30000 0 2
35000 0 2
40000 0 1
43000 0 1
45000 0 1
50000 0 3
80000 0 2
1e+05 0 5
150000 0 1
2e+05 0 3
4e+05 0 1
5e+05 0 2
720000 0 1
ggplot(data = GP_0, aes(x = Year, y = Has_medical_education_debt)) + geom_bar(aes(fill = Has_medical_education_debt), stat = "identity")
ggplot(data = GP_0) + geom_bar(aes(x = Year, fill = Has_medical_education_debt), position = "fill")
GP_0 <- GP_0[,!names(GP_0) %in% c("fidme")]
# GP_0 <- GP_0[!(GP_0$fidp < 0),]
# GP_0$fidp <- ifelse(GP_0$fidp < 0, NA, GP_0$fidp)
print("Financial debt from owning practices/premises in $")
[1] "Financial debt from owning practices/premises in $"
with(GP_0, table(fidp,Year))
Year
fidp 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-4 0 17 17 7 14 13 21 11 10 14
-3 0 227 207 251 236 199 178 204 211 215
-2 291 30 25 29 23 27 39 39 46 45
-1 10 0 0 0 0 0 0 0 0 0
0 316 350 371 331 350 403 408 396 388 390
1 1 0 1 0 0 0 0 0 0 0
2 1 0 0 0 1 1 1 0 0 0
4 0 0 0 0 0 0 0 1 0 0
6 0 0 0 0 1 0 0 0 1 0
60 0 0 0 0 1 0 0 0 0 0
80 0 0 0 0 1 0 0 0 0 0
90 0 0 0 0 0 0 0 0 0 1
100 0 0 1 0 1 0 0 0 0 0
200 1 0 0 0 0 1 0 0 0 0
490 1 0 0 0 0 0 0 0 0 0
500 0 0 0 0 0 0 0 1 0 0
790 0 0 0 0 0 0 0 0 1 0
1000 0 0 1 1 0 0 0 0 0 0
1100 0 0 0 0 0 0 0 1 0 0
1200 0 0 0 0 0 0 0 0 0 1
2000 0 0 0 0 0 0 1 0 1 0
2500 0 0 0 0 0 0 1 0 0 0
4000 0 0 0 0 0 0 1 0 0 0
4400 0 0 0 0 0 0 0 1 0 0
5000 0 0 1 0 1 0 0 0 0 0
7000 1 0 0 0 0 0 0 0 0 0
8000 0 1 0 0 0 0 0 0 0 0
9000 0 1 0 0 0 0 0 0 0 0
10000 4 1 2 3 1 1 4 1 1 0
11000 0 0 0 0 0 0 0 0 1 0
12000 0 0 0 0 0 0 0 0 1 0
12560 0 0 1 0 0 0 0 0 0 0
13000 0 0 0 0 1 0 0 1 0 1
14000 0 0 0 0 0 0 0 0 0 1
14300 0 1 0 0 0 0 0 0 0 0
15000 0 0 2 3 1 2 1 1 0 1
17000 1 0 0 0 0 0 0 0 0 0
18000 0 0 0 0 0 0 0 0 1 0
20000 5 5 2 5 2 2 2 0 1 2
21000 0 0 0 1 0 0 0 0 0 0
22000 0 0 0 0 0 0 0 1 0 0
23000 1 1 0 0 0 0 0 1 0 0
23104 0 0 0 0 0 0 0 0 0 1
25000 2 2 1 1 1 0 2 2 1 1
28000 0 0 0 1 0 0 0 0 0 0
30000 6 6 2 4 2 3 2 3 2 3
31000 1 0 0 0 0 0 1 0 1 0
32090 0 0 0 0 0 0 0 0 1 0
33000 0 0 1 0 0 0 0 0 1 0
33500 0 0 0 0 0 0 0 1 0 0
35000 3 1 2 2 1 0 1 1 1 0
36000 0 0 0 0 0 1 0 0 0 1
37000 0 0 0 0 0 1 0 0 0 0
37500 0 0 0 0 0 0 0 0 0 1
38000 1 1 0 0 0 1 0 0 2 0
40000 4 4 3 3 3 6 0 5 2 1
41082 0 0 0 0 0 0 0 1 0 0
42000 0 0 1 0 0 0 0 0 0 0
45000 2 0 2 0 2 1 0 1 1 0
48000 0 1 0 0 0 0 0 0 0 0
50000 7 4 3 6 6 6 6 3 5 1
54000 0 0 0 0 1 0 0 0 0 0
55000 1 2 1 0 0 0 0 0 0 0
56000 1 0 0 0 1 0 0 0 0 0
57000 1 1 1 1 1 0 1 0 0 0
60000 3 4 4 2 3 4 5 3 4 2
65000 1 1 1 0 1 0 1 0 1 0
68000 0 0 0 0 0 1 0 0 0 0
70000 5 1 3 2 2 2 2 1 2 1
72000 0 0 0 0 0 0 0 0 1 0
75000 1 2 3 0 2 1 0 0 0 1
80000 5 1 2 6 5 1 1 4 3 3
81115 0 0 0 0 0 0 0 0 0 1
82000 0 0 1 0 0 0 0 1 0 0
82134 0 0 1 0 0 0 0 0 0 0
84000 0 0 0 0 0 0 0 1 0 0
85000 1 2 1 3 1 0 0 0 0 1
86000 0 0 0 0 0 0 0 0 0 1
87000 0 0 1 0 0 0 0 0 0 0
88000 1 0 0 1 1 1 1 1 2 1
89000 0 0 0 1 0 0 0 0 0 0
90000 2 1 1 3 0 2 2 3 2 0
91000 0 1 0 0 0 0 0 0 0 0
94000 0 0 0 0 0 0 0 1 0 0
95000 1 0 2 2 0 0 2 1 0 0
96000 0 1 0 0 1 0 0 1 0 0
97000 0 1 0 0 1 0 0 0 0 0
98000 0 1 0 1 0 1 0 0 0 0
99000 0 0 0 0 0 1 1 0 0 0
1e+05 16 9 10 9 9 9 9 5 5 3
104000 0 1 0 0 0 0 0 0 0 0
109000 1 0 0 0 0 0 1 0 0 0
110000 1 2 0 3 1 0 1 2 1 0
111000 0 0 0 0 0 0 0 0 0 1
112800 0 0 0 1 0 0 0 0 0 0
114000 0 0 0 0 0 1 0 0 0 0
115000 0 2 0 0 1 0 0 0 0 0
120000 0 0 2 0 2 1 1 0 1 4
125000 1 0 1 0 0 0 0 1 1 0
130000 1 1 1 1 2 1 2 2 1 1
[ reached getOption("max.print") -- omitted 157 rows ]
ggplot(data = GP_0, aes(x = fidp)) + geom_histogram() + facet_wrap(~ Year)
ggplot(data = GP_0, aes(x = Year, y = fidp), group = Year) + geom_boxplot()
GP_0$Has_medical_practice_debt <- ifelse(GP_0$fidp > 0, 1, 0)
GP_0$Has_medical_practice_debt <- lfactor(GP_0$Has_medical_practice_debt, level = 0:1, labels = c("No debt","Has debt"))
attributes(GP_0$Has_medical_practice_debt)
$levels
[1] "No debt" "Has debt"
$class
[1] "lfactor" "factor"
$llevels
[1] 0 1
table(GP_0$fidp, GP_0$Has_medical_practice_debt)
No debt Has debt
-4 124 0
-3 1928 0
-2 594 0
-1 10 0
0 3703 0
1 0 2
2 0 4
4 0 1
6 0 2
60 0 1
80 0 1
90 0 1
100 0 2
200 0 2
490 0 1
500 0 1
790 0 1
1000 0 2
1100 0 1
1200 0 1
2000 0 2
2500 0 1
4000 0 1
4400 0 1
5000 0 2
7000 0 1
8000 0 1
9000 0 1
10000 0 18
11000 0 1
12000 0 1
12560 0 1
13000 0 3
14000 0 1
14300 0 1
15000 0 11
17000 0 1
18000 0 1
20000 0 26
21000 0 1
22000 0 1
23000 0 3
23104 0 1
25000 0 13
28000 0 1
30000 0 33
31000 0 3
32090 0 1
33000 0 2
33500 0 1
35000 0 12
36000 0 2
37000 0 1
37500 0 1
38000 0 5
40000 0 31
41082 0 1
42000 0 1
45000 0 9
48000 0 1
50000 0 47
54000 0 1
55000 0 4
56000 0 2
57000 0 6
60000 0 34
65000 0 6
68000 0 1
70000 0 21
72000 0 1
75000 0 10
80000 0 31
81115 0 1
82000 0 2
82134 0 1
84000 0 1
85000 0 9
86000 0 1
87000 0 1
88000 0 9
89000 0 1
90000 0 16
91000 0 1
94000 0 1
95000 0 8
96000 0 3
97000 0 2
98000 0 3
99000 0 2
1e+05 0 84
104000 0 1
109000 0 2
110000 0 11
111000 0 1
112800 0 1
114000 0 1
115000 0 3
120000 0 11
125000 0 4
130000 0 13
133000 0 1
135000 0 1
140000 0 15
145000 0 2
147000 0 1
150000 0 42
153000 0 1
154266 0 1
160000 0 13
162000 0 3
165000 0 3
170000 0 6
175000 0 1
180000 0 19
185000 0 2
190000 0 6
190059 0 1
191960 0 1
2e+05 0 69
204000 0 1
205000 0 1
206000 0 1
209000 0 1
210000 0 5
215000 0 1
220000 0 13
227000 0 1
230000 0 7
235000 0 1
238000 0 1
238497 0 1
240000 0 8
246000 0 2
248000 0 2
248200 0 1
248500 0 1
250000 0 38
256000 0 10
260000 0 5
264276 0 1
265000 0 2
270000 0 2
273000 0 1
275000 0 3
280000 0 10
284042 0 1
288000 0 3
295000 0 1
3e+05 0 62
312384 0 1
320000 0 7
330000 0 2
333333 0 1
340000 0 2
345000 0 1
350000 0 10
353000 0 1
355000 0 1
360000 0 3
362000 0 1
370000 0 1
375000 0 2
380000 0 2
383000 0 1
395000 0 1
395500 0 1
4e+05 0 35
406406.53125 0 1
410000 0 1
415000 0 2
420000 0 2
430000 0 1
447000 0 2
450000 0 13
460000 0 1
465000 0 3
466000 0 1
470000 0 5
475000 0 2
480000 0 2
485000 0 1
490000 0 1
5e+05 0 31
520000 0 1
530000 0 1
550000 0 7
560000 0 1
567000 0 1
580000 0 1
6e+05 0 29
620000 0 1
650000 0 6
656000 0 1
659000 0 1
670000 0 1
680000 0 1
691000 0 1
692000 0 1
7e+05 0 13
706000 0 1
710000 0 2
715000 0 1
730000 0 1
750000 0 8
776000 0 1
790000 0 1
8e+05 0 13
820000 0 1
830000 0 1
840000 0 1
850000 0 2
870000 0 1
877500 0 1
9e+05 0 8
950000 0 1
1e+06 0 19
1020000 0 1
1030000 0 1
1140000 0 1
1200000 0 8
1250000 0 2
1258000 0 1
1275000 0 1
1300000 0 9
1310000 0 1
1350000 0 1
1400000 0 6
1500000 0 8
1550000 0 1
1590000 0 1
1600000 0 2
1730000 0 1
1750000 0 1
1800000 0 6
1850000 0 1
2e+06 0 8
2200000 0 2
2300000 0 2
2500000 0 11
2600000 0 1
2700000 0 1
2770000 0 1
2800000 0 1
3e+06 0 8
3098000 0 1
3100000 0 1
3500000 0 1
3671000 0 1
3700000 0 1
4e+06 0 2
4900000 0 1
5e+06 0 1
5500000 0 1
5600000 0 1
6e+06 0 1
6500000 0 1
4.2e+07 0 1
ggplot(data = GP_0, aes(x = Year, y = Has_medical_practice_debt)) + geom_bar(aes(fill = Has_medical_practice_debt), stat = "identity")
ggplot(data = GP_0) + geom_bar(aes(x = Year, fill = Has_medical_practice_debt), position = "fill")
GP_0 <- GP_0[,!names(GP_0) %in% c("fidp")]
# with(GP_0, table(fidpdk,Year))
# with(GP_0, table(fidpna,Year))
# GP_0 <- GP_0[!(GP_0$fips < 0 & (GP_0$Year == "2008" | GP_0$Year == "2009" | GP_0$Year == "2010" | GP_0$Year == "2011" | GP_0$Year == "2012" | GP_0$Year == "2013" | GP_0$Year == "2014" | GP_0$Year == "2015" | GP_0$Year == "2016")),]
# GP_0 <- GP_0[!(GP_0$fips2 < 0 & GP_0$Year == "2017"),]
# GP_0$fips <- ifelse((GP_0$fips < 0 & (GP_0$Year == "2008" | GP_0$Year == "2009" | GP_0$Year == "2010" | GP_0$Year == "2011" | GP_0$Year == "2012" | GP_0$Year == "2013" | GP_0$Year == "2014" | GP_0$Year == "2015" | GP_0$Year == "2016")), NA, GP_0$fips)
# GP_0$fips2 <- ifelse(GP_0$fips2 < 0 & GP_0$Year == "2017", NA, GP_0$fips2)
print("What is the status of your private practice for tax purposes? (waves 1-9)")
[1] "What is the status of your private practice for tax purposes? (waves 1-9)"
with(GP_0, table(fips,Year))
Year
fips 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-5 0 2 0 0 0 0 0 0 0 0
-4 0 0 0 0 0 0 0 0 1 0
-3 0 0 0 0 0 0 0 1 1 0
-2 29 22 13 14 20 15 22 32 28 0
0 220 249 242 249 241 255 260 270 268 0
1 74 78 91 72 70 74 72 72 68 0
2 213 221 206 213 228 229 206 209 215 0
3 57 53 58 65 68 64 72 78 61 0
4 50 55 53 53 47 43 48 33 34 0
5 130 92 106 103 93 88 89 74 90 0
attributes(GP_0$fips)
$label
[1] "What is the status of your private practice for tax purpose?"
$labels
[0]sole trader [1]partnership [2]company [3]trust [4]don't know [5]not applicable
0 1 2 3 4 5
$class
[1] "haven_labelled"
print("How would you describe the ownership structure of the main practice in which you work? (wave 10)")
[1] "How would you describe the ownership structure of the main practice in which you work? (wave 10)"
with(GP_0, table(fips2,Year))
Year
fips2 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-2 0 0 0 0 0 0 0 0 0 34
0 0 0 0 0 0 0 0 0 0 111
1 0 0 0 0 0 0 0 0 0 152
2 0 0 0 0 0 0 0 0 0 258
3 0 0 0 0 0 0 0 0 0 96
4 0 0 0 0 0 0 0 0 0 72
5 0 0 0 0 0 0 0 0 0 43
attributes(GP_0$fips2)
$label
[1] "What is the status of your private practice for tax purpose?"
$labels
[0]sole trader [1]partnership [2]company/corporate [3]trust [4]don't know
0 1 2 3 4
[5]not applicable
5
$class
[1] "haven_labelled"
GP_0$Practice_ownership_structure <- ifelse(GP_0$Year == "2017", GP_0$fips2, GP_0$fips)
GP_0$Practice_ownership_structure <- lfactor(GP_0$Practice_ownership_structure, levels = 0:5, labels = c("Sole trader","Partnership","Company/Corporate","Trust","Don't know","Not applicable"))
table(GP_0$Practice_ownership_structure,GP_0$Year)
2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
Sole trader 220 249 242 249 241 255 260 270 268 111
Partnership 74 78 91 72 70 74 72 72 68 152
Company/Corporate 213 221 206 213 228 229 206 209 215 258
Trust 57 53 58 65 68 64 72 78 61 96
Don't know 50 55 53 53 47 43 48 33 34 72
Not applicable 130 92 106 103 93 88 89 74 90 43
attributes(GP_0$Practice_ownership_structure)
$levels
[1] "Sole trader" "Partnership" "Company/Corporate" "Trust" "Don't know"
[6] "Not applicable"
$class
[1] "lfactor" "factor"
$llevels
[1] 0 1 2 3 4 5
ggplot(data = GP_0, aes(x = Year, y = Practice_ownership_structure)) + geom_bar(aes(fill = Practice_ownership_structure), stat = "identity")
ggplot(data = GP_0) + geom_bar(aes(x = Year, fill = Practice_ownership_structure), position = "fill")
GP_0 <- GP_0[,!names(GP_0) %in% c("fips","fips2")]
# with(GP_0, table(fioti,Year))
# with(GP_0, table(fisadd,Year))
# with(GP_0, table(fiip,Year))
# ggplot(data = GP_0, aes(x = fiip)) + geom_histogram() + facet_wrap(~ Year)
# ggplot(data = GP_0, aes(x = Year, y = fiip), group = Year) + geom_boxplot()
# GP_0 <- GP_0[!(GP_0$fighiy_gp < 0),]
# GP_0 <- GP_0[!(GP_0$finhiy_gp < 0),]
# GP_0$fighiy_gp <- ifelse(GP_0$fighiy_gp < 0, NA, GP_0$fighiy_gp)
# GP_0$finhiy_gp <- ifelse(GP_0$finhiy_gp < 0, NA, GP_0$finhiy_gp)
colnames(GP_0)[colnames(GP_0) == "fighiy_gp"] <- "Annual_household_income_before_tax"
with(GP_0, table(Annual_household_income_before_tax,Year))
Year
Annual_household_income_before_tax 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-5 4 1 0 0 0 0 0 0 0 0
-4 3 2 1 24 0 1 5 6 7 7
-3 1 0 93 0 24 34 39 37 33 26
-2 132 141 66 106 109 125 107 124 115 125
0 0 2 0 0 0 0 0 1 0 2
1000 0 0 0 0 0 0 0 0 0 1
5000 0 0 0 0 0 0 0 1 0 0
7500 1 0 0 0 0 0 0 0 0 0
12000 0 0 0 0 0 0 1 0 0 0
14000 0 0 0 0 0 0 0 0 0 1
14500 0 0 0 0 0 0 0 0 1 0
16000 0 0 0 0 0 0 0 1 0 1
17000 0 0 0 0 0 1 0 1 0 0
20000 1 0 0 0 0 0 2 1 1 1
22000 0 0 0 0 0 0 1 0 0 0
23000 0 1 1 0 0 0 0 0 0 0
26000 0 0 0 0 0 0 0 1 1 2
27000 0 0 0 0 0 0 0 0 0 1
29000 0 1 0 0 0 0 0 0 0 0
29500 0 0 0 0 0 0 0 0 1 0
30000 0 0 1 0 0 0 0 0 0 2
32000 0 0 0 0 0 0 1 0 0 0
35000 0 1 0 0 0 0 0 0 1 0
37000 0 0 1 0 0 0 0 0 0 0
39000 1 0 1 0 0 0 0 0 0 0
40000 0 2 0 0 1 0 1 1 0 0
40500 0 0 0 0 1 0 0 0 0 0
42000 0 0 0 0 0 0 0 1 0 0
42500 0 0 0 0 0 0 0 0 0 1
44000 0 0 0 0 1 1 0 0 0 0
45000 0 0 0 0 0 1 1 0 0 0
48000 0 0 0 0 0 0 1 0 0 0
49000 0 0 0 0 0 1 0 0 0 0
50000 1 2 2 2 0 1 1 0 0 1
50500 0 0 0 1 0 0 1 0 0 0
51000 0 0 0 1 0 0 0 0 0 1
52000 0 2 1 2 0 0 1 0 0 0
52500 0 0 1 0 0 0 0 0 0 0
55000 0 0 1 0 0 0 0 0 0 0
56000 0 1 0 0 0 0 0 0 0 0
56500 0 0 0 0 0 0 0 0 1 0
57000 0 0 1 0 0 0 0 0 0 1
57500 0 0 0 1 0 0 0 0 0 1
58000 1 0 0 0 0 0 0 0 0 1
60000 4 6 4 2 3 1 0 3 1 1
61500 0 0 0 0 0 0 1 0 0 0
62000 0 1 0 0 0 0 0 0 0 1
62500 0 0 0 0 0 0 0 1 0 0
64000 0 0 0 1 1 0 0 1 0 0
64500 0 1 0 0 0 0 0 0 0 0
65000 3 1 1 0 1 1 1 1 2 0
65500 0 0 0 0 0 0 1 0 0 0
66000 0 2 1 0 0 1 0 0 0 0
66500 0 0 1 0 0 0 0 0 0 1
67000 0 0 0 0 0 1 0 0 1 0
67500 0 0 0 1 0 0 0 0 0 0
68000 0 1 0 0 0 0 0 0 0 0
69500 0 0 0 0 0 0 0 1 0 0
70000 1 2 2 1 0 0 3 0 3 1
71500 1 0 0 0 0 0 0 0 0 0
72000 0 0 0 0 0 1 0 0 0 0
73000 0 0 1 2 0 0 0 0 0 0
74000 0 0 1 1 1 0 0 0 0 0
74500 0 0 0 0 0 0 0 0 1 0
75000 2 0 2 2 0 0 0 0 0 0
75500 0 0 0 0 0 0 0 0 1 0
76000 0 0 0 1 0 1 0 0 0 2
77000 1 0 0 0 0 0 0 0 0 0
78000 0 3 0 2 0 3 1 3 1 1
78500 0 0 0 0 0 0 0 0 0 1
79000 0 1 0 0 0 0 0 0 0 0
80000 4 1 7 0 6 4 2 3 2 1
80500 0 0 1 0 0 0 0 0 0 0
81000 0 0 0 0 0 1 0 0 0 0
81500 0 0 0 0 0 1 1 0 0 1
82000 0 0 0 0 0 0 0 1 0 0
82500 0 0 0 2 0 0 0 0 0 0
83000 0 0 0 2 0 0 2 0 0 0
84000 0 1 0 0 2 1 1 0 1 0
84500 0 0 0 0 1 0 0 0 0 0
85000 1 2 2 2 0 0 0 0 1 0
86000 1 0 1 0 1 1 0 0 1 0
86500 0 0 0 1 0 0 0 0 0 0
87000 1 1 1 0 0 0 1 0 1 0
88000 0 1 0 0 0 1 0 0 1 0
88500 0 1 1 0 0 1 0 0 0 1
89000 0 1 0 0 1 0 1 0 0 0
90000 5 2 3 4 3 3 0 2 2 4
90500 0 0 0 0 0 0 0 0 0 1
91000 2 1 1 0 0 2 0 1 0 0
92000 1 2 0 0 1 2 1 0 1 0
92500 0 1 0 0 1 0 0 0 0 0
93000 0 1 0 0 0 1 0 0 0 0
93500 1 0 0 0 0 0 0 0 1 0
94000 0 0 0 0 1 0 1 0 1 0
94500 0 0 0 0 0 0 0 0 0 1
95000 4 1 1 2 0 1 0 2 1 0
95500 0 1 0 0 0 0 0 1 0 0
96000 1 0 0 0 0 0 0 0 0 1
97000 0 0 0 0 0 0 0 1 0 0
[ reached getOption("max.print") -- omitted 549 rows ]
ggplot(data = GP_0, aes(x = Annual_household_income_before_tax)) + geom_histogram() + facet_wrap(~ Year)
ggplot(data = GP_0, aes(x = Year, y = Annual_household_income_before_tax), group = Year) + geom_boxplot()
GP_0$Log_Annual_household_income_before_tax <- log(GP_0$Annual_household_income_before_tax + 1)
NaNs produced
ggplot(data = GP_0, aes(x = Log_Annual_household_income_before_tax)) + geom_histogram() + facet_wrap(~ Year)
ggplot(data = GP_0, aes(x = Year, y = Log_Annual_household_income_before_tax), group = Year) + geom_boxplot()
colnames(GP_0)[colnames(GP_0) == "finhiy_gp"] <- "Annual_household_income_after_tax"
with(GP_0, table(Annual_household_income_after_tax,Year))
Year
Annual_household_income_after_tax 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-5 3 0 0 0 0 0 0 0 0 0
-4 6 2 0 164 0 1 7 12 7 10
-3 0 1 239 0 185 178 189 156 175 180
-2 251 295 69 106 109 125 107 124 115 125
0 1 1 0 0 0 0 0 1 0 1
1000 0 0 0 0 0 0 0 1 0 0
2000 0 0 0 0 0 0 0 1 0 0
10000 1 0 0 0 0 0 1 1 0 0
11000 0 0 0 0 0 1 0 1 0 0
11500 0 0 0 0 0 0 1 0 0 0
12000 0 0 0 0 0 0 0 1 0 0
12500 0 0 0 0 0 0 0 1 0 0
13000 0 0 0 0 0 1 0 0 0 0
14000 0 0 0 0 0 0 1 0 0 1
15000 3 0 1 0 0 0 0 0 0 1
15500 0 0 0 0 1 0 0 0 0 0
16000 0 1 0 0 0 0 0 0 0 0
16500 1 0 0 0 0 0 0 0 0 0
17000 0 0 0 0 0 1 0 0 0 1
17500 0 0 0 0 0 0 0 0 0 1
18000 0 0 0 0 0 0 0 0 1 0
19000 0 0 0 0 0 0 1 0 0 0
20000 0 0 0 0 0 1 0 1 1 0
21000 0 0 0 0 0 0 0 1 0 0
22500 0 0 1 0 0 0 0 0 0 0
25000 0 0 0 0 0 0 0 0 1 1
25500 0 1 0 0 0 0 0 0 0 0
26000 0 0 0 0 0 1 1 0 0 0
27000 1 0 0 0 0 0 0 0 1 1
28000 0 0 0 0 0 0 0 0 0 1
29000 0 1 0 0 0 0 0 0 0 0
30000 0 0 0 1 2 0 0 1 0 0
31000 0 0 1 1 0 0 0 0 0 0
32000 1 0 0 0 0 0 0 0 0 0
32500 0 0 0 0 0 1 0 0 0 0
35000 1 0 0 0 2 0 0 0 0 0
37000 0 0 1 0 0 0 0 0 0 1
38000 0 0 0 0 0 0 0 0 0 1
39000 0 1 0 0 0 1 1 2 0 0
39500 1 0 1 0 0 0 0 0 0 0
40000 3 2 0 0 1 0 1 0 1 1
42000 0 1 0 0 0 0 0 0 0 0
42500 0 0 0 0 0 0 0 0 0 1
43000 0 1 0 0 0 0 0 0 0 0
43500 0 0 0 0 0 1 0 0 0 0
44000 0 1 0 0 0 0 0 0 1 0
44500 0 0 0 1 0 0 0 0 0 0
45000 2 1 0 0 1 0 1 1 2 1
45500 0 0 0 0 0 0 1 0 0 0
47000 1 0 0 0 0 0 1 0 0 1
47500 0 0 1 0 0 0 0 0 0 0
48000 0 0 1 0 0 0 0 1 0 1
49000 0 0 0 0 0 0 1 0 0 0
50000 0 5 3 2 6 0 1 4 2 0
50500 0 0 1 0 0 0 0 0 0 0
51000 0 0 0 1 0 0 0 0 0 0
51500 0 0 0 1 0 0 0 0 0 0
52000 1 1 0 2 0 0 1 2 0 0
53000 0 0 0 0 0 0 0 1 0 0
54000 0 1 0 0 0 0 1 1 0 0
54500 0 0 0 0 0 0 0 1 0 0
55000 2 0 1 1 0 2 1 0 2 1
55500 0 0 0 0 0 0 0 0 1 0
56000 0 0 0 0 0 0 1 0 1 0
56500 0 0 0 0 0 0 1 0 0 0
57000 0 0 1 0 0 0 1 0 0 0
57500 0 0 0 0 1 0 0 0 0 0
58000 0 1 1 0 0 0 0 0 0 0
59000 0 0 0 2 0 0 0 0 0 0
59500 0 0 0 2 0 0 0 0 0 0
60000 7 9 5 2 3 0 1 2 5 2
60500 0 0 1 0 0 0 0 0 0 0
61000 1 2 0 0 0 0 0 0 0 0
62000 0 0 1 2 1 1 0 0 3 0
62500 0 0 0 0 0 0 0 1 0 0
63000 0 0 0 1 0 0 1 0 0 1
64000 0 0 1 0 0 0 0 0 2 1
64500 0 0 0 0 0 0 0 0 0 1
65000 8 5 4 2 2 6 1 2 0 0
66000 0 0 0 0 0 0 0 1 1 0
66500 1 0 0 0 0 0 0 0 0 0
67000 0 0 0 0 0 4 1 1 2 1
67500 1 1 1 1 0 1 0 1 0 1
68000 3 1 0 1 1 2 3 1 1 0
68500 0 0 0 1 0 0 0 0 0 0
69000 2 2 0 1 0 1 0 0 0 1
69500 0 0 0 1 0 0 0 0 0 0
70000 9 5 5 5 9 2 4 1 1 2
70500 0 0 0 0 0 1 0 0 0 0
71000 1 0 0 1 1 2 0 0 0 0
71500 1 0 0 0 0 0 0 0 0 0
72000 1 2 3 0 1 0 0 0 0 0
73000 0 1 0 0 1 1 1 0 0 1
73500 0 0 0 0 1 0 0 0 0 0
74000 1 0 0 0 0 0 1 0 0 0
75000 7 5 0 2 2 2 0 5 1 3
75500 0 1 1 0 0 0 0 0 0 0
76000 1 0 0 0 1 0 0 1 0 0
76500 1 0 0 0 0 0 0 0 1 0
77000 0 0 0 0 0 0 0 0 3 0
[ reached getOption("max.print") -- omitted 396 rows ]
ggplot(data = GP_0, aes(x = Annual_household_income_after_tax)) + geom_histogram() + facet_wrap(~ Year)
ggplot(data = GP_0, aes(x = Year, y = Annual_household_income_after_tax), group = Year) + geom_boxplot()
GP_0$Log_Annual_household_income_after_tax <- log(GP_0$Annual_household_income_after_tax + 1)
NaNs produced
ggplot(data = GP_0, aes(x = Log_Annual_household_income_after_tax)) + geom_histogram() + facet_wrap(~ Year)
ggplot(data = GP_0, aes(x = Year, y = Log_Annual_household_income_after_tax), group = Year) + geom_boxplot()
GP_0 <- GP_0[,!names(GP_0) %in% c("fighiy_gp","finhiy_gp")]
# with(GP_0, table(glnl_gp,Year))
# GP_0 <- GP_0[!(GP_0$gltwwasgc < 0),]
# GP_0$gltwwasgc <- ifelse(GP_0$gltwwasgc < 0, NA, GP_0$gltwwasgc)
print("ASGC classification of main place of work (based on postcode)")
[1] "ASGC classification of main place of work (based on postcode)"
with(GP_0, table(gltwwasgc,Year))
Year
gltwwasgc 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-3 1 0 2 3 3 2 1 3 1 4
1 522 522 519 522 519 522 524 523 526 523
2 149 153 150 142 145 145 145 146 142 143
3 101 97 98 102 100 99 99 97 97 96
GP_0$ASGC_main_workplace <- as_factor(GP_0$gltwwasgc)
table(GP_0$ASGC_main_workplace, GP_0$gltwwasgc)
-3 1 2 3
-3 20 0 0 0
[1]major city 0 5222 0 0
[2]inner regional 0 0 1460 0
[3]outer regional/remote/very remote 0 0 0 986
ggplot(data = GP_0, aes(x = Year, y = ASGC_main_workplace)) + geom_bar(aes(fill = ASGC_main_workplace), stat = "identity")
ggplot(data = GP_0) + geom_bar(aes(x = Year, fill = ASGC_main_workplace), position = "fill")
table(GP_0$ASGC_main_workplace,GP_0$Year)
2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
-3 1 0 2 3 3 2 1 3 1 4
[1]major city 522 522 519 522 519 522 524 523 526 523
[2]inner regional 149 153 150 142 145 145 145 146 142 143
[3]outer regional/remote/very remote 101 97 98 102 100 99 99 97 97 96
attributes(GP_0$ASGC_main_workplace)
$levels
[1] "-3" "[1]major city"
[3] "[2]inner regional" "[3]outer regional/remote/very remote"
$class
[1] "factor"
$label
[1] "ASGC classification of main place of work"
GP_0 <- GP_0[,!names(GP_0) %in% c("gltwwasgc")]
Remove the rare instances in which a GP becomes a Specialist sometime within the survey period. And check for no duplicates.
GP_0 <- make.pbalanced(GP_0, balance.type = "shared.individuals")
# Specialist_0 <- make.pbalanced(Specialist_0, balance.type = "shared.individuals")
any(duplicated(GP_0[,c("xwaveid", "Year")]))
[1] FALSE
# any(duplicated(Specialist_0[,c("xwaveid", "Year")]))
paste0("The number of GPs are ", length(unique(GP_0[["xwaveid"]])))
[1] "The number of GPs are 765"
paste0("The number of Specialists are ", length(unique(Specialist_0[["xwaveid"]])))
[1] "The number of Specialists are 1050"
Send selected variables for analysis in ExPanDaR
select_1 <- c(
"xwaveid",
"Year",
"Gender",
"Age_group",
"Medical_degree_completion",
"ASGC_main_workplace",
"JS_Recognition_for_good_work",
"JS_Hours_of_work",
"JS_Remuneration",
"JS_Like_to_change_work_hours",
"Annual_earning_before_tax",
"Log_Annual_earning_before_tax",
"Annual_imputed_earning_before_tax",
"Log_Annual_imputed_earning_before_tax",
"Annual_earning_after_tax",
"Log_Annual_household_income_after_tax",
"Has_medical_education_debt",
"Has_medical_practice_debt",
"Annual_household_income_before_tax",
"Log_Annual_household_income_before_tax",
"Annual_household_income_after_tax",
"Log_Annual_household_income_after_tax"
)
GP_0.01 <- GP_0[,select_1]
rm(select_1)
glimpse(GP_0.01)
Observations: 7,650
Variables: 22
$ xwaveid [3m[38;5;246m<chr>[39m[23m "1100002", "1100002", "1100002", "1100002", "1100002", "1100002…
$ Year [3m[38;5;246m<chr>[39m[23m "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015",…
$ Gender [3m[38;5;246m<fct>[39m[23m [0]male, [0]male, [0]male, [0]male, [0]male, [0]male, [0]male, …
$ Age_group [3m[38;5;246m<fct>[39m[23m [7]60-64, [8]65-69, [8]65-69, [8]65-69, [8]65-69, [8]65-69, [8]…
$ Medical_degree_completion [3m[38;5;246m<fct>[39m[23m [6]1970-1974, [6]1970-1974, [6]1970-1974, [6]1970-1974, [6]1970…
$ ASGC_main_workplace [3m[38;5;246m<fct>[39m[23m [1]major city, [1]major city, [1]major city, [1]major city, [1]…
$ JS_Recognition_for_good_work [3m[38;5;246m<ord>[39m[23m [2]not sure, [1]moderately dissatisfied, [2]not sure, [3]modera…
$ JS_Hours_of_work [3m[38;5;246m<ord>[39m[23m [4]very satisfied, [4]very satisfied, [4]very satisfied, [4]ver…
$ JS_Remuneration [3m[38;5;246m<ord>[39m[23m [1]moderately dissatisfied, [0]very dissatisfied, [1]moderately…
$ JS_Like_to_change_work_hours [3m[38;5;246m<fct>[39m[23m "[0]no", "[0]no", "[0]no", "[0]no", "[0]no", "[0]no", "[2]yes, …
$ Annual_earning_before_tax [3m[38;5;246m<dbl>[39m[23m 110000, 141000, 135000, 162500, 115000, 104500, 138000, -3, 190…
$ Log_Annual_earning_before_tax [3m[38;5;246m<dbl>[39m[23m 11.60824, 11.85652, 11.81304, 11.99844, 11.65270, 11.55695, 11.…
$ Annual_imputed_earning_before_tax [3m[38;5;246m<dbl>[39m[23m 109000, 151500, 135000, 164500, 112000, 122500, 155500, 161000,…
$ Log_Annual_imputed_earning_before_tax [3m[38;5;246m<dbl>[39m[23m 11.59911, 11.92835, 11.81304, 12.01067, 11.62626, 11.71587, 11.…
$ Annual_earning_after_tax [3m[38;5;246m<dbl>[39m[23m 80000, 98500, -3, 113000, 81000, 87500, 96000, 104000, 142000, …
$ Log_Annual_household_income_after_tax [3m[38;5;246m<dbl>[39m[23m 11.28979, 11.60824, NaN, 11.63515, 11.67845, 11.37941, 11.67845…
$ Has_medical_education_debt [3m[38;5;246m<fct>[39m[23m No debt, No debt, No debt, No debt, No debt, No debt, No debt, …
$ Has_medical_practice_debt [3m[38;5;246m<fct>[39m[23m No debt, No debt, No debt, No debt, No debt, No debt, No debt, …
$ Annual_household_income_before_tax [3m[38;5;246m<dbl>[39m[23m 110000, 153000, 135000, 162500, 152000, 104500, 160000, -3, 200…
$ Log_Annual_household_income_before_tax [3m[38;5;246m<dbl>[39m[23m 11.60824, 11.93820, 11.81304, 11.99844, 11.93164, 11.55695, 11.…
$ Annual_household_income_after_tax [3m[38;5;246m<dbl>[39m[23m 80000, 110000, -3, 113000, 118000, 87500, 118000, 117000, 15200…
$ Log_Annual_household_income_after_tax [3m[38;5;246m<dbl>[39m[23m 11.28979, 11.60824, NaN, 11.63515, 11.67845, 11.37941, 11.67845…
# ExPanD(df = GP_0.01, cs_id = "xwaveid", ts_id = "Year")