About

The R code written for this project was done back in 2015 when I was quite new at R. As a result, it is quite terrible. Lots of DRY violations. I have changed it a little since then to fix bugs introduced by me renaming some functions. There are no serious changes with reflect to the overall code structure.

Startup

Load packages and set options.

options(digits = 2)
library(pacman)
p_load(kirkegaard, jsonlite, stringi, plyr, devtools,
       VIM, scales, readODS)

Data

Load data.

#load name data
names.df = read.csv("names.csv", stringsAsFactors = F, encoding="UTF-8")
rownames(names.df) = names.df$name

EDA

Plot some things of interest.

#Initial explorative plots
#Mere histograms
ggplot(data=names.df, aes(x=number)) +
  geom_histogram() +
  scale_x_continuous(trans=log10_trans(),
                     breaks = trans_breaks("log10", function(x) round(10^x))) +
  xlab("Number of persons") +
  ylab("Number of names")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figures/number.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

describe(names.df$number)

##    vars    n mean   sd median trimmed mad min   max range skew kurtosis
## X1    1 2358 2185 5714    309     741 277 100 50094 49994  4.6       24
##     se
## X1 118

ggplot(data=names.df, aes(x=age)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figures/age.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=names.df, aes(x=own.place)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 455 rows containing non-finite values (stat_bin).

GG_save("figures/own_place.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 455 rows containing non-finite values (stat_bin).

ggplot(data=names.df, aes(x=conviction)) +
  geom_histogram() +
  xlab("conviction [%]")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figures/conviction.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=names.df, aes(x=income)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 526 rows containing non-finite values (stat_bin).

GG_save("figures/income.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 526 rows containing non-finite values (stat_bin).

ggplot(data = names.df, aes(x=age, y=income, color=gender)) +
  geom_point() +
  geom_smooth(aes(color=gender)) +
  ylab("Monthly income in DKK")

## `geom_smooth()` using method = 'gam'

## Warning: Removed 526 rows containing non-finite values (stat_smooth).

## Warning: Removed 526 rows containing missing values (geom_point).

GG_save("figures/age_gender_income.png")

## `geom_smooth()` using method = 'gam'

## Warning: Removed 526 rows containing non-finite values (stat_smooth).

## Warning: Removed 526 rows containing missing values (geom_point).

ggplot(data = names.df, aes(x=age, y=conviction, color=gender)) +
  geom_point() +
  geom_smooth(aes(color=gender))

## `geom_smooth()` using method = 'gam'

GG_save("figures/age_gender_conviction.png")

## `geom_smooth()` using method = 'gam'

ggplot(data = names.df, aes(x=age, y=married, color=gender)) +
  geom_point() +
  geom_smooth(aes(color=gender))

## `geom_smooth()` using method = 'gam'

## Warning: Removed 455 rows containing non-finite values (stat_smooth).

## Warning: Removed 455 rows containing missing values (geom_point).

GG_save("figures/age_gender_married.png")

## `geom_smooth()` using method = 'gam'

## Warning: Removed 455 rows containing non-finite values (stat_smooth).

## Warning: Removed 455 rows containing missing values (geom_point).

ggplot(data = names.df, aes(x=age, y=own.place, color=gender)) +
  geom_point() +
  geom_smooth(aes(color=gender))

## `geom_smooth()` using method = 'gam'

## Warning: Removed 455 rows containing non-finite values (stat_smooth).

## Warning: Removed 455 rows containing missing values (geom_point).

GG_save("figures/age_gender_ownPlace.png")

## `geom_smooth()` using method = 'gam'

## Warning: Removed 455 rows containing non-finite values (stat_smooth).

## Warning: Removed 455 rows containing missing values (geom_point).

ggplot(data = names.df, aes(x=age, y=no.job, color=gender)) +
  geom_point() +
  geom_smooth(aes(color=gender))

## `geom_smooth()` using method = 'gam'

## Warning: Removed 526 rows containing non-finite values (stat_smooth).

## Warning: Removed 526 rows containing missing values (geom_point).

GG_save("figures/age_gender_noJob.png")

## `geom_smooth()` using method = 'gam'

## Warning: Removed 526 rows containing non-finite values (stat_smooth).

## Warning: Removed 526 rows containing missing values (geom_point).

ggplot(data = names.df, aes(x=age, y=outside.job.market, color=gender)) +
  geom_point() +
  geom_smooth(aes(color=gender))

## `geom_smooth()` using method = 'gam'

## Warning: Removed 455 rows containing non-finite values (stat_smooth).

## Warning: Removed 455 rows containing missing values (geom_point).

GG_save("figures/age_gender_outsideJobMarket.png")

## `geom_smooth()` using method = 'gam'

## Warning: Removed 455 rows containing non-finite values (stat_smooth).

## Warning: Removed 455 rows containing missing values (geom_point).

ggplot(data = names.df, aes(x=age, y=student, color=gender)) +
  geom_point() +
  geom_smooth(aes(color=gender))

## `geom_smooth()` using method = 'gam'

## Warning: Removed 455 rows containing non-finite values (stat_smooth).

## Warning: Removed 455 rows containing missing values (geom_point).

GG_save("figures/age_gender_student.png")

## `geom_smooth()` using method = 'gam'

## Warning: Removed 455 rows containing non-finite values (stat_smooth).

## Warning: Removed 455 rows containing missing values (geom_point).

ggplot(data = names.df, aes(x=age, y=independent, color=gender)) +
  geom_point() +
  geom_smooth(aes(color=gender))

## `geom_smooth()` using method = 'gam'

## Warning: Removed 455 rows containing non-finite values (stat_smooth).

## Warning: Removed 455 rows containing missing values (geom_point).

GG_save("figures/age_gender_independent.png")

## `geom_smooth()` using method = 'gam'

## Warning: Removed 455 rows containing non-finite values (stat_smooth).

## Warning: Removed 455 rows containing missing values (geom_point).

ggplot(data = names.df, aes(x=age, y=executive, color=gender)) +
  geom_point() +
  geom_smooth(aes(color=gender))

## `geom_smooth()` using method = 'gam'

## Warning: Removed 455 rows containing non-finite values (stat_smooth).

## Warning: Removed 455 rows containing missing values (geom_point).

GG_save("figures/age_gender_executive.png")

## `geom_smooth()` using method = 'gam'

## Warning: Removed 455 rows containing non-finite values (stat_smooth).

## Warning: Removed 455 rows containing missing values (geom_point).

Missing data

.cols = c(4:8,11:22)
matrixplot2(names.df[.cols]) #this is to avoid the character vars

names.df = names.df[order(names.df$number),] #sort by number
miss_by_var(names.df)

##                 id               name             gender 
##                  0                  0                  0 
##             unisex             number                age 
##                  0                  0                  0 
##          own.place         rent.place         work.types 
##                455                455                457 
##            live.in            married             single 
##                  0                455                455 
##      live.together    reg.partnership             no.job 
##                455                455                526 
##           employee outside.job.market            student 
##                455                455                455 
##        independent          executive         conviction 
##                455                455                  0 
##             income           pop.2015           pop.2014 
##                526                  2                  2

names.df$NA.num = miss_by_case(names.df)
table(miss_by_case(names.df[.cols]))

## 
##    0    2   13 
## 1832   71  455

miss_plot(names.df[.cols])

GG_save("figures/miss_histo_all.png")

Main analysis

We carry out a number of analyses using various subgroups.

The sexes together.

#S factor
s.data = subset(names.df, select=c("no.job","own.place","married",
                                   "conviction", "income",
                                   "age"))

#missing data
if (F){ #run manually when needed
  matrixplot2(s.data)
  matrixplot(s.data)
  miss.table(s.data)
  miss.hist(s.data)
  GG_save("figures/missing_histogram.png")
}

#impute data
s.data = s.data[miss_by_case(s.data) <= 2, ] #subset to cases with 0 or 1 NA
s.data = miss_impute(s.data, noise = F) #impute data

#residualize
s.data.r = s.data #copy
s.data.r$age2 = s.data$age^2
s.data.r$age3 = s.data$age^3
s.data.r = df_residualize(s.data.r, resid.var = c("age","age2","age3")) #remove effect of age on indicators
s.data.r$age2 = NULL #remove sq and cube
s.data.r$age3 = NULL

#correlations
cors.B = cor(s.data); round(cors.B,2)

##            no.job own.place married conviction income   age
## no.job       1.00     -0.50    0.18       0.42  -0.21 -0.28
## own.place   -0.50      1.00    0.43      -0.44   0.73  0.58
## married      0.18      0.43    1.00      -0.13   0.45  0.57
## conviction   0.42     -0.44   -0.13       1.00  -0.12 -0.26
## income      -0.21      0.73    0.45      -0.12   1.00  0.48
## age         -0.28      0.58    0.57      -0.26   0.48  1.00

cors2.B = cor(s.data.r); round(cors2.B,2)

##            no.job own.place married conviction income age
## no.job       1.00     -0.43    0.43       0.38  -0.09   0
## own.place   -0.43      1.00    0.15      -0.37   0.63   0
## married      0.43      0.15    1.00       0.02   0.24   0
## conviction   0.38     -0.37    0.02       1.00   0.01   0
## income      -0.09      0.63    0.24       0.01   1.00   0
## age          0.00      0.00    0.00       0.00   0.00   1

#write
cors3.B = combine_upperlower(cors.B, cors2.B)
write_csv(cors3.B %>% as.data.frame, "results/S_both_cors.csv", na = "")

#remove age
s.data$age = NULL
s.data.r$age = NULL

#factor analyze
s.fa.B = fa(s.data);s.fa.B #without removing effect of age

## Factor Analysis using method =  minres
## Call: fa(r = s.data)
## Standardized loadings (pattern matrix) based upon correlation matrix
##              MR1   h2    u2 com
## no.job     -0.50 0.25 0.755   1
## own.place   1.00 1.00 0.005   1
## married     0.43 0.19 0.813   1
## conviction -0.44 0.19 0.809   1
## income      0.73 0.54 0.463   1
## 
##                 MR1
## SS loadings    2.16
## Proportion Var 0.43
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  10  and the objective function was  2 with Chi Square of  3810
## The degrees of freedom for the model are 5  and the objective function was  0.54 
## 
## The root mean square of the residuals (RMSR) is  0.17 
## The df corrected root mean square of the residuals is  0.24 
## 
## The harmonic number of observations is  1903 with the empirical chi square  1083  with prob <  6.6e-232 
## The total number of observations was  1903  with Likelihood Chi Square =  1029  with prob <  3.3e-220 
## 
## Tucker Lewis Index of factoring reliability =  0.46
## RMSEA index =  0.33  and the 90 % confidence intervals are  0.31 0.34
## BIC =  991
## Fit based upon off diagonal values = 0.83
## Measures of factor score adequacy             
##                                                 MR1
## Correlation of scores with factors             1.00
## Multiple R square of scores with factors       1.00
## Minimum correlation of possible factor scores  0.99

s.fa2.B = fa(s.data.r);s.fa2.B #removing effect of age

## Factor Analysis using method =  minres
## Call: fa(r = s.data.r)
## Standardized loadings (pattern matrix) based upon correlation matrix
##              MR1    h2    u2 com
## no.job     -0.43 0.184 0.816   1
## own.place   1.00 0.995 0.005   1
## married     0.15 0.021 0.979   1
## conviction -0.37 0.134 0.866   1
## income      0.63 0.402 0.598   1
## 
##                 MR1
## SS loadings    1.74
## Proportion Var 0.35
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  10  and the objective function was  1.5 with Chi Square of  2819
## The degrees of freedom for the model are 5  and the objective function was  0.6 
## 
## The root mean square of the residuals (RMSR) is  0.2 
## The df corrected root mean square of the residuals is  0.29 
## 
## The harmonic number of observations is  1903 with the empirical chi square  1562  with prob <  0 
## The total number of observations was  1903  with Likelihood Chi Square =  1147  with prob <  7.5e-246 
## 
## Tucker Lewis Index of factoring reliability =  0.19
## RMSEA index =  0.35  and the 90 % confidence intervals are  0.33 0.35
## BIC =  1110
## Fit based upon off diagonal values = 0.64
## Measures of factor score adequacy             
##                                                 MR1
## Correlation of scores with factors             1.00
## Multiple R square of scores with factors       1.00
## Minimum correlation of possible factor scores  0.99

#get scores
scores = data.frame(matrix(nrow=nrow(s.data), ncol=0))
scores$s.scores = s.fa.B$scores
scores$s.scores2 = s.fa2.B$scores
colnames(scores) = c("S.both", "S.both.no.age")
rownames(scores) = rownames(s.fa.B$scores)
dimnames(scores$S) = NULL
dimnames(scores$S.no.age) = NULL
wtd.cors(scores)

##               S.both S.both.no.age
## S.both          1.00          0.81
## S.both.no.age   0.81          1.00

#insert into main df
names.df = merge_datasets(names.df, scores)

Men only.

#S factor
s.data = subset(names.df, select=c("no.job","own.place","married",
                                   "conviction", "income",
                                   "age"))
s.data = subset(s.data, names.df$gender=="Male")

#impute data
s.data = s.data[miss_by_case(s.data)<=2,] #subset to cases with 0 or 1 NA
s.data = miss_impute(s.data, noise = F) #impute data

#residualize
s.data.r = s.data #copy
s.data.r$age2 = s.data$age^2
s.data.r$age3 = s.data$age^3
s.data.r = df_residualize(s.data.r, resid.var = c("age","age2","age3")) #remove effect of age on indicators
s.data.r$age2 = NULL #remove sq and cube
s.data.r$age3 = NULL

#correlations
cors.M = cor(s.data);round(cors.M,2)

##            no.job own.place married conviction income   age
## no.job       1.00     -0.53    0.03       0.63  -0.29 -0.21
## own.place   -0.53      1.00    0.54      -0.59   0.78  0.63
## married      0.03      0.54    1.00      -0.29   0.46  0.71
## conviction   0.63     -0.59   -0.29       1.00  -0.40 -0.35
## income      -0.29      0.78    0.46      -0.40   1.00  0.60
## age         -0.21      0.63    0.71      -0.35   0.60  1.00

cors2.M = cor(s.data.r);round(cors2.M,2)

##            no.job own.place married conviction income age
## no.job       1.00     -0.52    0.26       0.61  -0.22   0
## own.place   -0.52      1.00    0.18      -0.51   0.65   0
## married      0.26      0.18    1.00      -0.06   0.06   0
## conviction   0.61     -0.51   -0.06       1.00  -0.26   0
## income      -0.22      0.65    0.06      -0.26   1.00   0
## age          0.00      0.00    0.00       0.00   0.00   1

#write
cors3.M = combine_upperlower(cors.M, cors2.M)
write_csv(cors3.M %>% as.data.frame, "results/S_men_cors.csv", na = "")

#remove age
s.data$age = NULL
s.data.r$age = NULL

#factor analyze
s.fa.M = fa(s.data);s.fa.M #without removing effect of age

## Factor Analysis using method =  minres
## Call: fa(r = s.data)
## Standardized loadings (pattern matrix) based upon correlation matrix
##              MR1   h2    u2 com
## no.job     -0.53 0.28 0.721   1
## own.place   1.00 1.00 0.005   1
## married     0.55 0.30 0.703   1
## conviction -0.60 0.36 0.645   1
## income      0.78 0.61 0.390   1
## 
##                 MR1
## SS loadings    2.54
## Proportion Var 0.51
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  10  and the objective function was  2.6 with Chi Square of  2273
## The degrees of freedom for the model are 5  and the objective function was  0.55 
## 
## The root mean square of the residuals (RMSR) is  0.15 
## The df corrected root mean square of the residuals is  0.21 
## 
## The harmonic number of observations is  877 with the empirical chi square  390  with prob <  4.3e-82 
## The total number of observations was  877  with Likelihood Chi Square =  484  with prob <  2.2e-102 
## 
## Tucker Lewis Index of factoring reliability =  0.58
## RMSEA index =  0.33  and the 90 % confidence intervals are  0.3 0.34
## BIC =  450
## Fit based upon off diagonal values = 0.91
## Measures of factor score adequacy             
##                                                 MR1
## Correlation of scores with factors             1.00
## Multiple R square of scores with factors       1.00
## Minimum correlation of possible factor scores  0.99

s.fa2.M = fa(s.data.r);s.fa2.M #without removing effect of age

## Factor Analysis using method =  minres
## Call: fa(r = s.data.r)
## Standardized loadings (pattern matrix) based upon correlation matrix
##              MR1    h2   u2 com
## no.job     -0.59 0.347 0.65   1
## own.place   0.91 0.834 0.17   1
## married     0.13 0.016 0.98   1
## conviction -0.59 0.354 0.65   1
## income      0.65 0.418 0.58   1
## 
##                 MR1
## SS loadings    1.97
## Proportion Var 0.39
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  10  and the objective function was  1.8 with Chi Square of  1525
## The degrees of freedom for the model are 5  and the objective function was  0.57 
## 
## The root mean square of the residuals (RMSR) is  0.15 
## The df corrected root mean square of the residuals is  0.21 
## 
## The harmonic number of observations is  877 with the empirical chi square  405  with prob <  3e-85 
## The total number of observations was  877  with Likelihood Chi Square =  501  with prob <  4.6e-106 
## 
## Tucker Lewis Index of factoring reliability =  0.34
## RMSEA index =  0.34  and the 90 % confidence intervals are  0.31 0.35
## BIC =  467
## Fit based upon off diagonal values = 0.85
## Measures of factor score adequacy             
##                                                 MR1
## Correlation of scores with factors             0.93
## Multiple R square of scores with factors       0.87
## Minimum correlation of possible factor scores  0.74

#get scores
scores = data.frame(matrix(nrow=nrow(s.data), ncol=0))
scores$s.scores = s.fa.M$scores
scores$s.scores2 = s.fa2.M$scores
colnames(scores) = c("S.men", "S.men.no.age")
rownames(scores) = rownames(s.fa.M$scores)
dimnames(scores$S) = NULL
dimnames(scores$S.no.age) = NULL
wtd.cors(scores)

##              S.men S.men.no.age
## S.men         1.00         0.76
## S.men.no.age  0.76         1.00

#insert into main df
names.df = merge_datasets(names.df, scores)

Women only.

#S factor
s.data = subset(names.df, select=c("no.job","own.place","married",
                                   "conviction", "income",
                                   "age"))
s.data = subset(s.data, names.df$gender=="Female")

#impute data
s.data = s.data[miss_by_case(s.data)<=2,] #subset to cases with 0 or 1 NA
s.data = miss_impute(s.data, noise = F) #impute data

#residualize
s.data.r = s.data #copy
s.data.r$age2 = s.data$age^2
s.data.r$age3 = s.data$age^3
s.data.r = df_residualize(s.data.r, resid.var = c("age","age2","age3")) #remove effect of age on indicators
s.data.r$age2 = NULL #remove sq and cube
s.data.r$age3 = NULL

#correlations
cors.F = cor(s.data);round(cors.F,2)

##            no.job own.place married conviction income   age
## no.job       1.00     -0.47    0.34       0.42  -0.12 -0.33
## own.place   -0.47      1.00    0.27      -0.41   0.74  0.56
## married      0.34      0.27    1.00      -0.06   0.39  0.46
## conviction   0.42     -0.41   -0.06       1.00  -0.18 -0.28
## income      -0.12      0.74    0.39      -0.18   1.00  0.45
## age         -0.33      0.56    0.46      -0.28   0.45  1.00

cors2.F = cor(s.data.r);round(cors2.F,2)

##            no.job own.place married conviction income age
## no.job       1.00     -0.37    0.59       0.37   0.03   0
## own.place   -0.37      1.00    0.02      -0.32   0.66   0
## married      0.59      0.02    1.00       0.08   0.24   0
## conviction   0.37     -0.32    0.08       1.00  -0.07   0
## income       0.03      0.66    0.24      -0.07   1.00   0
## age          0.00      0.00    0.00       0.00   0.00   1

#write
cors3.F = combine_upperlower(cors.F, cors2.F)
write_csv(cors3.F %>% as.data.frame, "results/S_women_cors.csv", na = "")

#remove age
s.data$age = NULL
s.data.r$age = NULL

#factor analyze
s.fa.F = fa(s.data);s.fa.F #without removing effect of age

## Factor Analysis using method =  minres
## Call: fa(r = s.data)
## Standardized loadings (pattern matrix) based upon correlation matrix
##              MR1    h2    u2 com
## no.job     -0.47 0.221 0.779   1
## own.place   1.00 0.995 0.005   1
## married     0.27 0.072 0.928   1
## conviction -0.41 0.169 0.831   1
## income      0.74 0.544 0.456   1
## 
##                MR1
## SS loadings    2.0
## Proportion Var 0.4
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  10  and the objective function was  1.9 with Chi Square of  1979
## The degrees of freedom for the model are 5  and the objective function was  0.65 
## 
## The root mean square of the residuals (RMSR) is  0.19 
## The df corrected root mean square of the residuals is  0.28 
## 
## The harmonic number of observations is  1026 with the empirical chi square  777  with prob <  1.3e-165 
## The total number of observations was  1026  with Likelihood Chi Square =  660  with prob <  1.8e-140 
## 
## Tucker Lewis Index of factoring reliability =  0.33
## RMSEA index =  0.36  and the 90 % confidence intervals are  0.33 0.37
## BIC =  626
## Fit based upon off diagonal values = 0.75
## Measures of factor score adequacy             
##                                                 MR1
## Correlation of scores with factors             1.00
## Multiple R square of scores with factors       1.00
## Minimum correlation of possible factor scores  0.99

s.fa2.F = fa(s.data.r);s.fa2.F #without removing effect of age

## Factor Analysis using method =  minres
## Call: fa(r = s.data.r)
## Standardized loadings (pattern matrix) based upon correlation matrix
##              MR1      h2    u2 com
## no.job     -0.37 0.13360 0.866   1
## own.place   1.00 0.99501 0.005   1
## married     0.02 0.00024 1.000   1
## conviction -0.32 0.10281 0.897   1
## income      0.66 0.43225 0.568   1
## 
##                 MR1
## SS loadings    1.66
## Proportion Var 0.33
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  10  and the objective function was  1.6 with Chi Square of  1649
## The degrees of freedom for the model are 5  and the objective function was  0.8 
## 
## The root mean square of the residuals (RMSR) is  0.24 
## The df corrected root mean square of the residuals is  0.34 
## 
## The harmonic number of observations is  1026 with the empirical chi square  1159  with prob <  2.1e-248 
## The total number of observations was  1026  with Likelihood Chi Square =  816  with prob <  4.5e-174 
## 
## Tucker Lewis Index of factoring reliability =  0.01
## RMSEA index =  0.4  and the 90 % confidence intervals are  0.37 0.41
## BIC =  781
## Fit based upon off diagonal values = 0.53
## Measures of factor score adequacy             
##                                                 MR1
## Correlation of scores with factors             1.00
## Multiple R square of scores with factors       1.00
## Minimum correlation of possible factor scores  0.99

#get scores
scores = data.frame(matrix(nrow=nrow(s.data), ncol=0))
scores$s.scores = s.fa.F$scores
scores$s.scores2 = s.fa2.F$scores
colnames(scores) = c("S.women", "S.women.no.age")
rownames(scores) = rownames(s.fa.F$scores)
dimnames(scores$S) = NULL
dimnames(scores$S.no.age) = NULL
wtd.cors(scores)

##                S.women S.women.no.age
## S.women           1.00           0.83
## S.women.no.age    0.83           1.00

#insert into main df
names.df = merge_datasets(names.df, scores)

Both together in age bins.

## S FACTOR WITHIN AGE BINS, BOTH GENDERS

#fetch data
s.data = subset(names.df, select=c("no.job","own.place","married",
                                   "conviction", "income",
                                   "age"))

#impute data
s.data = s.data[miss_by_case(s.data)<=2,] #subset to cases with 0 or 1 NA
s.data = miss_impute(s.data, noise = F) #impute data

#the databins
s.data.bins = list()
starts = seq(20, 50, 5)
for (idx in seq_along(starts)) {
  element.name = str_c("age.", starts[idx], ".", (starts[idx]+5))
  print(element.name)
  s.data.bins[[element.name]] = s.data[s.data$age >= starts[idx] & s.data$age <= (starts[idx]+5), -6]
}

## [1] "age.20.25"
## [1] "age.25.30"
## [1] "age.30.35"
## [1] "age.35.40"
## [1] "age.40.45"
## [1] "age.45.50"
## [1] "age.50.55"

#fa objects
fa.bins = llply(.data = s.data.bins, .fun = function(x) {
  print(cor(x, use="pair"))
  print(table(miss_by_case(x)))
  fa(x)
})

##            no.job own.place married conviction  income
## no.job       1.00     -0.51  0.4575      0.244 -0.3066
## own.place   -0.51      1.00 -0.1504     -0.208  0.6476
## married      0.46     -0.15  1.0000     -0.067 -0.0049
## conviction   0.24     -0.21 -0.0670      1.000 -0.0926
## income      -0.31      0.65 -0.0049     -0.093  1.0000
## 
##   0 
## 201 
##            no.job own.place  married conviction   income
## no.job       1.00    -0.362  0.52955      0.349 -0.23665
## own.place   -0.36     1.000 -0.06677     -0.341  0.77357
## married      0.53    -0.067  1.00000      0.028  0.00023
## conviction   0.35    -0.341  0.02815      1.000 -0.11085
## income      -0.24     0.774  0.00023     -0.111  1.00000
## 
##   0 
## 254 
##            no.job own.place married conviction income
## no.job       1.00    -0.377   0.578      0.268 -0.377
## own.place   -0.38     1.000  -0.078     -0.326  0.755
## married      0.58    -0.078   1.000     -0.063 -0.222
## conviction   0.27    -0.326  -0.063      1.000 -0.095
## income      -0.38     0.755  -0.222     -0.095  1.000
## 
##   0 
## 271 
##            no.job own.place married conviction income
## no.job       1.00     -0.49   0.570      0.327  -0.42
## own.place   -0.49      1.00  -0.189     -0.449   0.75
## married      0.57     -0.19   1.000      0.059  -0.25
## conviction   0.33     -0.45   0.059      1.000  -0.17
## income      -0.42      0.75  -0.254     -0.166   1.00
## 
##   0 
## 253 
##            no.job own.place married conviction income
## no.job       1.00     -0.60   0.486      0.281 -0.430
## own.place   -0.60      1.00  -0.185     -0.397  0.746
## married      0.49     -0.19   1.000      0.042 -0.090
## conviction   0.28     -0.40   0.042      1.000 -0.092
## income      -0.43      0.75  -0.090     -0.092  1.000
## 
##   0 
## 194 
##            no.job own.place married conviction income
## no.job       1.00     -0.40   0.161      0.367 -0.270
## own.place   -0.40      1.00   0.211     -0.256  0.671
## married      0.16      0.21   1.000     -0.042  0.203
## conviction   0.37     -0.26  -0.042      1.000  0.071
## income      -0.27      0.67   0.203      0.071  1.000
## 
##   0 
## 161 
##            no.job own.place married conviction income
## no.job      1.000    -0.321  -0.081      0.238 -0.096
## own.place  -0.321     1.000   0.718     -0.014  0.653
## married    -0.081     0.718   1.000      0.134  0.525
## conviction  0.238    -0.014   0.134      1.000  0.240
## income     -0.096     0.653   0.525      0.240  1.000
## 
##   0 
## 139

#plot
fa_plot_loadings(fa.bins, fa_labels = names(fa.bins))

GG_save("figures/S_agebin_both.png")

#factor sizes
fa.sizes.B = ldply(.data = s.data.bins, .fun = function(x) {
  mean(fa(x)$communality)
})


# males only --------------------------------------------------------------

#fetch data
s.data = subset(names.df, select=c("no.job","own.place","married",
                                   "conviction", "income",
                                   "age"))
s.data = s.data[names.df$gender == "Male",] #males only

#impute data
s.data = s.data[miss_by_case(s.data) <= 2,] #subset to cases with 0 or 1 NA
s.data = miss_impute(s.data, noise = F) #impute data

#the databins
s.data.bins = list()
starts = seq(20, 50, 5)
for (idx in seq_along(starts)) {
  element.name = str_c("age.", starts[idx], ".", (starts[idx]+5))
  print(element.name)
  s.data.bins[[element.name]] = s.data[s.data$age >= starts[idx] & s.data$age <= (starts[idx]+5), -6]
}

## [1] "age.20.25"
## [1] "age.25.30"
## [1] "age.30.35"
## [1] "age.35.40"
## [1] "age.40.45"
## [1] "age.45.50"
## [1] "age.50.55"

#fa objects
fa.bins = llply(.data = s.data.bins, .fun = function(x) {
  print(cor(x, use="pair"))
  print(table(miss_by_case(x)))
  fa(x)
})

##            no.job own.place married conviction income
## no.job       1.00    -0.517   0.404       0.39 -0.396
## own.place   -0.52     1.000   0.057      -0.35  0.701
## married      0.40     0.057   1.000      -0.08  0.068
## conviction   0.39    -0.346  -0.080       1.00 -0.554
## income      -0.40     0.701   0.068      -0.55  1.000
## 
##  0 
## 76 
##            no.job own.place married conviction income
## no.job       1.00     -0.38   0.512      0.578  -0.28
## own.place   -0.38      1.00   0.107     -0.348   0.86
## married      0.51      0.11   1.000      0.097   0.12
## conviction   0.58     -0.35   0.097      1.000  -0.33
## income      -0.28      0.86   0.121     -0.334   1.00
## 
##   0 
## 120 
##            no.job own.place married conviction income
## no.job       1.00     -0.41   0.501       0.59 -0.507
## own.place   -0.41      1.00   0.108      -0.37  0.883
## married      0.50      0.11   1.000       0.13 -0.032
## conviction   0.59     -0.37   0.129       1.00 -0.452
## income      -0.51      0.88  -0.032      -0.45  1.000
## 
##   0 
## 126 
##            no.job own.place married conviction income
## no.job       1.00    -0.469   0.503      0.446  -0.49
## own.place   -0.47     1.000  -0.099     -0.471   0.82
## married      0.50    -0.099   1.000      0.077  -0.24
## conviction   0.45    -0.471   0.077      1.000  -0.39
## income      -0.49     0.818  -0.239     -0.389   1.00
## 
##   0 
## 131 
##            no.job own.place married conviction income
## no.job       1.00    -0.604   0.236      0.550  -0.53
## own.place   -0.60     1.000  -0.072     -0.573   0.87
## married      0.24    -0.072   1.000      0.022  -0.11
## conviction   0.55    -0.573   0.022      1.000  -0.46
## income      -0.53     0.873  -0.115     -0.461   1.00
## 
##  0 
## 93 
##            no.job own.place married conviction income
## no.job       1.00     -0.37   -0.12       0.68  -0.29
## own.place   -0.37      1.00    0.28      -0.47   0.78
## married     -0.12      0.28    1.00      -0.30   0.11
## conviction   0.68     -0.47   -0.30       1.00  -0.39
## income      -0.29      0.78    0.11      -0.39   1.00
## 
##  0 
## 73 
##            no.job own.place married conviction income
## no.job       1.00     -0.44   -0.29       0.43  -0.17
## own.place   -0.44      1.00    0.69      -0.16   0.65
## married     -0.29      0.69    1.00      -0.13   0.32
## conviction   0.43     -0.16   -0.13       1.00  -0.07
## income      -0.17      0.65    0.32      -0.07   1.00
## 
##  0 
## 61

#plot
fa_plot_loadings(fa.bins, fa_labels = names(fa.bins))

GG_save("figures/S_agebin_males.png")

#factor sizes
fa.sizes.M = ldply(.data = s.data.bins, .fun = function(x) {
  mean(fa(x)$communality)
})

# females only ------------------------------------------------------------

#fetch data
s.data = subset(names.df, select=c("no.job","own.place","married",
                                   "conviction", "income",
                                   "age"))
s.data = s.data[names.df$gender == "Female",] #males only

#impute data
s.data = s.data[miss_by_case(s.data) <= 2,] #subset to cases with 0 or 1 NA
s.data = miss_impute(s.data, noise = F) #impute data

#the databins
s.data.bins = list()
starts = seq(20, 50, 5)
for (idx in seq_along(starts)) {
  element.name = str_c("age.", starts[idx], ".", (starts[idx]+5))
  print(element.name)
  s.data.bins[[element.name]] = s.data[s.data$age >= starts[idx] & s.data$age <= (starts[idx]+5), -6]
}

## [1] "age.20.25"
## [1] "age.25.30"
## [1] "age.30.35"
## [1] "age.35.40"
## [1] "age.40.45"
## [1] "age.45.50"
## [1] "age.50.55"

#factor analyses for plotting
fa.bins = llply(.data = s.data.bins, .fun = function(x) {
  print(cor(x, use="pair"))
  print(table(miss_by_case(x)))
  fa(x)
})

##            no.job own.place married conviction income
## no.job       1.00     -0.50   0.508      0.372 -0.224
## own.place   -0.50      1.00  -0.306     -0.350  0.661
## married      0.51     -0.31   1.000     -0.073 -0.051
## conviction   0.37     -0.35  -0.073      1.000 -0.358
## income      -0.22      0.66  -0.051     -0.358  1.000
## 
##   0 
## 125 
##            no.job own.place married conviction income
## no.job       1.00     -0.35   0.555      0.328  -0.16
## own.place   -0.35      1.00  -0.357     -0.391   0.77
## married      0.55     -0.36   1.000      0.092  -0.17
## conviction   0.33     -0.39   0.092      1.000  -0.29
## income      -0.16      0.77  -0.171     -0.285   1.00
## 
##   0 
## 134 
##            no.job own.place married conviction income
## no.job       1.00     -0.38  0.6497     0.2177  -0.23
## own.place   -0.38      1.00 -0.3843    -0.2650   0.76
## married      0.65     -0.38  1.0000     0.0067  -0.37
## conviction   0.22     -0.27  0.0067     1.0000  -0.18
## income      -0.23      0.76 -0.3696    -0.1773   1.00
## 
##   0 
## 145 
##            no.job own.place married conviction income
## no.job       1.00     -0.51    0.67       0.28  -0.38
## own.place   -0.51      1.00   -0.34      -0.39   0.83
## married      0.67     -0.34    1.00       0.14  -0.30
## conviction   0.28     -0.39    0.14       1.00  -0.29
## income      -0.38      0.83   -0.30      -0.29   1.00
## 
##   0 
## 122 
##            no.job own.place married conviction income
## no.job       1.00     -0.64   0.690      0.186  -0.39
## own.place   -0.64      1.00  -0.327     -0.191   0.70
## married      0.69     -0.33   1.000      0.037  -0.10
## conviction   0.19     -0.19   0.037      1.000  -0.10
## income      -0.39      0.70  -0.100     -0.104   1.00
## 
##   0 
## 101 
##            no.job own.place married conviction income
## no.job       1.00     -0.43   0.397      0.336  -0.31
## own.place   -0.43      1.00   0.128     -0.214   0.67
## married      0.40      0.13   1.000      0.047   0.19
## conviction   0.34     -0.21   0.047      1.000  -0.23
## income      -0.31      0.67   0.192     -0.230   1.00
## 
##  0 
## 88 
##             no.job own.place married conviction  income
## no.job      1.0000     -0.20   0.132      0.245 -0.0082
## own.place  -0.2023      1.00   0.729     -0.137  0.7070
## married     0.1316      0.73   1.000      0.046  0.6301
## conviction  0.2447     -0.14   0.046      1.000 -0.2142
## income     -0.0082      0.71   0.630     -0.214  1.0000
## 
##  0 
## 78

#plot
fa_plot_loadings(fa.bins, fa_labels =  names(fa.bins), reverse = c(1, 1, 1, 1, -1, 1, 1))

GG_save("figures/S_agebin_females.png")

#factor sizes
fa.sizes.F = ldply(.data = s.data.bins, .fun = function(x) {
  mean(fa(x)$communality)
})


# plot factor size and age bin --------------------------------------------

fa.sizes = rbind(fa.sizes.B,
                 fa.sizes.M,
                 fa.sizes.F)
fa.sizes$group = c(rep("Both", nrow(fa.sizes.B)),
                   rep("Male", nrow(fa.sizes.M)),
                   rep("Female", nrow(fa.sizes.F)))

#plot
ggplot(data = fa.sizes, aes(x = .id, y = V1, color = group, group =  group)) +
  geom_line(stat="identity", position = "dodge" ) +
  xlab("Age bin") + ylab("Factor % of variance")

## Warning: Width not defined. Set with `position_dodge(width = ?)`

GG_save("figures/S_ageBins_factor_sizes.png")

## Warning: Width not defined. Set with `position_dodge(width = ?)`

#loadings plot
fa.list = list(s.fa.B, s.fa.M, s.fa.F, s.fa2.B, s.fa2.M, s.fa2.F)
fa.names = c("S.Both", "S.Male", "S.Female","S.BothNA", "S.MaleNA", "S.FemaleNA")
fa_plot_loadings(fa.list, fa.names)

GG_save("figures/loadings.png")

Data output

For reuse.

#loadings plot
fa.list = list(s.fa.B, s.fa.M, s.fa.F, s.fa2.B, s.fa2.M, s.fa2.F)
fa.names = c("S.Both", "S.Male", "S.Female","S.BothNA", "S.MaleNA", "S.FemaleNA")
fa_plot_loadings(fa.list, fa.names)

GG_save("figures/loadings.png")

#save var%
var.percents = ldply(fa.list, function(x) {
  return(mean(x$communality))
})
colnames(var.percents) = "Var%"
rownames(var.percents) = fa.names
write_csv(var.percents %>% as.data.frame, "results/var_percents.csv")

#delta correlation plot
cors3.delta = cors3.M - cors3.F #male minus female
write_csv(cors3.delta %>% as.data.frame, "results/S_delta_cors.csv" , na="")

#S scores
s.scores = names.df[, str_detect(colnames(names.df), "S.")]
s.scores.cors = wtd.cors(s.scores)
write_csv(round(s.scores.cors,2) %>% as.data.frame, "results/S_score_cors.csv", na = "")


# main data ---------------------------------------------------------------

write_csv(names.df, "data_for_reuse.csv", na = "")
write_rds(names.df, "data_for_reuse.rds")

What is a good name? The S factor in Denmark at the name-level