purp.gather   <- T
purp.analysis <- F
purp.debug    <- F
purp.mcmc     <- F
library(tidyverse)
library(psych)
library(mascutils)
library(printr)
#library(lavaan)
options(mc.cores = 5)


if(!purp.gather) {load("D_1.Rda")}


## Functions

rm_psycho <- function(Data) 
  Data %>% 
  group_by(Part, Item) %>% 
  summarize(mean_resp = mean(response)) %>% 
  ungroup() %>% 
  arrange(Item) %>% 
  spread(Item, value = mean_resp) %>% 
  select(-Part)

rm_design <- function(Data) 
  Data %>% 
  group_by(Design, Item) %>%
  summarize(mean_resp = mean(response)) %>% 
  ungroup() %>% 
  spread(Item, value = mean_resp) %>% 
  select(-Design)

alpha_ci <- function(Data){
  Scale <- str_c(distinct(Data, Scale)$Scale) 
  model_psych <- 
    psych::alpha(rm_psycho(Data), check.keys = FALSE, n.iter = 100)$boot %>% 
    as_tibble() %>% 
    mutate(Perspective = "psychometric")
  model_design <- 
    psych::alpha(rm_design(Data), check.keys = FALSE, n.iter = 100)$boot %>% 
    as_tibble() %>% 
    mutate(Perspective = "designometric")
  out <- 
    bind_rows(model_psych,
              model_design) %>% 
    select(Perspective, std.alpha) %>% 
    group_by(Perspective) %>% 
    summarize(center = mean(std.alpha),
              lower = quantile(std.alpha, .025),
              upper = quantile(std.alpha, .975)) %>% 
    mutate(Scale = Scale) %>% 
    go_first(Scale, Perspective)
  out
}

# D_1 %>% 
#   filter(Scale == "Attractiveness") %>% 
#   alpha_ci()


item_rel <- function(Data){
  #Data <- D_1 %>% filter(Scale == "HQI")
  Scale <- str_c(distinct(Data, Scale)$Scale)
  model_psych <- 
    psych::alpha(rm_psycho(Data), check.keys = FALSE)$item.stats %>% 
    as_tibble(rownames = "Item") %>% 
    mutate(Perspective = "psychometric")
  model_design <- 
    psych::alpha(rm_design(Data), check.keys = FALSE)$item.stats %>% 
    as_tibble(rownames = "Item") %>% 
    mutate(Perspective = "designometric")
  
#  model <- M_1_design
  out <- 
    bind_rows(model_psych,
              model_design) %>% 
    mutate(Scale = Scale) %>% 
    go_first(Scale, Item, Perspective) %>% 
    arrange(Scale, Item, Perspective)
  out
}

# D_1 %>% 
#   filter(Scale == "Attractiveness") %>% 
#   item_rel()

1 The psychometric fallacy in design research

The ISO 9241-11 defines usability by three components: effectiveness, efficiency and satisfaction. The first two factors are rooted in a well established human performance perspective. The third, satisfaction, with its vaguely emotional frame of reference remained poorly understood to many researchers and practitioners and was often associated with “this you have to measure with a rating scale”. Then the UX age dawned to paint a more detailed picture of elusive concepts, such as user’s feelings (e.g. Eeriness), their aesthetic judgments and even their dreams (Hedonic Quality). It was a wonder to observe how the pale definition of user satisfaction had to make space for a big party. And everybody was bringing their own rating scales! (Bargas-Avila & Hoernbaek, Old wine in new bottles)

In industrial practice, rating scales have their place as a always available and cheap method for comparing designs and benchmark all kinds of systems. In contrast to this convenience, developing a valid and reliable rating scale, is a project that can not be smaller than a doctorate thesis. However, most people in design research are primarly busy with learning something about designs, such that most of these scales have been designed with minimal effort for a transient purpose. Only a few rating scale inventories pervade industry and research and these have been designed with best psychometric effort (AttrakDiff, UEQ, TAM).

Psychometrics is the science of assigning meaningful numbers to persons. Traditionally these numbers served to measure skills, such as mathematical intelligence or comprehension of language. With time, researchers became more interested into more elusive properties of persons, such as how much they watch themselves in social situations (self monitoring). Always devoted to the queen of social sciences, design researchers adopted more or less sophisticated methods from psychometrics to improve and validate their shiny new rating scales.

Unfortunately, practically all of them failed to recognize that measuring a person with a rating scale is structurally different from measuring a design. In brief, the difference between a psychometric and a designometric measurment is that:

The psychometric situation has the purpose of measuring person attributes (e.g. attitude towards technology)in an encounter of persons and items
The designometric situation has the purpose of measuring design attributes (e.g. beautifulness) in an encounter of designs, persons and items.

The psychometric fallacy is to validate designometric rating scales as if they were psychometric. It comes in two forms:

A psychometric measurement is two-dimensional: a flat matrix of Persons by Items. The designometric perspective is three-dimensional: persons by items by designs. Psychometricians usually employ large person samples as they must show that their instrument can adequatly discern between persons. In analogy, a designometric rating scale claims to discern between designs, and such a claim requires a large sample of designs to validate this claim. The level 1 psychometric fallacy is, to use just one or few design(s) in the validation study.
Designometric observations are three-way encounters that include the psychometric two-way encounter. That means, you can create a psychometric response matrix from a designometric response cuboid, by averaging over designs. At the same time you can create a designometric response matrix from the designometric response cuboid by averaging over persons. And that is the correct way! The level 2 psychometric fallacy is to do the analysis on a psychometric response matrix rather than a designometric matrix.

Aim of this study is to: + elaborate on how the psychometric fallacy can compromise the process of construction and validation of designometric rating scales. + provide examples of published rating scales with level 1 and level 2 psycometric fallacies, as well as an informal inquiry on how pervading the problem is in literature and practice + do a first exploration of actual consequences of the psychometric fallacy using real data

1.1 Psychometrics

A typical psychometric measurement situations is when person attributes are assessed by a set of ordinal-scaled items, such as the following:

1.1.0.1 [HERE]

a set of tasks (i.e. items), where the response \(y_{ij}\) on any encounter between person \(i\) and item \(j\) is either correct (\(y_{ij} = 1\)) or incorrect (\(y_{ij} = 0\)). A test validation study for a Rasch scale therefore results in a dichotomous response matrix.

1.1.1 Reliability

Recall that in CTT the measurement error is reduced by the law of large numbers. The more items are added to estimate the latent person variable \(\theta_i\), the more the common structure among the items dominates the noise, resulting in a more reliable measure. However, in designometrics studies, researchers are interested in the

1.1.2 Validity

1.1.3 Factor structures

1.2 Designometrics

1.2.1 Simulation on Reliability

When a scale validation study in design research falls into the psychomnetric fallacy by using a psychometric response matrix for reliability analysis, what is shown is that the scale reliably measures a person’s tendency to judge websites beautiful or robot faces spine-tingling. This is obviously not the same as measuring a websites perceived beauty or a robot faces eeriness. The following example demonstrates the difference by simulating an extreme situation, where a fictive three-item scale of Coolness is highly reliable for persons, but has no reliability at all for discerning the tested designs. Such a pattern can occur for the trivial reason that the sample have little or no variance with respect to Coolness. In the following simulation, we assume that the Coolness scale be tested on a sample of 50 undertaker company websites, and 50 participants.

set.seed(42)

n_Design = 20
n_Part   = 20
n_Item  =  4
n_Obs = n_Design * n_Part * n_Item

Designs <- tibble(Design      = as.factor(1:n_Design),
                  cool_Design = rnorm(n_Design, 0, .02)) ## little variance in Coolness

Parts   <- tibble(Part        = as.factor(1:n_Part),
                  cool_Part   = rnorm(n_Part, 0, .2)) ## strong variance in tendeny to judge sth. cool

Items   <- tibble(Scale       = "Coolness",
                  Item        = as.factor(1:4),
                  cool_Item   = rnorm(n_Item,  0, .2)) ## item strength: understating items get lower values

Coolness     <- expand_grid(Design = Designs$Design,
                       Part   = Parts$Part,
                       Item   = Items$Item) %>% 
  left_join(Designs) %>% 
  left_join(Parts) %>% 
  left_join(Items) %>% 
  mutate(response = mascutils::rescale_zero_one(cool_Design + cool_Part - cool_Item + rnorm(n_Obs, 0, .5)))

## Joining with `by = join_by(Design)`
## Joining with `by = join_by(Part)`
## Joining with `by = join_by(Item)`

Coolness %>% 
  ggplot(aes(y = response, x = Design)) +
  geom_violin()

Coolness %>% 
  ggplot(aes(y = response, x = Part)) +
  geom_violin()

alpha_ci(Coolness)

## `summarise()` has grouped output by 'Part'. You can override using the
## `.groups` argument.
## Number of categories should be increased in order to count frequencies.
## `summarise()` has grouped output by 'Design'. You can override using the
## `.groups` argument.
## Number of categories should be increased in order to count frequencies.

## Warning in psych::alpha(rm_design(Data), check.keys = FALSE, n.iter = 100): Some items were negatively correlated with the total scale and probably 
## should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option

## Some items ( 1 ) were negatively correlated with the total scale and 
## probably should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option

Scale	Perspective	center	lower	upper
Coolness	designometric	0.1179021	-0.5375240	0.4680096
Coolness	psychometric	0.9307102	0.8793905	0.9598878

This simple example demonstrate that a scale can produce excellent reliability when measuring persons, but rather poor reliability on designs. In all psychometric fallacy study, this could have happened to some degree and would go completely unnoticed. The way we constructed this simulation, producing a sample of designs with little difference in Coolness, also highlights the importance of careful sampling the designs in a designometric validation study. In many classes of designs, we can expect some properties to vary strongly and others to be relatively stable across designs. In our example, undertaker websites will probably not so much differ in how much they enthuse users, which makes them a poor sample for a Coolness scale, but could still vary a lot in visual simplicity.

Still, falling into the psychometric fallacy does not neccessarily mean that a scale is unreliable under the designometric perspective. It is not too unlikely, that the two mental processes of appreciating coolness (psychometric perspective) and discerning coolness (designometric perspective) share some mental processes and therefore result in sufficient reliability (or factors structure) under both perspectives. It is even possible that the real situation is the opposite of the previous simulation, where persons vary little in appreciation, whereas designs vary strongly. In the following, we will explore on several real designometric data sets how psychometric and designometric scale and item reliabilities compare.

2 Methods

From a theoretical perspective the psychometric fallacy is obvious and we have demonstrated by simulation that the worst case is possible , but little is known how the fallacy effects the quality of rating scales. Here, we explore only the basics psychometric qualities: scale consistency and item reliability.

2.1 Data sets

The data used for analysis originates from four experiments (DK, PS, QB, SP). In all experiments, participants saw pictures of designs, websites (QB, SP) or robot faces (DK, PS) and responded to one item at a time. The original experiments tested the influence of presentation times (generally between 17ms and 5s). For the analysis here, we only used responses at presentation times of 500ms and 2000ms.

As in these experiments only single items were used per presented design, the designometric cuboid is very sparse. However, when collapsing the cuboid to either psychometric RM or designometric RM, the result is completely filled response matrices.

norm_cols <- 
  function(Data) 
    Data %>% 
    mutate(Part = str_c(Study, as.character(Part)),
         Item = as.character(Item),
         Scale = as.character(Scale),
         Design = as.character(Design)) %>% 
    mutate(response = mascutils::rescale_unit(response)) %>% 
  arrange(Study, Part, Scale , Item, Anchor, Design)

mini_cols <- 
  function(Data)
    Data %>% select(Study, Part, Scale , Item, Anchor, Design, response)
    

Items <- readxl::read_excel("Items.xlsx")

load("DK1.Rda")
DK <- 
  DK1 %>% 
  filter(Condition == "long") %>% 
  mutate(Study = "DK",
         Design = str_remove(Stimulus, "^c")) %>% 
  left_join(select(Items, Scale, Item, AnchorLow_EN, AnchorHigh_EN)) %>% 
  mutate(Anchor = str_c(AnchorHigh_EN, AnchorLow_EN, sep = " - ")) %>% 
  norm_cols()

## Joining with `by = join_by(Item, Scale)`

load("PS.Rda")
PS <- 
  PS_1 %>% 
  filter(Condition == 2) %>% 
  mutate(Study = "PS",
         Design = Stimulus) %>% 
  left_join(select(Items, Scale, Item, AnchorLow_EN, AnchorHigh_EN)) %>% 
  mutate(Anchor = str_c(AnchorHigh_EN, AnchorLow_EN, sep = " - ")) %>% 
  norm_cols()

## Joining with `by = join_by(Item, Scale)`

load("AH.Rda")

## Warning: namespace 'MCMCglmm' is not available and has been replaced
## by .GlobalEnv when processing object 'M'

AH <- D$AH1 %>% 
  rename(Part = Participant,
         trial = Trial,
         Design = Face,
         Gender = sex) %>% 
  mutate(Study = "AH",
         Scale = "nEeriness",
         Item = str_c("n", Item),
         Gender = as.factor(Gender),
         response = 1 - mascutils::rescale_unit(response)) %>% 
  left_join(select(Items, Scale, Item, AnchorLow_EN, AnchorHigh_EN)) %>% 
  mutate(Anchor = str_c(AnchorHigh_EN, AnchorLow_EN, sep = " - ")) %>%  
  norm_cols()

## Joining with `by = join_by(Scale, Item)`

load("Tuch1.Rda")
QB <- 
  Tuch1 %>% 
  mutate(Anchor = str_c(AnchorLow, AnchorHi, sep = "_"),
         Study = "QB") %>%
  group_by(Scale) %>% 
  mutate(Item = str_c(Scale, as.integer(as.factor(Anchor)))) %>% 
  ungroup() %>% 
  filter(Condition == "0.5s") %>% 
  rename(Part = Subj, 
         Design = Stimulus, 
         response = Judgement) %>%  
  norm_cols()


load("Tuch2.Rda")
SP <- 
  Tuch2 %>% 
  rename(Part = Participant, Design = Stimulus, response = Response) %>% 
  mutate(Study = "SP") %>% 
  filter(Inventory == "UEQ" &
         Scale == "Attractiveness" &
         Condition %in% c("unlimited", "500ms")) %>% 
  mutate(response = mascutils::rescale_unit(response)) %>% 
  mutate(response = if_else(Item %in% c("Att3", "Att4"), 
                            1 - response, 
                            response)) %>% 
  left_join(select(Items, Scale, Item, AnchorLow_EN, AnchorHigh_EN)) %>% 
  mutate(Anchor = str_c(AnchorHigh_EN, AnchorLow_EN, sep = " - ")) %>% 
  norm_cols()

## Joining with `by = join_by(Scale, Item)`

load("DN.Rda")
DN <- 
  DN %>% 
  mutate(Study = "DN",
         response = mascutils::rescale_zero_one(Response),
         Anchor = "") %>% 
  rename(Part = subject_nr,
         Design = SSName) %>% 
  norm_cols()



D_1 <- bind_rows(mini_cols(PS), mini_cols(DK), mini_cols(QB), mini_cols(SP), mini_cols(DN), mini_cols(AH)) |> 
  mutate(Scale = if_else(Scale %in% c("beauty", "hedonism", "usability"), str_to_sentence(Scale), Scale))

D_Eer <- 
  D_1 %>% 
  filter(Scale == "nEeriness",
         str_detect(Item, "[[:digit:]]+")) ## only M&R stimuli

distinct(D_1, Scale, Study) |> 
  arrange(Scale)

Scale	Study
Attractiveness	SP
Beauty	DN
Credib	QB
HQI	QB
HQS	QB
Hedonism	DN
Usability	DN
nEeriness	PS
nEeriness	DK
nEeriness	AH

D_Att <- 
  D_1 %>% 
  filter(Scale %in% c("HQI", "HQS", "Credib") )
  

D_HUB <- 
  D_1 %>% 
  filter(Study == "DN")

save(D_1, D_Eer, PS, DK, AH, QB, SP, DN, file = "DMX_data.Rda")

load("DMX_data.Rda")

D_1 %>% 
  ggplot(aes(x = response)) +
  geom_histogram() +
  facet_wrap(~Study, scale = "free_y")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

D_Eer %>% 
  ggplot(aes(x = Item, y = response, color = Study)) +
  geom_violin()

## Joining with `by = join_by(Study)`
## Joining with `by = join_by(Study)`

Study	n_Design	n_Part	n_Obs
AH	20	45	10800
DK	80	35	2800
DN	48	42	8064
PS	87	39	2808
QB	76	25	1900
SP	66	40	1440

## Joining with `by = join_by(Scale)`
## Joining with `by = join_by(Scale)`

Scale	n_Design	n_Part	n_Obs
Attractiveness	66	40	1440
Beauty	48	42	2688
Credib	76	25	500
HQI	76	25	700
HQS	76	25	700
Hedonism	48	42	2688
Usability	48	42	2688
nEeriness	127	119	16408

2.1.1 Scales

For the following rating scales responses have been extracted from the original experimental data:

The Eeriness scale has been developed for measuring negative emotional responses towards robot faces and is primarily use for research on the Uncanny Valles phenomenon. Ho & MacDorman(2017) present an advanced psychometric validation of the scale. The study made use of 12 animated characters (Designs), avoiding the level 1 fallacy to some degree, but the data analysis is under psychometric perspective (level 2 fallacy).

The Attractiveness scale is part of the User Experience Questionnaire (UEQ) inventory. Is has been vaidated by [Bettina Laugwitz, Theo Held, and Martin Schrepp. 2008. Construction and Evaluation of a User Experience Questionnaire. . 63–76. https://doi.org/10.1007/978-3-540-89350-9_6] The UEQ has undergone basic psychometric evaluation in six studies with a single design each (level 1 fallacy).

The two scales Hedonic Quality - Identity (HQI) and Hedonic Quality - Stimulation (HQS) are from the AttrakDiff2 inventory. AttrakDiff2 underwent basic evaluation using only three Designs under psychometric perspective (level 1 fallacy) [Hassenzahl, M., Burmester, M., Koller, F., AttrakDiff: Ein Fragebogen zur Messung wahrgenommener hedonischer und pragmatischer Qualität].

The Credibility scale … #### [HERE]

The following table gives an overview on inventory and scales:

## `summarise()` has grouped output by 'Study'. You can override using the
## `.groups` argument.

Study	Scale	n_Items
AH	nEeriness	8
DK	nEeriness	8
DN	Beauty	4
DN	Hedonism	4
DN	Usability	4
PS	nEeriness	8
QB	Credib	5
QB	HQI	7
QB	HQS	7
SP	Attractiveness	6

2.1.2 Data analysis

2.2 Results

2.2.1 Scale consistency

Scale_rel <-
  D_1 %>% 
  #mutate(Scale = str_c(Study, Scale, sep = "_")) %>% 
  split(.$Scale) %>% 
  map_df(alpha_ci)

## Some items ( Att4 Att6 ) were negatively correlated with the total scale and 
## probably should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option

Scale_rel

Scale	Perspective	center	lower	upper
Attractiveness	designometric	0.6351084	0.4139821	0.7675680
Attractiveness	psychometric	0.3607549	0.1025158	0.6041318
Beauty	designometric	0.9692454	0.9524973	0.9789909
Beauty	psychometric	0.5872196	0.3211394	0.7246443
Credib	designometric	0.5717053	0.3137612	0.7250947
Credib	psychometric	0.4243274	-0.2559442	0.6789972
Hedonism	designometric	0.9681432	0.9519016	0.9783777
Hedonism	psychometric	0.6229097	0.4330071	0.7679728
HQI	designometric	0.6737763	0.5425565	0.7830014
HQI	psychometric	0.6496321	0.2090802	0.8473008
HQS	designometric	0.7394482	0.6378606	0.8019733
HQS	psychometric	0.6896385	0.2702103	0.8681698
nEeriness	designometric	0.8834215	0.8351128	0.9156898
nEeriness	psychometric	0.8139584	0.7283786	0.8756691
Usability	designometric	0.8685692	0.7983001	0.9208187
Usability	psychometric	0.6550002	0.4810185	0.7966900

Scale_rel %>% 
  ggplot(aes(color = Scale,
             label = Scale,
             x = Perspective,
             y = center,
             ymin = lower,
             ymax = upper)) +
  geom_point() +
  geom_line(aes(group = Scale)) +
  ylab("std. Cronbach alpha") +
  geom_label() +
  ylim(0,1)

2.2.2 Item reliability

Item_rel <-
  D_1 %>% 
  split(.$Scale) %>% 
  map_df(item_rel)

## Some items ( Att4 Att6 ) were negatively correlated with the total scale and 
## probably should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option

Item_rel

Scale	Item	Perspective	n	raw.r	std.r	r.cor	r.drop	mean	sd
Attractiveness	Att1	designometric	59	0.6935717	0.6221672	0.5374563	0.4152139	0.5578079	0.2357714
Attractiveness	Att1	psychometric	40	0.6556074	0.6582254	0.5850407	0.3760495	0.5602917	0.1860536
Attractiveness	Att2	designometric	56	0.6803049	0.6949713	0.6711486	0.5044168	0.5051086	0.2363659
Attractiveness	Att2	psychometric	40	0.6499161	0.6604626	0.6482586	0.3950330	0.4927083	0.1721726
Attractiveness	Att3	designometric	54	0.7224083	0.7110001	0.6214230	0.5291541	0.5478719	0.2477413
Attractiveness	Att3	psychometric	40	0.4999924	0.5096504	0.3655711	0.1976182	0.5500833	0.1733970
Attractiveness	Att4	designometric	57	0.3314863	0.3210092	0.0856613	0.0477289	0.5078553	0.2118101
Attractiveness	Att4	psychometric	40	0.2379467	0.2590127	0.0002670	-0.0557086	0.4937083	0.1556704
Attractiveness	Att5	designometric	53	0.7734007	0.7701590	0.7479344	0.6165928	0.6050833	0.2344340
Attractiveness	Att5	psychometric	40	0.7220290	0.7143748	0.6884718	0.4761634	0.5999583	0.1851124
Attractiveness	Att6	designometric	55	0.5071367	0.4898179	0.3320168	0.2360847	0.5461803	0.2407393
Attractiveness	Att6	psychometric	40	0.2038149	0.1716358	-0.1026968	-0.1800056	0.5563333	0.2040698
Beauty	1	designometric	48	0.9715208	0.9705167	0.9620374	0.9469555	0.4817018	0.2405092
Beauty	1	psychometric	42	0.5634238	0.5646013	0.2839047	0.2265541	0.4763787	0.1232866
Beauty	2	designometric	48	0.9512628	0.9509864	0.9304837	0.9141279	0.5581468	0.2252406
Beauty	2	psychometric	42	0.7005367	0.7291817	0.6326197	0.4630836	0.5518076	0.1088177
Beauty	3	designometric	48	0.9697443	0.9684071	0.9595165	0.9437284	0.5264599	0.2403634
Beauty	3	psychometric	42	0.7725085	0.7719752	0.7087504	0.5186978	0.5387326	0.1285998
Beauty	4	designometric	48	0.9328297	0.9355976	0.9008122	0.8865251	0.4910026	0.2133545
Beauty	4	psychometric	42	0.6487324	0.6222997	0.3868669	0.3061015	0.5009746	0.1341732
Credib	Credib1	designometric	59	0.6044867	0.5932103	0.4228984	0.3185598	0.5694068	0.2211135
Credib	Credib1	psychometric	25	0.6959111	0.6899259	0.6056964	0.4315671	0.5667000	0.1462139
Credib	Credib2	designometric	59	0.6551251	0.6087126	0.4511409	0.3364046	0.5215226	0.2560754
Credib	Credib2	psychometric	25	0.7337153	0.7095985	0.6784585	0.4579035	0.5105000	0.1592414
Credib	Credib3	designometric	59	0.6514878	0.6506570	0.5467255	0.4086193	0.5228249	0.2228835
Credib	Credib3	psychometric	25	0.5507187	0.5569437	0.3896841	0.2571936	0.5160000	0.1363455
Credib	Credib4	designometric	57	0.7123939	0.7130302	0.6322551	0.4802624	0.5217398	0.2564079
Credib	Credib4	psychometric	25	0.4871192	0.4900538	0.2266055	0.1664315	0.5203000	0.1409701
Credib	Credib5	designometric	53	0.5370583	0.4951819	0.2553971	0.1843925	0.4953751	0.2240947
Credib	Credib5	psychometric	25	0.3886878	0.4118324	0.1330502	0.0676140	0.4957000	0.1353824
Hedonism	1	designometric	48	0.9754944	0.9729531	0.9661373	0.9489504	0.4949129	0.2486562
Hedonism	1	psychometric	42	0.5736808	0.6387250	0.4451207	0.3110343	0.4989629	0.0873503
Hedonism	2	designometric	48	0.9265228	0.9346126	0.9044443	0.8870970	0.5479773	0.1613367
Hedonism	2	psychometric	42	0.6375588	0.6812583	0.5040558	0.3795473	0.5420985	0.0918896
Hedonism	3	designometric	48	0.9539940	0.9513429	0.9322637	0.9167024	0.4675383	0.2110448
Hedonism	3	psychometric	42	0.7611141	0.7352989	0.6478569	0.5326241	0.4721382	0.1008703
Hedonism	4	designometric	48	0.9644170	0.9626921	0.9502177	0.9360589	0.4408734	0.2070048
Hedonism	4	psychometric	42	0.7802483	0.7127237	0.6195312	0.4669670	0.4360133	0.1286006
HQI	HQI1	designometric	54	0.5650463	0.5677040	0.5199326	0.3768710	0.4896019	0.2501074
HQI	HQI1	psychometric	25	0.8131976	0.7897839	0.7924689	0.6859397	0.4950000	0.2026234
HQI	HQI2	designometric	60	0.6225986	0.5693149	0.4603014	0.3755944	0.5781833	0.2724666
HQI	HQI2	psychometric	25	0.4140515	0.4199140	0.2907244	0.2108103	0.5604000	0.1667609
HQI	HQI3	designometric	58	0.6975315	0.6478708	0.5525440	0.4689579	0.5020259	0.2574881
HQI	HQI3	psychometric	25	0.5941852	0.6053038	0.5068226	0.4222744	0.4941000	0.1680958
HQI	HQI4	designometric	57	0.6658530	0.6966137	0.6226209	0.5353069	0.5549269	0.2150086
HQI	HQI4	psychometric	25	0.5258622	0.5417442	0.4966752	0.3680606	0.5709000	0.1448704
HQI	HQI5	designometric	59	0.4338197	0.3596435	0.2005093	0.1251196	0.5912429	0.2374892
HQI	HQI5	psychometric	25	0.7795347	0.7634402	0.7937611	0.6220707	0.5843000	0.2158794
HQI	HQI6	designometric	63	0.5748955	0.5786184	0.4471956	0.3832471	0.5124471	0.2509687
HQI	HQI6	psychometric	25	0.5138991	0.5250801	0.3944244	0.3154123	0.5037000	0.1751995
HQI	HQI7	designometric	57	0.7367313	0.7424558	0.7285225	0.6019045	0.4723392	0.2803627
HQI	HQI7	psychometric	25	0.5576736	0.5584739	0.4336605	0.3685304	0.4573000	0.1750787
HQS	HQS1	designometric	53	0.6810616	0.6611287	0.5768370	0.5024558	0.5541855	0.2449457
HQS	HQS1	psychometric	25	0.6168159	0.6070060	0.5477540	0.4332504	0.5197000	0.1599859
HQS	HQS2	designometric	55	0.7521635	0.7630558	0.7209064	0.6402145	0.3685152	0.2426397
HQS	HQS2	psychometric	25	0.6922924	0.7099388	0.6440410	0.5659545	0.3813000	0.1321243
HQS	HQS3	designometric	56	0.5881881	0.5977326	0.5303986	0.4181500	0.4187946	0.2243515
HQS	HQS3	psychometric	25	0.7746096	0.7598930	0.7075289	0.6311833	0.4277000	0.1744156
HQS	HQS4	designometric	61	0.6705848	0.6308341	0.5702840	0.4614471	0.4811749	0.2632831
HQS	HQS4	psychometric	25	0.4887681	0.4886682	0.3808049	0.2910595	0.4654000	0.1498588
HQS	HQS5	designometric	56	0.6387364	0.5227271	0.4352969	0.3392942	0.3966518	0.2557008
HQS	HQS5	psychometric	25	0.5210489	0.5184179	0.4270064	0.3229098	0.3899000	0.1542091
HQS	HQS6	designometric	61	0.7125647	0.6549950	0.5735686	0.4876641	0.4197322	0.2625926
HQS	HQS6	psychometric	25	0.5634879	0.5772406	0.4567243	0.3922267	0.4003000	0.1417273
HQS	HQS7	designometric	61	0.5840752	0.5848721	0.4958604	0.3960248	0.4128005	0.2460413
HQS	HQS7	psychometric	25	0.6966990	0.6955517	0.6761135	0.5369481	0.4267000	0.1611009
nEeriness	nE1	designometric	127	0.7408118	0.7334399	0.6848541	0.6440660	0.4678868	0.1740001
nEeriness	nE1	psychometric	119	0.5160214	0.5343692	0.4265733	0.3863068	0.4556709	0.0952187
nEeriness	nE2	designometric	126	0.7428029	0.7397503	0.6929683	0.6406685	0.4809049	0.1821338
nEeriness	nE2	psychometric	119	0.6873760	0.6728223	0.6097037	0.5412618	0.4710938	0.1331084
nEeriness	nE3	designometric	125	0.7363558	0.7016361	0.6419461	0.5971234	0.4606730	0.2197593
nEeriness	nE3	psychometric	119	0.5663081	0.5477541	0.4419186	0.3869046	0.4442411	0.1334954
nEeriness	nE4	designometric	126	0.6349151	0.6613301	0.5886438	0.5480713	0.5499998	0.1425516
nEeriness	nE4	psychometric	119	0.6343404	0.6317383	0.5547755	0.4991363	0.5278561	0.1144561
nEeriness	nE5	designometric	127	0.7501838	0.7615488	0.7206447	0.6736243	0.5490203	0.1549981
nEeriness	nE5	psychometric	119	0.7423577	0.7477601	0.7092518	0.6486944	0.5340442	0.1036929
nEeriness	nE6	designometric	127	0.7180501	0.7173765	0.6617658	0.6239836	0.5236777	0.1610552
nEeriness	nE6	psychometric	119	0.6432526	0.6575557	0.5929158	0.5329360	0.5006405	0.0974882
nEeriness	nE7	designometric	125	0.8272383	0.8246236	0.8079900	0.7584556	0.4967029	0.1799651
nEeriness	nE7	psychometric	119	0.7977637	0.7956693	0.7821557	0.7006352	0.4942902	0.1245290
nEeriness	nE8	designometric	126	0.7624603	0.7703229	0.7417685	0.6787708	0.5527972	0.1646881
nEeriness	nE8	psychometric	119	0.7030982	0.7086295	0.6631192	0.5909101	0.5344227	0.1103392
Usability	1	designometric	48	0.8954695	0.8852177	0.8496008	0.8072696	0.5537115	0.1698535
Usability	1	psychometric	42	0.7059783	0.6963695	0.5421684	0.4300962	0.5353763	0.1208224
Usability	2	designometric	48	0.8949999	0.8727338	0.8517108	0.8006296	0.5615960	0.1771572
Usability	2	psychometric	42	0.6888980	0.7014512	0.5453210	0.4359088	0.5574644	0.1107410
Usability	3	designometric	48	0.6282174	0.6911338	0.5121547	0.4845593	0.5668973	0.1168843
Usability	3	psychometric	42	0.6876127	0.6760119	0.4882153	0.4002469	0.5600007	0.1214538
Usability	4	designometric	48	0.9622228	0.9443958	0.9610464	0.9097243	0.5221123	0.2167186
Usability	4	psychometric	42	0.7434466	0.7530564	0.6456878	0.5171022	0.5242849	0.1112598

G_Item_rel <-
  Item_rel %>% 
  ggplot(aes(color = Scale,
             x = Perspective,
             y = r.cor)) +
  # geom_point() +
  geom_line(aes(group = Item)) +
  ylab("Item-whole correlation")

G_Item_rel

G_Item_rel +
  geom_label(aes( label = Item)) +
  facet_wrap(~Scale, ncol = 2) +
  geom_point(data = rename(Scale_rel, alpha = center),
             aes(x = Perspective, 
                 y = alpha,
                 col = "Whole Cronbach alpha")) +
  geom_line(data = rename(Scale_rel, alpha = center),
             aes(x = Perspective, 
                 y = alpha,
                 group = Scale,
                 col = "Whole Cronbach alpha"))

2.3 Number of factors

Often, different scales are used in combination to create a more complete picture. It is usually aimed for that every scale measures exactly one construct (or latent variable) and that different scales measure different constructs. As a counter-example, MacDorman found that the Eeriness scale decomposes into two slightly different aspects, summarized as “eery” and “spine-tingling”. In contrast, the AttrakDiff2 questionnaire comprises two scales to capture supposedly different aspects.

Given a response matrix, the number of factors can be estimated using parallel analysis. Ideally, this procedure returns exactly as many factors as there are separate scales. Here, we use parallel analysis to assess whether the two perspectives produce the expected number of factors.

parallel_analysis <- function(data, n, persp, scales){
  if (persp == "D") {
    data <- rm_design(data)
    main <- str_c("Designometric Parallel Analysis of ", scales)
  }
  if (persp == "P") {
    data <- rm_psycho(data)
    main <- str_c("Psychometric Parallel Analysis of ", scales)
  }
  psych::fa.parallel(data,
                   fa = "fa",
                   fm = "minres",
                   nfactors=n,
                   main=main)
    
}

2.4 Eeriness scale

Eeriness is usually considered a one-dimensional construct. Nevertheless, it has been suggested that it comprises two slightly different factors.

parallel_analysis(D_Eer, 2, "D", "Eeriness")

## `summarise()` has grouped output by 'Design'. You can override using the
## `.groups` argument.
## Loading required namespace: GPArotation

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Parallel analysis suggests that the number of factors =  1  and the number of components =  NA

parallel_analysis(D_Eer, 2, "P", "Eeriness")

## `summarise()` has grouped output by 'Part'. You can override using the
## `.groups` argument.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Parallel analysis suggests that the number of factors =  2  and the number of components =  NA

The results suggest that under psychometric perspective there is only one latent variables, whereas there are two under designometric perspective.

E_psycho <- tibble(Perspective = "psychometric",
                   Item = str_c("nE", 1:8),
                   loading = psych::fa(rm_psycho(D_Eer))$loadings)

## `summarise()` has grouped output by 'Part'. You can override using the
## `.groups` argument.

E_design <- tibble(Perspective = "designometric",
                   Item = str_c("nE", 1:8),
                   loading = psych::fa(rm_design(D_Eer))$loadings)

## `summarise()` has grouped output by 'Design'. You can override using the
## `.groups` argument.

# bind_rows(E_psycho, E_design) %>% 
#   ggplot(aes(x = Perspective, group = Item))
# 
# 
# str(E_psycho)

2.5 AttrakDiff and Credibility

The AttrakDiff2 inventory splits hedonistic quality into two components Identity and Stimulation, while the credibility scale is separate right from the start.

parallel_analysis(D_Att, 3, "P", "AttrakDiff and Credibility")

## `summarise()` has grouped output by 'Part'. You can override using the
## `.groups` argument.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Parallel analysis suggests that the number of factors =  1  and the number of components =  NA

parallel_analysis(D_Att, 3, "D", "AttrakDiff and Credibility")

## `summarise()` has grouped output by 'Design'. You can override using the
## `.groups` argument.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## In smc, smcs > 1 were set to 1.0
## In smc, smcs < 0 were set to .0
## In smc, smcs > 1 were set to 1.0
## In smc, smcs < 0 were set to .0

## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in sqrt(1/diag(V)): NaNs produced

## Warning in cov2cor(t(w) %*% r %*% w): diag(.) had 0 or NA entries; non-finite
## result is doubtful

## In smc, smcs > 1 were set to 1.0
## In smc, smcs < 0 were set to .0
## In smc, smcs > 1 were set to 1.0
## In smc, smcs < 0 were set to .0

## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Parallel analysis suggests that the number of factors =  5  and the number of components =  NA

Under psychometric perspective, all items can be grouped under just one latent construct. In contrast, the designometric analysis yielded five factors.

2.6 Hedonism, Usability and Beauty

In DN three separate scales were used. However, parallel analysis suggests that these capture the same latent variable under both perspectives.

parallel_analysis(D_HUB, 3, "P", "Hedonism, Usability and Beauty")

## `summarise()` has grouped output by 'Part'. You can override using the
## `.groups` argument.

## Warning in GPFoblq(A, Tmat = Tmat, normalize = normalize, eps = eps, maxit =
## maxit, : convergence not obtained in GPFoblq. 1000 iterations used.

## Parallel analysis suggests that the number of factors =  1  and the number of components =  NA

parallel_analysis(D_HUB, 3, "D", "Hedonism, Usability and Beauty")

## `summarise()` has grouped output by 'Design'. You can override using the
## `.groups` argument.

## Warning in GPFoblq(A, Tmat = Tmat, normalize = normalize, eps = eps, maxit =
## maxit, : convergence not obtained in GPFoblq. 1000 iterations used.

## Parallel analysis suggests that the number of factors =  1  and the number of components =  NA

3 Confirmatory Factor Analysis on Inventories

Several of the original studies employed more than one scale (QB, DN, SP). CFA is commonly used on multi-scale inventories to assess advanced psychometric qualities. In particular,

discriminant validity. If the scales measure genuinely different aspects of a person or a design,

library(blavaan)

tbl_post.blavaan <- function(x, model = NA){
  x %>% 
    blavaan::standardizedposterior() %>% 
    coda::as.mcmc() %>% 
    coda::as.mcmc.list() %>% 
    tidybayes::tidy_draws() %>% 
    rename(chain = .chain, iter = .iteration) %>% 
    select(-.draw) %>% 
    gather(parameter, value, -chain, -iter) %>% 
    mutate(type = case_when(str_detect(parameter, "=~") ~ "std.coef",
                            str_detect(parameter, "~~") ~ "std.vcov")) %>% 
    separate(parameter, into = c("lhs", "rhs"), remove = F)
}

F_6 <- "nEeriness =~ nE1 + nE2 + nE3 + nE4 + nE5 + nE6 + nE7 + nE8"

M_6_psycho <- 
  bcfa(model = F_6,
  data = rm_psycho(D_Eer) ,
  n.chains = 5,
  burnin = 12000,
  sample = 2000)

save(M_6_psycho, file = "M_6.Rda")


M_6_design <- 
  bcfa(model = F_6,
       data = rm_design(D_Eer),
       n.chains = 5,
       burnin = 20000,
       sample = 2000)




save(M_6_design, M_6_psycho, file = "M_6.Rda")




P_6 <- 
  bind_rows(
    tbl_post.blavaan(M_6_design) %>% mutate(model = "designometric"),
    tbl_post.blavaan(M_6_psycho) %>% mutate(model = "psychometric")
  ) 

save(M_6_design, M_6_psycho, P_6, file = "M_6.Rda")

load("M_6.Rda")

clu <- function(x)
  x %>% 
  group_by(model, parameter, type, lhs, rhs) %>% 
  summarize(center = median(value),
            lower = quantile(value, .025),
            upper = quantile(value, .025)) %>% 
  ungroup()


P_6 %>% 
  mutate(parameter = NA) %>% 
  filter(type == "std.coef") %>% 
  clu() %>% 
  mascutils::discard_redundant() %>% 
  rename(Item = rhs)

## `summarise()` has grouped output by 'model', 'parameter', 'type', 'lhs'. You
## can override using the `.groups` argument.

model	Item	center	lower	upper
designometric	nE1	0.7448549	0.6717351	0.6717351
designometric	nE2	0.6591509	0.5169453	0.5169453
designometric	nE3	0.6549655	0.5077850	0.5077850
designometric	nE4	0.5179342	0.3501500	0.3501500
designometric	nE5	0.6497413	0.5130526	0.5130526
designometric	nE6	0.6109226	0.4593454	0.4593454
designometric	nE7	0.7407468	0.6282122	0.6282122
designometric	nE8	0.6773998	0.5447105	0.5447105
psychometric	nE1	0.7004677	0.5704952	0.5704952
psychometric	nE2	0.5265502	-0.6398494	-0.6398494
psychometric	nE3	0.3910258	-0.5155322	-0.5155322
psychometric	nE4	0.4413438	-0.6035649	-0.6035649
psychometric	nE5	0.5171714	-0.6336654	-0.6336654
psychometric	nE6	0.4205133	-0.5832273	-0.5832273
psychometric	nE7	0.6104738	-0.6853271	-0.6853271
psychometric	nE8	0.5105753	-0.6295708	-0.6295708

CLU_6 <- 
  P_6 %>% 
  filter(type == "std.coef") %>% 
  clu() %>% 
  rename(Item = rhs)

## `summarise()` has grouped output by 'model', 'parameter', 'type', 'lhs'. You
## can override using the `.groups` argument.

CLU_6

model	parameter	type	lhs	Item	center	lower	upper
designometric	nEeriness=~nE1	std.coef	nEeriness	nE1	0.7448549	0.6717351	0.6717351
designometric	nEeriness=~nE2	std.coef	nEeriness	nE2	0.6591509	0.5169453	0.5169453
designometric	nEeriness=~nE3	std.coef	nEeriness	nE3	0.6549655	0.5077850	0.5077850
designometric	nEeriness=~nE4	std.coef	nEeriness	nE4	0.5179342	0.3501500	0.3501500
designometric	nEeriness=~nE5	std.coef	nEeriness	nE5	0.6497413	0.5130526	0.5130526
designometric	nEeriness=~nE6	std.coef	nEeriness	nE6	0.6109226	0.4593454	0.4593454
designometric	nEeriness=~nE7	std.coef	nEeriness	nE7	0.7407468	0.6282122	0.6282122
designometric	nEeriness=~nE8	std.coef	nEeriness	nE8	0.6773998	0.5447105	0.5447105
psychometric	nEeriness=~nE1	std.coef	nEeriness	nE1	0.7004677	0.5704952	0.5704952
psychometric	nEeriness=~nE2	std.coef	nEeriness	nE2	0.5265502	-0.6398494	-0.6398494
psychometric	nEeriness=~nE3	std.coef	nEeriness	nE3	0.3910258	-0.5155322	-0.5155322
psychometric	nEeriness=~nE4	std.coef	nEeriness	nE4	0.4413438	-0.6035649	-0.6035649
psychometric	nEeriness=~nE5	std.coef	nEeriness	nE5	0.5171714	-0.6336654	-0.6336654
psychometric	nEeriness=~nE6	std.coef	nEeriness	nE6	0.4205133	-0.5832273	-0.5832273
psychometric	nEeriness=~nE7	std.coef	nEeriness	nE7	0.6104738	-0.6853271	-0.6853271
psychometric	nEeriness=~nE8	std.coef	nEeriness	nE8	0.5105753	-0.6295708	-0.6295708

P_6 %>% 
  filter(type == "std.coef") %>% 
  rename(Item = rhs) %>% 
  ggplot(aes(x = Item, color = model, fill = model, y = value)) +
  geom_violin() +
  geom_point(data = CLU_6, aes(y = center)) +
  geom_line(data = CLU_6, aes(y = center, group = model))

4 Using designometric scales as psychometrics

Up to this point we have taken a purely designometric stance, that such rating scales must primarily discriminate between designs. In some research situations, however, a designometric scale could well be used psychometrically. For example, a common stereotype is that male adolescents expose themselves more to imagery of robots, zombies and humanoid extraterrestrians than young women. One could assume that the feeling of eeriness wears off, which would then produce weaker responses of male respondents averaged over designs.

AH %>% 
  group_by(Part, Item, Gender) %>%
  summarize(nEeriness = mean(response)) %>% 
  ggplot(aes(x = Item, color = Gender, y = nEeriness)) +
  geom_boxplot()

## `summarise()` has grouped output by 'Part', 'Item'. You can override using the
## `.groups` argument.

5 Discussion

Scales in HCI research and beyond are commonly used to discriminate between poor and good designs. We stated that scales for measuring designs must be evaluated on design-by-item response matrices. Most validation studies evaluate their scales on person-by-item response matrices, which we call the psychometric fallacy. To make the case, a simulation showed, that in a realistic scenario, psychometric reliability can be excellent when designometric reliability is poor. Fortunately, when looking at real data from commonly used rating scales, this bias is reversed: Designometric reliability is consistently better. At the same time,

5.1 Implications for practical use

5.1.1 AttrakDiff2

The two scales HQI and HQS showed only moderate reliability under both perspectives.

5.2 Implications for scale development

5.3 Limitations

Populations in the samples were rather homogenous (students). Too little variance in the sample?
just stimuli, no use. we can assume dominance of system 1.
Tested conditions were on finalized scales, rather than initial item pools.

5.4 The ideal designometric scale

The comparison of item-level reliability suggests that the scales fall into two clusters: beauty and hedonism have overall excellent item and scale reliability. Reliability under psychometric perspective is still good. What is striking is that item reliability seem to drop by a constant

Designometric models for the evaluation of designs.

Martin Schmettow and Simone Borsci