df <- data.table::fread("data/Data Science Methodologies_complete.csv",
stringsAsFactors = F)
metadata <- df %>%
dplyr::select(`Respondent ID`, `Collector ID`, `Start Date`,
`End Date`, `IP Address`, `Email Address`,
`First Name`, `Last Name`, `Custom Data 1`)
breaks <- c(10,205,213,218,221,225,229,231,234,309,312,329)
questions <- df %>% dplyr::select(breaks %>% head(-1)) %>% colnames()
header_first <- function(df) {
names(df) <- as.character(unlist(df[1,]))
df[-1,]
}
responses <- lapply(1:11, function(i){
df %>% dplyr::select(breaks[i]:(breaks[i+1]-1)) %>% header_first() %>%
dplyr::mutate(id = dplyr::row_number())
})
qA <- lapply(c(seq(1,8),10), function(i){
responses[[i]] %>%
tidyr::pivot_longer(-id) %>%
dplyr::filter(value != "") %>%
dplyr::select(id, value) %>%
dplyr::rename(!!paste0("q", i):=value)
}) %>%
Reduce(function(x,y) merge(x = x, y = y, by = "id", all=T), .)
colnames(qA)[-1] <- questions[c(seq(1,8),10)]
i <- 9
qB <- responses[[i]] %>%
tidyr::pivot_longer(-id) %>%
dplyr::filter(value != "") %>%
dplyr::mutate(
name = stringr::str_remove(name, value),
name = stringr::str_remove(name, " - "),
value_num = as.numeric(stringr::str_sub(value, 1, 1))
) %>%
dplyr::select(id, name, value_num) %>%
tidyr::pivot_wider(names_from=name, values_from = value_num)
i <- 11
qC_know <- responses[[i]] %>%
dplyr::select(id, dplyr::contains("Know")) %>%
tidyr::pivot_longer(-id) %>%
dplyr::mutate(
name = stringr::str_remove(name, " - Know about it"),
value = value == "Know about it"
) %>% dplyr::rename(know = value, methodology = name)
qC_used <- responses[[i]] %>%
dplyr::select(id, dplyr::contains("Used")) %>%
tidyr::pivot_longer(-id) %>%
dplyr::mutate(
name = stringr::str_remove(name, " - Used it"),
value = value == "Used it"
) %>% dplyr::rename(used = value, methodology = name)
qC <- merge(qC_know, qC_used)
cols <- qA %>% dplyr::select(-id) %>% colnames
g <- lapply(cols, function(col){
g <- qA %>%
dplyr::mutate(
x = get(col),
x = stringr::str_pad(x, 30, side="left", pad=" ")
) %>%
dplyr::count(x) %>%
ggplot2::ggplot()+
ggplot2::geom_col(data=. %>% dplyr::filter(n>1),
ggplot2::aes(x=reorder(x, n), y=n, fill=is.na(x)), width=0.8, show.legend = F)+
ggplot2::geom_text(data=. %>% dplyr::filter(n>1),
ggplot2::aes(x=reorder(x, n), y=n, label=n), hjust=-0.2)+
ggplot2::geom_label(hjust=1, vjust=0, size=3, label.size = NA,
ggplot2::aes(x=0, y=Inf, label=paste0(c(ifelse(min(n)==1, "Items with 1 count\n", ""), x[n==1], "\n"), collapse = "\n")))+
ggplot2::coord_flip()+
ggplot2::scale_fill_manual(values=c("#324AA8", "#f4f1de"))+
ggplot2::labs(x=NULL, y="Survey Respondents")+
# ggplot2::ggtitle(col)+
ggpubr::theme_pubclean()+
ggplot2::theme(
plot.margin = grid::unit(c(0, 0, 0, 0), "null"),
plot.title.position = "plot"
)
ggplot2::ggsave(paste0("plot_questions_", col, ".pdf"), width=6, height = 6)
print(g)
})
We are going to explore the factor analysis technique, looking at both principal axis and principal components extraction methods, two different methods of identifying the correct number of factors to extract (scree plot and parallel analysis), and two different methods of rotating factors to facilitate interpretation.
data <- qB %>% dplyr::select(-id)
psych::describe(data)
Prior to the factor analysis, the “factorability” of the data should be evaluated. In other words, “are there meaningful latent factors to be found within the data?” We can check two things: (1) Bartlett’s test of sphericity; and (2) the Kaiser-Meyer-Olkin measure of sampling adequacy.
The most liberal test is Bartlett’s test of sphericity - this evaluates whether or not the variables intercorrelate at all, by evaluating the observed correlation matrix against an “identity matrix” (a matrix with ones along the principal diagonal, and zeroes everywhere else). If this test is not statistically significant, we should not employ a factor analysis.
psych::cortest.bartlett(data)
## R was not square, finding R from data
## $chisq
## [1] 828.8527
##
## $p.value
## [1] 6.68845e-113
##
## $df
## [1] 105
Bartlett’s test was statistically significant, suggesting that the observed correlation matrix among the items is not an identity matrix. This really is not a particularly powerful indication that we have a factorable dataset, though - all it really tells that at least some of the variables are correlated with each other.
The Kaiser-Meyer-Olkin (KMO) measure of sampling adequacy is a better measure of factorability. The KMO tests to see if the partial correlations within the data are close enough to zero to suggest that there is at least one latent factor underlying the variables. The minimum acceptable value is 0.50, but most authors recommend a value of at 0.60 before undertaking a factor analysis. The KMO function in the psych package produces an overall Measure of Sampling Adequacy (MSA, as its labelled in the output), and an MSA for each item. Theoretically, if the overall MSA is too low, you could look at the item MSA’s and drop items that are too low. This should be done with caution, of course, as is the case with any atheoretical, empirical method of item selection
psych::KMO(data)
## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = data)
## Overall MSA = 0.84
## MSA for each item =
## Team collaboration and coordination
## 0.82
## Harness knowledge for future work
## 0.92
## Communicating the results to end-users
## 0.86
## Proactive team communication
## 0.82
## Data augmentation or metadata enrichment
## 0.90
## Data visualization tools
## 0.89
## Version control for code, data & models
## 0.82
## Deployment pipeline to production
## 0.76
## Data security and privacy
## 0.84
## Understanding each team member skills and role
## 0.88
## Precisely describe stakeholders needs
## 0.86
## Identify project potential risks and pitfalls
## 0.88
## Establishing timelines and deliverables
## 0.81
## Define a data lifecycle workflow
## 0.77
## Develop a strategy to meet project requirements
## 0.85
The overall KMO is 0.84 which is excellent - this suggests that we can go ahead with the planned factor analysis.
The first step for the factor analysis is to choose the number of factors to extract, in order to achieve the most parsimonious (but still interpretable) factor structure. There are a number of methods that we could use, but the two most commonly employed methods are the scree plot, and parallel analysis.
Eigenvalues are a measure of the amount of variance accounted for by a factor, and so they can be useful in determining the number of factors that we need to extract. In a scree plot, we simply plot the eigenvalues for all of our factors, and then look to see where they drop off sharply.
psych::scree(data)
The scree plot technique involves drawing a straight line through the plotted eigenvalues, starting with the largest one. The last point to fall on this line represents the last factor that you extract, with the idea being that beyond this, the amount of additional variance explained is non-meaningful. In fact, the word “scree” refers to the loose stone that lies around the base of the mountain. A “scree plot” is effectively looking to help you differentiate between the points that represent “mountain”, and the points that represent “scree.” Regardless of whether you are using a principal components or a principal axis factor extraction, there is a very large first factor in this data.
A better method for evaluating the scree plot is within a parallel analysis. In addition to plotting the eigenvalues from our factor analysis (whether it’s based on principal axis or principal components extraction), a parallel analysis involves generating random correlation matrices and after factor analyzing them, comparing the resulting eigenvalues to the eigenvalues of the observed data. The idea behind this method is that observed eigenvalues that are higher than their corresponding random eigenvalues are more likely to be from “meaningful factors” than observed eigenvalues that are below their corresponding random eigenvalue.
psych::fa.parallel(data)
## Parallel analysis suggests that the number of factors = 3 and the number of components = 1
When looking at the parallel analysis scree plots, there are two places to look depending on which type of factor analysis we are looking to run. The two blue lines show the observed eigenvalues - they should look identical to the scree plots drawn by the scree function. The red dotted lines show the random eigenvalues or the simulated data line. Each point on the blue line that lies above the corresponding simulated data line is a factor or component to extract. In this analysis, 3 factors in the “Factor Analysis” parallel analysis lie above the corresponding simulated data line and 1 components in the “Principal Components” parallel analysis lie above the corresponding simulated data line. In this case, however, the last factor/component lies very close to the line - for both principal components extraction and principal axis extraction. Thus, we should probably compare the 3, 4 and 5 factor solutions.
We already have a good idea as to how many factors (3, 4, or 5) that we should extract in our analysis. Now we need to decide whether we will use “common factor” analysis, or “principal components” analysis. In a very broad sense, “common factor” analysis (or “principal axis factoring”) is used when we want to identify the latent variables that are underlying a set of variables, while “principal components” analysis is used to reduce a set of variables to a smaller set of factors (i.e., the “principal components” of the data). In other words, common factor analysis is used when you want to evaluate a theoretical model with a set of variables, and principal components analysis is used for data reduction.
The fa function takes the following parameters when it is called:
dimensions <- data.frame(name=c("TEAM", "DATA", "PROJECT"), value=c(1,2,3))
truth <- data.frame(aspect=names(data), dimension=c(1,2,1,1,2,2,2,3,2,1,3,3,3,2,3)) %>%
tibble::rowid_to_column() %>%
merge(dimensions, by.x="dimension", by.y="value", sort=T) %>%
dplyr::arrange(rowid) %>%
dplyr::select(-dimension, -rowid)
names(data) <- paste0("[", truth$name, "] ", truth$aspect)
truth
A quick way to visualize the rotated factor solution, and determine whether it represents an “interpretable” solution is to use the fa.diagram function
nfactors <- c(3,4,5)
models_fa <- lapply(nfactors, function(n){
psych::fa(data, nfactors = n, rotate = "oblimin", fm="minres")
})
## Loading required namespace: GPArotation
diagrams <- lapply(models_fa, psych::fa.diagram, rsize = 0.2)
model_fa3 <- models_fa[[1]]
print(model_fa3)
## Factor Analysis using method = minres
## Call: psych::fa(r = data, nfactors = n, rotate = "oblimin", fm = "minres")
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR3 MR1 MR2
## [TEAM] Team collaboration and coordination 0.10 0.68 0.03
## [DATA] Harness knowledge for future work 0.38 0.21 0.10
## [TEAM] Communicating the results to end-users 0.29 0.35 0.01
## [TEAM] Proactive team communication 0.00 0.86 0.02
## [DATA] Data augmentation or metadata enrichment 0.35 -0.06 0.26
## [DATA] Data visualization tools 0.18 0.24 0.14
## [DATA] Version control for code, data & models -0.13 0.24 0.62
## [PROJECT] Deployment pipeline to production 0.20 -0.09 0.54
## [DATA] Data security and privacy 0.15 0.23 0.32
## [TEAM] Understanding each team member skills and role 0.39 0.25 -0.03
## [PROJECT] Precisely describe stakeholders needs 0.43 0.13 0.15
## [PROJECT] Identify project potential risks and pitfalls 0.57 -0.09 0.22
## [PROJECT] Establishing timelines and deliverables 0.50 0.10 0.01
## [DATA] Define a data lifecycle workflow 0.07 -0.05 0.61
## [PROJECT] Develop a strategy to meet project requirements 0.67 0.10 -0.11
## h2 u2 com
## [TEAM] Team collaboration and coordination 0.57 0.43 1.0
## [DATA] Harness knowledge for future work 0.33 0.67 1.7
## [TEAM] Communicating the results to end-users 0.33 0.67 1.9
## [TEAM] Proactive team communication 0.75 0.25 1.0
## [DATA] Data augmentation or metadata enrichment 0.23 0.77 1.9
## [DATA] Data visualization tools 0.21 0.79 2.6
## [DATA] Version control for code, data & models 0.47 0.53 1.4
## [PROJECT] Deployment pipeline to production 0.37 0.63 1.3
## [DATA] Data security and privacy 0.31 0.69 2.3
## [TEAM] Understanding each team member skills and role 0.31 0.69 1.7
## [PROJECT] Precisely describe stakeholders needs 0.35 0.65 1.4
## [PROJECT] Identify project potential risks and pitfalls 0.41 0.59 1.4
## [PROJECT] Establishing timelines and deliverables 0.33 0.67 1.1
## [DATA] Define a data lifecycle workflow 0.38 0.62 1.0
## [PROJECT] Develop a strategy to meet project requirements 0.48 0.52 1.1
##
## MR3 MR1 MR2
## SS loadings 2.26 2.00 1.56
## Proportion Var 0.15 0.13 0.10
## Cumulative Var 0.15 0.28 0.39
## Proportion Explained 0.39 0.34 0.27
## Cumulative Proportion 0.39 0.73 1.00
##
## With factor correlations of
## MR3 MR1 MR2
## MR3 1.00 0.53 0.42
## MR1 0.53 1.00 0.38
## MR2 0.42 0.38 1.00
##
## Mean item complexity = 1.5
## Test of the hypothesis that 3 factors are sufficient.
##
## The degrees of freedom for the null model are 105 and the objective function was 4.53 with Chi Square of 828.85
## The degrees of freedom for the model are 63 and the objective function was 0.66
##
## The root mean square of the residuals (RMSR) is 0.05
## The df corrected root mean square of the residuals is 0.06
##
## The harmonic number of observations is 190 with the empirical chi square 92.8 with prob < 0.0086
## The total number of observations was 190 with Likelihood Chi Square = 120.24 with prob < 1.9e-05
##
## Tucker Lewis Index of factoring reliability = 0.867
## RMSEA index = 0.069 and the 90 % confidence intervals are 0.05 0.088
## BIC = -210.32
## Fit based upon off diagonal values = 0.97
## Measures of factor score adequacy
## MR3 MR1 MR2
## Correlation of (regression) scores with factors 0.88 0.92 0.84
## Multiple R square of scores with factors 0.78 0.84 0.71
## Minimum correlation of possible factor scores 0.55 0.68 0.42
MR: These are factors, and the name merely reflects the fitting method, e.g. minimum residual, maximum likelihood, principal components. The default is minimum residual, so in this case MRout of order? the number assigned is arbitrary, but this has to do with a rotated solution.h2: the amount of variance in the item/variable explained by the (retained) factors. It is the sum of the squared loadings, a.k.a. communality.u2: 1 - h2. residual variance, a.k.a. uniqueness.com: Item complexity. Specifically it is “Hoffman’s index of complexity for each item. This is just \((\sum\lambda_i^2)2/\sum\lambda_i^4\) where \(\lambda_i\) is the factor loading on the ith factor. From Hofmann (1978), MBR. See also Pettersson and Turkheimer (2010).” It equals one if an item loads only on one factor, 2 if evenly loads on two factors, etc. Basically it tells you how much an item reflects a single construct. It will be lower for relatively lower loadings.print(model_fa3$loadings, cutoff = 0.3)
##
## Loadings:
## MR3 MR1 MR2
## [TEAM] Team collaboration and coordination 0.682
## [DATA] Harness knowledge for future work 0.383
## [TEAM] Communicating the results to end-users 0.353
## [TEAM] Proactive team communication 0.857
## [DATA] Data augmentation or metadata enrichment 0.347
## [DATA] Data visualization tools
## [DATA] Version control for code, data & models 0.617
## [PROJECT] Deployment pipeline to production 0.538
## [DATA] Data security and privacy 0.318
## [TEAM] Understanding each team member skills and role 0.391
## [PROJECT] Precisely describe stakeholders needs 0.429
## [PROJECT] Identify project potential risks and pitfalls 0.566
## [PROJECT] Establishing timelines and deliverables 0.503
## [DATA] Define a data lifecycle workflow 0.605
## [PROJECT] Develop a strategy to meet project requirements 0.670
##
## MR3 MR1 MR2
## SS loadings 1.844 1.662 1.321
## Proportion Var 0.123 0.111 0.088
## Cumulative Var 0.123 0.234 0.322
g <- lapply(models_fa, function(model){
g <- model$loadings[,] %>%
as.data.frame() %>%
tibble::rownames_to_column() %>%
tidyr::pivot_longer(-rowname) %>%
ggplot2::ggplot()+
ggplot2::geom_col(ggplot2::aes(x=abs(value), y=rowname, fill=value))+
ggplot2::geom_vline(xintercept = 0.3, color="red", linetype="dashed")+
ggplot2::facet_grid(cols=dplyr::vars(name))+
scico::scale_fill_scico()+
ggplot2::scale_x_continuous(name="Loading Strength", breaks=c(0,1))+
ggplot2::scale_y_discrete(name="Variable")+
ggplot2::theme_bw()
print(g)
})
The communality for each variable is the percentage of variance that can be explained by the retained factors. It’s best if the retained factors explain more of the variance in each variable.
As a point of interest, the primary difference between the way that common factor analysis and principal component analysis are conducted, is that the correlation matrix on which the factor analysis is based has ones along the principal diagonal in principal components analysis, and the communalities along the principal diagonal in principal axis factor analysis.
lapply(models_fa, function(model){
data.frame(model$communality) %>% `colnames<-`(paste0("Factors:", model$factors))
}) %>%
dplyr::bind_cols()
The eigenvalues derived in the extracted factor solution are stored within e.values. These are the eigenvalues that were plotted in the scree plots.
model_fa3$e.values
## [1] 5.0639531 1.3455668 1.0926313 0.9884682 0.8984961 0.8511433 0.8145318
## [8] 0.7301593 0.7028788 0.5416376 0.5014652 0.4397683 0.4212832 0.3342322
## [15] 0.2737850
If you want the eigenvalues from the rotated solution, you would ask for values.
model_fa3$values
## [1] 4.47804859 0.77691874 0.56852636 0.35284918 0.25725216 0.23068597
## [7] 0.11231424 0.09012957 0.01895818 -0.05550875 -0.11517946 -0.16916206
## [13] -0.19864255 -0.23141776 -0.29228545
SS loadings: These are the eigenvalues, the sum of the squared loadings. In this case where we are using a correlation matrix, summing across all factors would equal the number of variables used in the analysis.Proportion Var: tells us how much of the overall variance the factor accounts for out of all the variables.Cumulative Var: the cumulative sum of Proportion Var.Proportion Explained: The relative amount of variance explained - Proportion Var/sum(Proportion Var).Cumulative Proportion: the cumulative sum of Proportion Explained.model_fa3$Vaccounted
## MR3 MR1 MR2
## SS loadings 2.2616866 1.9982004 1.5635999
## Proportion Var 0.1507791 0.1332134 0.1042400
## Cumulative Var 0.1507791 0.2839925 0.3882325
## Proportion Explained 0.3883733 0.3431278 0.2684989
## Cumulative Proportion 0.3883733 0.7315011 1.0000000
factor correlations: the correlation matrix for the factors.Mean item complexity: the mean of commodel_fa3$Phi
## MR3 MR1 MR2
## MR3 1.0000000 0.5308094 0.4174089
## MR1 0.5308094 1.0000000 0.3836068
## MR2 0.4174089 0.3836068 1.0000000
Test of the hypothesis that 3 factors are sufficient.
The degrees of freedom for the null model are 105 and the objective function was 4.53 with Chi Square of 828.85
The degrees of freedom for the model are 63 and the objective function was 0.66
null model: The degrees of freedom for the null model that assumes no correlation structure.objective function: The value of the function that is minimized by a specific procedure.model: The one you’re actually interested in. Where p = Number of items, nf = number of factors then: *degrees of freedom = \(p(p-1)/2-p \cdot n_f + n_f(n_f-1)/2\) For the null model this is \(p(p-1)/2\)Chi-square: If f is the objective function value. Then \(\chi^2=(n_{obs}-1-(2p+5)/6-2n_f/3))f\)data <- qB %>% dplyr::select(-id)
model <- psych::fa(data, nfactors = 3, rotate = "oblimin", fm="minres")
proposal <- read.csv(file = "factors.csv", stringsAsFactors = F) %>%
dplyr::select(-Dimension) %>%
dplyr::mutate_if(is.numeric, ~ . /100)
loadings <- model$loadings[,] %>%
as.data.frame() %>%
tibble::rownames_to_column(var="Factor") %>%
dplyr::mutate(
total = abs(MR1) + abs(MR2) + abs(MR3),
MR1 = abs(MR1) / total,
MR2 = abs(MR2) / total,
MR3 = abs(MR3) / total
) %>%
dplyr::select(-total)
merge(proposal, loadings) %>%
tidyr::pivot_longer(-Factor) %>%
dplyr::mutate(
class = (name %in% c("TEAM", "MR1"))*0 + (name %in% c("DATA", "MR2"))*1 + (name %in% c("PROJECT", "MR3"))*2,
class = factor(class, labels = c("Factor 1 (Team)", "Factor 2 (Data)", "Factor 3 (Project)")),
model = ifelse(name %in% c("TEAM", "DATA", "PROJECT"), "A priori assigned", "Factor analysis")
) %>%
ggplot2::ggplot()+
ggplot2::geom_col(ggplot2::aes(y=Factor, x=value, fill=model), position="dodge", width=0.5)+
ggplot2::facet_grid(cols = dplyr::vars(class))+
ggplot2::scale_x_continuous(name = "Normalized loadings", breaks=c(0,0.5), labels = ~ sprintf("%.1f", .), limits=c(0,1))+
ggplot2::scale_fill_manual(name=NULL, values=c("#C7A43C", "#324AA8"), guide=ggplot2::guide_legend(nrow=1))+
ggpubr::theme_pubclean()+
ggplot2::theme(
legend.position = c(-0.41,-0.05),
plot.margin = grid::unit(c(0, 0, 0.05, 0), "null"),
strip.background = ggplot2::element_rect(fill="white"),
axis.title = ggplot2::element_text(size=10)
)
ggplot2::ggsave("plot_factor_analysis.pdf", width=7, height = 7)
qC %>%
# filter not answered
dplyr::mutate(answered = know | used) %>%
dplyr::group_by(id) %>%
dplyr::mutate(answered = any(answered)) %>%
dplyr::ungroup() %>%
dplyr::filter(answered) %>%
dplyr::group_by(methodology) %>%
dplyr::summarise(
know = sum(know),
used = sum(used),
dont_know = dplyr::n(), # - know,
dont_used = dplyr::n(), # - used
) %>%
dplyr::filter(methodology != "None") %>%
tidyr::pivot_longer(-methodology) %>%
dplyr::mutate(
classA = stringr::str_detect(name, "know"),
classB = stringr::str_detect(name, "dont"),
methodology_num = as.numeric(rev(as.factor(methodology)))
) %>%
ggplot2::ggplot()+
ggplot2::geom_col(
data = . %>% dplyr::filter(classB),
ggplot2::aes(x=value, y=methodology, fill=name),
position = ggplot2::position_dodge2(padding = 0.25, reverse = T), width=0.7)+
ggplot2::geom_col(
data = . %>% dplyr::filter(!classB),
ggplot2::aes(x=value, y=methodology, fill=name),
position = ggplot2::position_dodge2(padding = 0.25, reverse = T), width=0.7)+
ggplot2::geom_text(
data = . %>% dplyr::filter(!classB, classA),
ggplot2::aes(x=value/2, y=methodology_num + 0.17,
label=paste0(round(100*value/177, 1), " %")),
vjust=0.5, hjust=0.5, color="black", fontface="bold"
)+
ggplot2::geom_text(
data = . %>% dplyr::filter(!classB, classA),
ggplot2::aes(x=(value+177)/2, y=methodology_num + 0.17,
label=paste0(round(100*(177-value)/177, 1), " %")),
vjust=0.5, hjust=0.5, color="white", fontface="bold"
)+
ggplot2::geom_text(
data = . %>% dplyr::filter(!classB, !classA) %>% dplyr::filter(value > 10),
ggplot2::aes(x=value/2, y=methodology_num - 0.17,
label=paste0(round(100*value/177, 1), " %")),
vjust=0.5, hjust=0.5, color="black", fontface="bold"
)+
ggplot2::geom_text(
data = . %>% dplyr::filter(!classB, !classA),
ggplot2::aes(x=(value+177)/2, y=methodology_num - 0.17,
label=paste0(round(100*(177-value)/177, 1), " %")),
vjust=0.5, hjust=0.5, color="white", fontface="bold"
)+
ggplot2::scale_y_discrete(name="Methodology", limits=rev)+
ggplot2::scale_x_continuous(name="Survey Respondents")+
ggplot2::scale_fill_manual(
name=NULL,
values = c("#F5CF62", "#A88622", "#8099FF", "#324AA8"),
breaks = c("know", "dont_know", "used", "dont_used"),
labels = c("Know about it", "Did not know", "Used it", "Did not use")
)+
ggpubr::theme_pubclean()+
ggplot2::theme(plot.margin = grid::unit(c(0, 0, 0, 0), "null"),
legend.position = "top")
ggplot2::ggsave("plot_methodology_use.pdf", width=7, height = 7)
cols <- qA %>% dplyr::select(-id) %>% colnames
qA %>%
dplyr::select(cols[c(3, 4, 5, 6, 7, 8, 9)]) %>%
tidyr::pivot_longer(dplyr::everything()) %>%
dplyr::count(name, value) %>%
dplyr::mutate(value = ifelse(value == "I'd prefer not to answer", "_NA_", value)) %>%
dplyr::mutate(value = as.factor(ifelse(is.na(value), "_NA_", value))) %>%
dplyr::arrange(name, value) %>%
dplyr::group_by(name) %>%
dplyr::mutate(
total_cumsum = rev(cumsum(rev(n))),
total = sum(n),
total_percentage = 100*n/total,
label = paste0(round(total_percentage, 1), "%"),
) %>%
dplyr::ungroup() %>%
dplyr::mutate(name_num = name %>% as.factor %>% as.numeric) %>%
ggplot2::ggplot()+
ggplot2::geom_col(
ggplot2::aes(x=n, y=stringr::str_wrap(name, 20), group=value), color="black", fill="white",
position = ggplot2::position_stack(), width=0.3, show.legend = T)+
ggplot2::geom_text(
data = . %>% dplyr::filter(total_percentage > 4),
ggplot2::aes(x=total_cumsum - n/2, y=8 - name_num - 0, label=label),
hjust = 0.5, vjust=0.5
)+
ggplot2::geom_label(
data = . %>% dplyr::mutate(value = stringr::str_replace(value, "_NA_", "NA")),
ggplot2::aes(x=total_cumsum - n/2, y=8 - name_num + 0.18, label=stringr::str_wrap(value, 15)),
lineheight=0.8, size=4, hjust = 0.5, vjust=0, label.size = NA, label.padding = grid::unit(0.5, "mm")
)+
ggplot2::scale_x_continuous(name = "Survey Respondents")+
ggplot2::scale_y_discrete(name = "Question", limits=rev)+
ggpubr::theme_pubclean()+
ggplot2::theme(
legend.position = "top"
)
ggplot2::ggsave("plot_questions.pdf", width=7, height = 7)
qB %>%
tidyr::pivot_longer(-id) %>%
dplyr::count(name, value) %>%
dplyr::group_by(name) %>%
dplyr:::mutate(
total_cumsum = rev(cumsum(rev(n))),
total = sum(n),
total_percentage = 100*n/total,
label = paste0(round(total_percentage, 1), "%"),
top = sum((value == 5) * n),
avg = sum(value * n/total),
std = sqrt(sum((value - avg)^2 * n/total))
) %>%
dplyr::ungroup() %>%
dplyr::mutate(name_num = reorder(name, avg) %>% as.factor %>% as.numeric) %>%
ggplot2::ggplot()+
ggplot2::geom_col(
ggplot2::aes(x=n, y=reorder(stringr::str_wrap(name, 250), avg), fill=factor(value)),
position = ggplot2::position_stack(), width=0.6, show.legend = T)+
ggplot2::geom_text(
data = . %>% dplyr::filter(total_percentage > 7),
ggplot2::aes(x=total_cumsum - n/2, y=name_num, label=label),
hjust = 0.5, vjust=0.5, size=3, #fontface="italic"
)+
ggplot2::geom_label(
data = . %>% dplyr::distinct(name_num, avg, std),
ggplot2::aes(x=205, y=name_num, label=paste0(sprintf("%.2f", avg), " \u00B1 ", sprintf("%.2f", std))),
hjust = 0.5, vjust=0.5, size=3, color="black", label.size = NA
)+
ggplot2::annotate("text", x = 205, y=16, label="Weighted\nAvg \u00B1 Std", size=3, color="black", hjust=0.5, vjust=0.5)+
ggplot2::scale_x_continuous(name = "Survey Respondents", breaks = c(0,50,100,150))+
ggplot2::scale_y_discrete(name = "Factor")+
ggplot2::coord_cartesian(ylim=c(1,15), clip = "off")+
ggplot2::scale_fill_manual(name=NULL, labels = c("1-Not important at all","2-Little importance","3-Moderately important","4-Very important","5-Absolutely essential"),
values=c("#00BF6F","#507CB6","#F9BE00","#6BC8CD","#FF8B4F"),
guide=ggplot2::guide_legend(nrow = 1, byrow = T, reverse = T))+
# ggplot2::ggtitle(stringr::str_wrap("Based on your experience, how relevant are these aspects for the development of Data Science projects?", 1000))+
ggpubr::theme_pubclean()+
ggplot2::theme(
plot.margin = grid::unit(c(0.1, 0.02, 0, 0), "null"),
legend.position = c(0.0, 1.07),
# legend.position = "top",
legend.justification='left',
legend.direction='horizontal',
plot.title.position = "plot",
legend.text = ggplot2::element_text(size=9),
axis.text = ggplot2::element_text(size=9),
plot.title = ggplot2::element_text(size=11)
)
ggplot2::ggsave("plot_aspects.pdf", width=12, height = 8)
qB %>%
# merge(qA %>% dplyr::select(id, group=`Do you usually work in group or individually?`)) %>%
merge(qA %>% dplyr::select(id, group=`Do you usually follow some Data Science project methodology?`)) %>%
dplyr::filter(group != "NA") %>%
# dplyr::mutate(group = ifelse(group == "Yes", 1, 0)) %>%
tidyr::pivot_longer(-c(id, group)) %>%
dplyr::count(name, value, group) %>%
dplyr::group_by(name, group) %>%
dplyr:::mutate(
total_cumsum = rev(cumsum(rev(n))),
total = sum(n),
total_percentage = 100*n/total,
label = paste0(round(total_percentage, 0), "%"),
top = sum((value == 5) * n),
avg = sum(value * n/total),
std = sqrt(sum((value - avg)^2 * n/total))
) %>%
dplyr::ungroup() %>%
ggplot2::ggplot()+
ggplot2::geom_col(
ggplot2::aes(x=n, y=group, fill=factor(value)),
position = "fill", width=1, show.legend = T)+
ggplot2::geom_text(
data = . %>% dplyr::filter(value %in% c(4,5), total_percentage > 10),
ggplot2::aes(x=(total_cumsum)/total, y=group, label=label),
hjust = 1, vjust=0.5, size=3, #fontface="italic"
)+
ggplot2::geom_label(
data = . %>% dplyr::distinct(name, group, avg, std),
ggplot2::aes(x=1.1, y=group, label=paste0(sprintf("%.2f", avg), "\u00B1", sprintf("%.2f", std))),
hjust = 0.5, vjust=0.5, size=3, color="black", label.size = NA
)+
# ggplot2::annotate("text", x = 1.1, y=16, label="Weighted\nAverage", size=3, color="black", hjust=0.5, vjust=0.5)+
ggplot2::facet_grid(rows=dplyr::vars(name), switch = "y")+
ggplot2::scale_x_continuous(name = "Survey Respondents")+
ggplot2::scale_y_discrete(name = NULL, position = "right")+
# ggplot2::coord_cartesian(ylim=c(1,15), clip = "off")+
ggplot2::scale_fill_manual(name=NULL, labels = c("1-Not important at all","2-Little importance","3-Moderately important","4-Very important","5-Absolutely essential"),
values=c("#00BF6F","#507CB6","#F9BE00","#6BC8CD","#FF8B4F"),
guide=ggplot2::guide_legend(nrow = 1, byrow = T, reverse=T))+
ggpubr::theme_pubclean()+
ggplot2::theme(
plot.margin = grid::unit(c(0, 0, 0, 0), "null"),
# legend.position = c(0, 1.07),
legend.justification='left',
legend.direction='horizontal',
legend.text = ggplot2::element_text(size=9),
axis.text = ggplot2::element_text(size=9),
strip.background = ggplot2::element_blank(),
strip.text.y.left = ggplot2::element_text(angle=0, hjust=1),
)
ggplot2::ggsave("plot_comparison.pdf", width=12, height = 9)
# Mann-Whitney U-test
qB %>%
# merge(qA %>% dplyr::select(id, group=`Do you usually work in group or individually?`)) %>%
merge(qA %>% dplyr::select(id, group=`Do you usually follow some Data Science project methodology?`)) %>%
dplyr::filter(group != "NA") %>%
dplyr::filter(group %in% c("No", "Yes")) %>%
tidyr::pivot_longer(-c(id, group)) %>%
dplyr::group_by(name) %>%
# rstatix::t_test(value ~ group, paired = F, var.equal = F, alternative = "less", conf.level = 0.90, detailed = T) %>%
rstatix::wilcox_test(value ~ group, paired = F, alternative = "two.sided", conf.level = 0.95, detailed = F) %>%
# rstatix::pairwise_t_test(value ~ group, paired = F, detailed = T) %>%
rstatix::add_significance()
qB %>%
# merge(qA %>% dplyr::select(id, group=`Do you usually work in group or individually?`)) %>%
merge(qA %>% dplyr::select(id, group=`Do you usually follow some Data Science project methodology?`)) %>%
dplyr::filter(group != "NA") %>%
tidyr::pivot_longer(-c(id, group)) %>%
dplyr::count(name, value, group) %>%
dplyr::group_by(name, group) %>%
dplyr:::summarise(
total = sum(n),
avg = sum(value * n/total),
std = sqrt(sum((value - avg)^2 * n/total))
) %>%
dplyr::ungroup() %>%
dplyr::mutate(name = name %>% stringr::str_wrap(40)) %>%
ggplot2::ggplot()+
ggplot2::geom_point(
ggplot2::aes(x=avg, y=reorder(name, avg), color=group)
)+
ggrepel::geom_text_repel(
ggplot2::aes(x=avg, y=name, label=sprintf("%.2f", avg), color=group),
size=3.5, nudge_y = 0.3, hjust=0.5, vjust=0.5, direction = "x", show.legend = F
)+
# ggplot2::scale_color_viridis_d()+
ggplot2::scale_color_manual(name="Professionals address to DS methodology",
values=c("gray", "#DE350D", "#049160"),
guide=ggplot2::guide_legend(title.position = "left", nrow = 1,reverse = T))+
ggnewscale::new_scale_color()+
ggplot2::geom_linerange(
data = . %>%
dplyr::select(-total, -std) %>%
dplyr::filter(group %in% c("No", "Yes")) %>%
dplyr::mutate(name = reorder(name, avg)) %>%
tidyr::pivot_wider(names_from = group, values_from = avg),
ggplot2::aes(xmin=`No`, y=name, xmax=`Yes`, color=`Yes` > `No`),
position = ggplot2::position_nudge(y=-0.2),
size=2
)+
ggplot2::annotate("text", x=4.5, y=c(5,8,9,10), label="\u2731", color="#F06800", size=4)+
ggplot2::scale_color_manual(values=c("#DE350D", "#0EDF96"), guide="none")+
# ggplot2::scale_color_gradient2(low="#DE350D", mid="white", high="#0EDF96")+
ggplot2::scale_x_continuous(name="Score")+
ggplot2::scale_y_discrete(name="Factor")+
ggpubr::theme_pubclean()+
ggplot2::theme(
plot.margin = grid::unit(c(0.06, 0, 0, 0), "null"),
legend.box = "vertical",
legend.position = c(0.2, 1.03),
legend.title = ggplot2::element_text(size=10),
axis.text.y = ggplot2::element_text(hjust=1)
)
## `summarise()` has grouped output by 'name'. You can override using the `.groups` argument.
ggplot2::ggsave("plot_improvement.pdf", width=7, height = 7, device = cairo_pdf)
https://www.promptcloud.com/blog/exploratory-factor-analysis-in-r/
https://m-clark.github.io/posts/2020-04-10-psych-explained/
https://www.geo.fu-berlin.de/en/v/soga/Geodata-analysis/factor-analysis/A-simple-example-of-FA/index.html