This file documents a reanalysis of Franco-Provençal questionnaire data originally performed using Principal Components Analysis (PCA). The aim is to examine questionnaire items that are potentially redundant, i.e. tap into the same latent variable, and to investigate procedures to collapse across such questions.
R and make a correlation matrix to see how the variables are related to each other:require(readxl)
## Loading required package: readxl
## set working directory
setwd("/Users/christopherstewart/Desktop/FrPr_PCA")
## actually load data into R and take "informants" column out of the dataset
data <- read_excel("RECODED March23 PCA data(march_30).xlsx")
### remove column with informant number
data[1]<- NULL
## make correlation matrix
suppressPackageStartupMessages(require("Hmisc"))
## Warning: package 'Hmisc' was built under R version 3.2.3
## Warning: package 'ggplot2' was built under R version 3.2.4
data_corrmat <- cor(data)
## make an object that retains highly correlated variables (functions created by Stephen Turner (see https://gist.github.com/stephenturner/3492773))
cor.prob <- function (X, dfr = nrow(X) - 2) {
R <- cor(X, use="pairwise.complete.obs")
above <- row(R) < col(R)
r2 <- R[above]^2
Fstat <- r2 * dfr/(1 - r2)
R[above] <- 1 - pf(Fstat, 1, dfr)
R[row(R) == col(R)] <- NA
R
}
flattenSquareMatrix <- function(m) {
if( (class(m) != "matrix") | (nrow(m) != ncol(m))) stop("Must be a square matrix.")
if(!identical(rownames(m), colnames(m))) stop("Row and column names must be equal.")
ut <- upper.tri(m)
data.frame(i = rownames(m)[row(m)[ut]],
j = rownames(m)[col(m)[ut]],
cor=t(m)[ut],
p=m[ut])
}
## apply these functions to our correlation matrix to yield a more digestible output, then print out the five strongest correlations (|r| > .91, p = 0)
data_corrmat_1 <- flattenSquareMatrix(cor.prob(data_corrmat))
data_hi_corr <- data_corrmat_1[order(-abs(data_corrmat_1$cor)), ]
print(head(data_hi_corr, n = 5))
## i j cor p
## 6 L.WITH.ELDER.CHILD L.SIBLINGS 0.8311935 5.628766e-06
## 38 L.WITH.PARTNER L.DOCTOR 0.8132550 1.302802e-05
## 66 L.POSTOFFICE L.BANK 0.7841460 4.279938e-05
## 28 L.LOCAL.SHOPS L.AOSTA.SHOPS 0.7590710 1.041643e-04
## 190 L.BOOKS L.INTERNET 0.7074792 4.847592e-04
summary call indicates that the variance is concentrated in the first component with subsequent components explaining considerably less, an intepretation reinforced by the accompanying scree plot.data_pca <- prcomp(data)
summary(data_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.0814 0.7096 0.62607 0.59900 0.56615 0.5404
## Proportion of Variance 0.2526 0.1088 0.08468 0.07752 0.06925 0.0631
## Cumulative Proportion 0.2526 0.3614 0.44611 0.52363 0.59288 0.6560
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.52252 0.48414 0.4477 0.4166 0.40602 0.33481
## Proportion of Variance 0.05899 0.05064 0.0433 0.0375 0.03562 0.02422
## Cumulative Proportion 0.71497 0.76561 0.8089 0.8464 0.88203 0.90625
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.31740 0.30687 0.27997 0.24285 0.21612 0.15710
## Proportion of Variance 0.02177 0.02035 0.01694 0.01274 0.01009 0.00533
## Cumulative Proportion 0.92802 0.94836 0.96530 0.97804 0.98813 0.99346
## PC19 PC20
## Standard deviation 0.14213 0.10024
## Proportion of Variance 0.00436 0.00217
## Cumulative Proportion 0.99783 1.00000
plot(data_pca, type = "l", main = "Variance Explained by Components from PCA")
write.csv(data_pca$rotation, "PCA_loadings(FROM MARCH 30 RECODING).csv")
## getting the top 5 components, to match up with Joe's analysis
data_pca_top_5 <- data_pca$x[, 1:5]
write.csv(data_pca_top_5, "top_5_components_for_informants(FROM MARCH 30 RECODING).csv")