Load the libraries + functions

Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.

options(Encoding = "UTF-8")
library(Rling)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rgl)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(psych)

The Data

The data is provided as liwc_house_conflict.csv. We collected over 1000 different speeches given on the floor of the US House of Representatives that discussed different war time conflicts with Iraq, Kuwait, Russia, Syria, Iran, and a few others. This data was then processed with the Linguistic Inquiry and Word Count software, which provides a linguistic frequency analysis for many categories.

You should pick 15-20 categories that you think might cluster together and/or be interesting to examine for their register relatedness. You can learn more about the categories by checking out the attached manual starting on page four. Do not use the “total” categories with their subgroups or you might get a singular matrix error. You might also consider running a quick summary on your choosen categories as well, to make sure they are not effectly zero frequency (i.e., most of the informal language ones will be very small percents due to the location of the speech).

Import your data and create a data frame here with only the categories you are interested in.

main_data <- read.csv("liwc_house_conflict.csv")
rownames(main_data) <- main_data[,1]
data <- main_data %>% dplyr::select(Analytic,Clout,Authentic,Tone,WPS,Sixltr,Dic,posemo,negemo,female,male,see,hear,feel,work,leisure,home,money,relig,death) 

Calculate a MDS

##create distance scores
distances = dist(data, method = "euclidean")

##run the MDS
mds = cmdscale(distances, #distances
               k = 2, #number of dimensions
               eig = T #calculate the eigenvalues
               )
##run the MDS

barplot(mds$eig, #plot the eigenvalues
        xlab = "Dimensions", 
        ylab = "Eigenvalue", 
        main = "Scree plot")

head(sort(mds$eig, decreasing = TRUE),10)
##  [1] 692168.6052 287521.2862 260030.5055 124748.6872  20714.8966
##  [6]  14382.0462   8388.9154   2806.6626   1449.8033    702.8467

Plot the MDS

{
  plot(mds$points, #plot the MDS dimension points
      type = "n", #blank canvas plot
      main = "MDS of US House of Representatives Speeches")
  
  text(mds$points, #plot the dimensions
       labels = rownames(main_data), #label them with the names
       cex = 0.6) #text sizing
}

##run the 3D MDS
mds3 = cmdscale(distances, #distances
               k = 3, #number of dimensions
               eig = T #calculate the eigenvalues
               )

#plot the 3D graph
{
  plot3d(mds3$points, type = "n")
  text3d(mds3$points, texts = rownames(main_data), cex = 0.6)
}

mds4 = cmdscale(distances, #distances
               k = 4, #number of dimensions
               eig = T #calculate the eigenvalues
               )

mds$GOF
## [1] 0.6920615 0.6920615
mds3$GOF
## [1] 0.8757493 0.8757493
mds4$GOF
## [1] 0.9638729 0.9638729
sqrt(sum((distances - dist(mds$points))^2)/sum(distances^2))
## [1] 0.2749429
sqrt(sum((distances - dist(mds3$points))^2)/sum(distances^2))
## [1] 0.1167261
sqrt(sum((distances - dist(mds4$points))^2)/sum(distances^2))
## [1] 0.0336854

Create a Shepard Plot

sh = Shepard(distances, #real Euclidean distances
             mds4$points) #modeled numbers

{
  plot(sh, main = "Shepard Plot", pch = ".")
  lines(sh$x, sh$yf, type = "S")
}

Before you start

correlations = cor(data)
cortest.bartlett(correlations, n = nrow(data))
## $chisq
## [1] 5189.859
## 
## $p.value
## [1] 0
## 
## $df
## [1] 190
KMO(correlations)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = correlations)
## Overall MSA =  0.55
## MSA for each item = 
##  Analytic     Clout Authentic      Tone       WPS    Sixltr       Dic 
##      0.73      0.71      0.41      0.41      0.57      0.72      0.62 
##    posemo    negemo    female      male       see      hear      feel 
##      0.29      0.34      0.62      0.59      0.75      0.76      0.82 
##      work   leisure      home     money     relig     death 
##      0.67      0.57      0.50      0.66      0.74      0.67
psych::describe(data)
##           vars    n  mean    sd median trimmed   mad   min   max range
## Analytic     1 1040 79.84 14.63  83.20   81.69 13.47  9.17 98.90 89.73
## Clout        2 1040 71.87 13.41  72.77   72.30 14.17 17.48 99.00 81.52
## Authentic    3 1040 26.71 15.45  23.59   25.30 14.46  1.00 84.25 83.25
## Tone         4 1040 27.35 25.68  18.92   23.63 22.92  1.00 99.00 98.00
## WPS          5 1040 20.31  4.56  19.82   20.05  4.03 10.39 39.95 29.56
## Sixltr       6 1040 23.92  4.52  23.84   23.92  4.80 11.06 39.18 28.12
## Dic          7 1040 82.46  4.20  82.74   82.69  4.11 66.67 93.30 26.63
## posemo       8 1040  3.16  1.44   2.98    3.06  1.39  0.00  9.94  9.94
## negemo       9 1040  3.62  1.69   3.40    3.52  1.63  0.00 10.10 10.10
## female      10 1040  0.21  0.37   0.00    0.13  0.00  0.00  5.51  5.51
## male        11 1040  0.81  0.84   0.60    0.68  0.70  0.00  6.28  6.28
## see         12 1040  0.44  0.44   0.37    0.38  0.42  0.00  3.61  3.61
## hear        13 1040  0.93  0.62   0.80    0.85  0.50  0.00  4.40  4.40
## feel        14 1040  0.18  0.27   0.10    0.13  0.14  0.00  2.67  2.67
## work        15 1040  3.64  1.67   3.45    3.54  1.57  0.00 11.16 11.16
## leisure     16 1040  0.21  0.30   0.12    0.16  0.18  0.00  2.81  2.81
## home        17 1040  0.27  0.31   0.20    0.22  0.30  0.00  2.63  2.63
## money       18 1040  0.64  0.84   0.42    0.47  0.62  0.00  5.74  5.74
## relig       19 1040  0.33  0.59   0.10    0.19  0.15  0.00  5.39  5.39
## death       20 1040  0.88  0.83   0.64    0.76  0.73  0.00  5.34  5.34
##            skew kurtosis   se
## Analytic  -1.14     1.20 0.45
## Clout     -0.34    -0.10 0.42
## Authentic  0.84     0.52 0.48
## Tone       1.03     0.12 0.80
## WPS        0.75     1.28 0.14
## Sixltr     0.00    -0.31 0.14
## Dic       -0.51     0.19 0.13
## posemo     0.77     1.01 0.04
## negemo     0.58     0.42 0.05
## female     4.58    43.99 0.01
## male       1.71     4.25 0.03
## see        1.74     6.03 0.01
## hear       1.69     4.45 0.02
## feel       2.94    14.50 0.01
## work       0.63     0.61 0.05
## leisure    2.67    12.46 0.01
## home       1.82     5.89 0.01
## money      2.47     7.88 0.03
## relig      3.54    17.46 0.02
## death      1.30     1.93 0.03

How many factors/components?

number_items = fa.parallel(data,
                           fm = "ml", #type of math
                           fa = "both") #look at both efa/pca

## Parallel analysis suggests that the number of factors =  7  and the number of components =  7
sum(number_items$fa.values > 1)
## [1] 2
sum(number_items$fa.values > 0.7)
## [1] 2

Simple structure - run the PCA/EFA

##save it
PCA_fit = principal(data,
                    nfactors = 2, #number of components
                    rotate = "none")

PCA_fit
## Principal Components Analysis
## Call: principal(r = data, nfactors = 2, rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
##             PC1   PC2     h2   u2 com
## Analytic  -0.72 -0.12 0.5307 0.47 1.1
## Clout      0.56  0.13 0.3277 0.67 1.1
## Authentic  0.17  0.14 0.0486 0.95 1.9
## Tone      -0.04  0.93 0.8619 0.14 1.0
## WPS       -0.28  0.15 0.1018 0.90 1.5
## Sixltr    -0.78 -0.08 0.6151 0.38 1.0
## Dic        0.72  0.20 0.5574 0.44 1.2
## posemo     0.00  0.57 0.3217 0.68 1.0
## negemo     0.05 -0.74 0.5525 0.45 1.0
## female     0.17 -0.01 0.0280 0.97 1.0
## male       0.28 -0.11 0.0899 0.91 1.3
## see        0.27  0.07 0.0758 0.92 1.2
## hear       0.40  0.10 0.1678 0.83 1.1
## feel       0.22 -0.05 0.0493 0.95 1.1
## work      -0.69  0.17 0.5099 0.49 1.1
## leisure    0.10  0.01 0.0102 0.99 1.0
## home      -0.03  0.05 0.0035 1.00 1.5
## money     -0.34  0.23 0.1671 0.83 1.8
## relig      0.25  0.01 0.0639 0.94 1.0
## death      0.20 -0.49 0.2760 0.72 1.3
## 
##                        PC1  PC2
## SS loadings           3.15 2.21
## Proportion Var        0.16 0.11
## Cumulative Var        0.16 0.27
## Proportion Explained  0.59 0.41
## Cumulative Proportion 0.59 1.00
## 
## Mean item complexity =  1.2
## Test of the hypothesis that 2 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.08 
##  with the empirical chi square  2575.01  with prob <  0 
## 
## Fit based upon off diagonal values = 0.73
fa.plot(PCA_fit, labels = colnames(data))

fa.diagram(PCA_fit)

PCA_fit$rms
## [1] 0.08072
##print it out

Adequate solution

We got 0.08. Although it is higher than 0.06 but still below 0.1. so overall it is good although not great.

  1. Speakers seem to be talking more about negative sentiment than positve sentiment
  2. He uses a lot of emotional tone
  3. I noticed more emphasis on work versus home.