Load the libraries + functions

##r chunk
library(reticulate)
py_config()
## python:         C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate/python.exe
## libpython:      C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate/python36.dll
## pythonhome:     C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate
## version:        3.6.11 (default, Aug  5 2020, 19:41:03) [MSC v.1916 64 bit (AMD64)]
## Architecture:   64bit
## numpy:          C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate/Lib/site-packages/numpy
## numpy_version:  1.19.1
library(FactoMineR)
library(ca)

Simple Correspondence Analysis

##r chunk
df = read.csv('GenderPronounsNYT.csv')
df[,7:10] = lapply(df[,7:10], factor)
freq = df[,c('DictionaryWord', 'Decade')]
freq = table(df$DictionaryWord,df$Decade)
freq
##       
##        1990s 2000s 2010s
##   he    1429  1457  1302
##   her    514   525   641
##   hers     4     4     2
##   him    331   348   226
##   his   1384  1350  1333
##   she    338   316   496

The Analysis

Run a simple correspondence analysis on the data.

##r chunk 
model_sca = CA(freq)

summary(model_sca)
## 
## Call:
## CA(X = freq) 
## 
## The chi square of independence between the two variables is equal to 108.5027 (p-value =  0.000000000000000001068973 ).
## 
## Eigenvalues
##                        Dim.1   Dim.2
## Variance               0.009   0.000
## % of var.             98.942   1.058
## Cumulative % of var.  98.942 100.000
## 
## Rows
##         Iner*1000    Dim.1    ctr   cos2    Dim.2    ctr   cos2  
## he    |     0.815 |  0.048  9.040  0.993 | -0.004  6.086  0.007 |
## her   |     1.474 | -0.101 16.032  0.973 | -0.017 41.027  0.027 |
## hers  |     0.067 |  0.282  0.740  0.993 |  0.024  0.510  0.007 |
## him   |     2.412 |  0.179 26.914  0.998 | -0.008  4.729  0.002 |
## his   |     0.083 |  0.011  0.450  0.486 |  0.011 44.551  0.514 |
## she   |     4.192 | -0.209 46.823  0.999 |  0.006  3.097  0.001 |
## 
## Columns
##         Iner*1000    Dim.1    ctr   cos2    Dim.2    ctr   cos2  
## 1990s |     1.125 |  0.057 11.988  0.954 |  0.013 54.678  0.046 |
## 2000s |     1.996 |  0.077 21.833  0.979 | -0.011 44.833  0.021 |
## 2010s |     5.921 | -0.133 66.178  1.000 | -0.001  0.488  0.000 |
  • What do the inertia values tell you about the dimensionality of the data?
  • First dimension captures 98.94% of variance and dimension two captures all the variance

Create a 2D plot of the data.

##r chunk
plot(model_sca)

  • What can you tell about the pronoun usage from examining this plot?
  • She, her words usage increased in 2010s

Multiple Correspondence Analysis

##r chunk
mca_model_1 = MCA(df[4:10], graph = FALSE)
summary(mca_model_1)
## 
## Call:
## MCA(X = df[4:10], graph = FALSE) 
## 
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6   Dim.7
## Variance               0.171   0.153   0.145   0.144   0.142   0.142   0.140
## % of var.             13.271  11.912  11.289  11.226  11.061  11.021  10.903
## Cumulative % of var.  13.271  25.183  36.472  47.698  58.759  69.780  80.683
##                        Dim.8   Dim.9
## Variance               0.132   0.116
## % of var.             10.278   9.038
## Cumulative % of var.  90.962 100.000
## 
## Individuals (the 10 first)
##                           Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3
## 1                      |  0.059  0.000  0.006 | -0.173  0.002  0.049 |  0.311
## 2                      |  0.059  0.000  0.006 | -0.173  0.002  0.049 |  0.311
## 3                      |  0.059  0.000  0.006 | -0.173  0.002  0.049 |  0.311
## 4                      |  0.059  0.000  0.006 | -0.173  0.002  0.049 |  0.311
## 5                      |  0.059  0.000  0.006 | -0.173  0.002  0.049 |  0.311
## 6                      |  0.059  0.000  0.006 | -0.173  0.002  0.049 |  0.311
## 7                      |  0.059  0.000  0.006 | -0.173  0.002  0.049 |  0.311
## 8                      |  0.059  0.000  0.006 | -0.173  0.002  0.049 |  0.311
## 9                      |  0.059  0.000  0.006 | -0.173  0.002  0.049 |  0.311
## 10                     |  0.059  0.000  0.006 | -0.173  0.002  0.049 |  0.311
##                           ctr   cos2  
## 1                       0.006  0.158 |
## 2                       0.006  0.158 |
## 3                       0.006  0.158 |
## 4                       0.006  0.158 |
## 5                       0.006  0.158 |
## 6                       0.006  0.158 |
## 7                       0.006  0.158 |
## 8                       0.006  0.158 |
## 9                       0.006  0.158 |
## 10                      0.006  0.158 |
## 
## Categories (the 10 first)
##                            Dim.1     ctr    cos2  v.test     Dim.2     ctr
## 1990s                  |  -0.012   0.004   0.000  -0.908 |  -0.020   0.013
## 2000s                  |   0.059   0.096   0.002   4.551 |  -0.305   2.887
## 2010s                  |  -0.047   0.062   0.001  -3.643 |   0.325   3.281
## Arts and Entertainment |   0.745  23.238   0.555  81.611 |   0.025   0.030
## Sports                 |  -0.745  23.238   0.555 -81.611 |  -0.025   0.030
## Object                 |   1.263  28.749   0.438  72.466 |  -0.016   0.005
## Possessive             |  -0.108   0.329   0.006  -8.451 |   0.907  26.078
## Subject                |  -0.529  10.432   0.224 -51.894 |  -0.685  19.470
## Career_FALSE           |   0.040   0.130   0.070  28.966 |  -0.102   0.956
## Career_TRUE            |  -1.756   5.725   0.070 -28.966 |   4.516  42.175
##                           cos2  v.test     Dim.3     ctr    cos2  v.test  
## 1990s                    0.000  -1.562 |  -0.828  22.497   0.343 -64.141 |
## 2000s                    0.046 -23.601 |   0.102   0.341   0.005   7.892 |
## 2010s                    0.053  25.163 |   0.726  17.302   0.264  56.249 |
## Arts and Entertainment   0.001   2.787 |  -0.038   0.072   0.001  -4.179 |
## Sports                   0.001  -2.787 |   0.038   0.072   0.001   4.179 |
## Object                   0.000  -0.928 |   0.314   2.092   0.027  18.031 |
## Possessive               0.423  71.281 |  -0.456   6.957   0.107 -35.842 |
## Subject                  0.376 -67.168 |   0.196   1.686   0.031  19.243 |
## Career_FALSE             0.462 -74.488 |  -0.027   0.069   0.032 -19.480 |
## Career_TRUE              0.462  74.488 |   1.181   3.044   0.032  19.480 |
## 
## Categorical variables (eta2)
##                          Dim.1 Dim.2 Dim.3  
## Decade                 | 0.002 0.066 0.408 |
## ArticleType            | 0.555 0.001 0.001 |
## PronounType            | 0.472 0.488 0.109 |
## Career                 | 0.070 0.462 0.032 |
## Family                 | 0.020 0.032 0.173 |
## Great                  | 0.004 0.020 0.257 |
## Beautiful              | 0.071 0.003 0.036 |
##r chunk
plot(mca_model_1, col.ind = "gray", invisible = "ind")

##r chunk
dimdesc(mca_model_1)[['Dim 1']]$quali
##                      R2
## ArticleType 0.555076817
## PronounType 0.471895900
## Beautiful   0.070990821
## Career      0.069926295
## Family      0.020036773
## Great       0.004484384
## Decade      0.001934022
##                                                                                                                                                                                                                p.value
## ArticleType 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## PronounType 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Beautiful   0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003875365
## Career      0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003760586697
## Family      0.00000000000000000000000000000000000000000000000000000094880385970060989947807983391214747825870290398597717285156250000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Great       0.00000000000020856693443996706140153296615835643024183809757232666015625000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Decade      0.00000905253969254840187667887629885399292106740176677703857421875000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
dimdesc(mca_model_1)[['Dim 1']]$category
##                                       Estimate
## PronounType=Object                  0.43534713
## ArticleType=Arts and Entertainment  0.30774601
## Beautiful=Beautiful_TRUE            0.97943255
## Career=Career_FALSE                 0.37095697
## Family=Family_TRUE                  0.29142677
## Great=Great_TRUE                    0.12171325
## Decade=2000s                        0.02426956
## Decade=2010s                       -0.01942983
## Great=Great_FALSE                  -0.12171325
## PronounType=Possessive             -0.13058081
## Family=Family_FALSE                -0.29142677
## Career=Career_TRUE                 -0.37095697
## Beautiful=Beautiful_FALSE          -0.97943255
## PronounType=Subject                -0.30476631
## ArticleType=Sports                 -0.30774601
##                                                                                                                                                                                                                                       p.value
## PronounType=Object                 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## ArticleType=Arts and Entertainment 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Beautiful=Beautiful_TRUE           0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003875365
## Career=Career_FALSE                0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003760586696
## Family=Family_TRUE                 0.00000000000000000000000000000000000000000000000000000094880385969127081398355461061555615742690861225128173828125000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Great=Great_TRUE                   0.00000000000020856693444116522122375873005140078930708114057779312133789062500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Decade=2000s                       0.00000529751946315421301872405912547492334851995110511779785156250000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Decade=2010s                       0.00026821237299664435352677949175870253384346142411231994628906250000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Great=Great_FALSE                  0.00000000000020856693444116522122375873005140078930708114057779312133789062500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## PronounType=Possessive             0.00000000000000002595609492685777930235171973016861102223629131913185119628906250000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Family=Family_FALSE                0.00000000000000000000000000000000000000000000000000000094880385969133862276297941562575033458415418863296508789062500000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Career=Career_TRUE                 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003760586696
## Beautiful=Beautiful_FALSE          0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003875365
## PronounType=Subject                0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## ArticleType=Sports                 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

Simple Categories

##r chunk
mca_model_2 = MCA(df[,-c(1,3)], quali.sup = 1, graph = FALSE)

Create a 2D plot of your category analysis.

##r chunk 
plot(mca_model_2, invisible = "ind", col.ind = "gray")

##r chunk
plotellipses(mca_model_2, keepvar = 1, label = "quali")

Create a 95% CI type plot for the category.

##r chunk
plotellipses(mca_model_2, means = F, keepvar = 1, label = "quali")

  • What can you tell about the categories from these plots? Are they distinct or overlapping?
  • The confidence intervals overlap

Run a MCA in Python

mca_df <- df[-1]
##python chunk 
import prince
import pandas
import prince
data = pandas.DataFrame(r.mca_df)
data.head()
##       Gender DictionaryWord Decade  ... Family  Great Beautiful
## 0  Masculine             he  2010s  ...  FALSE  FALSE     FALSE
## 1  Masculine             he  2010s  ...  FALSE  FALSE     FALSE
## 2  Masculine             he  2010s  ...  FALSE  FALSE     FALSE
## 3  Masculine             he  2010s  ...  FALSE  FALSE     FALSE
## 4  Masculine             he  2010s  ...  FALSE  FALSE     FALSE
## 
## [5 rows x 9 columns]
data.columns
## Index(['Gender', 'DictionaryWord', 'Decade', 'ArticleType', 'PronounType',
##        'Career', 'Family', 'Great', 'Beautiful'],
##       dtype='object')
mca = prince.MCA(n_components = 2, n_iter = 3, copy = True, check_input = True, engine = 'auto', random_state = 42)
mca = mca.fit(data)

mca.explained_inertia_
## [0.17769986688301184, 0.13539975137338306]

Plot the Results

##python chunk
import matplotlib
matplotlib.use('Agg')
ax = mca.plot_coordinates(
  X = data,
  ax = None,
  figsize = (10,10),
  show_row_points = True,
  row_points_size = 10,
  show_row_labels = False,
  show_column_points = True,
  column_points_size = 30,
  show_column_labels = False,
  legend_n_cols = 2
).legend(loc='upper right')

ax.get_figure()

Explore the differences