##r chunk
library(reticulate)
py_config()
## python: C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate/python.exe
## libpython: C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate/python36.dll
## pythonhome: C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate
## version: 3.6.11 (default, Aug 5 2020, 19:41:03) [MSC v.1916 64 bit (AMD64)]
## Architecture: 64bit
## numpy: C:/Users/raavi/AppData/Local/r-miniconda/envs/r-reticulate/Lib/site-packages/numpy
## numpy_version: 1.19.1
library(FactoMineR)
library(ca)
##r chunk
df = read.csv('GenderPronounsNYT.csv')
df[,7:10] = lapply(df[,7:10], factor)
freq = df[,c('DictionaryWord', 'Decade')]
freq = table(df$DictionaryWord,df$Decade)
freq
##
## 1990s 2000s 2010s
## he 1429 1457 1302
## her 514 525 641
## hers 4 4 2
## him 331 348 226
## his 1384 1350 1333
## she 338 316 496
Run a simple correspondence analysis on the data.
##r chunk
model_sca = CA(freq)
summary(model_sca)
##
## Call:
## CA(X = freq)
##
## The chi square of independence between the two variables is equal to 108.5027 (p-value = 0.000000000000000001068973 ).
##
## Eigenvalues
## Dim.1 Dim.2
## Variance 0.009 0.000
## % of var. 98.942 1.058
## Cumulative % of var. 98.942 100.000
##
## Rows
## Iner*1000 Dim.1 ctr cos2 Dim.2 ctr cos2
## he | 0.815 | 0.048 9.040 0.993 | -0.004 6.086 0.007 |
## her | 1.474 | -0.101 16.032 0.973 | -0.017 41.027 0.027 |
## hers | 0.067 | 0.282 0.740 0.993 | 0.024 0.510 0.007 |
## him | 2.412 | 0.179 26.914 0.998 | -0.008 4.729 0.002 |
## his | 0.083 | 0.011 0.450 0.486 | 0.011 44.551 0.514 |
## she | 4.192 | -0.209 46.823 0.999 | 0.006 3.097 0.001 |
##
## Columns
## Iner*1000 Dim.1 ctr cos2 Dim.2 ctr cos2
## 1990s | 1.125 | 0.057 11.988 0.954 | 0.013 54.678 0.046 |
## 2000s | 1.996 | 0.077 21.833 0.979 | -0.011 44.833 0.021 |
## 2010s | 5.921 | -0.133 66.178 1.000 | -0.001 0.488 0.000 |
Create a 2D plot of the data.
##r chunk
plot(model_sca)
##r chunk
mca_model_1 = MCA(df[4:10], graph = FALSE)
summary(mca_model_1)
##
## Call:
## MCA(X = df[4:10], graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 0.171 0.153 0.145 0.144 0.142 0.142 0.140
## % of var. 13.271 11.912 11.289 11.226 11.061 11.021 10.903
## Cumulative % of var. 13.271 25.183 36.472 47.698 58.759 69.780 80.683
## Dim.8 Dim.9
## Variance 0.132 0.116
## % of var. 10.278 9.038
## Cumulative % of var. 90.962 100.000
##
## Individuals (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## 1 | 0.059 0.000 0.006 | -0.173 0.002 0.049 | 0.311
## 2 | 0.059 0.000 0.006 | -0.173 0.002 0.049 | 0.311
## 3 | 0.059 0.000 0.006 | -0.173 0.002 0.049 | 0.311
## 4 | 0.059 0.000 0.006 | -0.173 0.002 0.049 | 0.311
## 5 | 0.059 0.000 0.006 | -0.173 0.002 0.049 | 0.311
## 6 | 0.059 0.000 0.006 | -0.173 0.002 0.049 | 0.311
## 7 | 0.059 0.000 0.006 | -0.173 0.002 0.049 | 0.311
## 8 | 0.059 0.000 0.006 | -0.173 0.002 0.049 | 0.311
## 9 | 0.059 0.000 0.006 | -0.173 0.002 0.049 | 0.311
## 10 | 0.059 0.000 0.006 | -0.173 0.002 0.049 | 0.311
## ctr cos2
## 1 0.006 0.158 |
## 2 0.006 0.158 |
## 3 0.006 0.158 |
## 4 0.006 0.158 |
## 5 0.006 0.158 |
## 6 0.006 0.158 |
## 7 0.006 0.158 |
## 8 0.006 0.158 |
## 9 0.006 0.158 |
## 10 0.006 0.158 |
##
## Categories (the 10 first)
## Dim.1 ctr cos2 v.test Dim.2 ctr
## 1990s | -0.012 0.004 0.000 -0.908 | -0.020 0.013
## 2000s | 0.059 0.096 0.002 4.551 | -0.305 2.887
## 2010s | -0.047 0.062 0.001 -3.643 | 0.325 3.281
## Arts and Entertainment | 0.745 23.238 0.555 81.611 | 0.025 0.030
## Sports | -0.745 23.238 0.555 -81.611 | -0.025 0.030
## Object | 1.263 28.749 0.438 72.466 | -0.016 0.005
## Possessive | -0.108 0.329 0.006 -8.451 | 0.907 26.078
## Subject | -0.529 10.432 0.224 -51.894 | -0.685 19.470
## Career_FALSE | 0.040 0.130 0.070 28.966 | -0.102 0.956
## Career_TRUE | -1.756 5.725 0.070 -28.966 | 4.516 42.175
## cos2 v.test Dim.3 ctr cos2 v.test
## 1990s 0.000 -1.562 | -0.828 22.497 0.343 -64.141 |
## 2000s 0.046 -23.601 | 0.102 0.341 0.005 7.892 |
## 2010s 0.053 25.163 | 0.726 17.302 0.264 56.249 |
## Arts and Entertainment 0.001 2.787 | -0.038 0.072 0.001 -4.179 |
## Sports 0.001 -2.787 | 0.038 0.072 0.001 4.179 |
## Object 0.000 -0.928 | 0.314 2.092 0.027 18.031 |
## Possessive 0.423 71.281 | -0.456 6.957 0.107 -35.842 |
## Subject 0.376 -67.168 | 0.196 1.686 0.031 19.243 |
## Career_FALSE 0.462 -74.488 | -0.027 0.069 0.032 -19.480 |
## Career_TRUE 0.462 74.488 | 1.181 3.044 0.032 19.480 |
##
## Categorical variables (eta2)
## Dim.1 Dim.2 Dim.3
## Decade | 0.002 0.066 0.408 |
## ArticleType | 0.555 0.001 0.001 |
## PronounType | 0.472 0.488 0.109 |
## Career | 0.070 0.462 0.032 |
## Family | 0.020 0.032 0.173 |
## Great | 0.004 0.020 0.257 |
## Beautiful | 0.071 0.003 0.036 |
##r chunk
plot(mca_model_1, col.ind = "gray", invisible = "ind")
##r chunk
dimdesc(mca_model_1)[['Dim 1']]$quali
## R2
## ArticleType 0.555076817
## PronounType 0.471895900
## Beautiful 0.070990821
## Career 0.069926295
## Family 0.020036773
## Great 0.004484384
## Decade 0.001934022
## p.value
## ArticleType 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## PronounType 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Beautiful 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003875365
## Career 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003760586697
## Family 0.00000000000000000000000000000000000000000000000000000094880385970060989947807983391214747825870290398597717285156250000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Great 0.00000000000020856693443996706140153296615835643024183809757232666015625000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Decade 0.00000905253969254840187667887629885399292106740176677703857421875000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
dimdesc(mca_model_1)[['Dim 1']]$category
## Estimate
## PronounType=Object 0.43534713
## ArticleType=Arts and Entertainment 0.30774601
## Beautiful=Beautiful_TRUE 0.97943255
## Career=Career_FALSE 0.37095697
## Family=Family_TRUE 0.29142677
## Great=Great_TRUE 0.12171325
## Decade=2000s 0.02426956
## Decade=2010s -0.01942983
## Great=Great_FALSE -0.12171325
## PronounType=Possessive -0.13058081
## Family=Family_FALSE -0.29142677
## Career=Career_TRUE -0.37095697
## Beautiful=Beautiful_FALSE -0.97943255
## PronounType=Subject -0.30476631
## ArticleType=Sports -0.30774601
## p.value
## PronounType=Object 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## ArticleType=Arts and Entertainment 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Beautiful=Beautiful_TRUE 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003875365
## Career=Career_FALSE 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003760586696
## Family=Family_TRUE 0.00000000000000000000000000000000000000000000000000000094880385969127081398355461061555615742690861225128173828125000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Great=Great_TRUE 0.00000000000020856693444116522122375873005140078930708114057779312133789062500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Decade=2000s 0.00000529751946315421301872405912547492334851995110511779785156250000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Decade=2010s 0.00026821237299664435352677949175870253384346142411231994628906250000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Great=Great_FALSE 0.00000000000020856693444116522122375873005140078930708114057779312133789062500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## PronounType=Possessive 0.00000000000000002595609492685777930235171973016861102223629131913185119628906250000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Family=Family_FALSE 0.00000000000000000000000000000000000000000000000000000094880385969133862276297941562575033458415418863296508789062500000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## Career=Career_TRUE 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003760586696
## Beautiful=Beautiful_FALSE 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000003875365
## PronounType=Subject 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
## ArticleType=Sports 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
##r chunk
mca_model_2 = MCA(df[,-c(1,3)], quali.sup = 1, graph = FALSE)
Create a 2D plot of your category analysis.
##r chunk
plot(mca_model_2, invisible = "ind", col.ind = "gray")
##r chunk
plotellipses(mca_model_2, keepvar = 1, label = "quali")
Create a 95% CI type plot for the category.
##r chunk
plotellipses(mca_model_2, means = F, keepvar = 1, label = "quali")
mca_df <- df[-1]
##python chunk
import prince
import pandas
import prince
data = pandas.DataFrame(r.mca_df)
data.head()
## Gender DictionaryWord Decade ... Family Great Beautiful
## 0 Masculine he 2010s ... FALSE FALSE FALSE
## 1 Masculine he 2010s ... FALSE FALSE FALSE
## 2 Masculine he 2010s ... FALSE FALSE FALSE
## 3 Masculine he 2010s ... FALSE FALSE FALSE
## 4 Masculine he 2010s ... FALSE FALSE FALSE
##
## [5 rows x 9 columns]
data.columns
## Index(['Gender', 'DictionaryWord', 'Decade', 'ArticleType', 'PronounType',
## 'Career', 'Family', 'Great', 'Beautiful'],
## dtype='object')
mca = prince.MCA(n_components = 2, n_iter = 3, copy = True, check_input = True, engine = 'auto', random_state = 42)
mca = mca.fit(data)
mca.explained_inertia_
## [0.17769986688301184, 0.13539975137338306]
##python chunk
import matplotlib
matplotlib.use('Agg')
ax = mca.plot_coordinates(
X = data,
ax = None,
figsize = (10,10),
show_row_points = True,
row_points_size = 10,
show_row_labels = False,
show_column_points = True,
column_points_size = 30,
show_column_labels = False,
legend_n_cols = 2
).legend(loc='upper right')
ax.get_figure()
Would you make different conclusions from the R output versus the Python output? What are some of the differences between the R and Python models?
Python model is slightly better than R model
What other variables might be missing in our data set that might lead to more distinct categories?
Age, location