\usepackage{float} \usepackage[utf8]{inputenc}

Algo de analisis factorial en Python

#!pip install factor_analyzer
import factor_analyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
import pandas as pd
data = pd.read_csv('usarrests.csv')
print(data.head(10))
    Unnamed: 0  Murder  Assault  UrbanPop  Rape
0      Alabama    13.2      236        58  21.2
1       Alaska    10.0      263        48  44.5
2      Arizona     8.1      294        80  31.0
3     Arkansas     8.8      190        50  19.5
4   California     9.0      276        91  40.6
5     Colorado     7.9      204        78  38.7
6  Connecticut     3.3      110        77  11.1
7     Delaware     5.9      238        72  15.8
8      Florida    15.4      335        80  31.9
9      Georgia    17.4      211        60  25.8
from factor_analyzer import FactorAnalyzer
df = pd.DataFrame(data, columns=['Murder', 'Assault', 'UrbanPop', 'Rape'])
print(df.head(10))
   Murder  Assault  UrbanPop  Rape
0    13.2      236        58  21.2
1    10.0      263        48  44.5
2     8.1      294        80  31.0
3     8.8      190        50  19.5
4     9.0      276        91  40.6
5     7.9      204        78  38.7
6     3.3      110        77  11.1
7     5.9      238        72  15.8
8    15.4      335        80  31.9
9    17.4      211        60  25.8
print(df.corr())
            Murder   Assault  UrbanPop      Rape
Murder    1.000000  0.801873  0.069573  0.563579
Assault   0.801873  1.000000  0.258872  0.665241
UrbanPop  0.069573  0.258872  1.000000  0.411341
Rape      0.563579  0.665241  0.411341  1.000000
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)
StandardScaler()
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
chi_square_value,p_value=calculate_bartlett_sphericity(df)
chi_square_value, p_value
(88.28814686595526, 6.868423073358666e-17)
from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all,kmo_model=calculate_kmo(df)
kmo_model
0.6538150062740201
scaled_data = scaler.transform(df)
scaled_data
array([[ 1.25517927,  0.79078716, -0.52619514, -0.00345116],
       [ 0.51301858,  1.11805959, -1.22406668,  2.50942392],
       [ 0.07236067,  1.49381682,  1.00912225,  1.05346626],
       [ 0.23470832,  0.23321191, -1.08449238, -0.18679398],
       [ 0.28109336,  1.2756352 ,  1.77678094,  2.08881393],
       [ 0.02597562,  0.40290872,  0.86954794,  1.88390137],
       [-1.04088037, -0.73648418,  0.79976079, -1.09272319],
       [-0.43787481,  0.81502956,  0.45082502, -0.58583422],
       [ 1.76541475,  1.99078607,  1.00912225,  1.1505301 ],
       [ 2.22926518,  0.48775713, -0.38662083,  0.49265293],
       [-0.57702994, -1.51224105,  1.21848371, -0.11129987],
       [-1.20322802, -0.61527217, -0.80534376, -0.75839217],
       [ 0.60578867,  0.94836277,  1.21848371,  0.29852525],
       [-0.13637203, -0.70012057, -0.03768506, -0.0250209 ],
       [-1.29599811, -1.39102904, -0.5959823 , -1.07115345],
       [-0.41468229, -0.67587817,  0.03210209, -0.34856705],
       [ 0.44344101, -0.74860538, -0.94491807, -0.53190987],
       [ 1.76541475,  0.94836277,  0.03210209,  0.10439756],
       [-1.31919063, -1.06375661, -1.01470522, -1.44862395],
       [ 0.81452136,  1.56654403,  0.10188925,  0.70835037],
       [-0.78576263, -0.26375734,  1.35805802, -0.53190987],
       [ 1.00006153,  1.02108998,  0.59039932,  1.49564599],
       [-1.1800355 , -1.19708982,  0.03210209, -0.68289807],
       [ 1.9277624 ,  1.06957478, -1.5032153 , -0.44563089],
       [ 0.28109336,  0.0877575 ,  0.31125071,  0.75148985],
       [-0.41468229, -0.74860538, -0.87513091, -0.521125  ],
       [-0.80895515, -0.83345379, -0.24704653, -0.51034012],
       [ 1.02325405,  0.98472638,  1.0789094 ,  2.671197  ],
       [-1.31919063, -1.37890783, -0.66576945, -1.26528114],
       [-0.08998698, -0.14254532,  1.63720664, -0.26228808],
       [ 0.83771388,  1.38472601,  0.31125071,  1.17209984],
       [ 0.76813632,  1.00896878,  1.42784517,  0.52500755],
       [ 1.20879423,  2.01502847, -1.43342815, -0.55347961],
       [-1.62069341, -1.52436225, -1.5032153 , -1.50254831],
       [-0.11317951, -0.61527217,  0.66018648,  0.01811858],
       [-0.27552716, -0.23951493,  0.1716764 , -0.13286962],
       [-0.66980002, -0.14254532,  0.10188925,  0.87012344],
       [-0.34510472, -0.78496898,  0.45082502, -0.68289807],
       [-1.01768785,  0.03927269,  1.49763233, -1.39469959],
       [ 1.53348953,  1.3119988 , -1.22406668,  0.13675217],
       [-0.92491776, -1.027393  , -1.43342815, -0.90938037],
       [ 1.25517927,  0.20896951, -0.45640799,  0.61128652],
       [ 1.13921666,  0.36654512,  1.00912225,  0.46029832],
       [-1.06407289, -0.61527217,  1.00912225,  0.17989166],
       [-1.29599811, -1.48799864, -2.34066115, -1.08193832],
       [ 0.16513075, -0.17890893, -0.17725937, -0.05737552],
       [-0.87853272, -0.31224214,  0.52061217,  0.53579242],
       [-0.48425985, -1.08799901, -1.85215107, -1.28685088],
       [-1.20322802, -1.42739264,  0.03210209, -1.1250778 ],
       [-0.22914211, -0.11830292, -0.38662083, -0.60740397]])
fa = FactorAnalyzer()
fa.set_params(n_factors=4, rotation=None)
fa.fit(scaled_data)
FactorAnalyzer(n_factors=4, rotation=None, rotation_kwargs={})
fa.get_eigenvalues()
(array([2.48024158, 0.98976515, 0.35656318, 0.17343009]),
 array([ 2.28299696e+00,  5.37887219e-01,  6.17387175e-02, -2.20915632e-07]))
# Check Eigenvalues
ev, v = fa.get_eigenvalues()
ev
array([2.48024158, 0.98976515, 0.35656318, 0.17343009])
sorted(ev)
[0.17343008772983554,
 0.35656318058083014,
 0.9897651525398421,
 2.480241579149494]
fa.get_communalities()
array([0.85762759, 0.88586043, 0.41517638, 0.72395827])
import matplotlib.pyplot as plt
plt.scatter(range(1,df.shape[1]+1),ev)
plt.plot(range(1,df.shape[1]+1),ev)
plt.title('Screeplot')
plt.xlabel('Factor')
plt.ylabel('Eigenvalores')
plt.grid()
plt.show()

import pandas as pd
factores=pd.DataFrame(fa.loadings_,columns=['F1','F2','F3','F4'],index=df.columns)

print(factores)
                F1        F2        F3   F4
Murder    0.843704 -0.374741 -0.073213  0.0
Assault   0.919370 -0.103672  0.172831  0.0
UrbanPop  0.331679  0.548848  0.062696  0.0
Rape      0.784798  0.292359 -0.150257  0.0
fa.corr_
array([[1.        , 0.80187331, 0.06957262, 0.56357883],
       [0.80187331, 1.        , 0.2588717 , 0.66524123],
       [0.06957262, 0.2588717 , 1.        , 0.41134124],
       [0.56357883, 0.66524123, 0.41134124, 1.        ]])
fa.n_factors
4
print(pd.DataFrame(fa.get_factor_variance(),index=['Variance','Proportional Var','Cumulative Var']))
                         0         1         2         3
Variance          2.282997  0.537887  0.061739  0.000000
Proportional Var  0.570749  0.134472  0.015435  0.000000
Cumulative Var    0.570749  0.705221  0.720656  0.720656
fa.get_factor_variance()
(array([2.28299689, 0.53788712, 0.06173866, 0.        ]),
 array([0.57074922, 0.13447178, 0.01543467, 0.        ]),
 array([0.57074922, 0.705221  , 0.72065567, 0.72065567]))
fa.loadings_
array([[ 0.84370394, -0.37474146, -0.07321271,  0.        ],
       [ 0.91937036, -0.10367227,  0.17283121,  0.        ],
       [ 0.33167926,  0.54884825,  0.06269644,  0.        ],
       [ 0.78479779,  0.29235872, -0.15025674,  0.        ]])
ev, v = fa.get_eigenvalues()
ev
array([2.48024158, 0.98976515, 0.35656318, 0.17343009])
(0.84370394)**2 + (-0.37474146)**2 + (-0.07321271)**2
0.8576276011199994
fa_varimax = FactorAnalyzer(rotation='varimax')
fa_varimax.fit(scaled_data)
FactorAnalyzer(rotation='varimax', rotation_kwargs={})
fa_varimax.loadings_
array([[ 0.91516486,  0.02762848,  0.13905951],
       [ 0.88036791,  0.32020407, -0.09100618],
       [ 0.05909459,  0.64142576, -0.0160376 ],
       [ 0.56292624,  0.59517677,  0.22986286]])
fa_varimax.get_communalities()
array([0.85762759, 0.88586043, 0.41517638, 0.72395827])
print(pd.DataFrame(fa.get_communalities(),index=df.columns,columns=['Communalities']))
          Communalities
Murder         0.857628
Assault        0.885860
UrbanPop       0.415176
Rape           0.723958
# Get variance of each factors
fa.get_factor_variance()
(array([2.28299689, 0.53788712, 0.06173866, 0.        ]),
 array([0.57074922, 0.13447178, 0.01543467, 0.        ]),
 array([0.57074922, 0.705221  , 0.72065567, 0.72065567]))
 import prince
pca = prince.PCA(
n_components=2,
n_iter=3,
rescale_with_mean=True,
rescale_with_std=True,
copy=True,
check_input=True,
engine='auto',
random_state=42
)
pca = pca.fit(df)
pca.transform(df)
0 1
0 0.985566 1.133392
1 1.950138 1.073213
2 1.763164 -0.745957
3 -0.141420 1.119797
4 2.523980 -1.542934
5 1.514563 -0.987555
6 -1.358647 -1.088928
7 0.047709 -0.325359
8 3.013042 0.039229
9 1.639283 1.278942
10 -0.912657 -1.570460
11 -1.639800 0.210973
12 1.378911 -0.681841
13 -0.505461 -0.151563
14 -2.253646 -0.104054
15 -0.796881 -0.270165
16 -0.750859 0.958440
17 1.564818 0.871055
18 -2.396829 0.376392
19 1.763369 0.427655
20 -0.486166 -1.474496
21 2.108441 -0.155397
22 -1.692682 -0.632261
23 0.996494 2.393796
24 0.696787 -0.263355
25 -1.185452 0.536874
26 -1.265637 -0.193954
27 2.874395 -0.775600
28 -2.383915 -0.018082
29 0.181566 -1.449506
30 1.980024 0.142849
31 1.682577 -0.823184
32 1.123379 2.228003
33 -2.992226 0.599119
34 -0.225965 -0.742238
35 -0.311783 -0.287854
36 0.059122 -0.541411
37 -0.888416 -0.571100
38 -0.863772 -1.491978
39 1.320724 1.933405
40 -1.987775 0.823343
41 0.999742 0.860251
42 1.355138 -0.412481
43 -0.550565 -1.471505
44 -2.801412 1.402288
45 -0.096335 0.199735
46 -0.216903 -0.970124
47 -2.108585 1.424847
48 -2.079714 -0.611269
49 -0.629427 0.321013
ax = pca.plot_row_coordinates(
df,
ax=None,
figsize=(6, 6),
x_component=0,
y_component=1,
labels=None,
#color_labels=data['Unnamed: 0'],
ellipse_outline=False,
ellipse_fill=True,
show_points=True
)
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.

ax.get_figure()
plt.show()