#!pip install factor_analyzer
Algo de analisis factorial en Python
import factor_analyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
import pandas as pd
= pd.read_csv('usarrests.csv')
data print(data.head(10))
Unnamed: 0 Murder Assault UrbanPop Rape
0 Alabama 13.2 236 58 21.2
1 Alaska 10.0 263 48 44.5
2 Arizona 8.1 294 80 31.0
3 Arkansas 8.8 190 50 19.5
4 California 9.0 276 91 40.6
5 Colorado 7.9 204 78 38.7
6 Connecticut 3.3 110 77 11.1
7 Delaware 5.9 238 72 15.8
8 Florida 15.4 335 80 31.9
9 Georgia 17.4 211 60 25.8
from factor_analyzer import FactorAnalyzer
= pd.DataFrame(data, columns=['Murder', 'Assault', 'UrbanPop', 'Rape'])
df print(df.head(10))
Murder Assault UrbanPop Rape
0 13.2 236 58 21.2
1 10.0 263 48 44.5
2 8.1 294 80 31.0
3 8.8 190 50 19.5
4 9.0 276 91 40.6
5 7.9 204 78 38.7
6 3.3 110 77 11.1
7 5.9 238 72 15.8
8 15.4 335 80 31.9
9 17.4 211 60 25.8
print(df.corr())
Murder Assault UrbanPop Rape
Murder 1.000000 0.801873 0.069573 0.563579
Assault 0.801873 1.000000 0.258872 0.665241
UrbanPop 0.069573 0.258872 1.000000 0.411341
Rape 0.563579 0.665241 0.411341 1.000000
from sklearn.preprocessing import StandardScaler
= StandardScaler()
scaler scaler.fit(df)
StandardScaler()
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
=calculate_bartlett_sphericity(df)
chi_square_value,p_value chi_square_value, p_value
(88.28814686595526, 6.868423073358666e-17)
from factor_analyzer.factor_analyzer import calculate_kmo
=calculate_kmo(df)
kmo_all,kmo_model kmo_model
0.6538150062740201
= scaler.transform(df)
scaled_data scaled_data
array([[ 1.25517927, 0.79078716, -0.52619514, -0.00345116],
[ 0.51301858, 1.11805959, -1.22406668, 2.50942392],
[ 0.07236067, 1.49381682, 1.00912225, 1.05346626],
[ 0.23470832, 0.23321191, -1.08449238, -0.18679398],
[ 0.28109336, 1.2756352 , 1.77678094, 2.08881393],
[ 0.02597562, 0.40290872, 0.86954794, 1.88390137],
[-1.04088037, -0.73648418, 0.79976079, -1.09272319],
[-0.43787481, 0.81502956, 0.45082502, -0.58583422],
[ 1.76541475, 1.99078607, 1.00912225, 1.1505301 ],
[ 2.22926518, 0.48775713, -0.38662083, 0.49265293],
[-0.57702994, -1.51224105, 1.21848371, -0.11129987],
[-1.20322802, -0.61527217, -0.80534376, -0.75839217],
[ 0.60578867, 0.94836277, 1.21848371, 0.29852525],
[-0.13637203, -0.70012057, -0.03768506, -0.0250209 ],
[-1.29599811, -1.39102904, -0.5959823 , -1.07115345],
[-0.41468229, -0.67587817, 0.03210209, -0.34856705],
[ 0.44344101, -0.74860538, -0.94491807, -0.53190987],
[ 1.76541475, 0.94836277, 0.03210209, 0.10439756],
[-1.31919063, -1.06375661, -1.01470522, -1.44862395],
[ 0.81452136, 1.56654403, 0.10188925, 0.70835037],
[-0.78576263, -0.26375734, 1.35805802, -0.53190987],
[ 1.00006153, 1.02108998, 0.59039932, 1.49564599],
[-1.1800355 , -1.19708982, 0.03210209, -0.68289807],
[ 1.9277624 , 1.06957478, -1.5032153 , -0.44563089],
[ 0.28109336, 0.0877575 , 0.31125071, 0.75148985],
[-0.41468229, -0.74860538, -0.87513091, -0.521125 ],
[-0.80895515, -0.83345379, -0.24704653, -0.51034012],
[ 1.02325405, 0.98472638, 1.0789094 , 2.671197 ],
[-1.31919063, -1.37890783, -0.66576945, -1.26528114],
[-0.08998698, -0.14254532, 1.63720664, -0.26228808],
[ 0.83771388, 1.38472601, 0.31125071, 1.17209984],
[ 0.76813632, 1.00896878, 1.42784517, 0.52500755],
[ 1.20879423, 2.01502847, -1.43342815, -0.55347961],
[-1.62069341, -1.52436225, -1.5032153 , -1.50254831],
[-0.11317951, -0.61527217, 0.66018648, 0.01811858],
[-0.27552716, -0.23951493, 0.1716764 , -0.13286962],
[-0.66980002, -0.14254532, 0.10188925, 0.87012344],
[-0.34510472, -0.78496898, 0.45082502, -0.68289807],
[-1.01768785, 0.03927269, 1.49763233, -1.39469959],
[ 1.53348953, 1.3119988 , -1.22406668, 0.13675217],
[-0.92491776, -1.027393 , -1.43342815, -0.90938037],
[ 1.25517927, 0.20896951, -0.45640799, 0.61128652],
[ 1.13921666, 0.36654512, 1.00912225, 0.46029832],
[-1.06407289, -0.61527217, 1.00912225, 0.17989166],
[-1.29599811, -1.48799864, -2.34066115, -1.08193832],
[ 0.16513075, -0.17890893, -0.17725937, -0.05737552],
[-0.87853272, -0.31224214, 0.52061217, 0.53579242],
[-0.48425985, -1.08799901, -1.85215107, -1.28685088],
[-1.20322802, -1.42739264, 0.03210209, -1.1250778 ],
[-0.22914211, -0.11830292, -0.38662083, -0.60740397]])
= FactorAnalyzer()
fa =4, rotation=None)
fa.set_params(n_factors fa.fit(scaled_data)
FactorAnalyzer(n_factors=4, rotation=None, rotation_kwargs={})
fa.get_eigenvalues()
(array([2.48024158, 0.98976515, 0.35656318, 0.17343009]),
array([ 2.28299696e+00, 5.37887219e-01, 6.17387175e-02, -2.20915632e-07]))
# Check Eigenvalues
= fa.get_eigenvalues()
ev, v ev
array([2.48024158, 0.98976515, 0.35656318, 0.17343009])
sorted(ev)
[0.17343008772983554,
0.35656318058083014,
0.9897651525398421,
2.480241579149494]
fa.get_communalities()
array([0.85762759, 0.88586043, 0.41517638, 0.72395827])
import matplotlib.pyplot as plt
range(1,df.shape[1]+1),ev)
plt.scatter(range(1,df.shape[1]+1),ev)
plt.plot('Screeplot')
plt.title('Factor')
plt.xlabel('Eigenvalores')
plt.ylabel(
plt.grid() plt.show()
import pandas as pd
=pd.DataFrame(fa.loadings_,columns=['F1','F2','F3','F4'],index=df.columns)
factores
print(factores)
F1 F2 F3 F4
Murder 0.843704 -0.374741 -0.073213 0.0
Assault 0.919370 -0.103672 0.172831 0.0
UrbanPop 0.331679 0.548848 0.062696 0.0
Rape 0.784798 0.292359 -0.150257 0.0
fa.corr_
array([[1. , 0.80187331, 0.06957262, 0.56357883],
[0.80187331, 1. , 0.2588717 , 0.66524123],
[0.06957262, 0.2588717 , 1. , 0.41134124],
[0.56357883, 0.66524123, 0.41134124, 1. ]])
fa.n_factors
4
print(pd.DataFrame(fa.get_factor_variance(),index=['Variance','Proportional Var','Cumulative Var']))
0 1 2 3
Variance 2.282997 0.537887 0.061739 0.000000
Proportional Var 0.570749 0.134472 0.015435 0.000000
Cumulative Var 0.570749 0.705221 0.720656 0.720656
fa.get_factor_variance()
(array([2.28299689, 0.53788712, 0.06173866, 0. ]),
array([0.57074922, 0.13447178, 0.01543467, 0. ]),
array([0.57074922, 0.705221 , 0.72065567, 0.72065567]))
fa.loadings_
array([[ 0.84370394, -0.37474146, -0.07321271, 0. ],
[ 0.91937036, -0.10367227, 0.17283121, 0. ],
[ 0.33167926, 0.54884825, 0.06269644, 0. ],
[ 0.78479779, 0.29235872, -0.15025674, 0. ]])
= fa.get_eigenvalues()
ev, v ev
array([2.48024158, 0.98976515, 0.35656318, 0.17343009])
0.84370394)**2 + (-0.37474146)**2 + (-0.07321271)**2 (
0.8576276011199994
= FactorAnalyzer(rotation='varimax')
fa_varimax fa_varimax.fit(scaled_data)
FactorAnalyzer(rotation='varimax', rotation_kwargs={})
fa_varimax.loadings_
array([[ 0.91516486, 0.02762848, 0.13905951],
[ 0.88036791, 0.32020407, -0.09100618],
[ 0.05909459, 0.64142576, -0.0160376 ],
[ 0.56292624, 0.59517677, 0.22986286]])
fa_varimax.get_communalities()
array([0.85762759, 0.88586043, 0.41517638, 0.72395827])
print(pd.DataFrame(fa.get_communalities(),index=df.columns,columns=['Communalities']))
Communalities
Murder 0.857628
Assault 0.885860
UrbanPop 0.415176
Rape 0.723958
# Get variance of each factors
fa.get_factor_variance()
(array([2.28299689, 0.53788712, 0.06173866, 0. ]),
array([0.57074922, 0.13447178, 0.01543467, 0. ]),
array([0.57074922, 0.705221 , 0.72065567, 0.72065567]))
import prince
= prince.PCA(
pca =2,
n_components=3,
n_iter=True,
rescale_with_mean=True,
rescale_with_std=True,
copy=True,
check_input='auto',
engine=42
random_state )
= pca.fit(df) pca
pca.transform(df)
0 | 1 | |
---|---|---|
0 | 0.985566 | 1.133392 |
1 | 1.950138 | 1.073213 |
2 | 1.763164 | -0.745957 |
3 | -0.141420 | 1.119797 |
4 | 2.523980 | -1.542934 |
5 | 1.514563 | -0.987555 |
6 | -1.358647 | -1.088928 |
7 | 0.047709 | -0.325359 |
8 | 3.013042 | 0.039229 |
9 | 1.639283 | 1.278942 |
10 | -0.912657 | -1.570460 |
11 | -1.639800 | 0.210973 |
12 | 1.378911 | -0.681841 |
13 | -0.505461 | -0.151563 |
14 | -2.253646 | -0.104054 |
15 | -0.796881 | -0.270165 |
16 | -0.750859 | 0.958440 |
17 | 1.564818 | 0.871055 |
18 | -2.396829 | 0.376392 |
19 | 1.763369 | 0.427655 |
20 | -0.486166 | -1.474496 |
21 | 2.108441 | -0.155397 |
22 | -1.692682 | -0.632261 |
23 | 0.996494 | 2.393796 |
24 | 0.696787 | -0.263355 |
25 | -1.185452 | 0.536874 |
26 | -1.265637 | -0.193954 |
27 | 2.874395 | -0.775600 |
28 | -2.383915 | -0.018082 |
29 | 0.181566 | -1.449506 |
30 | 1.980024 | 0.142849 |
31 | 1.682577 | -0.823184 |
32 | 1.123379 | 2.228003 |
33 | -2.992226 | 0.599119 |
34 | -0.225965 | -0.742238 |
35 | -0.311783 | -0.287854 |
36 | 0.059122 | -0.541411 |
37 | -0.888416 | -0.571100 |
38 | -0.863772 | -1.491978 |
39 | 1.320724 | 1.933405 |
40 | -1.987775 | 0.823343 |
41 | 0.999742 | 0.860251 |
42 | 1.355138 | -0.412481 |
43 | -0.550565 | -1.471505 |
44 | -2.801412 | 1.402288 |
45 | -0.096335 | 0.199735 |
46 | -0.216903 | -0.970124 |
47 | -2.108585 | 1.424847 |
48 | -2.079714 | -0.611269 |
49 | -0.629427 | 0.321013 |
= pca.plot_row_coordinates(
ax
df,=None,
ax=(6, 6),
figsize=0,
x_component=1,
y_component=None,
labels#color_labels=data['Unnamed: 0'],
=False,
ellipse_outline=True,
ellipse_fill=True
show_points )
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
ax.get_figure() plt.show()