In [94]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import prince
In [2]:
# READ DATABASE
dfa = pd.read_csv("recs2015_public_v4_3C.csv")
print(dfa.shape, "\n")
print(dfa.info(), "\n")
print(dfa.describe(), "\n")
dfa.head()
(209, 759) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Columns: 759 entries, DOEID to ZLPAMOUNT
dtypes: float64(245), int64(510), object(4)
memory usage: 1.2+ MB
None 

              DOEID  REGIONC  DIVISION     TYPEHUQ  ZTYPEHUQ      CELLAR  \
count    209.000000    209.0     209.0  209.000000     209.0  209.000000   
mean   12978.894737      4.0      10.0    2.578947       0.0   -0.430622   
std     1565.479955      0.0       0.0    1.226603       0.0    1.067987   
min    10021.000000      4.0      10.0    1.000000       0.0   -2.000000   
25%    11744.000000      4.0      10.0    2.000000       0.0   -2.000000   
50%    12998.000000      4.0      10.0    2.000000       0.0    0.000000   
75%    14362.000000      4.0      10.0    3.000000       0.0    0.000000   
max    15660.000000      4.0      10.0    5.000000       0.0    1.000000   

          ZCELLAR     BASEFIN    ZBASEFIN       ATTIC  ...   ZELAMOUNT  \
count  209.000000  209.000000  209.000000  209.000000  ...  209.000000   
mean    -0.574163   -1.636364   -1.684211   -0.296651  ...    0.138756   
std      0.922738    0.878054    0.750506    1.171799  ...    0.346522   
min     -2.000000   -2.000000   -2.000000   -2.000000  ...    0.000000   
25%     -2.000000   -2.000000   -2.000000   -2.000000  ...    0.000000   
50%      0.000000   -2.000000   -2.000000    0.000000  ...    0.000000   
75%      0.000000   -2.000000   -2.000000    1.000000  ...    0.000000   
max      1.000000    1.000000    1.000000    1.000000  ...    1.000000   

           NGXBTU    PERIODNG   ZNGAMOUNT        FOXBTU  PERIODFO  ZFOAMOUNT  \
count  183.000000  209.000000  209.000000  2.090000e+02     209.0      209.0   
mean   102.824426    1.287081   -0.086124  1.374500e+02      -2.0       -2.0   
std      0.539991    1.917499    0.809943  4.843291e-13       0.0        0.0   
min     98.170000   -2.000000   -2.000000  1.374500e+02      -2.0       -2.0   
25%    102.770000    1.000000    0.000000  1.374500e+02      -2.0       -2.0   
50%    102.960000    1.000000    0.000000  1.374500e+02      -2.0       -2.0   
75%    103.120000    1.000000    0.000000  1.374500e+02      -2.0       -2.0   
max    103.380000    5.000000    1.000000  1.374500e+02      -2.0       -2.0   

             LPXBTU    PERIODLP   ZLPAMOUNT  
count  2.090000e+02  209.000000  209.000000  
mean   9.133000e+01   -1.684211   -1.832536  
std    1.994296e-13    1.242589    0.616826  
min    9.133000e+01   -2.000000   -2.000000  
25%    9.133000e+01   -2.000000   -2.000000  
50%    9.133000e+01   -2.000000   -2.000000  
75%    9.133000e+01   -2.000000   -2.000000  
max    9.133000e+01    5.000000    1.000000  

[8 rows x 755 columns] 

Out[2]:
DOEID REGIONC DIVISION METROMICRO UATYP10 TYPEHUQ ZTYPEHUQ CELLAR ZCELLAR BASEFIN ... ZELAMOUNT NGXBTU PERIODNG ZNGAMOUNT FOXBTU PERIODFO ZFOAMOUNT LPXBTU PERIODLP ZLPAMOUNT
0 10021 4 10 METRO U 2 0 1 0 0 ... 0 102.88 1 0 137.45 -2 -2 91.33 -2 -2
1 10045 4 10 METRO U 2 0 0 0 -2 ... 0 101.65 1 0 137.45 -2 -2 91.33 -2 -2
2 10086 4 10 METRO U 2 0 0 0 -2 ... 0 103.31 1 0 137.45 -2 -2 91.33 -2 -2
3 10125 4 10 METRO U 2 0 0 0 -2 ... 1 103.12 1 0 137.45 -2 -2 91.33 -2 -2
4 10126 4 10 METRO U 2 0 1 0 0 ... 0 101.87 1 0 137.45 -2 -2 91.33 -2 -2

5 rows × 759 columns

In [3]:
print(dfa.groupby(['TYPEHUQ']).size(), "\n") # frequency of each unique label in column
dfa['TYPEHUQ'].value_counts() # alternate and better way
TYPEHUQ
1     17
2    133
3     15
4      9
5     35
dtype: int64 

Out[3]:
2    133
5     35
1     17
3     15
4      9
Name: TYPEHUQ, dtype: int64
In [4]:
# remove leading and trailing whitespaces from each cell, only works on string columns
dfa = dfa.apply(lambda x: x.str.strip() if x.dtype=='object' else x)
dfa['TYPEHUQ'].value_counts()
Out[4]:
2    133
5     35
1     17
3     15
4      9
Name: TYPEHUQ, dtype: int64
In [5]:
# check average NAs in each column
print(dfa.isna().mean())
dfa.isna().mean().plot(kind='barh')

# drop columns with >30% missing values (1 column). This cut-off is arbitrary, choose carefully
dfa = dfa.loc[:, dfa.isna().mean() < .3]
DOEID         0.0
REGIONC       0.0
DIVISION      0.0
METROMICRO    0.0
UATYP10       0.0
             ... 
PERIODFO      0.0
ZFOAMOUNT     0.0
LPXBTU        0.0
PERIODLP      0.0
ZLPAMOUNT     0.0
Length: 759, dtype: float64
In [6]:
# replace missing values with NaN, and drop rows with missing values
dfa = dfa.replace('NA',np.nan)
dfa = dfa.dropna()
dfa.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 0 to 208
Columns: 759 entries, DOEID to ZLPAMOUNT
dtypes: float64(245), int64(510), object(4)
memory usage: 1.1+ MB
In [7]:
print(dfa.duplicated().sum())
dfa = dfa.drop_duplicates()
dfa.info()
0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 0 to 208
Columns: 759 entries, DOEID to ZLPAMOUNT
dtypes: float64(245), int64(510), object(4)
memory usage: 1.1+ MB
In [11]:
# Task 4 - find and visualize correlations
sns.pairplot(dfa, vars=('TYPEHUQ','STORIES','KOWNRENT','YEARMADERANGE','BEDROOMS','NHSLDMEM','NUMADULT', 'NUMCHILD', 'ATHOME',
                        'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'), palette='RegionC')
Out[11]:
<seaborn.axisgrid.PairGrid at 0x7fce21e62c50>
In [95]:
# Task 4.1 - find and visualize correlations
#dfa_num = dfa[['TYPEHUQ','YEARMADERANGE','NHSLDMEM','TOTSQFT_EN','KWH','SMARTTHERM','SMARTMETER']] #FEW CORRELATION 'MONEYPY', 'EDUCATION'
dfa_num = dfa[['TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65']] #MORE CORRELATION
numeric_cols = dfa_num.columns.tolist()
dfa_num = dfa[numeric_cols]
dfa_corr = dfa_num.corr()
print(dfa_corr, "\n")
print(dfa_corr.mean(), "\n")
print(dfa_corr.abs().mean(), "\n")
print(dfa_num.describe(), "\n")
dfa_corr.info()
dfa_num.info()
for col in dfa_num:
    print(col + ' ' + str(dfa_num[col].nunique()))
            TOTSQFT_EN       KWH   CDD30YR     CDD65     CDD80   HDD30YR  \
TOTSQFT_EN    1.000000  0.439078 -0.009450 -0.018924  0.146573  0.033599   
KWH           0.439078  1.000000  0.026331  0.048174  0.067882  0.038801   
CDD30YR      -0.009450  0.026331  1.000000  0.920036 -0.001715 -0.781968   
CDD65        -0.018924  0.048174  0.920036  1.000000 -0.028997 -0.729947   
CDD80         0.146573  0.067882 -0.001715 -0.028997  1.000000 -0.032843   
HDD30YR       0.033599  0.038801 -0.781968 -0.729947 -0.032843  1.000000   
HDD65         0.003570  0.120317 -0.398221 -0.450321 -0.028528  0.821595   
HDD50        -0.046673  0.093151 -0.166447 -0.162598 -0.355521  0.646493   
GNDHDD65      0.008922 -0.024840  0.734877  0.635186  0.036618 -0.659633   

               HDD65     HDD50  GNDHDD65  
TOTSQFT_EN  0.003570 -0.046673  0.008922  
KWH         0.120317  0.093151 -0.024840  
CDD30YR    -0.398221 -0.166447  0.734877  
CDD65      -0.450321 -0.162598  0.635186  
CDD80      -0.028528 -0.355521  0.036618  
HDD30YR     0.821595  0.646493 -0.659633  
HDD65       1.000000  0.879847 -0.449919  
HDD50       0.879847  1.000000 -0.360940  
GNDHDD65   -0.449919 -0.360940  1.000000   

TOTSQFT_EN    0.172966
KWH           0.200988
CDD30YR       0.147049
CDD65         0.134734
CDD80         0.089274
HDD30YR       0.037344
HDD65         0.166482
HDD50         0.169701
GNDHDD65      0.102252
dtype: float64 

TOTSQFT_EN    0.189643
KWH           0.206508
CDD30YR       0.448783
CDD65         0.443798
CDD80         0.188742
HDD30YR       0.527209
HDD65         0.461369
HDD50         0.412408
GNDHDD65      0.434548
dtype: float64 

        TOTSQFT_EN           KWH     CDD30YR       CDD65       CDD80  \
count   183.000000    183.000000  183.000000  183.000000  183.000000   
mean   1811.136612   5658.203891  394.639344  502.699454   12.032787   
std    1054.321639   3197.166901  236.442593  175.903484   17.014829   
min     462.000000   1172.333000   86.000000  245.000000    0.000000   
25%    1040.500000   3310.382000  217.500000  364.000000    0.000000   
50%    1585.000000   5130.085000  294.000000  461.000000    2.000000   
75%    2219.500000   6837.900000  684.500000  669.000000   21.000000   
max    5480.000000  22633.000000  918.000000  917.000000   69.000000   

           HDD30YR        HDD65       HDD50     GNDHDD65  
count   183.000000   183.000000  183.000000   183.000000  
mean   2538.289617  2129.267760  152.907104  2128.244699  
std     504.415839   351.602625   84.570714   138.001538  
min    1741.000000  1837.000000   19.000000  1935.710000  
25%    2064.000000  1945.000000  101.500000  2001.760000  
50%    2529.000000  1983.000000  128.000000  2068.700000  
75%    2656.000000  2033.500000  170.000000  2278.075000  
max    4541.000000  2976.000000  372.000000  2325.030000   

<class 'pandas.core.frame.DataFrame'>
Index: 9 entries, TOTSQFT_EN to GNDHDD65
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   TOTSQFT_EN  9 non-null      float64
 1   KWH         9 non-null      float64
 2   CDD30YR     9 non-null      float64
 3   CDD65       9 non-null      float64
 4   CDD80       9 non-null      float64
 5   HDD30YR     9 non-null      float64
 6   HDD65       9 non-null      float64
 7   HDD50       9 non-null      float64
 8   GNDHDD65    9 non-null      float64
dtypes: float64(9)
memory usage: 720.0+ bytes
<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 0 to 208
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   TOTSQFT_EN  183 non-null    int64  
 1   KWH         183 non-null    float64
 2   CDD30YR     183 non-null    int64  
 3   CDD65       183 non-null    int64  
 4   CDD80       183 non-null    int64  
 5   HDD30YR     183 non-null    int64  
 6   HDD65       183 non-null    int64  
 7   HDD50       183 non-null    int64  
 8   GNDHDD65    183 non-null    float64
dtypes: float64(2), int64(7)
memory usage: 19.3 KB
TOTSQFT_EN 169
KWH 183
CDD30YR 135
CDD65 160
CDD80 45
HDD30YR 159
HDD65 135
HDD50 123
GNDHDD65 139
In [96]:
# pairplot - only numeric columns
numeric_cols = dfa_num.select_dtypes(include=np.number).columns.tolist()

#pairplot of first 5 numeric columns, coloured by Number of Doors
sns.pairplot(dfa_num, vars=numeric_cols[:7], palette='TOTSQFT_EN')
Out[96]:
<seaborn.axisgrid.PairGrid at 0x7fce03a4fd90>
In [97]:
# Task 4.2 - Patterns, outliers, correlations
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(x="KWH", y="TOTSQFT_EN", data=dfa)
plt.xticks(rotation=0)
Out[97]:
(array([    0.,  5000., 10000., 15000., 20000., 25000.]),
 <a list of 6 Text xticklabel objects>)
In [98]:
# Task 4.2 - Patterns, outliers, correlations
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(x="EDUCATION", y="KWH", data=dfa)
plt.xticks(rotation=0)
Out[98]:
(array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5]),
 <a list of 11 Text xticklabel objects>)
In [99]:
# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="HOUSEHOLDER_RACE", hue="NHSLDMEM", data=dfa, ax=ax)
plt.xticks(rotation=45)

#1 = "White alone"
#2 = "Black or African/American Alone"
#3 = "American Indian or Alaska Native Alone"
#4 = "Asian Alone"
#5 = "Native Hawaiian or Other Pacific Islander Alone"
#6 = "Some other race alone"
#7 = "2 or more Races selected"
Out[99]:
(array([0, 1, 2, 3, 4, 5]), <a list of 6 Text xticklabel objects>)
In [102]:
# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="HOUSEHOLDER_RACE", hue="NHSLDMEM", data=dfa, ax=ax)
plt.xticks(rotation=45)

#1 = "White alone"
#2 = "Black or African/American Alone"
#3 = "American Indian or Alaska Native Alone"
#4 = "Asian Alone"
#5 = "Native Hawaiian or Other Pacific Islander Alone"
#6 = "Some other race alone"
#7 = "2 or more Races selected"
Out[102]:
(array([0, 1, 2, 3, 4, 5]), <a list of 6 Text xticklabel objects>)
In [103]:
# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="MONEYPY", hue="EDUCATION", data=dfa, ax=ax)
plt.xticks(rotation=45)


#MONEYPY:
 #   1 = "Less than $20,000"
 #   2 = "$20,000 - $39,999"
 #   3 = "$40,000 - $59,999"
 #   4 = "$60,000 to $79,999"
 #   5 = "$80,000 to $99,999"
 #   6 = "$100,000 to $119,999"
 #   7 = "$120,000 to $139,999"
 #   8 = "$140,000 or more"
#EDUCATION:
 #   1 = "Less than high school diploma or GED"
 #   2 = "High school diploma or GED"
 #   3 = "Some college or Associate´s degree"
 #   4 = "Bachelor´s degree (for example BA, BS)"
 #   5 = "Master´s, Professional or Doctorate degree (for example MA, MS, MBA, MD, Jd, PhD)"
Out[103]:
(array([0, 1, 2, 3, 4, 5, 6, 7]), <a list of 8 Text xticklabel objects>)
In [104]:
# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="ATHOME", hue="SMARTTHERM", data=dfa, ax=ax)
plt.xticks(rotation=45)

#ATHOME = Number of weekdays someone is at home
#SMARTTHERM:
#    -9 = "Don´t know"
#    -2 = "Not applicable"
#    0 = "No"
#    1 = "Yes"
Out[104]:
(array([0, 1, 2, 3, 4, 5]), <a list of 6 Text xticklabel objects>)
In [105]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(x="SMARTTHERM", y="ATHOME", data=dfa)
plt.xticks(rotation=0)

#ATHOME = Number of weekdays someone is at home
#SMARTTHERM:
#    -9 = "Don´t know"
#    -2 = "Not applicable"
#    0 = "No"
#    1 = "Yes"
Out[105]:
(array([-10.,  -8.,  -6.,  -4.,  -2.,   0.,   2.]),
 <a list of 7 Text xticklabel objects>)
In [106]:
#boxplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.boxplot(x='NHSLDMEM', y='KWH', data=dfa)
plt.xticks(rotation=45)
Out[106]:
(array([0, 1, 2, 3, 4, 5, 6, 7, 8]), <a list of 9 Text xticklabel objects>)
In [107]:
#boxplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.boxplot(x='YEARMADERANGE', y='KWH', data=dfa)
plt.xticks(rotation=45)
Out[107]:
(array([0, 1, 2, 3, 4, 5, 6, 7]), <a list of 8 Text xticklabel objects>)
In [108]:
#boxplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.boxplot(x='TYPEHUQ', y='KWH', data=dfa)
plt.xticks(rotation=45)

#TYPEHUQ:
 #   1 = "Mobile home"
  #  2 = "Single-family detached house"
  #  3 = "Single-family attached house"
 #   4 = "Apartment in a building with 2 to 4 units"
 #   5 = "Apartment in a building with 5 or more units"
Out[108]:
(array([0, 1, 2, 3, 4]), <a list of 5 Text xticklabel objects>)
In [ ]:
 
In [109]:
# Task 5 - cluster your data to identify similar groups
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All
print(dfa_num.columns.tolist())
Xa = StandardScaler().fit_transform(dfa_num)
kmeans = KMeans(n_clusters=5, init='random') # initialization
kmeans.fit(Xa) # actual execution
pred = kmeans.predict(Xa)
np.unique(pred)
['TOTSQFT_EN', 'KWH', 'CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65']
Out[109]:
array([0, 1, 2, 3, 4], dtype=int32)
In [110]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.scatter(Xa[:, 1], Xa[:, 2], c=pred, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 1], centers[:, 2], c='grey', s=50)
Out[110]:
<matplotlib.collections.PathCollection at 0x7fcde3fa1b90>
In [111]:
# Task 6 - PCA for dimensionality reduction
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All
pca = PCA(n_components=0.95)
pca.fit(Xa)
pcad = pca.transform(Xa)
print(pca.explained_variance_ratio_)
[0.44328177 0.17037288 0.16592559 0.0967905  0.06115848 0.04417003]
In [112]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(pcad[:,0], pcad[:,1])
Out[112]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcde3fd9d10>
In [113]:
pca2 = prince.PCA(n_components=759, n_iter=3, rescale_with_mean=True,
    rescale_with_std=True, copy=True, engine='auto')
pca2 = pca2.fit(dfa_num)
pca2.explained_inertia_
Out[113]:
[0.4432817729064176,
 0.17037288147667498,
 0.1659255947235978,
 0.09679050040117156,
 0.061158481687759225,
 0.04417002719324449,
 0.012991845373342333,
 0.003503050324278712,
 0.0018058459135133465]
In [115]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['TYPEHUQ'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [116]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['STORIES'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [117]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['YEARMADERANGE'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [118]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['IECC_CLIMATE_PUB'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [119]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['SMARTMETER'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [120]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['ATHOME'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [121]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['THERMAIN'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [122]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['SMARTTHERM'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [123]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['YEARMADERANGE'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [124]:
#[['TYPEHUQ','YEARMADERANGE','NHSLDMEM','TOTSQFT_EN','KWH','SMARTTHERM','SMARTMETER']

ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['SMARTTHERM'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [125]:
#TYPEHUQ','STORIES','KOWNRENT','YEARMADERANGE','BEDROOMS','NHSLDMEM','NUMADULT', 'NUMCHILD', 'ATHOME',
         #               'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'
    
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['BEDROOMS'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [126]:
#'NUMADULT', 'NUMCHILD', 'ATHOME',
         #               'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'
    
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['KOWNRENT'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [127]:
#'NUMADULT', 'NUMCHILD', 'ATHOME',
         #               'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'
    
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['DRYER'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )