In [94]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import prince
In [2]:
# READ DATABASE
dfa = pd.read_csv("recs2015_public_v4_3C.csv")
print(dfa.shape, "\n")
print(dfa.info(), "\n")
print(dfa.describe(), "\n")
dfa.head()
(209, 759) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Columns: 759 entries, DOEID to ZLPAMOUNT
dtypes: float64(245), int64(510), object(4)
memory usage: 1.2+ MB
None 

              DOEID  REGIONC  DIVISION     TYPEHUQ  ZTYPEHUQ      CELLAR  \
count    209.000000    209.0     209.0  209.000000     209.0  209.000000   
mean   12978.894737      4.0      10.0    2.578947       0.0   -0.430622   
std     1565.479955      0.0       0.0    1.226603       0.0    1.067987   
min    10021.000000      4.0      10.0    1.000000       0.0   -2.000000   
25%    11744.000000      4.0      10.0    2.000000       0.0   -2.000000   
50%    12998.000000      4.0      10.0    2.000000       0.0    0.000000   
75%    14362.000000      4.0      10.0    3.000000       0.0    0.000000   
max    15660.000000      4.0      10.0    5.000000       0.0    1.000000   

          ZCELLAR     BASEFIN    ZBASEFIN       ATTIC  ...   ZELAMOUNT  \
count  209.000000  209.000000  209.000000  209.000000  ...  209.000000   
mean    -0.574163   -1.636364   -1.684211   -0.296651  ...    0.138756   
std      0.922738    0.878054    0.750506    1.171799  ...    0.346522   
min     -2.000000   -2.000000   -2.000000   -2.000000  ...    0.000000   
25%     -2.000000   -2.000000   -2.000000   -2.000000  ...    0.000000   
50%      0.000000   -2.000000   -2.000000    0.000000  ...    0.000000   
75%      0.000000   -2.000000   -2.000000    1.000000  ...    0.000000   
max      1.000000    1.000000    1.000000    1.000000  ...    1.000000   

           NGXBTU    PERIODNG   ZNGAMOUNT        FOXBTU  PERIODFO  ZFOAMOUNT  \
count  183.000000  209.000000  209.000000  2.090000e+02     209.0      209.0   
mean   102.824426    1.287081   -0.086124  1.374500e+02      -2.0       -2.0   
std      0.539991    1.917499    0.809943  4.843291e-13       0.0        0.0   
min     98.170000   -2.000000   -2.000000  1.374500e+02      -2.0       -2.0   
25%    102.770000    1.000000    0.000000  1.374500e+02      -2.0       -2.0   
50%    102.960000    1.000000    0.000000  1.374500e+02      -2.0       -2.0   
75%    103.120000    1.000000    0.000000  1.374500e+02      -2.0       -2.0   
max    103.380000    5.000000    1.000000  1.374500e+02      -2.0       -2.0   

             LPXBTU    PERIODLP   ZLPAMOUNT  
count  2.090000e+02  209.000000  209.000000  
mean   9.133000e+01   -1.684211   -1.832536  
std    1.994296e-13    1.242589    0.616826  
min    9.133000e+01   -2.000000   -2.000000  
25%    9.133000e+01   -2.000000   -2.000000  
50%    9.133000e+01   -2.000000   -2.000000  
75%    9.133000e+01   -2.000000   -2.000000  
max    9.133000e+01    5.000000    1.000000  

[8 rows x 755 columns] 

Out[2]:
DOEID REGIONC DIVISION METROMICRO UATYP10 TYPEHUQ ZTYPEHUQ CELLAR ZCELLAR BASEFIN ... ZELAMOUNT NGXBTU PERIODNG ZNGAMOUNT FOXBTU PERIODFO ZFOAMOUNT LPXBTU PERIODLP ZLPAMOUNT
0 10021 4 10 METRO U 2 0 1 0 0 ... 0 102.88 1 0 137.45 -2 -2 91.33 -2 -2
1 10045 4 10 METRO U 2 0 0 0 -2 ... 0 101.65 1 0 137.45 -2 -2 91.33 -2 -2
2 10086 4 10 METRO U 2 0 0 0 -2 ... 0 103.31 1 0 137.45 -2 -2 91.33 -2 -2
3 10125 4 10 METRO U 2 0 0 0 -2 ... 1 103.12 1 0 137.45 -2 -2 91.33 -2 -2
4 10126 4 10 METRO U 2 0 1 0 0 ... 0 101.87 1 0 137.45 -2 -2 91.33 -2 -2

5 rows × 759 columns

In [3]:
print(dfa.groupby(['TYPEHUQ']).size(), "\n") # frequency of each unique label in column
dfa['TYPEHUQ'].value_counts() # alternate and better way
TYPEHUQ
1     17
2    133
3     15
4      9
5     35
dtype: int64 

Out[3]:
2    133
5     35
1     17
3     15
4      9
Name: TYPEHUQ, dtype: int64
In [4]:
# remove leading and trailing whitespaces from each cell, only works on string columns
dfa = dfa.apply(lambda x: x.str.strip() if x.dtype=='object' else x)
dfa['TYPEHUQ'].value_counts()
Out[4]:
2    133
5     35
1     17
3     15
4      9
Name: TYPEHUQ, dtype: int64
In [5]:
# check average NAs in each column
print(dfa.isna().mean())
dfa.isna().mean().plot(kind='barh')

# drop columns with >30% missing values (1 column). This cut-off is arbitrary, choose carefully
dfa = dfa.loc[:, dfa.isna().mean() < .3]
DOEID         0.0
REGIONC       0.0
DIVISION      0.0
METROMICRO    0.0
UATYP10       0.0
             ... 
PERIODFO      0.0
ZFOAMOUNT     0.0
LPXBTU        0.0
PERIODLP      0.0
ZLPAMOUNT     0.0
Length: 759, dtype: float64
In [6]:
# replace missing values with NaN, and drop rows with missing values
dfa = dfa.replace('NA',np.nan)
dfa = dfa.dropna()
dfa.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 0 to 208
Columns: 759 entries, DOEID to ZLPAMOUNT
dtypes: float64(245), int64(510), object(4)
memory usage: 1.1+ MB
In [7]:
print(dfa.duplicated().sum())
dfa = dfa.drop_duplicates()
dfa.info()
0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 0 to 208
Columns: 759 entries, DOEID to ZLPAMOUNT
dtypes: float64(245), int64(510), object(4)
memory usage: 1.1+ MB
In [11]:
# Task 4 - find and visualize correlations
sns.pairplot(dfa, vars=('TYPEHUQ','STORIES','KOWNRENT','YEARMADERANGE','BEDROOMS','NHSLDMEM','NUMADULT', 'NUMCHILD', 'ATHOME',
                        'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'), palette='RegionC')
Out[11]:
<seaborn.axisgrid.PairGrid at 0x7fce21e62c50>
In [80]:
# Task 4.1 - find and visualize correlations
dfa_num = dfa[['TYPEHUQ','YEARMADERANGE','NHSLDMEM','TOTSQFT_EN','KWH','SMARTTHERM','SMARTMETER']] #FEW CORRELATION 'MONEYPY', 'EDUCATION'
#dfa_num = dfa[['TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65']] #MORE CORRELATION
numeric_cols = dfa_num.columns.tolist()
dfa_num = dfa[numeric_cols]
dfa_corr = dfa_num.corr()
print(dfa_corr, "\n")
print(dfa_corr.mean(), "\n")
print(dfa_corr.abs().mean(), "\n")
print(dfa_num.describe(), "\n")
dfa_corr.info()
dfa_num.info()
for col in dfa_num:
    print(col + ' ' + str(dfa_num[col].nunique()))
                TYPEHUQ  YEARMADERANGE  NHSLDMEM  TOTSQFT_EN       KWH  \
TYPEHUQ        1.000000       0.062120 -0.087615   -0.423996 -0.400076   
YEARMADERANGE  0.062120       1.000000  0.064196    0.203962  0.098226   
NHSLDMEM      -0.087615       0.064196  1.000000    0.106434  0.279585   
TOTSQFT_EN    -0.423996       0.203962  0.106434    1.000000  0.439078   
KWH           -0.400076       0.098226  0.279585    0.439078  1.000000   
SMARTTHERM    -0.168430      -0.016859  0.029209    0.064825 -0.044998   
SMARTMETER    -0.226400      -0.115799  0.068627    0.147296  0.108584   

               SMARTTHERM  SMARTMETER  
TYPEHUQ         -0.168430   -0.226400  
YEARMADERANGE   -0.016859   -0.115799  
NHSLDMEM         0.029209    0.068627  
TOTSQFT_EN       0.064825    0.147296  
KWH             -0.044998    0.108584  
SMARTTHERM       1.000000    0.171654  
SMARTMETER       0.171654    1.000000   

TYPEHUQ         -0.034914
YEARMADERANGE    0.185121
NHSLDMEM         0.208634
TOTSQFT_EN       0.219657
KWH              0.211485
SMARTTHERM       0.147914
SMARTMETER       0.164852
dtype: float64 

TYPEHUQ          0.338377
YEARMADERANGE    0.223023
NHSLDMEM         0.233667
TOTSQFT_EN       0.340799
KWH              0.338650
SMARTTHERM       0.213711
SMARTMETER       0.262623
dtype: float64 

          TYPEHUQ  YEARMADERANGE    NHSLDMEM   TOTSQFT_EN           KWH  \
count  183.000000     183.000000  183.000000   183.000000    183.000000   
mean     2.568306       3.387978    2.989071  1811.136612   5658.203891   
std      1.197287       1.968786    1.511818  1054.321639   3197.166901   
min      1.000000       1.000000    1.000000   462.000000   1172.333000   
25%      2.000000       2.000000    2.000000  1040.500000   3310.382000   
50%      2.000000       3.000000    3.000000  1585.000000   5130.085000   
75%      3.000000       5.000000    4.000000  2219.500000   6837.900000   
max      5.000000       8.000000   10.000000  5480.000000  22633.000000   

       SMARTTHERM  SMARTMETER  
count  183.000000  183.000000  
mean    -0.683060   -1.213115  
std      2.069689    3.949328  
min     -9.000000   -9.000000  
25%      0.000000    0.000000  
50%      0.000000    1.000000  
75%      0.000000    1.000000  
max      1.000000    1.000000   

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, TYPEHUQ to SMARTMETER
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TYPEHUQ        7 non-null      float64
 1   YEARMADERANGE  7 non-null      float64
 2   NHSLDMEM       7 non-null      float64
 3   TOTSQFT_EN     7 non-null      float64
 4   KWH            7 non-null      float64
 5   SMARTTHERM     7 non-null      float64
 6   SMARTMETER     7 non-null      float64
dtypes: float64(7)
memory usage: 448.0+ bytes
<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 0 to 208
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TYPEHUQ        183 non-null    int64  
 1   YEARMADERANGE  183 non-null    int64  
 2   NHSLDMEM       183 non-null    int64  
 3   TOTSQFT_EN     183 non-null    int64  
 4   KWH            183 non-null    float64
 5   SMARTTHERM     183 non-null    int64  
 6   SMARTMETER     183 non-null    int64  
dtypes: float64(1), int64(6)
memory usage: 16.4 KB
TYPEHUQ 5
YEARMADERANGE 8
NHSLDMEM 9
TOTSQFT_EN 169
KWH 183
SMARTTHERM 4
SMARTMETER 3
In [87]:
# pairplot - only numeric columns
numeric_cols = dfa_num.select_dtypes(include=np.number).columns.tolist()

#pairplot of first 5 numeric columns, coloured by Number of Doors
sns.pairplot(dfa_num, vars=numeric_cols[:7], palette='TOTSQFT_EN')
Out[87]:
<seaborn.axisgrid.PairGrid at 0x7fcdffa99a10>
In [13]:
# Task 4.2 - Patterns, outliers, correlations
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(x="KWH", y="TOTSQFT_EN", data=dfa)
plt.xticks(rotation=0)
Out[13]:
(array([    0.,  5000., 10000., 15000., 20000., 25000.]),
 <a list of 6 Text xticklabel objects>)
In [75]:
# Task 4.2 - Patterns, outliers, correlations
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(x="EDUCATION", y="KWH", data=dfa)
plt.xticks(rotation=0)
Out[75]:
(array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5]),
 <a list of 11 Text xticklabel objects>)
In [76]:
# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="HOUSEHOLDER_RACE", hue="NHSLDMEM", data=dfa, ax=ax)
plt.xticks(rotation=45)

#1 = "White alone"
#2 = "Black or African/American Alone"
#3 = "American Indian or Alaska Native Alone"
#4 = "Asian Alone"
#5 = "Native Hawaiian or Other Pacific Islander Alone"
#6 = "Some other race alone"
#7 = "2 or more Races selected"
Out[76]:
(array([0, 1, 2, 3, 4, 5]), <a list of 6 Text xticklabel objects>)
In [ ]:
# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="HOUSEHOLDER_RACE", hue="NHSLDMEM", data=dfa, ax=ax)
plt.xticks(rotation=45)

#1 = "White alone"
#2 = "Black or African/American Alone"
#3 = "American Indian or Alaska Native Alone"
#4 = "Asian Alone"
#5 = "Native Hawaiian or Other Pacific Islander Alone"
#6 = "Some other race alone"
#7 = "2 or more Races selected"
In [78]:
# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="MONEYPY", hue="EDUCATION", data=dfa, ax=ax)
plt.xticks(rotation=45)


#MONEYPY:
 #   1 = "Less than $20,000"
 #   2 = "$20,000 - $39,999"
 #   3 = "$40,000 - $59,999"
 #   4 = "$60,000 to $79,999"
 #   5 = "$80,000 to $99,999"
 #   6 = "$100,000 to $119,999"
 #   7 = "$120,000 to $139,999"
 #   8 = "$140,000 or more"
EDUCATION:
 #   1 = "Less than high school diploma or GED"
 #   2 = "High school diploma or GED"
 #   3 = "Some college or Associate´s degree"
 #   4 = "Bachelor´s degree (for example BA, BS)"
 #   5 = "Master´s, Professional or Doctorate degree (for example MA, MS, MBA, MD, Jd, PhD)"
Out[78]:
(array([0, 1, 2, 3, 4, 5, 6, 7]), <a list of 8 Text xticklabel objects>)
In [83]:
# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="ATHOME", hue="SMARTTHERM", data=dfa, ax=ax)
plt.xticks(rotation=45)

#ATHOME = Number of weekdays someone is at home
#SMARTTHERM:
#    -9 = "Don´t know"
#    -2 = "Not applicable"
#    0 = "No"
#    1 = "Yes"
Out[83]:
(array([0, 1, 2, 3, 4, 5]), <a list of 6 Text xticklabel objects>)
In [93]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(x="SMARTTHERM", y="ATHOME", data=dfa)
plt.xticks(rotation=0)

#ATHOME = Number of weekdays someone is at home
#SMARTTHERM:
#    -9 = "Don´t know"
#    -2 = "Not applicable"
#    0 = "No"
#    1 = "Yes"
Out[93]:
(array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5]),
 <a list of 11 Text xticklabel objects>)
In [89]:
#boxplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.boxplot(x='NHSLDMEM', y='KWH', data=dfa)
plt.xticks(rotation=45)
Out[89]:
(array([0, 1, 2, 3, 4, 5, 6, 7, 8]), <a list of 9 Text xticklabel objects>)
In [90]:
#boxplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.boxplot(x='YEARMADERANGE', y='KWH', data=dfa)
plt.xticks(rotation=45)
Out[90]:
(array([0, 1, 2, 3, 4, 5, 6, 7]), <a list of 8 Text xticklabel objects>)
In [91]:
#boxplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.boxplot(x='TYPEHUQ', y='KWH', data=dfa)
plt.xticks(rotation=45)

#TYPEHUQ:
 #   1 = "Mobile home"
  #  2 = "Single-family detached house"
  #  3 = "Single-family attached house"
 #   4 = "Apartment in a building with 2 to 4 units"
 #   5 = "Apartment in a building with 5 or more units"
Out[91]:
(array([0, 1, 2, 3, 4]), <a list of 5 Text xticklabel objects>)
In [ ]:
 
In [43]:
# Task 5 - cluster your data to identify similar groups
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All
print(dfa_num.columns.tolist())
Xa = StandardScaler().fit_transform(dfa_num)
kmeans = KMeans(n_clusters=5, init='random') # initialization
kmeans.fit(Xa) # actual execution
pred = kmeans.predict(Xa)
np.unique(pred)
['TYPEHUQ', 'YEARMADERANGE', 'NHSLDMEM', 'TOTSQFT_EN', 'KWH', 'SMARTTHERM', 'SMARTMETER']
Out[43]:
array([0, 1, 2, 3, 4], dtype=int32)
In [44]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.scatter(Xa[:, 1], Xa[:, 2], c=pred, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 1], centers[:, 2], c='grey', s=50)
Out[44]:
<matplotlib.collections.PathCollection at 0x7fcdfd2e2910>
In [45]:
# Task 6 - PCA for dimensionality reduction
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All
pca = PCA(n_components=0.95)
pca.fit(Xa)
pcad = pca.transform(Xa)
print(pca.explained_variance_ratio_)
[0.2914217  0.18036113 0.14077383 0.13535373 0.11064364 0.07090507
 0.0705409 ]
In [86]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(pcad[:,0], pcad[:,1])
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcdff8cc6d0>
In [47]:
pca2 = prince.PCA(n_components=759, n_iter=3, rescale_with_mean=True,
    rescale_with_std=True, copy=True, engine='auto')
pca2 = pca2.fit(dfa_num)
pca2.explained_inertia_
Out[47]:
[0.2914216954661282,
 0.1803611250031431,
 0.1407738331701052,
 0.13535373054132957,
 0.11064364390921308,
 0.0709050699151458,
 0.07054090199493465]
In [48]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['TYPEHUQ'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [49]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['STORIES'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [50]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['YEARMADERANGE'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [51]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['IECC_CLIMATE_PUB'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [52]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['SMARTMETER'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [53]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['ATHOME'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [54]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['THERMAIN'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [55]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['SMARTTHERM'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [56]:
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['YEARMADERANGE'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [59]:
#[['TYPEHUQ','YEARMADERANGE','NHSLDMEM','TOTSQFT_EN','KWH','SMARTTHERM','SMARTMETER']

ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['SMARTTHERM'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [60]:
#TYPEHUQ','STORIES','KOWNRENT','YEARMADERANGE','BEDROOMS','NHSLDMEM','NUMADULT', 'NUMCHILD', 'ATHOME',
         #               'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'
    
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['BEDROOMS'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [61]:
#'NUMADULT', 'NUMCHILD', 'ATHOME',
         #               'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'
    
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['KOWNRENT'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [70]:
#'NUMADULT', 'NUMCHILD', 'ATHOME',
         #               'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'
    
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['DRYER'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )
In [ ]: