import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import prince

# READ DATABASE
dfa = pd.read_csv("recs2015_public_v4_3C.csv")
print(dfa.shape, "\n")
print(dfa.info(), "\n")
print(dfa.describe(), "\n")
dfa.head()

(209, 759) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Columns: 759 entries, DOEID to ZLPAMOUNT
dtypes: float64(245), int64(510), object(4)
memory usage: 1.2+ MB
None 

              DOEID  REGIONC  DIVISION     TYPEHUQ  ZTYPEHUQ      CELLAR  \
count    209.000000    209.0     209.0  209.000000     209.0  209.000000   
mean   12978.894737      4.0      10.0    2.578947       0.0   -0.430622   
std     1565.479955      0.0       0.0    1.226603       0.0    1.067987   
min    10021.000000      4.0      10.0    1.000000       0.0   -2.000000   
25%    11744.000000      4.0      10.0    2.000000       0.0   -2.000000   
50%    12998.000000      4.0      10.0    2.000000       0.0    0.000000   
75%    14362.000000      4.0      10.0    3.000000       0.0    0.000000   
max    15660.000000      4.0      10.0    5.000000       0.0    1.000000   

          ZCELLAR     BASEFIN    ZBASEFIN       ATTIC  ...   ZELAMOUNT  \
count  209.000000  209.000000  209.000000  209.000000  ...  209.000000   
mean    -0.574163   -1.636364   -1.684211   -0.296651  ...    0.138756   
std      0.922738    0.878054    0.750506    1.171799  ...    0.346522   
min     -2.000000   -2.000000   -2.000000   -2.000000  ...    0.000000   
25%     -2.000000   -2.000000   -2.000000   -2.000000  ...    0.000000   
50%      0.000000   -2.000000   -2.000000    0.000000  ...    0.000000   
75%      0.000000   -2.000000   -2.000000    1.000000  ...    0.000000   
max      1.000000    1.000000    1.000000    1.000000  ...    1.000000   

           NGXBTU    PERIODNG   ZNGAMOUNT        FOXBTU  PERIODFO  ZFOAMOUNT  \
count  183.000000  209.000000  209.000000  2.090000e+02     209.0      209.0   
mean   102.824426    1.287081   -0.086124  1.374500e+02      -2.0       -2.0   
std      0.539991    1.917499    0.809943  4.843291e-13       0.0        0.0   
min     98.170000   -2.000000   -2.000000  1.374500e+02      -2.0       -2.0   
25%    102.770000    1.000000    0.000000  1.374500e+02      -2.0       -2.0   
50%    102.960000    1.000000    0.000000  1.374500e+02      -2.0       -2.0   
75%    103.120000    1.000000    0.000000  1.374500e+02      -2.0       -2.0   
max    103.380000    5.000000    1.000000  1.374500e+02      -2.0       -2.0   

             LPXBTU    PERIODLP   ZLPAMOUNT  
count  2.090000e+02  209.000000  209.000000  
mean   9.133000e+01   -1.684211   -1.832536  
std    1.994296e-13    1.242589    0.616826  
min    9.133000e+01   -2.000000   -2.000000  
25%    9.133000e+01   -2.000000   -2.000000  
50%    9.133000e+01   -2.000000   -2.000000  
75%    9.133000e+01   -2.000000   -2.000000  
max    9.133000e+01    5.000000    1.000000  

[8 rows x 755 columns]

print(dfa.groupby(['TYPEHUQ']).size(), "\n") # frequency of each unique label in column
dfa['TYPEHUQ'].value_counts() # alternate and better way

TYPEHUQ
1     17
2    133
3     15
4      9
5     35
dtype: int64

2    133
5     35
1     17
3     15
4      9
Name: TYPEHUQ, dtype: int64

# remove leading and trailing whitespaces from each cell, only works on string columns
dfa = dfa.apply(lambda x: x.str.strip() if x.dtype=='object' else x)
dfa['TYPEHUQ'].value_counts()

2    133
5     35
1     17
3     15
4      9
Name: TYPEHUQ, dtype: int64

# check average NAs in each column
print(dfa.isna().mean())
dfa.isna().mean().plot(kind='barh')

# drop columns with >30% missing values (1 column). This cut-off is arbitrary, choose carefully
dfa = dfa.loc[:, dfa.isna().mean() < .3]

DOEID         0.0
REGIONC       0.0
DIVISION      0.0
METROMICRO    0.0
UATYP10       0.0
             ... 
PERIODFO      0.0
ZFOAMOUNT     0.0
LPXBTU        0.0
PERIODLP      0.0
ZLPAMOUNT     0.0
Length: 759, dtype: float64

# replace missing values with NaN, and drop rows with missing values
dfa = dfa.replace('NA',np.nan)
dfa = dfa.dropna()
dfa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 0 to 208
Columns: 759 entries, DOEID to ZLPAMOUNT
dtypes: float64(245), int64(510), object(4)
memory usage: 1.1+ MB

print(dfa.duplicated().sum())
dfa = dfa.drop_duplicates()
dfa.info()

0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 0 to 208
Columns: 759 entries, DOEID to ZLPAMOUNT
dtypes: float64(245), int64(510), object(4)
memory usage: 1.1+ MB

# Task 4 - find and visualize correlations
sns.pairplot(dfa, vars=('TYPEHUQ','STORIES','KOWNRENT','YEARMADERANGE','BEDROOMS','NHSLDMEM','NUMADULT', 'NUMCHILD', 'ATHOME',
                        'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'), palette='RegionC')

<seaborn.axisgrid.PairGrid at 0x7fce21e62c50>

# Task 4.1 - find and visualize correlations
dfa_num = dfa[['TYPEHUQ','YEARMADERANGE','NHSLDMEM','TOTSQFT_EN','KWH','SMARTTHERM','SMARTMETER']] #FEW CORRELATION 'MONEYPY', 'EDUCATION'
#dfa_num = dfa[['TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65']] #MORE CORRELATION
numeric_cols = dfa_num.columns.tolist()
dfa_num = dfa[numeric_cols]
dfa_corr = dfa_num.corr()
print(dfa_corr, "\n")
print(dfa_corr.mean(), "\n")
print(dfa_corr.abs().mean(), "\n")
print(dfa_num.describe(), "\n")
dfa_corr.info()
dfa_num.info()
for col in dfa_num:
    print(col + ' ' + str(dfa_num[col].nunique()))

                TYPEHUQ  YEARMADERANGE  NHSLDMEM  TOTSQFT_EN       KWH  \
TYPEHUQ        1.000000       0.062120 -0.087615   -0.423996 -0.400076   
YEARMADERANGE  0.062120       1.000000  0.064196    0.203962  0.098226   
NHSLDMEM      -0.087615       0.064196  1.000000    0.106434  0.279585   
TOTSQFT_EN    -0.423996       0.203962  0.106434    1.000000  0.439078   
KWH           -0.400076       0.098226  0.279585    0.439078  1.000000   
SMARTTHERM    -0.168430      -0.016859  0.029209    0.064825 -0.044998   
SMARTMETER    -0.226400      -0.115799  0.068627    0.147296  0.108584   

               SMARTTHERM  SMARTMETER  
TYPEHUQ         -0.168430   -0.226400  
YEARMADERANGE   -0.016859   -0.115799  
NHSLDMEM         0.029209    0.068627  
TOTSQFT_EN       0.064825    0.147296  
KWH             -0.044998    0.108584  
SMARTTHERM       1.000000    0.171654  
SMARTMETER       0.171654    1.000000   

TYPEHUQ         -0.034914
YEARMADERANGE    0.185121
NHSLDMEM         0.208634
TOTSQFT_EN       0.219657
KWH              0.211485
SMARTTHERM       0.147914
SMARTMETER       0.164852
dtype: float64 

TYPEHUQ          0.338377
YEARMADERANGE    0.223023
NHSLDMEM         0.233667
TOTSQFT_EN       0.340799
KWH              0.338650
SMARTTHERM       0.213711
SMARTMETER       0.262623
dtype: float64 

          TYPEHUQ  YEARMADERANGE    NHSLDMEM   TOTSQFT_EN           KWH  \
count  183.000000     183.000000  183.000000   183.000000    183.000000   
mean     2.568306       3.387978    2.989071  1811.136612   5658.203891   
std      1.197287       1.968786    1.511818  1054.321639   3197.166901   
min      1.000000       1.000000    1.000000   462.000000   1172.333000   
25%      2.000000       2.000000    2.000000  1040.500000   3310.382000   
50%      2.000000       3.000000    3.000000  1585.000000   5130.085000   
75%      3.000000       5.000000    4.000000  2219.500000   6837.900000   
max      5.000000       8.000000   10.000000  5480.000000  22633.000000   

       SMARTTHERM  SMARTMETER  
count  183.000000  183.000000  
mean    -0.683060   -1.213115  
std      2.069689    3.949328  
min     -9.000000   -9.000000  
25%      0.000000    0.000000  
50%      0.000000    1.000000  
75%      0.000000    1.000000  
max      1.000000    1.000000   

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, TYPEHUQ to SMARTMETER
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TYPEHUQ        7 non-null      float64
 1   YEARMADERANGE  7 non-null      float64
 2   NHSLDMEM       7 non-null      float64
 3   TOTSQFT_EN     7 non-null      float64
 4   KWH            7 non-null      float64
 5   SMARTTHERM     7 non-null      float64
 6   SMARTMETER     7 non-null      float64
dtypes: float64(7)
memory usage: 448.0+ bytes
<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 0 to 208
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TYPEHUQ        183 non-null    int64  
 1   YEARMADERANGE  183 non-null    int64  
 2   NHSLDMEM       183 non-null    int64  
 3   TOTSQFT_EN     183 non-null    int64  
 4   KWH            183 non-null    float64
 5   SMARTTHERM     183 non-null    int64  
 6   SMARTMETER     183 non-null    int64  
dtypes: float64(1), int64(6)
memory usage: 16.4 KB
TYPEHUQ 5
YEARMADERANGE 8
NHSLDMEM 9
TOTSQFT_EN 169
KWH 183
SMARTTHERM 4
SMARTMETER 3

# pairplot - only numeric columns
numeric_cols = dfa_num.select_dtypes(include=np.number).columns.tolist()

#pairplot of first 5 numeric columns, coloured by Number of Doors
sns.pairplot(dfa_num, vars=numeric_cols[:7], palette='TOTSQFT_EN')

<seaborn.axisgrid.PairGrid at 0x7fcdffa99a10>

# Task 4.2 - Patterns, outliers, correlations
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(x="KWH", y="TOTSQFT_EN", data=dfa)
plt.xticks(rotation=0)

(array([    0.,  5000., 10000., 15000., 20000., 25000.]),
 <a list of 6 Text xticklabel objects>)

# Task 4.2 - Patterns, outliers, correlations
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(x="EDUCATION", y="KWH", data=dfa)
plt.xticks(rotation=0)

(array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5]),
 <a list of 11 Text xticklabel objects>)

# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="HOUSEHOLDER_RACE", hue="NHSLDMEM", data=dfa, ax=ax)
plt.xticks(rotation=45)

#1 = "White alone"
#2 = "Black or African/American Alone"
#3 = "American Indian or Alaska Native Alone"
#4 = "Asian Alone"
#5 = "Native Hawaiian or Other Pacific Islander Alone"
#6 = "Some other race alone"
#7 = "2 or more Races selected"

(array([0, 1, 2, 3, 4, 5]), <a list of 6 Text xticklabel objects>)

# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="HOUSEHOLDER_RACE", hue="NHSLDMEM", data=dfa, ax=ax)
plt.xticks(rotation=45)

#1 = "White alone"
#2 = "Black or African/American Alone"
#3 = "American Indian or Alaska Native Alone"
#4 = "Asian Alone"
#5 = "Native Hawaiian or Other Pacific Islander Alone"
#6 = "Some other race alone"
#7 = "2 or more Races selected"

# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="MONEYPY", hue="EDUCATION", data=dfa, ax=ax)
plt.xticks(rotation=45)


#MONEYPY:
 #   1 = "Less than $20,000"
 #   2 = "$20,000 - $39,999"
 #   3 = "$40,000 - $59,999"
 #   4 = "$60,000 to $79,999"
 #   5 = "$80,000 to $99,999"
 #   6 = "$100,000 to $119,999"
 #   7 = "$120,000 to $139,999"
 #   8 = "$140,000 or more"
EDUCATION:
 #   1 = "Less than high school diploma or GED"
 #   2 = "High school diploma or GED"
 #   3 = "Some college or Associate´s degree"
 #   4 = "Bachelor´s degree (for example BA, BS)"
 #   5 = "Master´s, Professional or Doctorate degree (for example MA, MS, MBA, MD, Jd, PhD)"

(array([0, 1, 2, 3, 4, 5, 6, 7]), <a list of 8 Text xticklabel objects>)

# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="ATHOME", hue="SMARTTHERM", data=dfa, ax=ax)
plt.xticks(rotation=45)

#ATHOME = Number of weekdays someone is at home
#SMARTTHERM:
#    -9 = "Don´t know"
#    -2 = "Not applicable"
#    0 = "No"
#    1 = "Yes"

(array([0, 1, 2, 3, 4, 5]), <a list of 6 Text xticklabel objects>)

fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(x="SMARTTHERM", y="ATHOME", data=dfa)
plt.xticks(rotation=0)

#ATHOME = Number of weekdays someone is at home
#SMARTTHERM:
#    -9 = "Don´t know"
#    -2 = "Not applicable"
#    0 = "No"
#    1 = "Yes"

(array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5]),
 <a list of 11 Text xticklabel objects>)

#boxplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.boxplot(x='NHSLDMEM', y='KWH', data=dfa)
plt.xticks(rotation=45)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]), <a list of 9 Text xticklabel objects>)

#boxplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.boxplot(x='YEARMADERANGE', y='KWH', data=dfa)
plt.xticks(rotation=45)

(array([0, 1, 2, 3, 4, 5, 6, 7]), <a list of 8 Text xticklabel objects>)

#boxplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.boxplot(x='TYPEHUQ', y='KWH', data=dfa)
plt.xticks(rotation=45)

#TYPEHUQ:
 #   1 = "Mobile home"
  #  2 = "Single-family detached house"
  #  3 = "Single-family attached house"
 #   4 = "Apartment in a building with 2 to 4 units"
 #   5 = "Apartment in a building with 5 or more units"

(array([0, 1, 2, 3, 4]), <a list of 5 Text xticklabel objects>)

# Task 5 - cluster your data to identify similar groups
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All
print(dfa_num.columns.tolist())
Xa = StandardScaler().fit_transform(dfa_num)
kmeans = KMeans(n_clusters=5, init='random') # initialization
kmeans.fit(Xa) # actual execution
pred = kmeans.predict(Xa)
np.unique(pred)

['TYPEHUQ', 'YEARMADERANGE', 'NHSLDMEM', 'TOTSQFT_EN', 'KWH', 'SMARTTHERM', 'SMARTMETER']

array([0, 1, 2, 3, 4], dtype=int32)

fig, ax = plt.subplots(figsize=(12, 8))
plt.scatter(Xa[:, 1], Xa[:, 2], c=pred, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 1], centers[:, 2], c='grey', s=50)

<matplotlib.collections.PathCollection at 0x7fcdfd2e2910>

# Task 6 - PCA for dimensionality reduction
# If you are starting from this task, you can run cells from all previous tasks in 
# the kernel by going to Kernel > Restart and Run All
pca = PCA(n_components=0.95)
pca.fit(Xa)
pcad = pca.transform(Xa)
print(pca.explained_variance_ratio_)

[0.2914217  0.18036113 0.14077383 0.13535373 0.11064364 0.07090507
 0.0705409 ]

fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(pcad[:,0], pcad[:,1])

<matplotlib.axes._subplots.AxesSubplot at 0x7fcdff8cc6d0>

pca2 = prince.PCA(n_components=759, n_iter=3, rescale_with_mean=True,
    rescale_with_std=True, copy=True, engine='auto')
pca2 = pca2.fit(dfa_num)
pca2.explained_inertia_

[0.2914216954661282,
 0.1803611250031431,
 0.1407738331701052,
 0.13535373054132957,
 0.11064364390921308,
 0.0709050699151458,
 0.07054090199493465]

ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['TYPEHUQ'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )

ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['STORIES'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )

ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['YEARMADERANGE'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )

ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['IECC_CLIMATE_PUB'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )

ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['SMARTMETER'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )

ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['ATHOME'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )

ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['THERMAIN'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )

ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['SMARTTHERM'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )

ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['YEARMADERANGE'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )

#[['TYPEHUQ','YEARMADERANGE','NHSLDMEM','TOTSQFT_EN','KWH','SMARTTHERM','SMARTMETER']

ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['SMARTTHERM'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )

#TYPEHUQ','STORIES','KOWNRENT','YEARMADERANGE','BEDROOMS','NHSLDMEM','NUMADULT', 'NUMCHILD', 'ATHOME',
         #               'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'
    
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['BEDROOMS'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )

#'NUMADULT', 'NUMCHILD', 'ATHOME',
         #               'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'
    
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['KOWNRENT'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )

#'NUMADULT', 'NUMCHILD', 'ATHOME',
         #               'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'
    
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
     x_component=0, y_component=1, labels=None,
     color_labels=dfa['DRYER'], ellipse_outline=False,
     ellipse_fill=True, show_points=True
 )

	DOEID	REGIONC	DIVISION	METROMICRO	UATYP10	TYPEHUQ	CELLAR	BASEFIN	...	ZELAMOUNT	NGXBTU	PERIODNG	FOXBTU	PERIODFO	ZFOAMOUNT	LPXBTU	PERIODLP	ZLPAMOUNT
0	10021	4	10	METRO	U	2	1	0	...	0	102.88	1	137.45	-2	-2	91.33	-2	-2
1	10045	4	10	METRO	U	2	0	-2	...	0	101.65	1	137.45	-2	-2	91.33	-2	-2
2	10086	4	10	METRO	U	2	0	-2	...	0	103.31	1	137.45	-2	-2	91.33	-2	-2
3	10125	4	10	METRO	U	2	0	-2	...	1	103.12	1	137.45	-2	-2	91.33	-2	-2
4	10126	4	10	METRO	U	2	1	0	...	0	101.87	1	137.45	-2	-2	91.33	-2	-2