import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import prince
# READ DATABASE
dfa = pd.read_csv("recs2015_public_v4_3C.csv")
print(dfa.shape, "\n")
print(dfa.info(), "\n")
print(dfa.describe(), "\n")
dfa.head()
print(dfa.groupby(['TYPEHUQ']).size(), "\n") # frequency of each unique label in column
dfa['TYPEHUQ'].value_counts() # alternate and better way
# remove leading and trailing whitespaces from each cell, only works on string columns
dfa = dfa.apply(lambda x: x.str.strip() if x.dtype=='object' else x)
dfa['TYPEHUQ'].value_counts()
# check average NAs in each column
print(dfa.isna().mean())
dfa.isna().mean().plot(kind='barh')
# drop columns with >30% missing values (1 column). This cut-off is arbitrary, choose carefully
dfa = dfa.loc[:, dfa.isna().mean() < .3]
# replace missing values with NaN, and drop rows with missing values
dfa = dfa.replace('NA',np.nan)
dfa = dfa.dropna()
dfa.info()
print(dfa.duplicated().sum())
dfa = dfa.drop_duplicates()
dfa.info()
# Task 4 - find and visualize correlations
sns.pairplot(dfa, vars=('TYPEHUQ','STORIES','KOWNRENT','YEARMADERANGE','BEDROOMS','NHSLDMEM','NUMADULT', 'NUMCHILD', 'ATHOME',
'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'), palette='RegionC')
# Task 4.1 - find and visualize correlations
#dfa_num = dfa[['TYPEHUQ','YEARMADERANGE','NHSLDMEM','TOTSQFT_EN','KWH','SMARTTHERM','SMARTMETER']] #FEW CORRELATION 'MONEYPY', 'EDUCATION'
dfa_num = dfa[['TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65']] #MORE CORRELATION
numeric_cols = dfa_num.columns.tolist()
dfa_num = dfa[numeric_cols]
dfa_corr = dfa_num.corr()
print(dfa_corr, "\n")
print(dfa_corr.mean(), "\n")
print(dfa_corr.abs().mean(), "\n")
print(dfa_num.describe(), "\n")
dfa_corr.info()
dfa_num.info()
for col in dfa_num:
print(col + ' ' + str(dfa_num[col].nunique()))
# pairplot - only numeric columns
numeric_cols = dfa_num.select_dtypes(include=np.number).columns.tolist()
#pairplot of first 5 numeric columns, coloured by Number of Doors
sns.pairplot(dfa_num, vars=numeric_cols[:7], palette='TOTSQFT_EN')
# Task 4.2 - Patterns, outliers, correlations
# If you are starting from this task, you can run cells from all previous tasks in
# the kernel by going to Kernel > Restart and Run All
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(x="KWH", y="TOTSQFT_EN", data=dfa)
plt.xticks(rotation=0)
# Task 4.2 - Patterns, outliers, correlations
# If you are starting from this task, you can run cells from all previous tasks in
# the kernel by going to Kernel > Restart and Run All
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(x="EDUCATION", y="KWH", data=dfa)
plt.xticks(rotation=0)
# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="HOUSEHOLDER_RACE", hue="NHSLDMEM", data=dfa, ax=ax)
plt.xticks(rotation=45)
#1 = "White alone"
#2 = "Black or African/American Alone"
#3 = "American Indian or Alaska Native Alone"
#4 = "Asian Alone"
#5 = "Native Hawaiian or Other Pacific Islander Alone"
#6 = "Some other race alone"
#7 = "2 or more Races selected"
# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="HOUSEHOLDER_RACE", hue="NHSLDMEM", data=dfa, ax=ax)
plt.xticks(rotation=45)
#1 = "White alone"
#2 = "Black or African/American Alone"
#3 = "American Indian or Alaska Native Alone"
#4 = "Asian Alone"
#5 = "Native Hawaiian or Other Pacific Islander Alone"
#6 = "Some other race alone"
#7 = "2 or more Races selected"
# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="MONEYPY", hue="EDUCATION", data=dfa, ax=ax)
plt.xticks(rotation=45)
#MONEYPY:
# 1 = "Less than $20,000"
# 2 = "$20,000 - $39,999"
# 3 = "$40,000 - $59,999"
# 4 = "$60,000 to $79,999"
# 5 = "$80,000 to $99,999"
# 6 = "$100,000 to $119,999"
# 7 = "$120,000 to $139,999"
# 8 = "$140,000 or more"
#EDUCATION:
# 1 = "Less than high school diploma or GED"
# 2 = "High school diploma or GED"
# 3 = "Some college or Associate´s degree"
# 4 = "Bachelor´s degree (for example BA, BS)"
# 5 = "Master´s, Professional or Doctorate degree (for example MA, MS, MBA, MD, Jd, PhD)"
# countplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.countplot(x="ATHOME", hue="SMARTTHERM", data=dfa, ax=ax)
plt.xticks(rotation=45)
#ATHOME = Number of weekdays someone is at home
#SMARTTHERM:
# -9 = "Don´t know"
# -2 = "Not applicable"
# 0 = "No"
# 1 = "Yes"
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(x="SMARTTHERM", y="ATHOME", data=dfa)
plt.xticks(rotation=0)
#ATHOME = Number of weekdays someone is at home
#SMARTTHERM:
# -9 = "Don´t know"
# -2 = "Not applicable"
# 0 = "No"
# 1 = "Yes"
#boxplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.boxplot(x='NHSLDMEM', y='KWH', data=dfa)
plt.xticks(rotation=45)
#boxplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.boxplot(x='YEARMADERANGE', y='KWH', data=dfa)
plt.xticks(rotation=45)
#boxplot
fig, ax = plt.subplots(figsize=(12, 8))
sns.boxplot(x='TYPEHUQ', y='KWH', data=dfa)
plt.xticks(rotation=45)
#TYPEHUQ:
# 1 = "Mobile home"
# 2 = "Single-family detached house"
# 3 = "Single-family attached house"
# 4 = "Apartment in a building with 2 to 4 units"
# 5 = "Apartment in a building with 5 or more units"
# Task 5 - cluster your data to identify similar groups
# If you are starting from this task, you can run cells from all previous tasks in
# the kernel by going to Kernel > Restart and Run All
print(dfa_num.columns.tolist())
Xa = StandardScaler().fit_transform(dfa_num)
kmeans = KMeans(n_clusters=5, init='random') # initialization
kmeans.fit(Xa) # actual execution
pred = kmeans.predict(Xa)
np.unique(pred)
fig, ax = plt.subplots(figsize=(12, 8))
plt.scatter(Xa[:, 1], Xa[:, 2], c=pred, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 1], centers[:, 2], c='grey', s=50)
# Task 6 - PCA for dimensionality reduction
# If you are starting from this task, you can run cells from all previous tasks in
# the kernel by going to Kernel > Restart and Run All
pca = PCA(n_components=0.95)
pca.fit(Xa)
pcad = pca.transform(Xa)
print(pca.explained_variance_ratio_)
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(pcad[:,0], pcad[:,1])
pca2 = prince.PCA(n_components=759, n_iter=3, rescale_with_mean=True,
rescale_with_std=True, copy=True, engine='auto')
pca2 = pca2.fit(dfa_num)
pca2.explained_inertia_
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
x_component=0, y_component=1, labels=None,
color_labels=dfa['TYPEHUQ'], ellipse_outline=False,
ellipse_fill=True, show_points=True
)
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
x_component=0, y_component=1, labels=None,
color_labels=dfa['STORIES'], ellipse_outline=False,
ellipse_fill=True, show_points=True
)
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
x_component=0, y_component=1, labels=None,
color_labels=dfa['YEARMADERANGE'], ellipse_outline=False,
ellipse_fill=True, show_points=True
)
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
x_component=0, y_component=1, labels=None,
color_labels=dfa['IECC_CLIMATE_PUB'], ellipse_outline=False,
ellipse_fill=True, show_points=True
)
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
x_component=0, y_component=1, labels=None,
color_labels=dfa['SMARTMETER'], ellipse_outline=False,
ellipse_fill=True, show_points=True
)
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
x_component=0, y_component=1, labels=None,
color_labels=dfa['ATHOME'], ellipse_outline=False,
ellipse_fill=True, show_points=True
)
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
x_component=0, y_component=1, labels=None,
color_labels=dfa['THERMAIN'], ellipse_outline=False,
ellipse_fill=True, show_points=True
)
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
x_component=0, y_component=1, labels=None,
color_labels=dfa['SMARTTHERM'], ellipse_outline=False,
ellipse_fill=True, show_points=True
)
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
x_component=0, y_component=1, labels=None,
color_labels=dfa['YEARMADERANGE'], ellipse_outline=False,
ellipse_fill=True, show_points=True
)
#[['TYPEHUQ','YEARMADERANGE','NHSLDMEM','TOTSQFT_EN','KWH','SMARTTHERM','SMARTMETER']
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
x_component=0, y_component=1, labels=None,
color_labels=dfa['SMARTTHERM'], ellipse_outline=False,
ellipse_fill=True, show_points=True
)
#TYPEHUQ','STORIES','KOWNRENT','YEARMADERANGE','BEDROOMS','NHSLDMEM','NUMADULT', 'NUMCHILD', 'ATHOME',
# 'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
x_component=0, y_component=1, labels=None,
color_labels=dfa['BEDROOMS'], ellipse_outline=False,
ellipse_fill=True, show_points=True
)
#'NUMADULT', 'NUMCHILD', 'ATHOME',
# 'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
x_component=0, y_component=1, labels=None,
color_labels=dfa['KOWNRENT'], ellipse_outline=False,
ellipse_fill=True, show_points=True
)
#'NUMADULT', 'NUMCHILD', 'ATHOME',
# 'SMARTTHERM','SMARTMETER','TOTSQFT_EN','KWH','CDD30YR', 'CDD65', 'CDD80', 'HDD30YR', 'HDD65', 'HDD50', 'GNDHDD65'
ax = pca2.plot_row_coordinates(dfa_num, ax=None, figsize=(12, 8),
x_component=0, y_component=1, labels=None,
color_labels=dfa['DRYER'], ellipse_outline=False,
ellipse_fill=True, show_points=True
)