import pandas as pd
import numpy as np
cc = pd.read_csv("AER_credit_card_data.csv")
cc.info()
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 1319 entries, 0 to 1318
## Data columns (total 12 columns):
## # Column Non-Null Count Dtype
## --- ------ -------------- -----
## 0 card 1319 non-null object
## 1 reports 1319 non-null int64
## 2 age 1319 non-null float64
## 3 income 1319 non-null float64
## 4 share 1319 non-null float64
## 5 expenditure 1319 non-null float64
## 6 owner 1319 non-null object
## 7 selfemp 1319 non-null object
## 8 dependents 1319 non-null int64
## 9 months 1319 non-null int64
## 10 majorcards 1319 non-null int64
## 11 active 1319 non-null int64
## dtypes: float64(4), int64(5), object(3)
## memory usage: 123.8+ KB
cc.head()
## card reports age income ... dependents months majorcards active
## 0 yes 0 37.66667 4.5200 ... 3 54 1 12
## 1 yes 0 33.25000 2.4200 ... 3 34 1 13
## 2 yes 0 33.66667 4.5000 ... 4 58 1 5
## 3 yes 0 30.50000 2.5400 ... 0 25 1 7
## 4 yes 0 32.16667 9.7867 ... 2 64 1 5
##
## [5 rows x 12 columns]
x = cc["card"]
condlist = [x == "yes", x == "no"]
choicelist = [1, 0]
cc["card"] = np.select(condlist, choicelist)
cc["card"][:20]
## 0 1
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## 7 1
## 8 1
## 9 1
## 10 1
## 11 0
## 12 0
## 13 1
## 14 1
## 15 1
## 16 1
## 17 0
## 18 1
## 19 0
## Name: card, dtype: int32
train/validation/test with 60%/20%/20% distribution
len(cc)
## 1319
n_train = int(len(cc) * 0.6 - 0.4)
n_train
## 791
n_val = int(len(cc) * 0.2 + 0.2)
n_val
## 264
n_test = int(len(cc) * 0.2 + 0.2)
n_test
## 264
tot = n_train + n_val + n_test
tot
## 1319
from sklearn.model_selection import train_test_split
ftrain, test = train_test_split(cc, test_size = n_test, random_state = 1)
len(ftrain)
## 1055
len(test)
## 264
train, val = train_test_split(ftrain, test_size = n_val, random_state = 1)
len(train)
## 791
len(val)
## 264
y_train = train.card.values
y_val = val.card.values
y_test = test.card.values
ROC AUC could also be used to evaluate feature importance of numerical variables.
Let’s do that
If your AUC is < 0.5, invert this variable by putting “-” in front
(e.g. -df_train[‘expenditure’])
AUC can go below 0.5 if the variable is negatively correlated with the target varialble. You can change the direction of the correlation by negating this variable - then negative correlation becomes positive.
Which numerical variable (among the following 4) has the highest AUC?
from sklearn.metrics import roc_auc_score
num_col = train.dtypes[train.dtypes!=object].index.values
num_col = np.delete(num_col, 0)
num_col
## array(['reports', 'age', 'income', 'share', 'expenditure', 'dependents',
## 'months', 'majorcards', 'active'], dtype=object)
from IPython.display import display
col = []
score = []
for i in num_col:
display(print(i))
x = roc_auc_score(train.card.values, train[i].values)
display(print(x))
col.append(i)
score.append(x)
## reports
## None
## 0.28333701393106236
## None
## age
## None
## 0.4759979020592945
## None
## income
## None
## 0.5908049467233478
## None
## share
## None
## 0.989183643423692
## None
## expenditure
## None
## 0.991042345276873
## None
## dependents
## None
## 0.46722427722262094
## None
## months
## None
## 0.470578221903237
## None
## majorcards
## None
## 0.5343859842838476
## None
## active
## None
## 0.6043173411362006
## None
roc_auc = {}
roc_auc["variable"] = col
roc_auc["value"] = score
roc_auc
## {'variable': ['reports', 'age', 'income', 'share', 'expenditure', 'dependents', 'months', 'majorcards', 'active'], 'value': [0.28333701393106236, 0.4759979020592945, 0.5908049467233478, 0.989183643423692, 0.991042345276873, 0.46722427722262094, 0.470578221903237, 0.5343859842838476, 0.6043173411362006]}
ind = np.arange(len(col))
auc_table = pd.DataFrame(data = roc_auc, index = ind )
auc_table
## variable value
## 0 reports 0.283337
## 1 age 0.475998
## 2 income 0.590805
## 3 share 0.989184
## 4 expenditure 0.991042
## 5 dependents 0.467224
## 6 months 0.470578
## 7 majorcards 0.534386
## 8 active 0.604317
Which numerical variable (among the following 4) has the highest AUC?
rom now on, use these columns only:
[“reports”, “age”, “income”, “share”, “expenditure”, “dependents”, “months”, “majorcards”, “active”, “owner”, “selfemp”]
Apply one-hot-encoding using DictVectorizer and train the logistic regression with these parameters:
LogisticRegression(solver=‘liblinear’, C=1.0, max_iter=1000)
del train["card"]
del val["card"]
del test["card"]
selected = ["reports", "age", "income", "share", "expenditure", "dependents", "months", "majorcards", "active", "owner", "selfemp"]
train = train[selected]
train.columns
## Index(['reports', 'age', 'income', 'share', 'expenditure', 'dependents',
## 'months', 'majorcards', 'active', 'owner', 'selfemp'],
## dtype='object')
train
## reports age income share ... majorcards active owner selfemp
## 1105 3 40.50000 4.0128 0.000299 ... 1 17 no no
## 431 1 32.33333 6.0000 0.000200 ... 1 4 yes no
## 407 1 29.16667 2.2000 0.038205 ... 1 7 no no
## 1217 1 54.66667 7.2900 0.106536 ... 1 9 yes no
## 1133 0 25.00000 3.3984 0.000353 ... 0 4 yes no
## ... ... ... ... ... ... ... ... ... ...
## 416 0 53.00000 2.4500 0.017718 ... 1 11 yes no
## 1162 2 30.58333 2.5000 0.000480 ... 1 18 no no
## 128 0 24.75000 1.8750 0.080708 ... 0 1 no no
## 413 1 56.91667 3.4838 0.062895 ... 1 7 yes no
## 1203 2 24.58333 2.2000 0.000545 ... 1 8 no no
##
## [791 rows x 11 columns]
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
dv = DictVectorizer(sparse=False)
train_dict = train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)
LogisticRegression(max_iter=1000, solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(max_iter=1000, solver='liblinear')
val_dict = val.to_dict(orient="records")
X_val = dv.transform(val_dict)
y_pred = model.predict_proba(X_val)[:,1]
What’s the AUC of this model on the validation dataset? (round to 3 digits)
roc_auc_score(y_val, y_pred).round(3)
## 0.995
Now let’s compute precision and recall for our model.
At which threshold precision and recall curves intersect?
thresholds = np.linspace(0.0, 1.0, 100)
thresholds
## array([0. , 0.01010101, 0.02020202, 0.03030303, 0.04040404,
## 0.05050505, 0.06060606, 0.07070707, 0.08080808, 0.09090909,
## 0.1010101 , 0.11111111, 0.12121212, 0.13131313, 0.14141414,
## 0.15151515, 0.16161616, 0.17171717, 0.18181818, 0.19191919,
## 0.2020202 , 0.21212121, 0.22222222, 0.23232323, 0.24242424,
## 0.25252525, 0.26262626, 0.27272727, 0.28282828, 0.29292929,
## 0.3030303 , 0.31313131, 0.32323232, 0.33333333, 0.34343434,
## 0.35353535, 0.36363636, 0.37373737, 0.38383838, 0.39393939,
## 0.4040404 , 0.41414141, 0.42424242, 0.43434343, 0.44444444,
## 0.45454545, 0.46464646, 0.47474747, 0.48484848, 0.49494949,
## 0.50505051, 0.51515152, 0.52525253, 0.53535354, 0.54545455,
## 0.55555556, 0.56565657, 0.57575758, 0.58585859, 0.5959596 ,
## 0.60606061, 0.61616162, 0.62626263, 0.63636364, 0.64646465,
## 0.65656566, 0.66666667, 0.67676768, 0.68686869, 0.6969697 ,
## 0.70707071, 0.71717172, 0.72727273, 0.73737374, 0.74747475,
## 0.75757576, 0.76767677, 0.77777778, 0.78787879, 0.7979798 ,
## 0.80808081, 0.81818182, 0.82828283, 0.83838384, 0.84848485,
## 0.85858586, 0.86868687, 0.87878788, 0.88888889, 0.8989899 ,
## 0.90909091, 0.91919192, 0.92929293, 0.93939394, 0.94949495,
## 0.95959596, 0.96969697, 0.97979798, 0.98989899, 1. ])
thresholds.shape
## (100,)
scores = []
for t in thresholds:
actual_positive = (y_val == 1)
actual_negative = (y_val == 0)
predict_positive = (y_pred >= t)
predict_negative = (y_pred < t)
tp = (predict_positive & actual_positive).sum()
tn = (predict_negative & actual_negative).sum()
fp = (predict_positive & actual_negative).sum()
fn = (predict_negative & actual_positive).sum()
scores.append((t, tp, fp, fn, tn))
import matplotlib.pyplot as plt
scores
## [(0.0, 211, 53, 0, 0), (0.010101010101010102, 211, 34, 0, 19), (0.020202020202020204, 211, 31, 0, 22), (0.030303030303030304, 211, 24, 0, 29), (0.04040404040404041, 211, 22, 0, 31), (0.05050505050505051, 211, 22, 0, 31), (0.06060606060606061, 211, 19, 0, 34), (0.07070707070707072, 211, 18, 0, 35), (0.08080808080808081, 210, 18, 1, 35), (0.09090909090909091, 210, 18, 1, 35), (0.10101010101010102, 210, 17, 1, 36), (0.11111111111111112, 210, 17, 1, 36), (0.12121212121212122, 210, 17, 1, 36), (0.13131313131313133, 210, 10, 1, 43), (0.14141414141414144, 208, 9, 3, 44), (0.15151515151515152, 208, 9, 3, 44), (0.16161616161616163, 208, 7, 3, 46), (0.17171717171717174, 207, 6, 4, 47), (0.18181818181818182, 207, 6, 4, 47), (0.19191919191919193, 207, 5, 4, 48), (0.20202020202020204, 207, 5, 4, 48), (0.21212121212121213, 207, 5, 4, 48), (0.22222222222222224, 207, 5, 4, 48), (0.23232323232323235, 206, 5, 5, 48), (0.24242424242424243, 206, 5, 5, 48), (0.25252525252525254, 206, 5, 5, 48), (0.26262626262626265, 206, 5, 5, 48), (0.27272727272727276, 206, 5, 5, 48), (0.2828282828282829, 206, 5, 5, 48), (0.29292929292929293, 206, 5, 5, 48), (0.30303030303030304, 205, 4, 6, 49), (0.31313131313131315, 205, 4, 6, 49), (0.32323232323232326, 205, 4, 6, 49), (0.33333333333333337, 205, 4, 6, 49), (0.3434343434343435, 205, 1, 6, 52), (0.3535353535353536, 205, 1, 6, 52), (0.36363636363636365, 205, 1, 6, 52), (0.37373737373737376, 205, 1, 6, 52), (0.38383838383838387, 205, 1, 6, 52), (0.393939393939394, 205, 1, 6, 52), (0.4040404040404041, 205, 1, 6, 52), (0.4141414141414142, 205, 1, 6, 52), (0.42424242424242425, 204, 1, 7, 52), (0.43434343434343436, 204, 1, 7, 52), (0.4444444444444445, 204, 1, 7, 52), (0.4545454545454546, 204, 1, 7, 52), (0.4646464646464647, 204, 1, 7, 52), (0.4747474747474748, 204, 1, 7, 52), (0.48484848484848486, 204, 1, 7, 52), (0.494949494949495, 204, 1, 7, 52), (0.5050505050505051, 204, 1, 7, 52), (0.5151515151515152, 204, 1, 7, 52), (0.5252525252525253, 204, 1, 7, 52), (0.5353535353535354, 204, 1, 7, 52), (0.5454545454545455, 204, 1, 7, 52), (0.5555555555555556, 204, 1, 7, 52), (0.5656565656565657, 204, 1, 7, 52), (0.5757575757575758, 204, 1, 7, 52), (0.5858585858585859, 204, 1, 7, 52), (0.595959595959596, 204, 1, 7, 52), (0.6060606060606061, 204, 1, 7, 52), (0.6161616161616162, 204, 1, 7, 52), (0.6262626262626263, 204, 1, 7, 52), (0.6363636363636365, 204, 1, 7, 52), (0.6464646464646465, 204, 1, 7, 52), (0.6565656565656566, 204, 1, 7, 52), (0.6666666666666667, 204, 1, 7, 52), (0.6767676767676768, 204, 1, 7, 52), (0.686868686868687, 204, 1, 7, 52), (0.696969696969697, 204, 1, 7, 52), (0.7070707070707072, 204, 1, 7, 52), (0.7171717171717172, 204, 1, 7, 52), (0.7272727272727273, 204, 1, 7, 52), (0.7373737373737375, 204, 1, 7, 52), (0.7474747474747475, 204, 1, 7, 52), (0.7575757575757577, 204, 1, 7, 52), (0.7676767676767677, 204, 1, 7, 52), (0.7777777777777778, 204, 1, 7, 52), (0.787878787878788, 204, 1, 7, 52), (0.797979797979798, 204, 1, 7, 52), (0.8080808080808082, 204, 1, 7, 52), (0.8181818181818182, 204, 1, 7, 52), (0.8282828282828284, 204, 1, 7, 52), (0.8383838383838385, 204, 1, 7, 52), (0.8484848484848485, 204, 0, 7, 53), (0.8585858585858587, 204, 0, 7, 53), (0.8686868686868687, 204, 0, 7, 53), (0.8787878787878789, 204, 0, 7, 53), (0.888888888888889, 204, 0, 7, 53), (0.8989898989898991, 204, 0, 7, 53), (0.9090909090909092, 204, 0, 7, 53), (0.9191919191919192, 204, 0, 7, 53), (0.9292929292929294, 204, 0, 7, 53), (0.9393939393939394, 204, 0, 7, 53), (0.9494949494949496, 204, 0, 7, 53), (0.9595959595959597, 204, 0, 7, 53), (0.9696969696969697, 203, 0, 8, 53), (0.9797979797979799, 203, 0, 8, 53), (0.98989898989899, 202, 0, 9, 53), (1.0, 179, 0, 32, 53)]
columns = ['threshold', 'tp', 'fp', 'fn', 'tn']
df_scores = pd.DataFrame(scores, columns=columns)
df_scores['precision'] = df_scores.tp / (df_scores.tp + df_scores.fp)
df_scores['recall'] = df_scores.tp / (df_scores.tp + df_scores.tn)
df_scores
## threshold tp fp fn tn precision recall
## 0 0.000000 211 53 0 0 0.799242 1.000000
## 1 0.010101 211 34 0 19 0.861224 0.917391
## 2 0.020202 211 31 0 22 0.871901 0.905579
## 3 0.030303 211 24 0 29 0.897872 0.879167
## 4 0.040404 211 22 0 31 0.905579 0.871901
## .. ... ... .. .. .. ... ...
## 95 0.959596 204 0 7 53 1.000000 0.793774
## 96 0.969697 203 0 8 53 1.000000 0.792969
## 97 0.979798 203 0 8 53 1.000000 0.792969
## 98 0.989899 202 0 9 53 1.000000 0.792157
## 99 1.000000 179 0 32 53 1.000000 0.771552
##
## [100 rows x 7 columns]
df_plot = df_scores.loc[:, ('threshold', 'precision', 'recall')]
plt.plot(df_plot.threshold, df_plot['precision'], label='precision')
plt.plot(df_plot.threshold, df_plot['recall'], label='recall')
plt.xlabel('Threshold')
plt.ylabel('Metric')
plt.legend()
plt.show()
df_plot.iloc[0:10,]
## threshold precision recall
## 0 0.000000 0.799242 1.000000
## 1 0.010101 0.861224 0.917391
## 2 0.020202 0.871901 0.905579
## 3 0.030303 0.897872 0.879167
## 4 0.040404 0.905579 0.871901
## 5 0.050505 0.905579 0.871901
## 6 0.060606 0.917391 0.861224
## 7 0.070707 0.921397 0.857724
## 8 0.080808 0.921053 0.857143
## 9 0.090909 0.921053 0.857143
closest to 0.1
Precision and recall are conflicting - when one grows, the other goes down. That’s why they are often combined into the F1 score - a metrics that takes into account both
\[F_1 = 2 \cdot \cfrac{P \cdot R}{P + R}\]
Where P is precision and R is recall.
Let’s compute F1 for all thresholds from 0.0 to 1.0 with increment 0.01 using the validation set
At which threshold F1 is maximal?
df_scores["F1"] = 2 * ((df_scores.precision * df_scores.recall)/(df_scores.precision + df_scores.recall))
df_scores
## threshold tp fp fn tn precision recall F1
## 0 0.000000 211 53 0 0 0.799242 1.000000 0.888421
## 1 0.010101 211 34 0 19 0.861224 0.917391 0.888421
## 2 0.020202 211 31 0 22 0.871901 0.905579 0.888421
## 3 0.030303 211 24 0 29 0.897872 0.879167 0.888421
## 4 0.040404 211 22 0 31 0.905579 0.871901 0.888421
## .. ... ... .. .. .. ... ... ...
## 95 0.959596 204 0 7 53 1.000000 0.793774 0.885033
## 96 0.969697 203 0 8 53 1.000000 0.792969 0.884532
## 97 0.979798 203 0 8 53 1.000000 0.792969 0.884532
## 98 0.989899 202 0 9 53 1.000000 0.792157 0.884026
## 99 1.000000 179 0 32 53 1.000000 0.771552 0.871046
##
## [100 rows x 8 columns]
df_scores[df_scores["F1"] == df_scores.F1.max()]
## threshold tp fp fn tn precision recall F1
## 1 0.010101 211 34 0 19 0.861224 0.917391 0.888421
## 6 0.060606 211 19 0 34 0.917391 0.861224 0.888421
closest to 0.1
For various reasons, I’m exhausted, If possible I will try to catch up next week. I’ll give up the next two questions.