Excel có 2 công cụ tuyệt vời là solver và goal seek. Với những dự án
nhỏ, hoặc dữ liệu nhỏ, mô hình ít thì 2 công cụ này cũng đủ dùng. Nhưng
khi dữ liệu nhiều, số lượng mô hình tăng, excel không còn phù hợp. Có
nhiều công cụ thay thế excel giải quyết những vẫn đề này, trong đó có
python là 1 công cụ rất mạnh.
Trong phần này tôi sử dụng hàm optimize của gói
scipy. Các bước như sau:
class sử dụng cho mục đích tối ưu. Class
này bao gồm 4 hàm:func: Hàm cần tối ưu.
sse: Tổng sai số bình phương. Càng gần 0 càng
tốt.
solver: Tối ưu tham số cho hàm
func.
predict: Xuất kết quả nội suy dựa trên các tham số
đã tối ưu từ hàm solver
gamma
exponential
cauchy
log-logistic
log-normal
import numpy as np
import pandas as pd
import scipy
from scipy import stats
import scipy.optimize as optimize
class opt_gamma():
def __init__(self, actual_pd):
self.actual_pd = actual_pd
self.x_input = range(1,5)
def func(self, x, scale, a, b):
predict = stats.gamma.pdf(x, a = a, scale = b) * scale
return predict
def sse(self, params, xobs, yobs):
ynew = self.func(xobs, *params)
sse = np.sum((ynew - yobs) ** 2)
return sse
def solver(self):
p0 = [1,1,1]
bounds = [(0.0001, 2), (0.0001, 10), (0.0001, 10)]
res = scipy.optimize.minimize(self.sse, p0, args=(self.x_input, self.actual_pd), bounds= bounds)
# res = scipy.optimize.minimize(self.sse, p0, args=(self.x_input, self.actual_pd), method='Nelder-Mead')
return res
def predict(self, t):
res = self.solver()
ypred = self.func(range(1, t+1), *res.x)
return ypred
class opt_exponential:
def __init__(self, actual_pd):
self.actual_pd = actual_pd
self.x_input = range(1,5)
def func(self, x, scale, a):
# change function here
pred = stats.expon.pdf(x, scale = 1/a) * scale
return pred
def sse(self, params, xobs, yobs):
ynew = self.func(xobs, *params)
mse = np.sum((ynew - yobs) ** 2)
return mse
def solver(self):
# change initial guess here
p0 = [1,2]
bounds = [(0.0001, 2), (0.0001, 0.5)]
res = scipy.optimize.minimize(self.sse, p0, args=(self.x_input, self.actual_pd), bounds = bounds)
return res
def predict(self, t=30):
res = self.solver()
ypred = self.func(range(1, t+1), *res.x)
return ypred
class opt_cauchy:
def __init__(self, actual_pd):
self.actual_pd = actual_pd
self.x_input = range(1,5)
def func(self, x, scale, a, b):
# change function here
predict = stats.cauchy.pdf(x, loc = b, scale = a) * scale
return predict
def sse(self, params, xobs, yobs):
ynew = self.func(xobs, *params)
mse = np.sum((ynew - yobs) ** 2)
return mse
def solver(self):
# change initial guess here
p0 = [1,1,1]
bounds = [(0.0001, 2), (0.0001, 15), (-30, 30)]
res = scipy.optimize.minimize(self.sse, p0, args=(self.x_input, self.actual_pd), bounds = bounds)
return res
def predict(self, t=30):
res = self.solver()
ypred = self.func(range(1, t+1), *res.x)
return ypred
class opt_log_logistic:
def __init__(self, actual_pd):
self.actual_pd = actual_pd
self.x_input = range(1,5)
def func(self, x, scale, a, b):
# change function here
predict = scale*(a/b)*(x/b)**(a-1)/(1+(x/b)**a)**2
return predict
def sse(self, params, xobs, yobs):
ynew = self.func(xobs, *params)
mse = np.sum((ynew - yobs) ** 2)
return mse
def solver(self):
# change initial guess here
p0 = [2,3,1]
bounds = [(0.0001, 2), (0.0001, 10), (0.0001, 10)]
res = scipy.optimize.minimize(self.sse, p0, args=(self.x_input, self.actual_pd), bounds=bounds)
return res
def predict(self, t=30):
res = self.solver()
ypred = self.func(range(1, t+1), *res.x)
return ypred
class opt_log_normal:
def __init__(self, actual_pd):
self.actual_pd = actual_pd
self.x_input = range(1,5)
def func(self, x, scale, a, b):
# change function here
predict = (np.exp(-(np.log(x) - a)**2 / (2 * b**2)) / (x * b * np.sqrt(2 * np.pi)))*scale
return predict
def sse(self, params, xobs, yobs):
ynew = self.func(xobs, *params)
mse = np.sum((ynew - yobs) ** 2)
return mse
def solver(self):
# change initial guess here
p0 = [1,1,1]
bounds = [(0.0001, 2), (0.0001, 0.5), (0.0001, 0.5)]
res = scipy.optimize.minimize(self.sse, p0, args=(self.x_input, self.actual_pd), bounds=bounds)
return res
def predict(self, t=30):
res = self.solver()
ypred = self.func(range(1, t+1), *res.x)
return ypred
Gamma = 1,1,1
Cauchy = 1,1,1
Expo = 1,2
Log Logistic = 2,3,1
Log Normal = 1,1,1
data = pd.DataFrame({
'segment_pool_group': ['A','B'],
'Y1':[0.015243, 0.511198],
'Y2':[0.025249, 0.058478],
'Y3':[0.015204, 0.013153],
'Y4':[0.006288, 0.001244]
})
data
## segment_pool_group Y1 Y2 Y3 Y4
## 0 A 0.015243 0.025249 0.015204 0.006288
## 1 B 0.511198 0.058478 0.013153 0.001244
fitted = opt_exponential(data.iloc[0,1:])
fitted.predict(10)
## array([0.02023038, 0.01675521, 0.013877 , 0.0114932 , 0.0095189 ,
## 0.00788374, 0.00652947, 0.00540784, 0.00447888, 0.0037095 ])
def fn_export_by_segment(segment, period):
segment = segment.upper()
actual_pd = data[data.segment_pool_group.isin([segment])]
actual_pd = actual_pd.iloc[0, 1:]
actual_pd = actual_pd.values
res_gamma = opt_gamma(actual_pd)
res_exponential = opt_exponential(actual_pd)
res_cauchy = opt_cauchy(actual_pd)
res_log_logistic = opt_log_logistic(actual_pd)
res_log_normal = opt_log_normal(actual_pd)
params = pd.DataFrame({
'Segment': segment,
'Distribution': ['Gamma', 'Exponential', 'Cauchy', 'Log-logistic', 'Log-normal'],
'SSE': [res_gamma.solver().fun, res_exponential.solver().fun, res_cauchy.solver().fun, res_log_logistic.solver().fun, res_log_normal.solver().fun],
'params': [res_gamma.solver().x, res_exponential.solver().x, res_cauchy.solver().x, res_log_logistic.solver().x, res_log_normal.solver().x]
})
params[['scale', 'a', 'b']] = pd.DataFrame(params.params.to_list())
del params['params']
dict_predict = {
'Segment': segment,
'Period': range(1, period+1),
'Gamma': res_gamma.predict(period),
'Exponential': res_exponential.predict(period),
'Cauchy': res_cauchy.predict(period),
'Log-logistic': res_log_logistic.predict(period),
'Log-normal': res_log_normal.predict(period)
}
best_distribution = params.sort_values('SSE').head(1)
best_distribution = best_distribution['Distribution'].item()
df_best_fit = pd.DataFrame(
{'Segment': segment,
'Period': range(1, period+1),
'PIT PD': dict_predict[best_distribution]
}
)
print('Best distribution of ' + segment + ' is '+ best_distribution)
return params, df_best_fit
cohort_frame = []
params_frame = []
for seg in data.segment_pool_group:
params, df = fn_export_by_segment(seg, 10)
cohort_frame.append(df)
params_frame.append(params)
## Best distribution of A is Gamma
## Best distribution of B is Log-logistic
cohort_frame = pd.concat(cohort_frame, axis=0)
params_frame = pd.concat(params_frame, axis=0)
cohort_frame
## Segment Period PIT PD
## 0 A 1 0.015258
## 1 A 2 0.025186
## 2 A 3 0.015317
## 3 A 4 0.006189
## 4 A 5 0.001999
## 5 A 6 0.000560
## 6 A 7 0.000142
## 7 A 8 0.000034
## 8 A 9 0.000008
## 9 A 10 0.000002
## 0 B 1 0.511253
## 1 B 2 0.057734
## 2 B 3 0.013715
## 3 B 4 0.004842
## 4 B 5 0.002148
## 5 B 6 0.001104
## 6 B 7 0.000628
## 7 B 8 0.000385
## 8 B 9 0.000250
## 9 B 10 0.000170
params_frame
## Segment Distribution SSE scale a b
## 0 A Gamma 2.670094e-08 0.064236 4.470855 0.525033
## 1 A Exponential 1.258736e-04 0.129599 0.188477 NaN
## 2 A Cauchy 2.894647e-07 0.096248 1.209731 1.989756
## 3 A Log-logistic 5.116131e-07 0.069833 3.016979 2.266344
## 4 A Log-normal 1.762030e-04 0.048216 0.500000 0.341430
## 0 B Gamma 4.475289e-05 1.714616 1.194551 0.439128
## 1 B Exponential 6.886861e-02 1.171584 0.500000 NaN
## 2 B Cauchy 1.492210e-05 0.736379 0.158899 1.218210
## 3 B Log-logistic 1.382140e-05 1.217354 2.666813 0.588791
## 4 B Log-normal 7.844108e-05 0.521912 0.000100 0.407397