R異常検知本python化warningsの処理options(warn=-1)
import warnings
warnings.simplefilter('ignore')
Rのデータを.csv化# install.packages("car")
library(car)
data(Davis)
Davis
write.csv(Davis,"Davis.csv",row.names=FALSE)
# install.packages("car")
library(MASS)
data(road)
road
write.csv(road,"road.csv",row.names=FALSE)
python版1章のコードは存在しないのでなし
python版# 実行例2.1
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("Davis.csv")
plt.hist(df["weight"],
bins = 14)
## (array([10., 58., 55., 32., 26., 11., 5., 1., 1., 0., 0., 0., 0.,
## 1.]), array([ 39. , 48.07142857, 57.14285714, 66.21428571,
## 75.28571429, 84.35714286, 93.42857143, 102.5 ,
## 111.57142857, 120.64285714, 129.71428571, 138.78571429,
## 147.85714286, 156.92857143, 166. ]), <BarContainer object of 14 artists>)
plt.xlim(35 ,105)
## (35.0, 105.0)
plt.show()
plt.clf()
# 実行例2.2
mu = df["weight"].mean()
s2 = df["weight"].var(ddof=0)
print(mu, s2)
## 65.8 226.71999999999977
# 実行例2.3
from scipy.stats import chi2
diff = df["weight"] - df["weight"].mean()
a = diff * diff / s2
th = chi2.ppf(q = 0.99, df = 1)
plt.scatter(df.index,
a,
color = "blue")
plt.hlines(th, 0, 199, color="red")
plt.xlabel("index")
plt.ylabel("anomaly score")
plt.show()
plt.clf()
# 実行例2.4
X = df[["weight", "height"]]
plt.scatter(X["weight"],
X["height"])
plt.xlabel("weight")
plt.ylabel("height")
plt.show()
plt.clf()
# 実行例2.5
X = df[["weight", "height"]].values
mx = X.mean(axis = 0)
Xc = X - mx
Sx = Xc.T @ Xc / X.shape[0]
a0 = Xc @ np.linalg.inv(Sx) * Xc
a = a0.sum(axis = 1)
plt.scatter(df.index,
a)
plt.xlabel("index")
plt.xlabel("anomaly score")
plt.hlines(th, 0, 199, color = "red")
plt.show()
plt.clf()
# 実行例2.6
df0 = pd.read_csv("road.csv")
df1 = df0.div(df0["drivers"], axis=0).drop("drivers", axis = 1)
df2 = np.log(df1 + 1)
df = df2.values
mx = df.mean(axis = 0)
Xc = df - mx
Sx = Xc.T @ Xc / df.shape[0]
a0 = Xc @ np.linalg.inv(Sx) * Xc
a = a0.sum(axis = 1) / df.shape[1]
plt.scatter(np.arange(df.shape[0]),
a)
plt.xlabel("index")
plt.xlabel("anomaly score")
plt.ylim(-1 / df.shape[1], 30 / df.shape[1])
## (-0.2, 6.0)
plt.hlines(1, 0, 30, color = "red")
plt.show()
plt.clf()
# 実行例2.7
# 書籍のRコードではrownameがCalifの行を取り出している。
# Califは5行目(pythonのインデックス的には4)なので、それを抜き出す。
xc_prime = Xc[4, :]
SN1 = 10 * np.log10(xc_prime**2 / np.diag(Sx))
plt.bar(df2.columns ,height = SN1)
## <BarContainer object of 5 artists>
plt.show()
plt.clf()