1 準備

1.1 warningsの処理

options(warn=-1)
import warnings
warnings.simplefilter('ignore')

1.2 書籍で利用しているRのデータを.csv

# install.packages("car")
library(car)
data(Davis)
Davis
write.csv(Davis,"Davis.csv",row.names=FALSE)
# install.packages("car")
library(MASS)
data(road)
road
write.csv(road,"road.csv",row.names=FALSE)

2 1章コードのpython

1章のコードは存在しないのでなし

3 2章コードのpython

# 実行例2.1
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("Davis.csv")
plt.hist(df["weight"],
         bins = 14)
## (array([10., 58., 55., 32., 26., 11.,  5.,  1.,  1.,  0.,  0.,  0.,  0.,
##         1.]), array([ 39.        ,  48.07142857,  57.14285714,  66.21428571,
##         75.28571429,  84.35714286,  93.42857143, 102.5       ,
##        111.57142857, 120.64285714, 129.71428571, 138.78571429,
##        147.85714286, 156.92857143, 166.        ]), <BarContainer object of 14 artists>)
plt.xlim(35 ,105)
## (35.0, 105.0)
plt.show()

plt.clf()
# 実行例2.2
mu = df["weight"].mean()
s2 = df["weight"].var(ddof=0)
print(mu, s2)
## 65.8 226.71999999999977
# 実行例2.3
from scipy.stats import chi2
diff = df["weight"] - df["weight"].mean()
a = diff * diff / s2
th = chi2.ppf(q = 0.99, df = 1)
plt.scatter(df.index,
            a,
            color = "blue")
plt.hlines(th, 0, 199, color="red")
plt.xlabel("index")
plt.ylabel("anomaly score")
plt.show()

plt.clf()
# 実行例2.4
X = df[["weight", "height"]]
plt.scatter(X["weight"],
            X["height"])
plt.xlabel("weight")
plt.ylabel("height")
plt.show()

plt.clf()
# 実行例2.5
X = df[["weight", "height"]].values
mx = X.mean(axis = 0)
Xc = X - mx
Sx = Xc.T @ Xc / X.shape[0]
a0 = Xc @ np.linalg.inv(Sx) * Xc
a = a0.sum(axis = 1)
plt.scatter(df.index,
            a)
plt.xlabel("index")
plt.xlabel("anomaly score")
plt.hlines(th, 0, 199, color = "red")
plt.show()

plt.clf()
# 実行例2.6
df0 = pd.read_csv("road.csv")
df1 = df0.div(df0["drivers"], axis=0).drop("drivers", axis = 1)
df2 = np.log(df1 + 1)
df = df2.values
mx = df.mean(axis = 0)
Xc = df - mx
Sx = Xc.T @ Xc / df.shape[0]
a0 = Xc @ np.linalg.inv(Sx) * Xc
a = a0.sum(axis = 1) / df.shape[1]
plt.scatter(np.arange(df.shape[0]),
            a)
plt.xlabel("index")
plt.xlabel("anomaly score")
plt.ylim(-1 / df.shape[1], 30 / df.shape[1])
## (-0.2, 6.0)
plt.hlines(1, 0, 30, color = "red")
plt.show()

plt.clf()
# 実行例2.7
# 書籍のRコードではrownameがCalifの行を取り出している。
# Califは5行目(pythonのインデックス的には4)なので、それを抜き出す。
xc_prime = Xc[4, :]
SN1 = 10 * np.log10(xc_prime**2 / np.diag(Sx))
plt.bar(df2.columns ,height = SN1)
## <BarContainer object of 5 artists>
plt.show()

plt.clf()