機械学習

d <- read.csv("https://stats.dip.jp/01_ds/data/seiseki_jp.csv")
head(d)

library(DT)
datatable(d, caption = "成績データ")

r <- prcomp(d[, -1], scale = T) # scale = T: 相関行列， F: 分散共分散行列を利用
# 【注意】学籍番号のカラム（1番目）を除いているd[, -1]（科目データだけで分析）

summary(r)

## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.4508 1.0479 0.70060 0.63795 0.54796 0.47059 0.42754
## Proportion of Variance 0.6674 0.1220 0.05454 0.04522 0.03336 0.02461 0.02031
## Cumulative Proportion  0.6674 0.7894 0.84394 0.88916 0.92252 0.94713 0.96744
##                            PC8     PC9
## Standard deviation     0.41376 0.34909
## Proportion of Variance 0.01902 0.01354
## Cumulative Proportion  0.98646 1.00000

# 参考
options(digits = 1) # 表示有効数字2桁
(variance <- r$sdev^2) # 分散（変動），固有値

## [1] 6.0 1.1 0.5 0.4 0.3 0.2 0.2 0.2 0.1

(proportion_variance <- variance / sum(variance)) # 変動割合

## [1] 0.67 0.12 0.05 0.05 0.03 0.02 0.02 0.02 0.01

(cumulative_propotion <- cumsum(proportion_variance)) # 累積変動割合

## [1] 0.7 0.8 0.8 0.9 0.9 0.9 1.0 1.0 1.0

#library(factoextra)
#get_eigenvalue(r)



evec <- r$rotation

datatable(round(evec, 2))

# レコード名を入力（省くとスコア表とbiplotでレコード連番表示になる）
rownames(r$x) <- d$学籍番号

datatable(round(r$x, 2))

library(factoextra)


fviz_screeplot(r, addlabels = T)

fviz_contrib(r, choice = "var", axes = 1, top = 5)

fviz_contrib(r, choice = "var", axes = 2, top = 5)

library("corrplot")

var <- get_pca_var(r)
corrplot(var$cor, is.corr = T, addCoef.col = "gray")

#corrplot(var$contrib, is.corr = F, addCoef.col = "gray") 


fviz_pca_var(r, 
             col.var = "contrib", # 色分け 
             repel = T) # repel: テキストラベルの重なり防止

fviz_pca_biplot(r, col.ind = "contrib", repel = T)

d0 <- read.csv(file = "https://stats.dip.jp/01_ds/data/hand_writing_numbers0-9.csv")

library(DT)
datatable(d0)

d <- d0[, -1]
number <- d0$number

draw.images <- function(img, i.fr, i.to)
{
  par(mfrow = c(5, 5), 
      mar = c(0, 0, 1, 0)+0.1, 
      cex.main = 0.9)
  
  DX <- 8
  DY <- 8
  BIT <- 16
  
  for (i in i.fr:i.to)
  {
    plot(NA, type = "n",axes = F,
         xlim = c(0, DX),
         ylim = c(0, DY),
         xlab = "",
         ylab = "",
        main = paste("Fig.", i-1))
    
    m <- matrix(unlist(img[i, ])/BIT, nrow = 8, byrow = T)
    
    rasterImage(m, 0, 0, DX, DY) 
  }
}

# 描画
draw.images(img = d, i.fr = 1, i.to = 20)

# 第2主成分までの主成分分析
r <- prcomp(d, rank. = 2)

fviz_screeplot(r, addlabels = T)

# 第１主成分への各変数の貢献度（上位５）
fviz_contrib(r, choice = "var", axes = 1, top = 5)

# 第２主成分への各変数の貢献度（上位５）
fviz_contrib(r, choice = "var", axes = 2, top = 5)

# 変数ベクトル
fviz_pca_var(r, 
             col.var = "contrib", # 色分け 
             repel = T) # repel: テキストラベルの重なり防止

# 個別グラフ（数字を大体グルーピング）
fviz_pca_ind(r, 
             label = d0$number,
             habillage = number,
             addEllipses = T,
             ellipse.level = 0.95)

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import matplotlib as mpl
import seaborn as sns
import numpy as np
import matplotlib.font_manager as fm

sns.set()
jpn_fonts=list(np.sort([ttf for ttf in fm.findSystemFonts() if 'ipaexg' in ttf or 'msgothic' in ttf or 'japan' in ttf or 'ipafont' in ttf]))
jpn_font=jpn_fonts[0]
prop = fm.FontProperties(fname=jpn_font)
print(jpn_font)

## C:\Windows\Fonts\msgothic.ttc

plt.rcParams['font.family'] = prop.get_name()


d0 = pd.read_csv("https://stats.dip.jp/01_ds/data/hand_writing_numbers0-9.csv")

# 最初の列（正解番号）を除く
d1 = d0.loc[:, d0.columns!="number"]

# 第2主成分まで計算する設定でオブジェクトを作成
pca = PCA(2)

r = pca.fit_transform(d1) # 主成分分析

d = pd.DataFrame(data = r, columns = ['pc1', 'pc2']) # データフレーム化

# 散布図
plt.scatter(d["pc1"], d["pc2"],
            c = d0["number"],
            edgecolor = 'none', alpha = 0.5,
            cmap=plt.cm.get_cmap('Spectral', 10))
            
plt.xlabel('第１主成分')
plt.ylabel('第２主成分')

plt.colorbar()

## <matplotlib.colorbar.Colorbar object at 0x00000201F4F6BE90>

plt.show()

機械学習

演習課題

23150124

2024-09-24