機械学習

d <- read.csv(file = 'https://stats.dip.jp/01_ds/data/gender.csv')

library(DT)
datatable(d, caption = '性別データ')

d$性別 <- ifelse(d$性別 == '男性', 1, 0)



fit <- glm(性別 ~ 身長, data = d, family = 'binomial')
summary(fit)

## 
## Call:
## glm(formula = 性別 ~ 身長, family = "binomial", data = d)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -105.1286    18.8522  -5.576 2.45e-08 ***
## 身長           0.6092     0.1092   5.577 2.45e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 186.965  on 134  degrees of freedom
## Residual deviance:  41.452  on 133  degrees of freedom
## AIC: 45.452
## 
## Number of Fisher Scoring iterations: 7

#library(sjPlot)
#tab_model(fit, show.aic = T)



xp <- seq(150, 200, 1)
yp <- predict(fit, type = 'response', newdata = data.frame(身長 = xp))

matplot(x = d$身長, y = d$性別, pch = 1, ylim = c(0, 1.1), xlim = c(150, 200), main = 'ロジスティック回帰分析', xlab = '身長', ylab = '性別')
grid()

matlines(x = xp, y = yp, lty = 1, col = 2, lwd = 2)

library(latex2exp)
text(x = 175, y = 0.6, adj = 0, labels = TeX('$\\leftarrow \\hat{p}=\\frac{1}{1+\\exp\\{-(b_0 + b_1 x)\\}}$'))
legend('bottomright', pch = c(1, NA), lty = c(NA, 1), col = c(1, 2), legend = c('男性(1)，女性(0)', 'ロジスティック回帰モデル'))



library(plotly)

plot_ly(type = 'scatter') |>
  add_trace(x = d$身長, y = d$性別, mode = 'markers', name = '男性(1)，女性(0)') |>
  add_trace(x = xp, y = yp, mode = 'lines', name = 'ロジスティック回帰モデル') |>
  layout(title = 'ロジスティック回帰分析', xaxis = list(title = '身長', range = c(150, 200)), yaxis = list(title = '性別')) |>
  add_annotations(x = 172, y = 0.4, ax = 80, text = '$\\hat{p}=\\frac{1}{1+\\exp\\{-(b_0 + b_1 x)\\}}$') |>
  config(mathjax = 'cdn')

yp <- predict(fit, type = 'response', newdata = data.frame(身長 = 170))
sprintf('男性の確率：%2.1f％', yp * 100)

## [1] "男性の確率：17.2％"

if (yp > 0.5) {
  cat("推定結果：男性\n")
} else {
    cat("推定結果：女性\n")
  }

## 推定結果：女性

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

# データの読み込み
d = pd.read_csv('https://stats.dip.jp/01_ds/data/gender.csv')
x = d['身長'].to_numpy().reshape(-1, 1)
y = d['性別']

# ダミー変数に変換し、1次元配列に変換
y = pd.get_dummies(y)
y = y.iloc[:, 1]  # 男性の列を選択（0: 女性、1: 男性）

# ロジスティック回帰モデルの学習
model = LogisticRegression(random_state=1)
model.fit(x, y)

LogisticRegression(random_state=1)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# 予測
xp = np.linspace(150, 200, 500).reshape(-1, 1)  # より細かい範囲で予測
yp = model.predict_proba(xp)

# グラフの描画
plt.title('ロジスティック回帰分析', fontdict={'family': 'MS Gothic', 'fontsize': 16})
plt.xlabel('身長［cm］', fontdict={'family': 'MS Gothic', 'fontsize': 14})
plt.ylabel('男性（１），女性（０）', fontdict={'family': 'MS Gothic', 'fontsize': 14})

# 格子線（grid lines）
plt.grid(linestyle='--', color=(0.9, 0.9, 0.9, 0.25))

# データポイントと予測曲線のプロット
plt.plot(x, y, 'o', label='Data points')
plt.plot(xp, yp[:, 1], linestyle='solid', 
         label='$\hat{y}_i=\\frac{1}{1-\\exp(b_0 + b_1 x_i)}$')

# 凡例（はんれい）
plt.legend(loc='lower right')

plt.show()

xp1 = np.array([[170]])  # 身長170cmを2D配列として指定
yp1 = model.predict_proba(xp1)  # 確率の予測

# 男性である確率を取得し、パーセント形式で表示
male_prob = yp1[0, 1]  # 男性の確率
male_prob_percent = round(male_prob * 100, 1)  # パーセント形式に変換
print(f'男性の確率：{male_prob_percent}%')

## 男性の確率：17.5%

機械学習

演習課題

23150124

2024-09-17