機械学習

d <- read.csv(file = 'https://stats.dip.jp/01_ds/data/univ_exam_data.csv')

library(DT)
datatable(d, caption = '勉強時間と大学合否')

fit <- glm(合格 ~ 勉強時間, data = d, family = 'binomial')
summary(fit)

## 
## Call:
## glm(formula = 合格 ~ 勉強時間, family = "binomial", data = d)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -2.7611     1.2649  -2.183   0.0291 *
## 勉強時間      0.6635     0.2736   2.425   0.0153 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27.726  on 19  degrees of freedom
## Residual deviance: 17.662  on 18  degrees of freedom
## AIC: 21.662
## 
## Number of Fisher Scoring iterations: 5

xp <- seq(0, 10, 0.1) 
yp <- predict(fit, type = 'response',
              newdata = data.frame(勉強時間 = xp))

matplot(x = d$勉強時間, y = d$合格, pch = 1,
        ylim = c(0, 1.1), xlim = c(0, 10),
        main = 'ロジスティック回帰分析',
        xlab = '平均勉強時間',
        ylab = '合格確率')
grid()

matlines(x = xp, y = yp, lty = 1, col = 2, lwd = 2)

library(latex2exp)
text(x = 3.8, y = 0.4, adj = 0, 
     labels = TeX('$\\leftarrow \\hat{p}=\\frac{1}{1+\\exp\\{-(b_0 + b_1 x)\\}}$'))

legend('topleft', pch = c(1, NA), lty = c(NA, 1), col = c(1, 2),
       legend = c('合格(1)，不合格(0)', 'ロジスティック回帰モデル'))

library(plotly)

##  要求されたパッケージ ggplot2 をロード中です

## 
##  次のパッケージを付け加えます: 'plotly'

##  以下のオブジェクトは 'package:ggplot2' からマスクされています:
## 
##     last_plot

##  以下のオブジェクトは 'package:latex2exp' からマスクされています:
## 
##     TeX

##  以下のオブジェクトは 'package:stats' からマスクされています:
## 
##     filter

##  以下のオブジェクトは 'package:graphics' からマスクされています:
## 
##     layout

plot_ly(type = 'scatter') |>
  add_trace(x = d$勉強時間, y = d$合格,
            mode = 'markers', name = '合格(1)，不合格(0)') |>
  add_trace(x = xp, y = yp, 
            mode = 'lines',   name = 'ロジスティック回帰モデル') |> 
  layout(title = 'ロジスティック回帰分析', 
         xaxis = list(title = '平均勉強時間'),
         yaxis = list(title = '合格確率')) |>
  add_annotations(x = 3.6, y = 0.4, ax = 150, 
                  text = '$\\hat{p}=\\frac{1}{1+\\exp\\{-(b_0 + b_1 x)\\}}$') |> 
  config(mathjax = 'cdn')

## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

yp <- predict(fit, type = 'response', newdata = data.frame(勉強時間 = 5.5))
sprintf('合格確率：%2.1f％', yp * 100)

## [1] "合格確率：70.9％"

d <- read.csv(file = 'https://stats.dip.jp/01_ds/data/gender.csv')

library(DT)
datatable(d, caption = '性別データ')

library(ggplot2)
library(DT)

d <- read.csv(file = 'https://stats.dip.jp/01_ds/data/gender.csv')


datatable(d, caption = '性別データ')

d$性別 <- ifelse(d$性別 == '男性', 1, 0)


fit <- glm(性別 ~ 身長, data = d, family = binomial)


height_seq <- seq(min(d$身長), max(d$身長, na.rm = TRUE), length.out = 100)

predicted_probs <- predict(fit, newdata = data.frame(身長 = height_seq), type = 'response')

plot_data <- data.frame(身長 = height_seq, 男性の確率 = predicted_probs)

ggplot(plot_data, aes(x = 身長, y = 男性の確率)) +
  geom_line(color = 'blue', size = 1) +
  geom_point(data = d, aes(x = 身長, y = 性別), color = 'red', alpha = 0.5) +
  labs(title = '身長ごとの男性である確率',
       x = '身長 (cm)',
       y = '男性である確率') +
  theme_minimal() +
  scale_y_continuous(labels = scales::percent)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

height_to_predict <- data.frame(身長 = 170)
predicted_probability <- predict(fit, newdata = height_to_predict, type = 'response')


predicted_probability

##         1 
## 0.1721242

if (predicted_probability > 0.5) {
  gender_estimation <- "男性 (Male)"
} else {
  gender_estimation <- "女性 (Female)"
}

gender_estimation

## [1] "女性 (Female)"

機械学習

演習課題

23150005

2024-09-17