機械学習.knit

title:機械学習

output: 田中ツヨキ

date: “2023-12-08”

d0 <- read.csv(file = 'https://stats.dip.jp/01_ds/data/titanic_data_jp.csv')
library(DT)
datatable(d0)

d0$生死 <- ifelse (d0$生死== '生存',1 ,0 )

summary(d0)

##    乗船番号              生死          客室等級             名前          
##  Length:714         Min.   :0.0000   Length:714         Length:714        
##  Class :character   1st Qu.:0.0000   Class :character   Class :character  
##  Mode  :character   Median :0.0000   Mode  :character   Mode  :character  
##                     Mean   :0.4062                                        
##                     3rd Qu.:1.0000                                        
##                     Max.   :1.0000                                        
##      性別                年齢        兄弟配偶者数        親子数      
##  Length:714         Min.   : 0.42   Min.   :0.0000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.0000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.0000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.5126   Mean   :0.4314  
##                     3rd Qu.:38.00   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                     Max.   :80.00   Max.   :5.0000   Max.   :6.0000  
##       運賃           乗船地         
##  Min.   :  0.00   Length:714        
##  1st Qu.:  8.05   Class :character  
##  Median : 15.74   Mode  :character  
##  Mean   : 34.69                     
##  3rd Qu.: 33.38                     
##  Max.   :512.33

head(d0)

##   乗船番号 生死 客室等級      名前 性別 年齢 兄弟配偶者数 親子数    運賃 乗船地
## 1     No.1    0      3等    Braund 男性   22            1      0  7.2500      S
## 2     No.2    1      1等   Cumings 女性   38            1      0 71.2833      C
## 3     No.3    1      3等 Heikkinen 女性   26            0      0  7.9250      S
## 4     No.4    1      1等  Futrelle 女性   35            1      0 53.1000      S
## 5     No.5    0      3等     Allen 男性   35            0      0  8.0500      S
## 6     No.7    0      1等  McCarthy 男性   54            0      0 51.8625      S

tail(d0)

##     乗船番号 生死 客室等級     名前 性別 年齢 兄弟配偶者数 親子数   運賃 乗船地
## 709   No.885    0      3等 Sutehall 男性   25            0      0  7.050      S
## 710   No.886    0      3等     Rice 女性   39            0      5 29.125      Q
## 711   No.887    0      2等 Montvila 男性   27            0      0 13.000      S
## 712   No.888    1      1等   Graham 女性   19            0      0 30.000      S
## 713   No.890    1      1等     Behr 男性   26            0      0 30.000      C
## 714   No.891    0      3等   Dooley 男性   32            0      0  7.750      Q

(n <- nrow(d0))

## [1] 714

set.seed(5)
ii <- sample(x = 1:n, size = floor(0.8 * n))
d.tr <- d0[ ii, ] 
d.te <- d0[-ii, ] 
(n.tr <- nrow(d.tr))

## [1] 571

(n.te <- nrow(d.te))

## [1] 143

fit.all <- glm(生死~客室等級+  性別+ 年齢+ 兄弟配偶者数+ 親子数+ 運賃+ 乗船地, data = d.tr, family = 'binomial')
summary(fit.all)

## 
## Call:
## glm(formula = 生死 ~ 客室等級 + 性別 + 年齢 + 兄弟配偶者数 + 
##     親子数 + 運賃 + 乗船地, family = "binomial", data = d.tr)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6437  -0.6306  -0.3804   0.6662   2.4121  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   17.215715 605.555989   0.028   0.9773    
## 客室等級2等   -1.540699   0.381403  -4.040 5.36e-05 ***
## 客室等級3等   -2.721618   0.398577  -6.828 8.59e-12 ***
## 性別男性      -2.512823   0.245347 -10.242  < 2e-16 ***
## 年齢          -0.046152   0.009290  -4.968 6.76e-07 ***
## 兄弟配偶者数  -0.353650   0.138036  -2.562   0.0104 *  
## 親子数        -0.044062   0.139460  -0.316   0.7520    
## 運賃          -0.003207   0.003342  -0.960   0.3372    
## 乗船地C      -12.272960 605.555797  -0.020   0.9838    
## 乗船地Q      -13.069235 605.556026  -0.022   0.9828    
## 乗船地S      -12.731745 605.555772  -0.021   0.9832    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 774.32  on 570  degrees of freedom
## Residual deviance: 519.84  on 560  degrees of freedom
## AIC: 541.84
## 
## Number of Fisher Scoring iterations: 13

fit <- glm(生死 ~ 客室等級 + 性別 + 年齢 + 兄弟配偶者数, data = d.tr, family = 'binomial')
summary(fit)

## 
## Call:
## glm(formula = 生死 ~ 客室等級 + 性別 + 年齢 + 兄弟配偶者数, 
##     family = "binomial", data = d.tr)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7973  -0.6277  -0.3904   0.6724   2.3959  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   4.389219   0.504021   8.708  < 2e-16 ***
## 客室等級2等  -1.508696   0.318108  -4.743 2.11e-06 ***
## 客室等級3等  -2.651199   0.313919  -8.445  < 2e-16 ***
## 性別男性     -2.480583   0.235670 -10.526  < 2e-16 ***
## 年齢         -0.045982   0.009104  -5.051 4.40e-07 ***
## 兄弟配偶者数 -0.405085   0.129990  -3.116  0.00183 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 774.32  on 570  degrees of freedom
## Residual deviance: 523.85  on 565  degrees of freedom
## AIC: 535.85
## 
## Number of Fisher Scoring iterations: 5

fit <- glm(生死 ~ 客室等級 + 性別 + 年齢, data = d.tr, family = 'binomial')
summary(fit)

## 
## Call:
## glm(formula = 生死 ~ 客室等級 + 性別 + 年齢, family = "binomial", 
##     data = d.tr)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7089  -0.6917  -0.4177   0.7013   2.4047  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.716434   0.437531   8.494  < 2e-16 ***
## 客室等級2等 -1.360400   0.307460  -4.425 9.66e-06 ***
## 客室等級3等 -2.557113   0.307238  -8.323  < 2e-16 ***
## 性別男性    -2.347847   0.225497 -10.412  < 2e-16 ***
## 年齢        -0.036569   0.008345  -4.382 1.18e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 774.32  on 570  degrees of freedom
## Residual deviance: 534.59  on 566  degrees of freedom
## AIC: 544.59
## 
## Number of Fisher Scoring iterations: 4

d.new <- data.frame(客室等級 = '2等', 年齢 = 45, 性別 = '女性')
p.hat <- predict(fit, type = 'response', newdata = d.new)
sprintf('生存確率：%2.1f％', p.hat * 100)

## [1] "生存確率：67.0％"

p.hat <- predict(fit, type = 'response', newdata = d.te)

threshold <- 0.5 
is.pred <- p.hat > threshold
is.ref <- d.te$生死 == 1

table(予測値 = is.pred, 真値 = is.ref)

##        真値
## 予測値 FALSE TRUE
##   FALSE    77    9
##   TRUE     12   45

is.ok <- is.pred == is.ref
n.ok <- sum(is.ok)

sprintf('生存予測精度：%2.1f％', n.ok / nrow(d.te) * 100)

## [1] "生存予測精度：85.3％"

library(caret)

##  要求されたパッケージ ggplot2 をロード中です

##  要求されたパッケージ lattice をロード中です

confusionMatrix(data = as.factor(is.pred), 
                reference = as.factor(is.ref))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE    77    9
##      TRUE     12   45
##                                           
##                Accuracy : 0.8531          
##                  95% CI : (0.7843, 0.9067)
##     No Information Rate : 0.6224          
##     P-Value [Acc > NIR] : 1.037e-09       
##                                           
##                   Kappa : 0.691           
##                                           
##  Mcnemar's Test P-Value : 0.6625          
##                                           
##             Sensitivity : 0.8652          
##             Specificity : 0.8333          
##          Pos Pred Value : 0.8953          
##          Neg Pred Value : 0.7895          
##              Prevalence : 0.6224          
##          Detection Rate : 0.5385          
##    Detection Prevalence : 0.6014          
##       Balanced Accuracy : 0.8493          
##                                           
##        'Positive' Class : FALSE           
##

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
##  次のパッケージを付け加えます: 'pROC'

##  以下のオブジェクトは 'package:stats' からマスクされています:
## 
##     cov, smooth, var

roc1 <- roc(response = d.te$生死, predict = p.hat,
            plot = T, print.auc = T, grid = T, ci = T, auc.polygon=T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

coords(roc1, 'best')

##   threshold specificity sensitivity
## 1 0.4505387   0.8539326   0.8518519

c <- coords(roc1) 

library(DT)
datatable(round(c, 3))

d <- read.csv(file = 'https://stats.dip.jp/01_ds/data/univ_exam_data.csv')

library(DT)
datatable(d, caption = '勉強時間と大学合否')

fit <- glm(合格 ~ 勉強時間, data = d, family = 'binomial')
#summary(fit)
library(sjPlot)
tab_model(fit, show.aic = T)

	合格
Predictors	Odds Ratios	CI	p
(Intercept)	0.06	0.00 – 0.51	0.029
勉強時間	1.94	1.25 – 3.86	0.015
Observations	20
R² Tjur	0.446
AIC	21.662

x <- d$勉強時間
y <- d$合格

matplot(x = x, y = y, pch = 1,
        ylim = c(0, 1.1), xlim = c(0, 10),
        main = 'ロジスティック回帰分析',
        xlab = '平均勉強時間',
        ylab = '合格(1)，不合格(0)')
grid()

xp <- seq(0, 10, 0.1)
yp <- predict(fit, type = 'response', newdata = data.frame(勉強時間 = xp))

matlines(x = xp, y = yp, lty = 1, col = 2, lwd = 2)

library(latex2exp)
text(x = 3.8, y = 0.4, adj = 0, 
     TeX('$\\leftarrow \\hat{p}=\\frac{1}{1+exp\\{-(b_0 + b_1 x)\\}}$'))
legend('topleft', pch = c(1, NA), lty = c(NA, 1), col = c(1, 2),legend = c('合格(1)，不合格(0)', 'ロジスティック回帰モデル'))

yp <- predict(fit, type = 'response', newdata = data.frame(勉強時間 = 5.5))
sprintf('合格確率：%2.1f％', yp * 100)

## [1] "合格確率：70.9％"

d <- read.csv(file = 'https://stats.dip.jp/01_ds/data/gender.csv')

library(DT)
datatable(d, caption = '性別データ')

d$性別 <- ifelse(d$性別 == '男性', 1, 0)

set.seed(1) # 乱数シード
n <- 100000 # データサイズ

# 正常値
rv <- runif(n = n, min = 0, max = 1) # 一様乱数

# 異常値１（負の値）
size <- 3   # ランダムサンプルサイズ
ii <- sample(n, size) # インデックスランダムサンプル
rv[ii] <- runif(n = size, min = -1, max = 0)

# 異常値２（とても大きな／小さな値）
rv[39] <- 22
rv[52] <- -21
rv[91] <- 11

# 小数点第２位に丸め行列（列数：100）に変換
m <- matrix(round(rv, 2), ncol = 100)

write.csv(as.data.frame(m), 
       file = 'data_cleansing2_dirty_data.csv', row.names = F, quote = F)

boxplot(m)

any(m < 0)

## [1] TRUE

any(abs(m) > 3*sd(m))

## [1] TRUE

jj <- 36:46
boxplot(m[, jj])

j <- 37
y <- m[, j]

barplot(height    = y,         # 棒グラフの高さ
        names.arg = 1:nrow(m)) # 横軸名

ii <- 140:231
y <- m[ii, j]

barplot(height    = y,                        # 棒グラフの高さ
        cex.names = 0.4,                      # 横軸ラベルの大きさ
        names.arg = paste(ii, '\n(', y, ')')) # 横軸名 \nで改行して値も（）内に表示

ii <- 194:206
y <- m[ii, j]

barplot(height    = y,                        # 棒グラフの高さ
        cex.names = 0.4,                      # 横軸ラベルの大きさ
        names.arg = paste(ii, '\n(', y, ')')) # 横軸名 \nで改行して値も表示

m[200, j]

## [1] -0.78

m[m < 0]

## [1] -21.00  -0.78  -0.99  -0.68

sigma <- sd(m) # 標準偏差

#m[abs(m) > 3*sigma] # 数が多いためコメントアウトした。

# 4シグマ（標準偏差の4倍）を超える値
m[abs(m) > 4*sigma]

## [1]  22 -21  11

m.a <- NULL # 結果格納用の空のオブジェクトを作成

for (i in 1:nrow(m))
{
  for (j in 1:ncol(m))
  {
    if ( m[i, j] < 0 | m[i, j] > 4*sigma)
    {
      cat('Row:', i, ', Col:', j, ', Value: ', m[i, j], fill = T) 
      m.a <- rbind(m.a, t(c(i, j, m[i, j]))) # rbind：行ベクトルを追加
    }
  }
}

## Row: 39 , Col: 1 , Value:  22
## Row: 52 , Col: 1 , Value:  -21
## Row: 91 , Col: 1 , Value:  11
## Row: 200 , Col: 37 , Value:  -0.78
## Row: 242 , Col: 100 , Value:  -0.68
## Row: 521 , Col: 59 , Value:  -0.99

colnames(m.a) <- c('Row', 'Col', 'Value')
m.a

##      Row Col  Value
## [1,]  39   1  22.00
## [2,]  52   1 -21.00
## [3,]  91   1  11.00
## [4,] 200  37  -0.78
## [5,] 242 100  -0.68
## [6,] 521  59  -0.99

d <- read.csv(file = 'https://stats.dip.jp/01_ds/data/data_cleansing2_dirty_data_exercise.csv')

library(DT)
datatable(d)

options(digits = 2) # 表示の有効桁数２桁に設定

d0 <- read.csv(file = 'https://stats.dip.jp/01_ds/data/car_data_jp.csv')

summary(d0)

##    お客様番号       性別                年齢         年収           購入判断  
##  Min.   :   1   Length:1000        Min.   :18   Min.   : 15000   Min.   :0.0  
##  1st Qu.: 251   Class :character   1st Qu.:32   1st Qu.: 46375   1st Qu.:0.0  
##  Median : 500   Mode  :character   Median :40   Median : 72000   Median :0.0  
##  Mean   : 500                      Mean   :40   Mean   : 72689   Mean   :0.4  
##  3rd Qu.: 750                      3rd Qu.:48   3rd Qu.: 90000   3rd Qu.:1.0  
##  Max.   :1000                      Max.   :63   Max.   :152500   Max.   :1.0

head(d0)

##   お客様番号 性別 年齢   年収 購入判断
## 1        385 男性   35  20000        0
## 2        681 男性   40  43500        0
## 3        353 男性   49  74000        0
## 4        895 男性   40 107500        1
## 5        661 男性   25  79000        0
## 6        846 女性   47  33500        1

tail(d0)

##      お客様番号 性別 年齢   年収 購入判断
## 995         951 女性   53 104000        1
## 996         863 男性   38  59000        0
## 997         800 女性   47  23500        0
## 998         407 女性   28 138500        1
## 999         299 女性   48 134000        1
## 1000        687 女性   44  73500        0

(n <- nrow(d0)) # 全レコード数

## [1] 1000

# 乱数シード（乱数の種）：
# 無作為標本抽出でプログラム実行のたびに同じ標本が抽出される。
# 数字を変えると異なる組み合わせで毎回同じ標本が抽出される。
set.seed(5)

## インデックス番号1～n（1000）をサイズ900で無作為標本抽出
#ii <- sample(x = 1:n, size = 900)

# インデックス番号1～nを全体の80％のサイズで無作為標本抽出
ii <- sample(x = 1:n, size = floor(0.8 * n))

# 抽出されたインデックス番号行のレコードを訓練データ
# 抽出されなかったインデックス番号行のレコードをテストデータとする。
d.tr <- d0[ ii, ] # 訓練データ
d.te <- d0[-ii, ] # テストデータ

(n.tr <- nrow(d.tr)) # 訓練データのレコード数

## [1] 800

(n.te <- nrow(d.te)) # テストデータのレコード数

## [1] 200

fit.all <- glm(購入判断 ~ ., data = d.tr, family = 'binomial')

summary(fit.all)

## 
## Call:
## glm(formula = 購入判断 ~ ., family = "binomial", data = d.tr)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.931  -0.590  -0.138   0.475   2.446  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.22e+01   9.22e-01   -13.3   <2e-16 ***
## お客様番号   2.16e-04   3.60e-04     0.6     0.55    
## 性別男性     1.67e-01   2.08e-01     0.8     0.42    
## 年齢         2.19e-01   1.68e-02    13.0   <2e-16 ***
## 年収         3.38e-05   3.67e-06     9.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1076.00  on 799  degrees of freedom
## Residual deviance:  589.56  on 795  degrees of freedom
## AIC: 599.6
## 
## Number of Fisher Scoring iterations: 6

fit <- glm(購入判断 ~ 年齢 + 年収, data = d.tr, family = 'binomial')

summary(fit)

## 
## Call:
## glm(formula = 購入判断 ~ 年齢 + 年収, family = "binomial", 
##     data = d.tr)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.948  -0.595  -0.140   0.476   2.421  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.19e+01   8.63e-01  -13.85   <2e-16 ***
## 年齢         2.17e-01   1.66e-02   13.05   <2e-16 ***
## 年収         3.35e-05   3.65e-06    9.18   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1076.00  on 799  degrees of freedom
## Residual deviance:  590.54  on 797  degrees of freedom
## AIC: 596.5
## 
## Number of Fisher Scoring iterations: 6

d.new <- data.frame(年齢 = 45, 年収 = 80000)
p.hat <- predict(fit, type = 'response', newdata = d.new)

sprintf('新車購入確率：%2.1f％', p.hat * 100)

## [1] "新車購入確率：62.0％"

p.hat <- predict(fit, type = 'response', newdata = d.te)

threshold <- 0.5 # 閾値（しきいち），カットオフ値（cut-off）とも呼ばれる。

is.pred <- p.hat > threshold
is.ref <- d.te$購入判断 == 1

table(予測値 = is.pred, 真値 = is.ref)

##        真値
## 予測値 FALSE TRUE
##   FALSE   102   27
##   TRUE     15   56

is.ok <- is.pred == is.ref
n.ok <- sum(is.ok)

sprintf('新車購入予測精度：%2.1f％', n.ok / nrow(d.te) * 100)

## [1] "新車購入予測精度：79.0％"

library(caret)

confusionMatrix(data = as.factor(is.pred), 
                reference = as.factor(is.ref))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE   102   27
##      TRUE     15   56
##                                         
##                Accuracy : 0.79          
##                  95% CI : (0.727, 0.844)
##     No Information Rate : 0.585         
##     P-Value [Acc > NIR] : 7.05e-10      
##                                         
##                   Kappa : 0.558         
##                                         
##  Mcnemar's Test P-Value : 0.0896        
##                                         
##             Sensitivity : 0.872         
##             Specificity : 0.675         
##          Pos Pred Value : 0.791         
##          Neg Pred Value : 0.789         
##              Prevalence : 0.585         
##          Detection Rate : 0.510         
##    Detection Prevalence : 0.645         
##       Balanced Accuracy : 0.773         
##                                         
##        'Positive' Class : FALSE         
##

library(pROC)
roc1 <- roc(response = d.te$購入判断, predict = p.hat,
            plot = T, print.auc = T, grid = T, ci = T, auc.polygon=T)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

coords(roc1, 'best') # 最適閾値 (optimal threshold)

##   threshold specificity sensitivity
## 1      0.27        0.77        0.89

c <- coords(roc1) # 閾値 (threshold)ごとの値 (sensitivity, specificity)

library(DT)
datatable(round(c, 3))

d0 <- read.csv(file = 'https://stats.dip.jp/01_ds/data/titanic_data_jp.csv')
datatable(d0)

d0 <- read.csv(file = 'https://stats.dip.jp/01_ds/data/dirty_data1.csv')

summary(d0) # 要約統計値　NA's:1 （NAが1つ含まれている）

##     会社名               気温         湿度          風速     
##  Length:22          Min.   :12   Min.   :-50   Min.   :-1.0  
##  Class :character   1st Qu.:20   1st Qu.: 30   1st Qu.: 1.0  
##  Mode  :character   Median :32   Median : 50   Median : 2.0  
##                     Mean   :28   Mean   : 40   Mean   : 2.1  
##                     3rd Qu.:34   3rd Qu.: 51   3rd Qu.: 4.0  
##                     Max.   :50   Max.   : 60   Max.   : 5.0  
##                     NA's   :1                  NA's   :1

library(DT)
datatable(d0)

d1 <- na.omit(d0)

summary(d1) # 要約統計値　NAがなくなった。

##     会社名               気温         湿度          風速     
##  Length:20          Min.   :12   Min.   :-50   Min.   :-1.0  
##  Class :character   1st Qu.:18   1st Qu.: 28   1st Qu.: 0.8  
##  Mode  :character   Median :32   Median : 50   Median : 2.0  
##                     Mean   :28   Mean   : 39   Mean   : 2.1  
##                     3rd Qu.:34   3rd Qu.: 51   3rd Qu.: 4.0  
##                     Max.   :50   Max.   : 60   Max.   : 5.0

datatable(d1)

# 各気象変数の許容範囲に入っているか否かを示す論理式を作成
is.tp <- -20 <= d1$気温 & d1$気温 <=  50
is.hm <-   0 <= d1$湿度 & d1$湿度 <= 100
is.ws <-   0 <= d1$風速 & d1$風速 <=  70

summary(is.tp)

##    Mode   FALSE    TRUE 
## logical       2      18

summary(is.hm)

##    Mode   FALSE    TRUE 
## logical       1      19

summary(is.ws)

##    Mode   FALSE    TRUE 
## logical       1      19

d2 <- d1[is.tp & is.hm & is.ws, ]

datatable(d2)

d3 <- d2
d3$気温 <- round(d2$気温, 1)
d3$湿度 <- round(d2$湿度, 0)
d3$風速 <- round(d2$風速, 0)

datatable(d3)

d3$会社名

##  [1] "ソニー"         "Sony"           "SONY"           "sony"          
##  [5] "ソニー株式会社" "Sony（株）"     "ソニー(株)"     "sony"          
##  [9] "富士通"         "Fujitsu"        "Fujitu"         "FUJITSU"       
## [13] "富士通株式会社" "富士通(株)"     "fujitsu"        "FUJITSU"

LAB.SONY <- c("ソニー", "Sony（株）", "ソニー(株)", "Sony", "SONY", "sony")

# 富士通株式会社を意味する表記
LAB.FUJITSU <- c("富士通", "富士通(株)", "Fujitsu", "FUJITSU", "fujitsu", "Fujitu", "fujitu")

# %in%の記号は右辺の集合に含まれるか否かを示す論理記号
is.sony    <- d3$会社名 %in% LAB.SONY
is.fujitsu <- d3$会社名 %in% LAB.FUJITSU

d4 <- d3

# 各メーカーの正式名称に統一
d4$会社名[is.sony]    <- 'ソニー株式会社'
d4$会社名[is.fujitsu] <- '富士通株式会社'

d4$会社名

##  [1] "ソニー株式会社" "ソニー株式会社" "ソニー株式会社" "ソニー株式会社"
##  [5] "ソニー株式会社" "ソニー株式会社" "ソニー株式会社" "ソニー株式会社"
##  [9] "富士通株式会社" "富士通株式会社" "富士通株式会社" "富士通株式会社"
## [13] "富士通株式会社" "富士通株式会社" "富士通株式会社" "富士通株式会社"

datatable(d4)

d <- read.csv(file = 'https://stats.dip.jp/01_ds/data/dirty_data2.csv')

datatable(d)

y <- c(AirPassengers) # 月間航空旅客数
n <- length(y)        # 月数
n.tr <- 120           # 訓練月数
n.te <- n - n.tr      # テスト月数
t <- 1:n              # 月
ii.tr <- 1:120        # 訓練期間インデックス
ii.te <- 121:n        # テスト期間インデックス
matplot(y, type = 'o', pch = 1, col = 2)

fq <- 12

library(MASS)
bc <- boxcox(y ~ t, data = data.frame(y = y[ii.tr], t = t[ii.tr]))

lambda <- round(bc$x[which.max(bc$y)], 1)
lambda

## [1] 0

library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

y.bc <- BoxCox(y[ii.tr], lambda)

matplot(y.bc, type = 'o', pch = 1, col = 2, ylab = 'y')

d <- 1

dy <- diff(y.bc, diff = d)

matplot(dy, type = 'o', pch = 1, col = 2, ylab = 'y')

ys <- ts(10 * sin(2 * pi * t / 10) + rnorm(n))

dir.create('acf_lag', showWarnings = F)

for (k in 1:25)
{
  png(paste0('acf_lag/acf_lag', k, '.png'))

  ys.lag <- lag(ys, k = -k)
  ts2 <- cbind(ys, ys.lag)

  par(mfrow = c(3, 1), mar = c(2, 4, 1, 1))
  matplot(ts2, type = 'o', pch = 1, col = c(1, 4), 
          xlab = '', ylab = '', xlim = c(0, 120))
  abline(v = c(0, k), col = 2)
  arrows( 0, 0, k, 0, length = 0.1, col = 2)
  mtext(paste(k), side = 1, at = k, line = 1, col = 2)
  legend('topright', col = c(1, 4), bg = 'white', lty = 1,
        legend = c('原系列（周期10の正弦波＋ノイズ）', paste0(k, '次ラグ系列')))
  grid()
   acf(ys, lag.max = k, xlim = c(0, 120), ylim = c(-1, 1), main = '')
  pacf(ys, lag.max = k, xlim = c(0, 120), ylim = c(-1, 1), main = '')

  dev.off()
}
#system('bash -c 'convert -delay 100 -loop 0 ./acf_lag/acf_lag{1..25}.png ./acf_lag/acf_lag.gif'')

a <- acf(dy, lag.max = 50) # MA(q)
grid()
abline(v = fq, col = 3, lty = 3)

q <- 12

pa <- pacf(dy, lag.max = 50) # AR(p)【注意】acfと違いグラフが1次ラグから始まる。
grid()
abline(v = fq, col = 3, lty = 3)

p <- 12

fit.arima <- Arima(ts(y[ii.tr], frequency = fq), lambda = 'auto',
                   order    = c(p, d, q),
                   seasonal = c(1, 1, 0)) # SARIMA

# c.f.
#fit.arima <- auto.arima(ts(y[ii.tr], frequency = fq), lambda = 'auto')
fit.arima

## Series: ts(y[ii.tr], frequency = fq) 
## ARIMA(12,1,12)(1,1,0)[12] 
## Box Cox transformation: lambda= -0.31 
## 
## Coefficients:
##         ar1    ar2     ar3    ar4     ar5    ar6   ar7     ar8   ar9   ar10
##       -0.25  0.094  -0.054  0.057  -0.059  -0.03  0.11  -0.003  0.14  0.073
## s.e.   0.14  0.144   0.137  0.135   0.133   0.13  0.14   0.133  0.13  0.131
##        ar11   ar12     ma1    ma2     ma3    ma4   ma5   ma6    ma7     ma8
##       -0.26  -0.28  -0.074  -0.14  -0.096  -0.22  0.19  0.21  -0.18  -0.037
## s.e.   0.13   0.18   0.147   0.14   0.126   0.15  0.15  0.15   0.16   0.139
##         ma9  ma10  ma11   ma12  sar1
##       -0.14  0.01  0.36  -0.63  0.12
## s.e.   0.14  0.15  0.15   0.15  0.23
## 
## sigma^2 = 5.45e-05:  log likelihood = 381
## AIC=-711   AICc=-693   BIC=-641

checkresiduals(fit.arima) # 残差分析グラフ

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(12,1,12)(1,1,0)[12]
## Q* = 6, df = 3, p-value = 0.1
## 
## Model df: 25.   Total lags used: 28

pred <- forecast(fit.arima, h = n.te, level = 95)

matplot(t, y, type = 'o', pch = 16, col = 2, main = pred$method)
grid()
abline(v = n.tr + 1, lty  = 3, col = 3)
matlines(t[ii.tr], fit.arima$fitted, type = 'o', pch = 16, lty = 2, col = 4)
matlines(t[ii.te], pred$mean,        type = 'o', pch =  1, lty = 2, col = 4)
matlines(t[ii.te], pred$lower, lty = 3, col = 'gray')
matlines(t[ii.te], pred$upper, lty = 3, col = 'gray')
legend('topleft', bg = 'white', lty = c(1, 2, 2, 3),
       col = c(2, 4, 4, 'gray'), pch = c(16, 16, 1, NA),
       legend = c('原系列', '1期先予測値', '予測値', '95%予測区間'))

get.accuracy <- function(yhat, y)
{
  data.frame(MBE  = mean(yhat - y),
             MAE  = mean(abs(yhat - y)),
             MAPE = mean(abs((yhat - y) / y)) * 100,
             RMSE = sqrt(mean((yhat - y)^2))
  )
}

get.accuracy(yhat = pred$mean, y = y[ii.te])

##   MBE MAE MAPE RMSE
## 1 -14  15  3.4   17

n <- 24*7*2            # 時間数
t <- 1:n               # 時間
n.tr <- floor(n * 0.8) # 訓練期間数
n.te <- n - n.tr       # テスト期間数
ii.tr <- 1:n.tr        # 訓練期間インデックス
ii.te <- (n.tr+1):n    # テスト期間インデックス

set.seed(5)

y <- 100 + 0.1 * t + 0.02 * t^2 + 100 * sin(2 * pi * t / 24) + rnorm(n, sd = 5)

matplot(y, type = 'o', pch = 1, col = 2)

options(digits = 2)

year  <- 2011:2017
sales <- c(1, 3, 2, 7, 3, 8, 7)
n <- length(year)
d <- data.frame(year, sales)

library(kableExtra)
kable(d,caption = 'ロボット販売', col.names = c('会計年', '台数'))%>%
  kable_classic('striped', full_width = F)

ロボット販売
会計年	台数
2011	1
2012	3
2013	2
2014	7
2015	3
2016	8
2017	7

matplot(x = year, y = sales, type = 'o', pch = 1,
        main = 'ロボット販売',
        xlab = '会計年',
        ylab = '台数')
grid()

d$ma3 <- NA # 値がNAの移動平均のカラム（ma3）を追加

for (i in 2:(n-1))
{
  d$ma3[i] <- (sales[i-1] + sales[i] + sales[i+1]) / 3
}

ma3 <- stats::filter(sales, rep(1/3, 3))
ma3

## Time Series:
## Start = 1 
## End = 7 
## Frequency = 1 
## [1] NA  2  4  4  6  6 NA

library(quantmod)

##  要求されたパッケージ xts をロード中です

##  要求されたパッケージ zoo をロード中です

## 
##  次のパッケージを付け加えます: 'zoo'

##  以下のオブジェクトは 'package:base' からマスクされています:
## 
##     as.Date, as.Date.numeric

##  要求されたパッケージ TTR をロード中です

ma3 <- rollmean(sales, 3, na.pad = T)
ma3

## [1] NA  2  4  4  6  6 NA

d$ma4 <- NA # 値がNAの移動平均のカラム（ma4）を追加

for (i in 3:(n-2)) 
{
  d$ma4[i] <-
    (0.5 * sales[i-2] + sales[i-1] + sales[i] + sales[i+1] + 0.5 * sales[i+2]) / 4
}

 ma4 <- stats::filter(sales, c(0.5/4, 1/4, 1/4, 1/4, 0.5/4))
  ma4

## Time Series:
## Start = 1 
## End = 7 
## Frequency = 1 
## [1]  NA  NA 3.5 4.4 5.6  NA  NA

ma4 <- rollmean(sales, k = 4, na.pad = T)
  ma4

## [1]  NA 3.2 3.8 5.0 6.2  NA  NA

kable(d, caption = 'ロボット販売',
      col.names = c('会計年', '台数', '3項移動平均', '4項移動平均'))%>%
kable_classic('striped', full_width = F)

ロボット販売
会計年	台数	3項移動平均	4項移動平均
2011	1	NA	NA
2012	3	2	NA
2013	2	4	3.5
2014	7	4	4.4
2015	3	6	5.6
2016	8	6	NA
2017	7	NA	NA

matplot(x = year, y = sales, type = 'o', pch = 1,
        main = 'ロボット販売', xlab = '会計年', ylab = '台数')
grid()

matlines(x = year, y = d$ma3, type = 'o', pch = 2, col = 2)
matlines(x = year, y = d$ma4, type = 'o', pch = 4, col = 4)

legend('topleft', col = c(1, 2, 4), pch = c(1, 2, 4), lty = 1,
       legend = c('台数', '3項移動平均', '4項移動平均'))

library(quantmod)

z <- getSymbols('9984', src='yahooj',
                        from = '2022-01-01',
                        to   = '2022-12-31',
                        auto.assign = F)

# データの保存
write.zoo(z, file = 'softbank.csv', sep = ',', quote = F)

x <- as.POSIXct(index(z), format = '%Y-%m-%d')
y <- z[, 'YJ9984.Close']

# 1日分データをずらすことで当日の日付の株価が１日前のものになる。
ylag1 <- lag(y, k = 1)

# align = 'right'で右寄せ移動平均となる。
yhat5   <- rollmean(ylag1,  5, fill = NA, align = 'right')
yhat25  <- rollmean(ylag1, 25, fill = NA, align = 'right')
yhat25c <- rollmean(y, 25, fill = NA) # 参考（通常の移動平均：中心化移動平均）

matplot(x, y, type = 'l', col = 3,
main = 'ソフトバンク（9984）', xlab = '日', ylab = '円')
matlines(x, yhat5,   col = 2)
matlines(x, yhat25,  col = 4)
matlines(x, yhat25c, col = 1)
grid()
legend('topleft', col = c(3, 2, 4, 1), lty = 1,
       legend = c('株価',
                  '5日移動平均',
                  '25日移動平均',
                  '25日中心化移動平均（参考）'))

chartSeries(z, TA = c(addBBands(), addMACD()), type='bar',
            subset='2022-06-01/2022-12-31')

x <- seq_along(y)
fit10 <- loess(y ~ x, data = data.frame(x, y), span = 0.1) # 10%
fit30 <- loess(y ~ x, data = data.frame(x, y), span = 0.3) # 30%

matplot(x, y = y, type = 'l', col = 3,
main = 'ソフトバンク（9984）', xlab = '日', ylab = '円')
matlines(x, fit10$fitted, col = 1, lwd = 2)
matlines(x, fit30$fitted, col = 2, lwd = 2)
grid()
legend('topleft', col = c(3, 1, 2), lty = 1,
       legend = c('株価',
                  'LOESS（平滑化窓幅10%）',
                  'LOESS（平滑化窓幅30%）'))

n <- 100                         # 全時間
x <- 1:n                         # 時刻
t <- 2 * x                       # トレンド（係数：2）
e <- rnorm(n, mean = 0, sd = 10) # 不規則変動 
y <- t + e                       # 合成変動 

matplot(x, y, type = 'o', pch = 16, col = 2)

n.tr <- n * 0.8   # 訓練時間（80%）
n.te <- n - n.tr  # テスト時間（20%）

ii.tr <- 1:n.tr 
ii.te <- (n.tr + 1):n

fit <- lm(y ~ x, data = data.frame(x = x[ii.tr], y = y[ii.tr]))
#summary(fit)
pred <- predict(fit, newdata = data.frame(x = x[ii.te]), interval = 'prediction')

myplot <- function(x, y, pred)
{
  matplot(x, y, type = 'o', pch = 16, col = 2, cex = 0.5)
  abline(v = n.tr+1, col = 3, lty = 2)
  matlines(x[ii.te], pred[, 'fit'], type = 'o', pch = 1, lty = 2, col = 4)
  matlines(x[ii.te], pred[, 'lwr'], lty = 3, col = 'lightgray')
  matlines(x[ii.te], pred[, 'upr'], lty = 3, col = 'lightgray')
  legend('topleft', bg = 0,
        lty = c(1, 2, 3), pch = c(16, 1, NA), col = c(2, 4, 'lightgray'),
        legend = c('原系列', '予測値', '95%予測区間'))
}

myplot(x, y, pred)

t <- 0.05 * x^2 - 2 * x          # 2次のトレンド
e <- rnorm(n, mean = 0, sd = 10) # 不規則変動 
y <- t + e                       # 合成変動 

fit <- lm(y ~ poly(x, 2, raw = T), data = data.frame(x = x[ii.tr], y = y[ii.tr]))
#summary(fit)
pred <- predict(fit, newdata = data.frame(x = x[ii.te]), interval = 'prediction')

myplot(x, y, pred)

T <- 10                          # 周期
c <- 30 * sin(2 * pi * x / T)    # 周期変動（振幅：30，周期：10）
t <- 2 * x                       # トレンド（係数：2）
e <- rnorm(n, mean = 0, sd = 10) # 不規則変動 
y <- t + c + e                   # 合成変動 

fit <- lm(y ~  poly(x, 2, raw = T), data = data.frame(x = x[ii.tr], y = y[ii.tr]))
#summary(fit)
pred <- predict(fit, newdata = data.frame(x = x[ii.te]), interval = 'prediction')

myplot(x, y, pred)

dir.create('./ts_reg_periodic', showWarnings = F)

yhat <- rep(NA, n)

for (j in 1:T)
{
  # j <- 1
  ii <- seq(j, n.tr, T)
  ii.pred <- ii[length(ii)] + seq(T, T*2, T)

  fit <- lm(y ~  x, data = data.frame(x = x[ii], y = y[ii]))
  pred <- predict(fit, newdata = data.frame(x = x[ii.pred]), interval = 'prediction')
  yhat[ii.pred] <- pred[, 'fit']

  png(paste0('./ts_reg_periodic/ts_reg_periodic', j, '.png'))
  matplot(x, y, type = 'o', pch = 16, col = 2, cex = 0.5, ylim = c(0, 250))
  abline(v = n.tr+1, col = 3, lty = 2)
  matlines(x[ii], y[ii], type = 'o', pch = 1, col = 1, cex = 1.5)
  matlines(x, yhat, type = 'o', pch = 1, col = 4)
  matpoints(x[ii.pred], yhat[ii.pred], pch = 1, col = 1)
  dev.off()
  #Sys.sleep(2)
}

#system('bash -c "convert -delay 100 -loop 0 ./ts_reg_periodic/ts_reg_periodic{1..10}.png ./ts_reg_periodic/ts_reg_periodic.gif"')

d0 <- read.csv('https://stats.dip.jp/01_ds/data/load_amedas_y2017-2022_tokyo.csv')

head(d0)

##         date            datetime dow       name    event    md gr year month
## 1 2017-01-01 2017-01-01 00:00:00  日 祝日：元日 年末年始 01-01  6 2017     1
## 2 2017-01-01 2017-01-01 01:00:00  日 祝日：元日 年末年始 01-01  6 2017     1
## 3 2017-01-01 2017-01-01 02:00:00  日 祝日：元日 年末年始 01-01  6 2017     1
## 4 2017-01-01 2017-01-01 03:00:00  日 祝日：元日 年末年始 01-01  6 2017     1
## 5 2017-01-01 2017-01-01 04:00:00  日 祝日：元日 年末年始 01-01  6 2017     1
## 6 2017-01-01 2017-01-01 05:00:00  日 祝日：元日 年末年始 01-01  6 2017     1
##   day su mo tu we th fr sa wd ho bw af sp ab    mw     time hr mi hm rm  ws  tp
## 1   1  1  0  0  0  0  0  0  0  1  0  0  1  0 27830 00:00:00  0  0 77  0 2.3 4.9
## 2   1  1  0  0  0  0  0  0  0  1  0  0  1  0 26340 01:00:00  1  0 71  0 2.5 4.5
## 3   1  1  0  0  0  0  0  0  0  1  0  0  1  0 25200 02:00:00  2  0 69  0 1.5 4.1
## 4   1  1  0  0  0  0  0  0  0  1  0  0  1  0 24380 03:00:00  3  0 71  0 1.4 3.9
## 5   1  1  0  0  0  0  0  0  0  1  0  0  1  0 23890 04:00:00  4  0 75  0 2.0 3.2
## 6   1  1  0  0  0  0  0  0  0  1  0  0  1  0 23940 05:00:00  5  0 74  0 2.2 3.1
##   tp3h tp5h tp3d tp7d
## 1  5.3  5.5  5.5  6.7
## 2  5.0  5.2  5.5  6.7
## 3  4.5  4.9  5.5  6.7
## 4  4.2  4.6  5.6  6.7
## 5  3.7  4.1  5.6  6.7
## 6  3.4  3.8  5.6  6.7

d0$px <- as.POSIXct(d0$datetime)
d0$gw <- d0$mw / 1000 # 測定値 (単位変換：MW -> GW)
d0$gwfit <- NA
d0$gwlwr <- NA
d0$gwupr <- NA

# Suffixとしてtr: training term, te: test term, al: all term を使用する。

# 訓練データの取得期間フラグ（ここでは予測日の前後１ヶ月間をとった）
is.67 <- '06-01' <= d0$md & d0$md <= '07-31'

# 訓練期間フラグ
is.tr <- '2017-06-01' <= d0$date & d0$date <= '2022-06-30'

# テスト期間フラグ
is.te <- '2022-07-01' <= d0$date & d0$date <= '2022-07-03'

# グラフ表示期間フラグ
is.plot <- '2022-06-28' <= d0$date & d0$date <= '2022-07-03'
d.plot <- d0[is.plot, ] # グラフ表用データ

matplot(d0$px, d0$gw, 
        main = '電力需要',
        xlab = '年', ylab = '電力需要［GW］',
        type = 'l', col = 2, ylim = c(20, 70))

for (hr in 0:23)
{
# 予測時間
#hr <- 10
is.hr <- d0$hr == hr

# 訓練データ
d.tr <- d0[is.hr & is.67 & is.tr, ]

# テストデータ（予測区間の正解データとして精度評価に利用）
d.te <- d0[is.hr & is.te, ]

# フィッティング
# （ここでは，トレンドがなく気温の2次多項式と曜日ダミーのモデルとした）
fit <- lm(gw ~ tp + I(tp^2) + dow, data = d.tr)
summary(fit)

# 予測（予測区間付き）
pred <- predict(fit, newdata = d.te, interval = 'prediction')

# 毎時の予測値を保存
d0$gwfit[d0$px %in% d.te$px] <- pred[, 'fit']
d0$gwlwr[d0$px %in% d.te$px] <- pred[, 'lwr']
d0$gwupr[d0$px %in% d.te$px] <- pred[, 'upr']

# 時間別グラフ
dir.create('./load', showWarnings = F)
png(paste0('./load/load_at', hr, '.png'))

matplot(d0$px[is.hr & is.plot], d0$gw[is.hr & is.plot], 
        main = paste0(hr, '時の電力需要'),
        xlab = format(d0$px[1], '%Y年'), ylab = '電力需要［GW］',
        type = 'n', pch = 16, col = 2, xaxt = 'n', 
        xlim = range(d0$px[is.plot]), ylim = c(20, 70))

px.ax <- seq(d0$px[1], d0$px[nrow(d0)], by = 60*60*24)
abline(v = px.ax, col = 'lightgray', lty = 2)
abline(v = as.POSIXct('2022-07-01'), col = 3, lty = 2) # 予測開始線
axis(side = 1, at = px.ax, labels = format(px.ax, '%-m/%-d(%a)')) 

matlines(d0$px[is.plot], d0$gw[is.plot], type = 'o', pch = 1, col = 'lightgray') 
matlines(d.te$px, pred[, 'fit'], type = 'o', pch = 1, lty = 2, col = 4)
matlines(d.te$px, pred[, 'lwr'], lty = 3, col = 'lightgray')
matlines(d.te$px, pred[, 'upr'], lty = 3, col = 'lightgray')
matlines(d0$px[is.hr & is.plot], d0$gw[is.hr & is.plot], type = 'o', pch = 16, col = 2) 

# 凡例
legend('topleft', bg = 0,
       lty = c(1, 2, 3), pch = c(16, 1, NA), col = c(2, 4, 'lightgray'),
       legend = c('原系列', '予測値', '95%予測区間'))
legend('topright', bg = 0,
       legend = sprintf('MAPE: %2.1f%%', 
                        mean(abs((pred[, 'fit'] - d.te$gw)/d.te$gw))*100))
dev.off()
}

#system('bash -c "convert -delay 100 -loop 0 ./load/load_at{0..23}.png ./load/load_at.gif"')

#cairo_pdf('load_0-23.pdf')

matplot(d0$px[is.plot], d0$gw[is.plot], 
        main = '電力需要',
        xlab = format(d0$px[1], '%Y年'), ylab = '電力需要［GW］',
        type = 'o', pch = 16, col = 2, xaxt = 'n',
        xlim = range(d0$px[is.plot]), ylim = c(20, 70))

px.ax <- seq(d0$px[1], d0$px[nrow(d0)], by = 60*60*24)
axis(side = 1, at = px.ax, labels = format(px.ax, '%-m/%-d(%a)')) 
abline(v = px.ax, col = 'lightgray', lty = 2)
abline(v = as.POSIXct('2022-07-01'), col = 3, lty = 2) # 予測開始線

matlines(d0$px[is.te], d0$gwlwr[is.te], lty = 3, col = 'lightgray')
matlines(d0$px[is.te], d0$gwupr[is.te], lty = 3, col = 'lightgray')
matlines(d0$px[is.te], d0$gwfit[is.te], type = 'o', pch = 1, lty = 2, col = 4)

# 凡例
legend('topleft', bg = 0,
       lty = c(1, 2, 3), pch = c(16, 1, NA), col = c(2, 4, 'lightgray'),
       legend = c('原系列', '予測値', '95%予測区間'))
legend('topright', bg = 0,
       legend = sprintf('MAPE: %2.1f%%', 
                        mean(abs((d0$gwfit[is.te] - d0$gw[is.te])/d0$gw[is.te]))*100))

d0 <- read.csv('https://stats.dip.jp/01_ds/data/e-stat_expenditure.csv', skip = 11)

# タイムスタンプと支出金額のカラムを探しデータを抽出
j.time <- grep('時間軸.月次..コード', colnames(d0))
j.ice  <- grep('アイスクリーム',      colnames(d0))

d1 <- d0[, c(j.time, j.ice)]

# 時間コード（10桁）からPOSIX準拠時間を作成
yyyy <- substr(paste(d1[, 1]), 1, 4)
mm   <- substr(paste(d1[, 1]), 7, 8)
px <- as.POSIXct(paste(yyyy, mm, '01 12:00:00'), format = '%Y%m%d %H:%M:%S')

d <- data.frame(px, date = format(px, '%Y-%m-%d'), y = d1[, 2])

# 暦，気象とマージ
d <- merge(d, read.csv('https://stats.dip.jp/01_ds/data/calendar.csv'))
d <- merge(d, read.csv('https://stats.dip.jp/01_ds/data/amedas_tokyo.csv')) 
#View(d)

# グラフ
MAIN <- 'アイスクリーム月間世帯支出'
matplot(d$px, d$y, type = 'o', pch = 16, lty =1, col = 2,
        main = MAIN, xlab = '年', ylab = '円')

```