nankai <- data.frame(month = 1:12, frequency=c(0,2,0,0,0,0,0,2,1,1,1,5))
barplot(height = nankai$frequency,
names.arg = nankai$month, xlab = "month", ylab = "frequency")
chisq.test(nankai$frequency,p=rep(1/12,length=12))
## Warning in chisq.test(nankai$frequency, p = rep(1/12, length = 12)): Chi-squared
## approximation may be incorrect
##
## Chi-squared test for given probabilities
##
## data: nankai$frequency
## X-squared = 24, df = 11, p-value = 0.01273
\[結論 帰無仮説H_0:各々の月に地震が起きる確率が12分の1は棄却される。\]
# accident_count 事故回数
# year_count 該当する年の数
df <- data.frame(accident_count = 0:8, year_count=c(1,6,6,8,5,7,0,1,0))
m <- sum((0:8)*df$year_count)/sum(df$year_count)
print(paste("最尤推定量λ:",m))
## [1] "最尤推定量λ: 3.05882352941176"
\[事故回数を8回以上と解釈した場合(自由度8)の検定\]
poisprob<-dpois(0:7,lambda=m)
theory<-c(poisprob,1-sum(poisprob))
chisq.test(df$year_count,p=theory)
## Warning in chisq.test(df$year_count, p = theory): Chi-squared approximation may
## be incorrect
##
## Chi-squared test for given probabilities
##
## data: df$year_count
## X-squared = 6.5411, df = 8, p-value = 0.5869
\[9回以上の実現値を0と解釈した場合(自由度9)の検定\]
poisprob2<-dpois(0:8,lambda=m)
theory2<-c(poisprob2,1-sum(poisprob2))
years2<-c(df$year_count,0)
chisq.test(years2,p=theory2)
## Warning in chisq.test(years2, p = theory2): Chi-squared approximation may be
## incorrect
##
## Chi-squared test for given probabilities
##
## data: years2
## X-squared = 6.5411, df = 9, p-value = 0.6848
\[χ二乗統計量の分布\]
N <- 1000
p <- c(4,3,2,1)/10
x <- rmultinom(N, size=10, prob=p)
expmatrix <- rep(N*p,N)
z <- x - expmatrix
chisqval <- apply(z*z/expmatrix, 2, sum)
hist(chisqval)
library(MASS)
## Warning: パッケージ 'MASS' はバージョン 4.1.3 の R の下で造られました
SmokeEx <- table(survey$Smoke, survey$Exer)
chisq.test(SmokeEx)
## Warning in chisq.test(SmokeEx): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: SmokeEx
## X-squared = 5.4885, df = 6, p-value = 0.4828
sb<-matrix(c(1085,703,55623,441239),2,2)
chisq.test(sb,correct=FALSE)
##
## Pearson's Chi-squared test
##
## data: sb
## X-squared = 4328.9, df = 1, p-value < 2.2e-16
# correct=TRUE
chisq.test(sb)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: sb
## X-squared = 4324, df = 1, p-value < 2.2e-16
prop.test(c(8,13),c(20,20))
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(8, 13) out of c(20, 20)
## X-squared = 1.604, df = 1, p-value = 0.2053
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.59965664 0.09965664
## sample estimates:
## prop 1 prop 2
## 0.40 0.65
bluepen<-matrix(c(8,12,13,7),2,2)
chisq.test(bluepen)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: bluepen
## X-squared = 1.604, df = 1, p-value = 0.2053
fisher.test(bluepen)
##
## Fisher's Exact Test for Count Data
##
## data: bluepen
## p-value = 0.2049
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.08194956 1.53121239
## sample estimates:
## odds ratio
## 0.3687025
mrecall<-matrix(c(14,17,6,2),2,2)
chisq.test(mrecall)
## Warning in chisq.test(mrecall): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mrecall
## X-squared = 1.2292, df = 1, p-value = 0.2676
chisq.test(mrecall,correct=FALSE) # イエーツの補正なし
## Warning in chisq.test(mrecall, correct = FALSE): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test
##
## data: mrecall
## X-squared = 2.2662, df = 1, p-value = 0.1322
fisher.test(mrecall)
##
## Fisher's Exact Test for Count Data
##
## data: mrecall
## p-value = 0.2351
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.02431935 1.91518631
## sample estimates:
## odds ratio
## 0.2835388
メモ
MF <- matrix(c(482, 532, 303, 390, 30, 38, 29, 62), 4, 2,
dimnames = list(c("機械", "電気", "電子", "環境"), c("男性", "女性"))
)
res <- chisq.test(MF)
res$stdres
## 男性 女性
## 機械 2.5322605 -2.5322605
## 電気 1.9026177 -1.9026177
## 電子 -0.1540644 0.1540644
## 環境 -4.5452585 4.5452585
# 正規分布の上側累積確率を計算
p.value.matrix <- pnorm(abs(res$stdres),lower.tail=FALSE)*2
round(p.value.matrix, 4)*100
## 男性 女性
## 機械 1.13 1.13
## 電気 5.71 5.71
## 電子 87.76 87.76
## 環境 0.00 0.00
# speed(時速)、dist(停止までに走行する距離、ft)
res<-lm(formula = dist~speed, data=cars)
plot(cars)
abline(res)
summary(res)
##
## Call:
## lm(formula = dist ~ speed, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.069 -9.525 -2.272 9.215 43.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.5791 6.7584 -2.601 0.0123 *
## speed 3.9324 0.4155 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
summary(res)
##
## Call:
## lm(formula = dist ~ speed, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.069 -9.525 -2.272 9.215 43.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.5791 6.7584 -2.601 0.0123 *
## speed 3.9324 0.4155 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
# Call:
# lm(formula = dist ~ speed, data = cars)
#
# Residuals:
# Min 1Q Median 3Q Max
# -29.069 -9.525 -2.272 9.215 43.201
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -17.5791 6.7584 -2.601 0.0123 *
# speed 3.9324 0.4155 9.464 1.49e-12 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 15.38 on 48 degrees of freedom
# Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
# F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
# -1とすることで切片を0とする回帰になる
res<-lm(formula = dist~speed - 1, data=cars)
plot(cars,xlim=c(0,25))
abline(res)
summary(res)
##
## Call:
## lm(formula = dist ~ speed - 1, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.183 -12.637 -5.455 4.590 50.181
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## speed 2.9091 0.1414 20.58 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.26 on 49 degrees of freedom
## Multiple R-squared: 0.8963, Adjusted R-squared: 0.8942
## F-statistic: 423.5 on 1 and 49 DF, p-value: < 2.2e-16
メモ X⇒YとY⇒Xは等しくないし、フィッティングさせた直線の解釈も異なる。
# sampledata2.xlsxのTIMSS2011シートのV列
likemath <- read.csv("likemath.csv",header=FALSE)
scoremath <- read.csv("scoremath.csv",header=FALSE)
plot(likemath$V1,scoremath$V1)
res<-lm(scoremath$V1~likemath$V1)
abline(res)
summary(res) # y = 679.1611 -3.1604x
##
## Call:
## lm(formula = scoremath$V1 ~ likemath$V1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -113.609 -41.289 -7.856 29.476 172.032
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 679.1611 42.7252 15.896 < 2e-16 ***
## likemath$V1 -3.1604 0.6045 -5.228 4.78e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56.62 on 43 degrees of freedom
## Multiple R-squared: 0.3887, Adjusted R-squared: 0.3744
## F-statistic: 27.34 on 1 and 43 DF, p-value: 4.775e-06
# 平均点が上がるほど数学が嫌いになる。と考え説明変数と独立変数を入れ替えた場合:
res2<-lm(likemath$V1~scoremath$V1)
summary(res2)
##
## Call:
## lm(formula = likemath$V1 ~ scoremath$V1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.777 -8.427 1.854 7.745 25.258
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 125.87889 10.95082 11.495 1.07e-14 ***
## scoremath$V1 -0.12297 0.02352 -5.228 4.78e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.17 on 43 degrees of freedom
## Multiple R-squared: 0.3887, Adjusted R-squared: 0.3744
## F-statistic: 27.34 on 1 and 43 DF, p-value: 4.775e-06
# summaryより、
# x = 125.87889 - 0.12297y
# ⇔ y = (125.87889-X)/0.12297 = 1023.655 - 8.132065X
abline(a=1023.655,b=-8.132065,lty="dotted")
# ↑で引かれた直線は横方向の残差平方和を最小化する
メモ: - 決定係数: 説明力のみ - +汎用性の判断→AIC
res<-lm(formula = dist~I(speed^2), data=cars)
plot(cars)
lines(cars$speed,fitted(res))
summary(res)
##
## Call:
## lm(formula = dist ~ I(speed^2), data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.448 -9.211 -3.594 5.076 45.862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.86005 4.08633 2.168 0.0351 *
## I(speed^2) 0.12897 0.01319 9.781 5.2e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.05 on 48 degrees of freedom
## Multiple R-squared: 0.6659, Adjusted R-squared: 0.6589
## F-statistic: 95.67 on 1 and 48 DF, p-value: 5.2e-13
# 2次式
res <- lm(formula = dist ~ speed + I(speed^2), data = cars)
plot(cars)
lines(cars$speed, fitted(res), lty = 2) # 線のスタイルを点線に設定
summary(res)
##
## Call:
## lm(formula = dist ~ speed + I(speed^2), data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.720 -9.184 -3.188 4.628 45.152
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.47014 14.81716 0.167 0.868
## speed 0.91329 2.03422 0.449 0.656
## I(speed^2) 0.09996 0.06597 1.515 0.136
##
## Residual standard error: 15.18 on 47 degrees of freedom
## Multiple R-squared: 0.6673, Adjusted R-squared: 0.6532
## F-statistic: 47.14 on 2 and 47 DF, p-value: 5.852e-12
# 3次式
res <- lm(formula = dist ~ speed + I(speed^2) + I(speed^3), data = cars)
lines(cars$speed, fitted(res), col = "red") # 線のスタイルを点線に設定
summary(res)
##
## Call:
## lm(formula = dist ~ speed + I(speed^2) + I(speed^3), data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.670 -9.601 -2.231 7.075 44.691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -19.50505 28.40530 -0.687 0.496
## speed 6.80111 6.80113 1.000 0.323
## I(speed^2) -0.34966 0.49988 -0.699 0.488
## I(speed^3) 0.01025 0.01130 0.907 0.369
##
## Residual standard error: 15.2 on 46 degrees of freedom
## Multiple R-squared: 0.6732, Adjusted R-squared: 0.6519
## F-statistic: 31.58 on 3 and 46 DF, p-value: 3.074e-11
# 4次式
res <- lm(formula = dist ~ speed + I(speed^2) + I(speed^3) + I(speed^4), data = cars)
lines(cars$speed, fitted(res), col = "blue") # 線のスタイルを点線に設定
legend("topleft",legend = c("2次式", "3次式", "4次式"),
col = c("black", "red", "blue"), lty = c(1, 2)
)
summary(res)
##
## Call:
## lm(formula = dist ~ speed + I(speed^2) + I(speed^3) + I(speed^4),
## data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.701 -8.766 -2.861 7.158 42.186
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.845412 60.849115 0.753 0.455
## speed -18.962244 22.296088 -0.850 0.400
## I(speed^2) 2.892190 2.719103 1.064 0.293
## I(speed^3) -0.151951 0.134225 -1.132 0.264
## I(speed^4) 0.002799 0.002308 1.213 0.232
##
## Residual standard error: 15.13 on 45 degrees of freedom
## Multiple R-squared: 0.6835, Adjusted R-squared: 0.6554
## F-statistic: 24.3 on 4 and 45 DF, p-value: 9.375e-11
res1 <- lm(formula = dist ~ speed, data = cars)
res2 <- lm(formula = dist ~ poly(speed, 2, raw = TRUE), data = cars)
res3 <- lm(formula = dist ~ poly(speed, 3, raw = TRUE), data = cars)
res4 <- lm(formula = dist ~ poly(speed, 4, raw = TRUE), data = cars)
summary(res1)$r.squared; summary(res1)$adj.r.squared
## [1] 0.6510794
## [1] 0.6438102
summary(res2)$r.squared; summary(res2)$adj.r.squared
## [1] 0.6673308
## [1] 0.6531747
summary(res3)$r.squared; summary(res3)$adj.r.squared
## [1] 0.6731808
## [1] 0.6518666
summary(res4)$r.squared; summary(res4)$adj.r.squared
## [1] 0.6835237
## [1] 0.6553925
次数をあげて推定パラメタを増やすと決定係数は改善できるがOverfittingに繋がる。
→どのように適切な推定パラメタを選ぶか?(モデル選択問題)
→AICを使う(AICが最小のものを選択する)
MLL: 最大対数尤度
モデルの当てはまりの良さ→AICを小さくする
パラメータ数Kの多さ→AICを大きくする
\[\begin{align*} AIC = -2MLL+2k \\ \end{align*}\]
AIC(res1,res2,res3,res4)
## df AIC
## res1 3 419.1569
## res2 4 418.7721
## res3 5 419.8850
## res4 6 420.2771
res02 <- lm(formula = dist ~ poly(speed, 2, raw = TRUE)-1, data = cars)
summary(res02)
##
## Call:
## lm(formula = dist ~ poly(speed, 2, raw = TRUE) - 1, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.836 -9.071 -3.152 4.570 44.986
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## poly(speed, 2, raw = TRUE)1 1.23903 0.55997 2.213 0.03171 *
## poly(speed, 2, raw = TRUE)2 0.09014 0.02939 3.067 0.00355 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.02 on 48 degrees of freedom
## Multiple R-squared: 0.9133, Adjusted R-squared: 0.9097
## F-statistic: 252.8 on 2 and 48 DF, p-value: < 2.2e-16
AIC(res02)
## [1] 416.8016
これは直線の方程式 \(y = mx + b\) です。
次のように表せると仮定 \[ y_j = α + βx_j + ε_j,\quad j=1,2,...,n\\ \] また、\(E[ε_j]=0,V[ε_j]=σ^2\)を満たすものとする。このようなモデルを線形単回帰モデルという。 定理7.
\[y=1+2sinx+3cos2x+ε,\quad ε \sim N(0,0.2)\]
x <- seq(0, 2 * pi, length = 100)
y <- 1 + 2 * sin(x) + 3 * cos(2 * x) + rnorm(length(x), 0, 0.2)
# データフレームを作成する
data <- data.frame(x = x, y = y)
plot(data$x, data$y)
res <- lm(y ~ I(sin(x)) + I(cos(2 * x)))
summary(res)
##
## Call:
## lm(formula = y ~ I(sin(x)) + I(cos(2 * x)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.54133 -0.12554 0.01487 0.15706 0.51187
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.03116 0.02128 48.46 <2e-16 ***
## I(sin(x)) 2.00412 0.03024 66.28 <2e-16 ***
## I(cos(2 * x)) 3.04184 0.02994 101.60 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2127 on 97 degrees of freedom
## Multiple R-squared: 0.9935, Adjusted R-squared: 0.9933
## F-statistic: 7357 on 2 and 97 DF, p-value: < 2.2e-16
lines(x, fitted(res))
\[y=f(x)=\frac{a}{1+b e^{-c x}}\]
diffusion_rate <- read.csv("PC普及率.csv", header = FALSE)
colnames(diffusion_rate) <- c("year", "rate")
plot(diffusion_rate$year, diffusion_rate$rate)
year <- diffusion_rate$year
rate <- diffusion_rate$rate
year0 <- year - 1987
res <- nls(rate ~ a / (1 + b * exp(-c * year0)), start = c(a = 80, b = 1, c = 1), trace = TRUE)
## 50139.17 (1.55e+00): par = (80 1 1)
## 29739.03 (2.37e+00): par = (72.51707 1.034615 0.3826346)
## 11711.30 (2.91e+00): par = (70.47085 1.067224 0.07099902)
## 6664.014 (2.19e+00): par = (104.8972 3.047831 0.0650449)
## 5063.070 (2.03e+00): par = (84.22802 3.25924 0.09463527)
## 2410.674 (1.50e+00): par = (83.88331 5.251473 0.1400597)
## 1394.916 (1.26e+00): par = (86.58721 10.59881 0.199742)
## 641.6215 (5.63e-01): par = (84.33738 16.97232 0.2257461)
## 493.8297 (2.31e-01): par = (82.2809 23.18385 0.2477569)
## 467.4292 (7.20e-02): par = (81.62738 27.33159 0.2584358)
## 464.6063 (2.08e-02): par = (81.29687 29.28643 0.2635173)
## 464.3555 (6.05e-03): par = (81.18931 29.99237 0.2653881)
## 464.3336 (1.77e-03): par = (81.15613 30.21454 0.2659927)
## 464.3318 (5.15e-04): par = (81.14652 30.28061 0.2661735)
## 464.3316 (1.49e-04): par = (81.14373 30.29992 0.2662265)
## 464.3316 (4.33e-05): par = (81.14292 30.30553 0.2662419)
## 464.3316 (1.26e-05): par = (81.14268 30.30716 0.2662464)
## 464.3316 (3.63e-06): par = (81.14261 30.30763 0.2662477)
summary(res)
##
## Formula: rate ~ a/(1 + b * exp(-c * year0))
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## a 81.14261 2.54320 31.906 < 2e-16 ***
## b 30.30763 7.53606 4.022 0.000469 ***
## c 0.26625 0.02304 11.557 1.6e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.31 on 25 degrees of freedom
##
## Number of iterations to convergence: 17
## Achieved convergence tolerance: 3.628e-06
lines(year, fitted(res))
memo algorithm=Newton法。誤差0のデータや二乗和の最小化以外のタスクには適用不可
### 1.素直にyear0を使うと上手くいかない
# res <- nls(rate ~ a / (1 + b * exp(-c * year)), start = c(a = 80, b = 1, c = 1), trace = TRUE)
# Error in nlsModel(formula, mf, start, wts, scaleOffset = scOff, nDcentral = nDcntr) :
# singular gradient matrix at initial parameter estimates
### 2.初期値によっては収束しない
# res <- nls(rate ~ a / (1 + b * exp(-c * year0)), start = c(a = 80, b = 0.1, c = 0.1), trace = TRUE)
# Error in nls(rate ~ a/(1 + b * exp(-c * year0)), start = c(a = 80, b = 0.1, :
# step 因子 0.000488281 は 0.000976562 の 'minFactor' 以下に縮小しました
city <- read.csv("population.csv", header = FALSE)
city <- city$V1
ord<-order(city,decreasing=TRUE)
city_popl<-city[ord]
rank<-1:21
plot(rank, city_popl)