人工データの発生

library(ggplot2)

正規分布

norm.data <- rnorm(n = 100, mean = 50, sd = 10)
hist(norm.data)

plot of chunk unnamed-chunk-2

一様分布

unif.data <- runif(n = 100, min = 0, max = 100)
hist(unif.data)

plot of chunk unnamed-chunk-3

任意の離散的確率分布

p <- runif(n = 10)
p
##  [1] 0.64341 0.88315 0.26553 0.41053 0.18931 0.73869 0.60737 0.51224
##  [9] 0.77452 0.06837

x <- cut(p, c(0, 0.4, 1))
x
##  [1] (0.4,1] (0.4,1] (0,0.4] (0.4,1] (0,0.4] (0.4,1] (0.4,1] (0.4,1]
##  [9] (0.4,1] (0,0.4]
## Levels: (0,0.4] (0.4,1]

大数の法則: 正規分布

set.seed(1)
mean(rnorm(n = 3, mean = 50, sd = 20))
## [1] 41.48
mean(rnorm(n = 10, mean = 50, sd = 20))
## [1] 57.76
mean(rnorm(n = 100, mean = 50, sd = 20))
## [1] 52.42
mean(rnorm(n = 1000, mean = 50, sd = 20))
## [1] 49.46
mean(rnorm(n = 10000, mean = 50, sd = 20))
## [1] 49.87

大数の法則: 二項分布

set.seed(3)
mean(rbinom(n = 3, size = 1, prob = 0.3))
## [1] 0.3333
mean(rbinom(n = 10, size = 1, prob = 0.3))
## [1] 0
mean(rbinom(n = 100, size = 1, prob = 0.3))
## [1] 0.32
mean(rbinom(n = 1000, size = 1, prob = 0.3))
## [1] 0.303
mean(rbinom(n = 10000, size = 1, prob = 0.3))
## [1] 0.2935

中心極限定理: 正規分布

# 母集団
P <- rnorm(n = 1e+06, mean = 100, sd = 20)
hist(P)

plot of chunk unnamed-chunk-7


# 100個の標本を100回
s100 <- matrix(sample(P, size = 100 * 100, replace = TRUE), 100)
hist(rowMeans(s100))

plot of chunk unnamed-chunk-7



# 100個の標本を1000回
s1000 <- matrix(sample(P, size = 100 * 1000, replace = TRUE), 1000)
hist(rowMeans(s1000))

plot of chunk unnamed-chunk-7


# 100個の標本を10000回
s10000 <- matrix(sample(P, size = 100 * 10000, replace = TRUE), 10000)
hist(rowMeans(s10000))

plot of chunk unnamed-chunk-7

中心極限定理: 二項分布

# 母集団
P <- rbinom(n = 1e+06, size = 1, prob = 0.3)
mean(P)
## [1] 0.3007

# 100個の標本を100回
s100 <- matrix(sample(P, size = 100 * 100, replace = TRUE), 100)
hist(rowMeans(s100))

plot of chunk unnamed-chunk-8



# 100個の標本を1000回
s1000 <- matrix(sample(P, size = 100 * 1000, replace = TRUE), 1000)
hist(rowMeans(s1000))

plot of chunk unnamed-chunk-8


# 100個の標本を10000回
s10000 <- matrix(sample(P, size = 100 * 10000, replace = TRUE), 10000)
hist(rowMeans(s10000))

plot of chunk unnamed-chunk-8

任意の相関係数を持つ2変数の人工データ

set.seed(1)
p <- 0.6
x <- runif(n = 100)
e <- runif(n = 100)
y <- p * x + sqrt(1 - p^2) * e

cor(x, y)
## [1] 0.6028

plot(x, y)

plot of chunk unnamed-chunk-9

回帰分析モデルの人工データ

n <- 100
a <- 10
b1 <- 3
b2 <- 0.5
x1 <- rnorm(n = n)
x2 <- runif(n = n)
e <- rnorm(n = n, sd = 10)
y <- b1 * x1 + b2 * x2 + a + e

plot(data.frame(x1 = x1, x2 = x2, y = y))

plot of chunk unnamed-chunk-10



fit <- lm(y ~ x1 + x2)
summary(fit)
## 
## Call:
## lm(formula = y ~ x1 + x2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25.74  -5.94   0.69   6.30  26.25 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    10.23       1.91    5.37  5.4e-07 ***
## x1              4.22       1.03    4.11  8.3e-05 ***
## x2              2.11       3.24    0.65     0.52    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.76 on 97 degrees of freedom
## Multiple R-squared:  0.155,  Adjusted R-squared:  0.138 
## F-statistic: 8.93 on 2 and 97 DF,  p-value: 0.000276