library(slidify)
create_deck(“Topic02”, git = TRUE)

R 統計軟體初階課程

劉正山副教授
國立中山大學政治學研究所
2012年8月23日、24日
Twitter: frankcsliu
email: csliu@mail.nsysu.edu.tw
Blogger: frankcsliu.blogspot.tw

Ｔopic 2: 資料描述與抽樣分配

暖身操
使用R初步檢視資料
資料分割
抽樣分配

1. 暖身操：

用R來玩玩Sudoku

install.packages("sudoku")
library(help = sudoku)
library(sudoku)
generateSudoku()
example(generateSudoku)

外觀比較漂亮的Sudoku

myPuzzle <- generateSudoku(Nblank = 20, print.it = F)
printSudoku(myPuzzle)

互動式的玩法

playSudoku(fetchSudokuUK(), solve = T)
printSudoku(fetchSudokuUK())

2. 使用R初步檢視資料

進行正式的分析之前要先瞭解資料的次數分配
參考：How to visualize and compare distributions.
參考：Even simple charts can tell a story.

檢視變數的資料（summrize a variable）

load("wgcoll.rda")
summary(wgc$aa)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    29.0    59.0    72.5    71.4    85.8    99.0

head(wgc)  #預設是前6列

##   id aa pe sm ae r g c
## 1  1 93 19  1  2 0 0 1
## 2  2 46 12  0  0 0 0 0
## 3  3 57 15  1  1 0 0 0
## 4  4 94 18  2  2 1 1 1
## 5  5 82 13  2  1 1 1 1
## 6  6 59 12  0  0 2 0 0

head(wgc, 20)  #顯示前20列

##    id aa pe sm ae r g c
## 1   1 93 19  1  2 0 0 1
## 2   2 46 12  0  0 0 0 0
## 3   3 57 15  1  1 0 0 0
## 4   4 94 18  2  2 1 1 1
## 5   5 82 13  2  1 1 1 1
## 6   6 59 12  0  0 2 0 0
## 7   7 61 12  1  2 0 0 0
## 8   8 29  9  0  0 1 1 0
## 9   9 36 13  1  1 0 0 0
## 10 10 91 16  2  2 1 1 0
## 11 11 55 10  0  0 1 0 0
## 12 12 58 11  0  1 0 0 0
## 13 13 67 14  1  1 0 1 1
## 14 14 77 14  1  2 2 1 0
## 15 15 71 12  0  0 2 1 0
## 16 16 83 16  2  2 1 0 1
## 17 17 96 15  2  2 2 0 1
## 18 18 87 12  1  1 0 0 1
## 19 19 62 11  0  0 0 0 0
## 20 20 52  9  0  1 2 1 0

tail(wgc,20)　#顯示最後20列

##    id aa pe sm ae r g c
## 31 31 64 13  1  1 0 0 0
## 32 32 77 13  1  0 1 1 1
## 33 33 88 16  2  2 0 1 0
## 34 34 54  9  0  1 1 0 0
## 35 35 86 17  1  2 1 0 1
## 36 36 73 15  1  1 0 1 0
## 37 37 79 15  2  1 0 0 1
## 38 38 85 14  2  1 2 1 1
## 39 39 96 16  0  1 1 0 1
## 40 40 59 12  1  0 0 1 0
## 41 41 84 14  1  0 1 0 1
## 42 42 71 15  2  1 1 0 0
## 43 43 89 15  0  1 0 1 1
## 44 44 38 12  1  0 1 1 0
## 45 45 62 11  1  1 2 0 1
## 46 46 93 16  1  0 1 0 1
## 47 47 71 13  2  1 1 0 0
## 48 48 55 11  0  1 0 0 0
## 49 49 74 15  1  2 0 1 0
## 50 50 88 18  1  1 0 1 0

盒狀圖（box plot）

boxplot(wgc$aa)

plot of chunk unnamed-chunk-7

# 直接把圖從印表機印出
dev.print(width = 6, height = 6, horizontal = FALSE)

# 把圖「印」成pdf檔
dev.print(file = "test_boxplot.pdf", device = pdf)

長條圖（histogram）

hist(wgc$aa)

plot of chunk unnamed-chunk-9

補充:

-Summarizing data using bar charts.

柱狀圖（bar chart）

barplot(wgc$g)  #錯誤示範

plot of chunk unnamed-chunk-10

# 正確做法
barplot(table(wgc$g), ylim = c(0, 30))

plot of chunk unnamed-chunk-11

圓餅圖（pie chart）

pie(wgc$g)  #錯誤示範

plot of chunk unnamed-chunk-12

pie(table(wgc$g), radius = 0.8, init.angle = 270)

plot of chunk unnamed-chunk-13

還有不少製圖套件可以試試

-例如ggplot2和googleVis：

install.pakages(googleVis)
library(googleVis)
gender <- as.data.frame(table(wgc$g))
rownames(gender) <- c("女", "男")
pie <- gvisPieChart(gender, labelvar = "row.names")
plot(pie)

進階繪圖說明 (Intermediate Boxplots and Histograms)

3. 分割資料為子集(subset)

準備１：製作兩個資料物件x與y

x <- matrix(rnorm(30, 1), ncol = 5)
y <- c(1, seq(5))

準備２：把物件x與物件y合併為一個矩陣z（matrix）

z <- cbind(x, y)

準備３：把物件x轉為資料框格式（data frame)

z.df <- data.frame(z)
z.df

##        V1       V2     V3      V4       V5 y
## 1 1.23351 -0.14503 1.4441  1.4516  1.60937 1
## 2 0.04472  0.18228 3.1658  1.6901  1.75227 1
## 3 0.18440  1.21110 1.5945  3.0075  1.63307 2
## 4 1.05453  1.65460 1.4643 -1.1552 -0.09981 3
## 5 3.26648  0.91051 1.4718  1.8692  1.06903 4
## 6 1.85547 -0.08774 0.3348  0.5796 -0.37785 5

使用`subset()`指令來切割資料框

names(z.df)

## [1] "V1" "V2" "V3" "V4" "V5" "y"

z.sub <- subset(z.df, y > 2)
z.sub

##      V1       V2     V3      V4       V5 y
## 4 1.055  1.65460 1.4643 -1.1552 -0.09981 3
## 5 3.266  0.91051 1.4718  1.8692  1.06903 4
## 6 1.855 -0.08774 0.3348  0.5796 -0.37785 5

使用多個條件篩選觀察值（列）

z.sub1 <- subset(z.df, y > 2 & V1 > 0.6)
z.sub1

##      V1       V2     V3      V4       V5 y
## 4 1.055  1.65460 1.4643 -1.1552 -0.09981 3
## 5 3.266  0.91051 1.4718  1.8692  1.06903 4
## 6 1.855 -0.08774 0.3348  0.5796 -0.37785 5

使用多個條件同時篩選變數與觀察值

z.sub2 <- subset(z.df, y > 2 & V2 > 0.4, select = c(V1, V4))
z.sub2

##      V1     V4
## 4 1.055 -1.155
## 5 3.266  1.869

z.sub3 <- subset(z.df, y > 3, select = V2:V5)
z.sub3

##         V2     V3     V4      V5
## 5  0.91051 1.4718 1.8692  1.0690
## 6 -0.08774 0.3348 0.5796 -0.3779

[進階]

用索引（index）的方式挑選觀察值

z.sub4 <- z.df[z.df$y == 1, ]
z.sub4

##        V1      V2    V3    V4    V5 y
## 1 1.23351 -0.1450 1.444 1.452 1.609 1
## 2 0.04472  0.1823 3.166 1.690 1.752 1

使用`%in%`來選取在一個變數有不同值的觀察值

z.sub5 <- z.df[z.df$y %in% c(1, 4), ]　#只選出y=1 或y=4的觀察值
z.sub5

##        V1      V2    V3    V4    V5 y
## 1 1.23351 -0.1450 1.444 1.452 1.609 1
## 2 0.04472  0.1823 3.166 1.690 1.752 1
## 5 3.26648  0.9105 1.472 1.869 1.069 4

使用索引來挑選特定欄位的觀察值

z.sub6 <- z.df[, 1:2]
z.sub6

##        V1       V2
## 1 1.23351 -0.14503
## 2 0.04472  0.18228
## 3 0.18440  1.21110
## 4 1.05453  1.65460
## 5 3.26648  0.91051
## 6 1.85547 -0.08774


z.sub7 <- z.df[, c(1, 3, 5)]
z.sub7

##        V1     V3       V5
## 1 1.23351 1.4441  1.60937
## 2 0.04472 3.1658  1.75227
## 3 0.18440 1.5945  1.63307
## 4 1.05453 1.4643 -0.09981
## 5 3.26648 1.4718  1.06903
## 6 1.85547 0.3348 -0.37785

舉一反三：用索引的方式同時篩選欄與列

z.sub8 <- z.df[c(1, 3), 3:6]
z.sub8

##      V3    V4    V5 y
## 1 1.444 1.452 1.609 1
## 3 1.595 3.007 1.633 2

4. 抽樣分配（Sampling distribution）

從描述統計要進入推論統計，抽樣分配的概念是很重要的一環。
請使用這個網站來把概念「玩清楚」

中央極限定理（ the central limit theorem, CLT）

When all of the possible sample means are computed, then the following properties are true:

The mean of the sample means will be the mean of the population.
The variance of the sample means will be the variance of the population divided by the sample size.

The standard deviation of the sample means (known as the standard error of the mean) will be smaller than the population mean and will be equal to the standard deviation of the population divided by the square root of the sample size.
If the population has a normal distribution, then the sample means will have a normal distribution.
If the population is not normally distributed, but the sample size is sufficiently large, then the sample means will have an approximately normal distribution. Some books define sufficiently large as at least 30 and others as at least 31. “ —

中文大意：

無論母體分佈為何，若每次樣本足夠大，那麼抽樣夠多次之後，這些樣本的平均數會呈現常態分佈。
把這些樣本平均數取平均之後，這個數字將會非常接近母體的平均數。

# Normal parent population
n <- 1
curve(dnorm(x, mean = 0, sd = 1/sqrt(n)), -3, 3, ylim = c(0, 5), 
    xlab = "x", ylab = "Densities of sample mean", bty = "l")
n <- 25
curve(dnorm(x, mean = 0, sd = 1/sqrt(n)), add = TRUE, lty = 2)
n <- 100
curve(dnorm(x, mean = 0, sd = 1/sqrt(n)), add = TRUE, lty = 3)

plot of chunk unnamed-chunk-25

# Nonnormal parent population
m <- 200
p <- 1/2
n <- 5
res <- rbinom(m, n, p)
hist(res, prob = TRUE, main = "n=5")
curve(dnorm(x, n * p, sqrt(n * p * (1 - p))), add = TRUE)

plot of chunk unnamed-chunk-26

n <- 10
res <- rbinom(m, n, p)
hist(res, prob = TRUE, main = "n=10")
curve(dnorm(x, n * p, sqrt(n * p * (1 - p))), add = TRUE)  #已近常態分配

plot of chunk unnamed-chunk-27

n <- 25
res <- rbinom(m, n, p)
hist(res, prob = TRUE, main = "n=25")
curve(dnorm(x, n * p, sqrt(n * p * (1 - p))), add = TRUE)

plot of chunk unnamed-chunk-28

plot(0, 0, type = "n", xlim = c(0, 1), ylim = c(0, 13.5), xlab = "Density estimate", 
    ylab = "f(x)")
m <- 500
a <- 0
b <- 1

n <- 2
for (i in 1:m) res[i] <- mean(runif(n, a, b))  #store the sample mean
lines(density(res), lwd = 2)

n <- 10
for (i in 1:m) res[i] <- mean(runif(n, a, b))
lines(density(res), lwd = 2)

n <- 100
for (i in 1:m) {
    res[i] <- mean(runif(n, a, b))
    lines(density(res), lwd = 2)
}

plot of chunk unnamed-chunk-30