library(slidify)
create_deck(“Topic02”, git = TRUE)


R 統計軟體初階課程


Topic 2: 資料描述與抽樣分配

  1. 暖身操
  2. 使用R初步檢視資料
  3. 資料分割
  4. 抽樣分配

1. 暖身操:

用R來玩玩Sudoku

install.packages("sudoku")
library(help = sudoku)
library(sudoku)
generateSudoku()
example(generateSudoku)

外觀比較漂亮的Sudoku

myPuzzle <- generateSudoku(Nblank = 20, print.it = F)
printSudoku(myPuzzle)

互動式的玩法

playSudoku(fetchSudokuUK(), solve = T)
printSudoku(fetchSudokuUK())

2. 使用R初步檢視資料


檢視變數的資料(summrize a variable)

load("wgcoll.rda")
summary(wgc$aa)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    29.0    59.0    72.5    71.4    85.8    99.0
head(wgc)  #預設是前6列
##   id aa pe sm ae r g c
## 1  1 93 19  1  2 0 0 1
## 2  2 46 12  0  0 0 0 0
## 3  3 57 15  1  1 0 0 0
## 4  4 94 18  2  2 1 1 1
## 5  5 82 13  2  1 1 1 1
## 6  6 59 12  0  0 2 0 0

head(wgc, 20)  #顯示前20列
##    id aa pe sm ae r g c
## 1   1 93 19  1  2 0 0 1
## 2   2 46 12  0  0 0 0 0
## 3   3 57 15  1  1 0 0 0
## 4   4 94 18  2  2 1 1 1
## 5   5 82 13  2  1 1 1 1
## 6   6 59 12  0  0 2 0 0
## 7   7 61 12  1  2 0 0 0
## 8   8 29  9  0  0 1 1 0
## 9   9 36 13  1  1 0 0 0
## 10 10 91 16  2  2 1 1 0
## 11 11 55 10  0  0 1 0 0
## 12 12 58 11  0  1 0 0 0
## 13 13 67 14  1  1 0 1 1
## 14 14 77 14  1  2 2 1 0
## 15 15 71 12  0  0 2 1 0
## 16 16 83 16  2  2 1 0 1
## 17 17 96 15  2  2 2 0 1
## 18 18 87 12  1  1 0 0 1
## 19 19 62 11  0  0 0 0 0
## 20 20 52  9  0  1 2 1 0

tail(wgc,20) #顯示最後20列
##    id aa pe sm ae r g c
## 31 31 64 13  1  1 0 0 0
## 32 32 77 13  1  0 1 1 1
## 33 33 88 16  2  2 0 1 0
## 34 34 54  9  0  1 1 0 0
## 35 35 86 17  1  2 1 0 1
## 36 36 73 15  1  1 0 1 0
## 37 37 79 15  2  1 0 0 1
## 38 38 85 14  2  1 2 1 1
## 39 39 96 16  0  1 1 0 1
## 40 40 59 12  1  0 0 1 0
## 41 41 84 14  1  0 1 0 1
## 42 42 71 15  2  1 1 0 0
## 43 43 89 15  0  1 0 1 1
## 44 44 38 12  1  0 1 1 0
## 45 45 62 11  1  1 2 0 1
## 46 46 93 16  1  0 1 0 1
## 47 47 71 13  2  1 1 0 0
## 48 48 55 11  0  1 0 0 0
## 49 49 74 15  1  2 0 1 0
## 50 50 88 18  1  1 0 1 0

盒狀圖 (box plot)

boxplot(wgc$aa)

plot of chunk unnamed-chunk-7


# 直接把圖從印表機印出
dev.print(width = 6, height = 6, horizontal = FALSE)

# 把圖「印」成pdf檔
dev.print(file = "test_boxplot.pdf", device = pdf)

長條圖(histogram)

hist(wgc$aa)

plot of chunk unnamed-chunk-9


補充:

-Summarizing data using bar charts.


柱狀圖(bar chart)

barplot(wgc$g)  #錯誤示範

plot of chunk unnamed-chunk-10


# 正確做法
barplot(table(wgc$g), ylim = c(0, 30))

plot of chunk unnamed-chunk-11


圓餅圖(pie chart)

pie(wgc$g)  #錯誤示範

plot of chunk unnamed-chunk-12


pie(table(wgc$g), radius = 0.8, init.angle = 270)

plot of chunk unnamed-chunk-13


還有不少製圖套件可以試試

-例如ggplot2和googleVis:

install.pakages(googleVis)
library(googleVis)
gender <- as.data.frame(table(wgc$g))
rownames(gender) <- c("女", "男")
pie <- gvisPieChart(gender, labelvar = "row.names")
plot(pie)

進階繪圖說明 (Intermediate Boxplots and Histograms)


3. 分割資料為子集(subset)

準備1:製作兩個資料物件x與y

x <- matrix(rnorm(30, 1), ncol = 5)
y <- c(1, seq(5))

準備2:把物件x與物件y合併為一個矩陣z(matrix)

z <- cbind(x, y)

準備3:把物件x轉為資料框格式(data frame)

z.df <- data.frame(z)
z.df
##        V1       V2     V3      V4       V5 y
## 1 1.23351 -0.14503 1.4441  1.4516  1.60937 1
## 2 0.04472  0.18228 3.1658  1.6901  1.75227 1
## 3 0.18440  1.21110 1.5945  3.0075  1.63307 2
## 4 1.05453  1.65460 1.4643 -1.1552 -0.09981 3
## 5 3.26648  0.91051 1.4718  1.8692  1.06903 4
## 6 1.85547 -0.08774 0.3348  0.5796 -0.37785 5

使用subset()指令來切割資料框

names(z.df)
## [1] "V1" "V2" "V3" "V4" "V5" "y"
z.sub <- subset(z.df, y > 2)
z.sub
##      V1       V2     V3      V4       V5 y
## 4 1.055  1.65460 1.4643 -1.1552 -0.09981 3
## 5 3.266  0.91051 1.4718  1.8692  1.06903 4
## 6 1.855 -0.08774 0.3348  0.5796 -0.37785 5

使用多個條件篩選觀察值(列)

z.sub1 <- subset(z.df, y > 2 & V1 > 0.6)
z.sub1
##      V1       V2     V3      V4       V5 y
## 4 1.055  1.65460 1.4643 -1.1552 -0.09981 3
## 5 3.266  0.91051 1.4718  1.8692  1.06903 4
## 6 1.855 -0.08774 0.3348  0.5796 -0.37785 5

使用多個條件同時篩選變數與觀察值

z.sub2 <- subset(z.df, y > 2 & V2 > 0.4, select = c(V1, V4))
z.sub2
##      V1     V4
## 4 1.055 -1.155
## 5 3.266  1.869
z.sub3 <- subset(z.df, y > 3, select = V2:V5)
z.sub3
##         V2     V3     V4      V5
## 5  0.91051 1.4718 1.8692  1.0690
## 6 -0.08774 0.3348 0.5796 -0.3779

[進階]

用索引(index)的方式挑選觀察值

z.sub4 <- z.df[z.df$y == 1, ]
z.sub4
##        V1      V2    V3    V4    V5 y
## 1 1.23351 -0.1450 1.444 1.452 1.609 1
## 2 0.04472  0.1823 3.166 1.690 1.752 1

使用%in%來選取在一個變數有不同值的觀察值

z.sub5 <- z.df[z.df$y %in% c(1, 4), ] #只選出y=1 或y=4的觀察值
z.sub5
##        V1      V2    V3    V4    V5 y
## 1 1.23351 -0.1450 1.444 1.452 1.609 1
## 2 0.04472  0.1823 3.166 1.690 1.752 1
## 5 3.26648  0.9105 1.472 1.869 1.069 4

使用索引來挑選特定欄位的觀察值

z.sub6 <- z.df[, 1:2]
z.sub6
##        V1       V2
## 1 1.23351 -0.14503
## 2 0.04472  0.18228
## 3 0.18440  1.21110
## 4 1.05453  1.65460
## 5 3.26648  0.91051
## 6 1.85547 -0.08774

z.sub7 <- z.df[, c(1, 3, 5)]
z.sub7
##        V1     V3       V5
## 1 1.23351 1.4441  1.60937
## 2 0.04472 3.1658  1.75227
## 3 0.18440 1.5945  1.63307
## 4 1.05453 1.4643 -0.09981
## 5 3.26648 1.4718  1.06903
## 6 1.85547 0.3348 -0.37785

舉一反三:用索引的方式同時篩選欄與列

z.sub8 <- z.df[c(1, 3), 3:6]
z.sub8
##      V3    V4    V5 y
## 1 1.444 1.452 1.609 1
## 3 1.595 3.007 1.633 2

4. 抽樣分配(Sampling distribution)


中央極限定理( the central limit theorem, CLT)

When all of the possible sample means are computed, then the following properties are true:


中文大意:


# Normal parent population
n <- 1
curve(dnorm(x, mean = 0, sd = 1/sqrt(n)), -3, 3, ylim = c(0, 5), 
    xlab = "x", ylab = "Densities of sample mean", bty = "l")
n <- 25
curve(dnorm(x, mean = 0, sd = 1/sqrt(n)), add = TRUE, lty = 2)
n <- 100
curve(dnorm(x, mean = 0, sd = 1/sqrt(n)), add = TRUE, lty = 3)

plot of chunk unnamed-chunk-25


# Nonnormal parent population
m <- 200
p <- 1/2
n <- 5
res <- rbinom(m, n, p)
hist(res, prob = TRUE, main = "n=5")
curve(dnorm(x, n * p, sqrt(n * p * (1 - p))), add = TRUE)

plot of chunk unnamed-chunk-26


n <- 10
res <- rbinom(m, n, p)
hist(res, prob = TRUE, main = "n=10")
curve(dnorm(x, n * p, sqrt(n * p * (1 - p))), add = TRUE)  #已近常態分配

plot of chunk unnamed-chunk-27


n <- 25
res <- rbinom(m, n, p)
hist(res, prob = TRUE, main = "n=25")
curve(dnorm(x, n * p, sqrt(n * p * (1 - p))), add = TRUE)

plot of chunk unnamed-chunk-28


plot(0, 0, type = "n", xlim = c(0, 1), ylim = c(0, 13.5), xlab = "Density estimate", 
    ylab = "f(x)")
m <- 500
a <- 0
b <- 1

n <- 2
for (i in 1:m) res[i] <- mean(runif(n, a, b))  #store the sample mean
lines(density(res), lwd = 2)

n <- 10
for (i in 1:m) res[i] <- mean(runif(n, a, b))
lines(density(res), lwd = 2)

n <- 100
for (i in 1:m) {
    res[i] <- mean(runif(n, a, b))
    lines(density(res), lwd = 2)
}

plot of chunk unnamed-chunk-30