Q1

Explain what does this statement do: lapply(lapply(search(), ls), length)

search() #是查詢目前所有被載入的套件列表
## [1] ".GlobalEnv"        "package:stats"     "package:graphics" 
## [4] "package:grDevices" "package:utils"     "package:datasets" 
## [7] "package:methods"   "Autoloads"         "package:base"
#lapply(X, FUN, ...)是apply a Function over a List or Vector
#lapply(search(), ls) # 查詢ls package的名稱列表 
lapply(lapply(search(), ls), length) # 查詢ls package 的數量(length是序列的長度)
## [[1]]
## [1] 0
## 
## [[2]]
## [1] 449
## 
## [[3]]
## [1] 87
## 
## [[4]]
## [1] 113
## 
## [[5]]
## [1] 247
## 
## [[6]]
## [1] 104
## 
## [[7]]
## [1] 203
## 
## [[8]]
## [1] 0
## 
## [[9]]
## [1] 1255

Q2

Convert the R script in the NZ schools example into a rmarkdown file and provide comments to each code chunk indicated by ‘##’. Give alternative code to perform the same calculation where appropriate.

The New Zealand Ministry of Education provides basic information for all primary and secondary schools in the country.

Source: Ministry of education - New Zealand

Column 1: School ID Column 2: School name Column 3: City where the school is located Column 4: The authority of the school Column 5: A socio-economic status of the families of the students of the school Column 6: The number of students enrolled at the school as of July 2007

## keep the school names with white spaces
dta2 <- read.csv("nzSchools.csv", as.is=2) #as.is is specified per column 
head(dta2)
##     ID                  Name      City  Auth Dec Roll
## 1 1015      Hora Hora School Whangarei State   2  318
## 2 1052    Morningside School Whangarei State   3  200
## 3 1062        Onerahi School Whangarei State   4  455
## 4 1092 Raurimu Avenue School Whangarei State   2   86
## 5 1130      Whangarei School Whangarei State   4  577
## 6 1018       Hurupaki School Whangarei State   8  329
## data structure
str(dta2)
## 'data.frame':    2571 obs. of  6 variables:
##  $ ID  : int  1015 1052 1062 1092 1130 1018 1029 1030 1588 1154 ...
##  $ Name: chr  "Hora Hora School" "Morningside School" "Onerahi School" "Raurimu Avenue School" ...
##  $ City: Factor w/ 541 levels "Ahaura","Ahipara",..: 533 533 533 533 533 533 533 533 533 533 ...
##  $ Auth: Factor w/ 4 levels "Other","Private",..: 3 3 3 3 3 3 3 3 4 3 ...
##  $ Dec : int  2 3 4 2 4 8 5 5 6 1 ...
##  $ Roll: int  318 200 455 86 577 329 637 395 438 201 ...
## dim()可顯示data.frame的資料維度,先顯示列(row),後顯示行(column)
dim(dta2) 
## [1] 2571    6

2571 obs. of 6 variables

binning

## 如果學生註冊人數(Roll)大於中位數median,則recode為Large,其餘為Small,同時寫入新變數$size
dta2$Size <- ifelse(dta2$Roll > median(dta2$Roll), "Large", "Small")

# 也可以用cut()
dta2$Size2<- cut(dta2$Roll,
                 breaks=c(min(dta2$Roll),median(dta2$Roll),max(dta2$Roll)),
                 c("Small", "Large"))
## 拿掉變數Size
dta2$Size <- NULL
dta2$Size2 <- NULL
## 顯示data.frame前六列
head(dta2)
##     ID                  Name      City  Auth Dec Roll
## 1 1015      Hora Hora School Whangarei State   2  318
## 2 1052    Morningside School Whangarei State   3  200
## 3 1062        Onerahi School Whangarei State   4  455
## 4 1092 Raurimu Avenue School Whangarei State   2   86
## 5 1130      Whangarei School Whangarei State   4  577
## 6 1018       Hurupaki School Whangarei State   8  329
## cut()函數可將數據進行分段recode
dta2$Size <- cut(dta2$Roll, 3, labels=c("Small", "Mediam", "Large"))#將Roll平均分為3個區間,命名為"Small", "Mediam", "Large"


str(dta2$Size)
##  Factor w/ 3 levels "Small","Mediam",..: 1 1 1 1 1 1 1 1 1 1 ...
## table()可以從任何vector、matrix、array、data.frame創造一個列聯表
table(dta2$Size) #計算三種不同Size的個數
## 
##  Small Mediam  Large 
##   2555     15      1

sorting

## order()函數對向量、矩陣進行排序,decreasing=T則可指定由大到小遞減排序
##依據註冊人數多寡排名,並新增變數$RollOrd
dta2$RollOrd <- order(dta2$Roll, decreasing=T) #order()回傳的數值是"element的位置"

head(dta2$RollOrd)
## [1] 1726  301  376 2307  615  199
tail(dta2$RollOrd)
## [1] 2401 1590 1996 2112 1514 1575

數值最大的元素為原數列第1726個,然後是第301個、第376個…如此類推,最小值六個是在原數列位置的第2401、1590、1996、2112、1514、1575個。

## 看一下前6列和後六列,驗證order()函數的解讀有無錯誤
head(dta2[dta2$RollOrd, ])
##       ID                  Name         City  Auth Dec Roll   Size RollOrd
## 1726 498 Correspondence School   Wellington State  NA 5546  Large     753
## 301   28     Rangitoto College     Auckland State  10 3022 Mediam     353
## 376   78      Avondale College     Auckland State   4 2613 Mediam     712
## 2307 319  Burnside High School Christchurch State   8 2588 Mediam     709
## 615   41      Macleans College     Auckland State  10 2476 Mediam    1915
## 199   43    Massey High School     Auckland State   5 2452 Mediam    1683
tail(dta2[dta2$RollOrd, ])
##        ID                    Name                  City    Auth Dec Roll  Size
## 2401 1641  Amana Christian School               Dunedin Private   9    7 Small
## 1590 2461       Tangimoana School              Manawatu   State   4    6 Small
## 1996 3598         Woodbank School              Kaikoura   State   4    6 Small
## 2112 3386     Jacobs River School          Jacobs River   State   5    6 Small
## 1514 2407     Ngamatapouri School Sth Taranaki District   State   9    5 Small
## 1575 2420 Papanui Junction School               Taihape   State   5    5 Small
##      RollOrd
## 2401    2562
## 1590     266
## 1996    2478
## 2112    1501
## 1514    2377
## 1575    1542

學生註冊人數(Roll)最多的學校為Correspondence School,有5546個學生,在原數列的位置是第1726個。

## head()呈現前六筆學校,依照城市名稱Z到A及註冊人數Roll多到少排序
head(dta2[order(dta2$City, dta2$Roll, decreasing=T), ]) 
##        ID                      Name      City  Auth Dec Roll  Size RollOrd
## 2548  401           Menzies College   Wyndham State   4  356 Small     859
## 2549 4054            Wyndham School   Wyndham State   5   94 Small    1163
## 1611 2742          Woodville School Woodville State   3  147 Small     726
## 1630 2640           Papatawa School Woodville State   7   27 Small    2273
## 2041 3600            Woodend School   Woodend State   9  375 Small    1401
## 1601  399 Central Southland College    Winton State   7  549 Small     450
#看結果發現dta2$Roll在這裡沒有發揮作用,結果和下一行一樣

head(dta2[order(dta2$City, decreasing=T), ]) #按照城市名稱Z到A排序
##        ID                      Name      City  Auth Dec Roll  Size RollOrd
## 2548  401           Menzies College   Wyndham State   4  356 Small     859
## 2549 4054            Wyndham School   Wyndham State   5   94 Small    1163
## 1611 2742          Woodville School Woodville State   3  147 Small     726
## 1630 2640           Papatawa School Woodville State   7   27 Small    2273
## 2041 3600            Woodend School   Woodend State   9  375 Small    1401
## 1601  399 Central Southland College    Winton State   7  549 Small     450
## tail()呈現後六筆學校,依照城市名稱Z到A及註冊人數Roll多到少排序
tail(dta2[order(dta2$City, dta2$Roll, decreasing=T), ])
##        ID                         Name    City  Auth Dec Roll  Size RollOrd
## 2169 3273                Albury School  Albury State   8   30 Small    1010
## 2018  350           Akaroa Area School  Akaroa State   8  125 Small    1051
## 2023 3332           Duvauchelle School  Akaroa State   9   41 Small     749
## 335  1200                Ahuroa School  Ahuroa State   7   22 Small     193
## 99   1000               Ahipara School Ahipara State   3  241 Small    1963
## 2117 2105 Awahono School - Grey Valley  Ahaura State   4  119 Small     364

counting

## table()函數可製作列聯表(contingency table)
table(dta2$Auth) #計算學校authority類型數量
## 
##            Other          Private            State State Integrated 
##                1               99             2144              327

私立學校有99所,州立學校2144所,State Integrated(州立整合型?)有327所,其他類1所

## 把table()命名為authtbl
authtbl <- table(dta2$Auth); authtbl
## 
##            Other          Private            State State Integrated 
##                1               99             2144              327
## class()可以查詢object的類別
class(authtbl)
## [1] "table"

authtbl這個object是table

## 找出dta2$Auth == "Other"的那一筆學校資料
dta2[dta2$Auth == "Other", ]
##       ID            Name         City  Auth Dec Roll  Size RollOrd
## 2315 518 Kingslea School Christchurch Other   1   51 Small    1579
## xtabs()可以從data.frame中, 利用統計模型公式(model formula)創造一個列聯表
xtabs(~ Auth + Dec, data=dta2) #第一個變數是row,第二個變數是column
##                   Dec
## Auth                 1   2   3   4   5   6   7   8   9  10
##   Other              1   0   0   0   0   0   0   0   0   0
##   Private            0   0   2   6   2   2   6  11  12  38
##   State            259 230 208 219 214 215 188 200 205 205
##   State Integrated  12  22  35  28  38  34  45  45  37  31
xtabs(~ Dec + Auth, data=dta2) #列行位置可以互換
##     Auth
## Dec  Other Private State State Integrated
##   1      1       0   259               12
##   2      0       0   230               22
##   3      0       2   208               35
##   4      0       6   219               28
##   5      0       2   214               38
##   6      0       2   215               34
##   7      0       6   188               45
##   8      0      11   200               45
##   9      0      12   205               37
##   10     0      38   205               31

aggregating

## 計算全部學校的平均註冊人數
mean(dta2$Roll)
## [1] 295.4737
## 計算私立學校平均註冊人數
mean(dta2$Roll[dta2$Auth == "Private"])
## [1] 308.798
## aggregate():Compute Summary Statistics of Data Subsets
#依據學校Auth類型,分群計算平均註冊人數
aggregate(dta2["Roll"], by=list(dta2$Auth), FUN=mean) 
##            Group.1     Roll
## 1            Other  51.0000
## 2          Private 308.7980
## 3            State 300.6301
## 4 State Integrated 258.3792

私立學校平均註冊人數略高於州立學校。

## Dec是學生家庭的社會經濟狀況,新增變數$Rich,將學生分群,Dec>5者為Rich
dta2$Rich <- dta2$Dec > 5; head(dta2$Rich)
## [1] FALSE FALSE FALSE FALSE FALSE  TRUE
## 依據學校類型(Auth)及學生家庭的社會經濟狀況(Rich)來計算平均註冊人數
aggregate(dta2["Roll"], by=list(dta2$Auth, dta2$Rich), FUN=mean)
##            Group.1 Group.2     Roll
## 1            Other   FALSE  51.0000
## 2          Private   FALSE 151.4000
## 3            State   FALSE 261.7487
## 4 State Integrated   FALSE 183.2370
## 5          Private    TRUE 402.5362
## 6            State    TRUE 338.8243
## 7 State Integrated    TRUE 311.2135
## by(data, INDICES, FUN, ..., simplify = TRUE)
#data frame按照INDICES的factor拆分成小的data frames,在每個小的data frame上運用函数FUN。

#依據學校類型計算註冊人數範圍
by(dta2["Roll"], INDICES=list(dta2$Auth), FUN=range) 
## : Other
## [1] 51 51
## ------------------------------------------------------------ 
## : Private
## [1]    7 1663
## ------------------------------------------------------------ 
## : State
## [1]    5 5546
## ------------------------------------------------------------ 
## : State Integrated
## [1]   18 1475
# 第二種方式:with()
with(dta2,tapply(Roll,list(Auth),range))
## $Other
## [1] 51 51
## 
## $Private
## [1]    7 1663
## 
## $State
## [1]    5 5546
## 
## $`State Integrated`
## [1]   18 1475

State Integrated州立整合型學校註冊人數範圍在18至1475人; State州立學校註冊人數範圍在5至5546人; Private私立學校註冊人數範圍在7至1663人; Other其他類學校註冊人數範圍在51人

Q3

Split the ChickWeight{datasets} data by individual chicks to extract separate slope estimates of regressing weight onto Time for each chick.

# take a look dataset
# ?ChickWeight 

head(ChickWeight)
##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1
## 3     59    4     1    1
## 4     64    6     1    1
## 5     76    8     1    1
## 6     93   10     1    1
tail(ChickWeight)
##     weight Time Chick Diet
## 573    155   12    50    4
## 574    175   14    50    4
## 575    205   16    50    4
## 576    234   18    50    4
## 577    264   20    50    4
## 578    264   21    50    4
names(ChickWeight)
## [1] "weight" "Time"   "Chick"  "Diet"
library(magrittr)

#extract separate slope estimates of regressing weight onto Time for each chick
with(ChickWeight, by(ChickWeight, Chick, 
                     function(x) {
                       lm(weight ~ Time, data = x) %>% 
                             coef()
                       } )) #%>% 
## Chick: 18
## (Intercept)        Time 
##          39          -2 
## ------------------------------------------------------------ 
## Chick: 16
## (Intercept)        Time 
##   43.392857    1.053571 
## ------------------------------------------------------------ 
## Chick: 15
## (Intercept)        Time 
##    46.83333     1.89881 
## ------------------------------------------------------------ 
## Chick: 13
## (Intercept)        Time 
##   43.384359    2.239601 
## ------------------------------------------------------------ 
## Chick: 9
## (Intercept)        Time 
##   52.094086    2.663137 
## ------------------------------------------------------------ 
## Chick: 20
## (Intercept)        Time 
##   37.667826    3.732718 
## ------------------------------------------------------------ 
## Chick: 10
## (Intercept)        Time 
##   38.695054    4.066102 
## ------------------------------------------------------------ 
## Chick: 8
## (Intercept)        Time 
##   43.727273    4.827273 
## ------------------------------------------------------------ 
## Chick: 17
## (Intercept)        Time 
##   43.030706    4.531538 
## ------------------------------------------------------------ 
## Chick: 19
## (Intercept)        Time 
##    31.21222     5.08743 
## ------------------------------------------------------------ 
## Chick: 4
## (Intercept)        Time 
##    32.86568     6.08864 
## ------------------------------------------------------------ 
## Chick: 6
## (Intercept)        Time 
##   44.123431    6.378006 
## ------------------------------------------------------------ 
## Chick: 11
## (Intercept)        Time 
##   47.921948    7.510967 
## ------------------------------------------------------------ 
## Chick: 3
## (Intercept)        Time 
##    23.17955     8.48737 
## ------------------------------------------------------------ 
## Chick: 1
## (Intercept)        Time 
##   24.465436    7.987899 
## ------------------------------------------------------------ 
## Chick: 12
## (Intercept)        Time 
##   21.939797    8.440629 
## ------------------------------------------------------------ 
## Chick: 2
## (Intercept)        Time 
##   24.724853    8.719861 
## ------------------------------------------------------------ 
## Chick: 5
## (Intercept)        Time 
##    16.89563    10.05536 
## ------------------------------------------------------------ 
## Chick: 14
## (Intercept)        Time 
##    20.52488    11.98245 
## ------------------------------------------------------------ 
## Chick: 7
## (Intercept)        Time 
##    5.842535   13.205264 
## ------------------------------------------------------------ 
## Chick: 24
## (Intercept)        Time 
##   53.067766    1.207533 
## ------------------------------------------------------------ 
## Chick: 30
## (Intercept)        Time 
##   39.109666    5.898351 
## ------------------------------------------------------------ 
## Chick: 22
## (Intercept)        Time 
##   40.082590    5.877931 
## ------------------------------------------------------------ 
## Chick: 23
## (Intercept)        Time 
##   38.428074    6.685978 
## ------------------------------------------------------------ 
## Chick: 27
## (Intercept)        Time 
##   29.858569    7.379368 
## ------------------------------------------------------------ 
## Chick: 28
## (Intercept)        Time 
##   23.984874    9.703676 
## ------------------------------------------------------------ 
## Chick: 26
## (Intercept)        Time 
##    20.70715    10.10316 
## ------------------------------------------------------------ 
## Chick: 25
## (Intercept)        Time 
##    19.65119    11.30676 
## ------------------------------------------------------------ 
## Chick: 29
## (Intercept)        Time 
##    5.882771   12.453487 
## ------------------------------------------------------------ 
## Chick: 21
## (Intercept)        Time 
##    15.56330    15.47512 
## ------------------------------------------------------------ 
## Chick: 33
## (Intercept)        Time 
##   45.830283    5.855241 
## ------------------------------------------------------------ 
## Chick: 37
## (Intercept)        Time 
##   29.608834    6.677053 
## ------------------------------------------------------------ 
## Chick: 36
## (Intercept)        Time 
##    25.85403     9.99047 
## ------------------------------------------------------------ 
## Chick: 31
## (Intercept)        Time 
##    19.13099    10.02617 
## ------------------------------------------------------------ 
## Chick: 39
## (Intercept)        Time 
##    17.03661    10.73710 
## ------------------------------------------------------------ 
## Chick: 38
## (Intercept)        Time 
##    10.67282    12.06051 
## ------------------------------------------------------------ 
## Chick: 32
## (Intercept)        Time 
##    13.69173    13.18091 
## ------------------------------------------------------------ 
## Chick: 40
## (Intercept)        Time 
##    10.83830    13.44229 
## ------------------------------------------------------------ 
## Chick: 34
## (Intercept)        Time 
##    5.081682   15.000151 
## ------------------------------------------------------------ 
## Chick: 35
## (Intercept)        Time 
##    4.757979   17.258811 
## ------------------------------------------------------------ 
## Chick: 44
## (Intercept)        Time 
##   44.909091    6.354545 
## ------------------------------------------------------------ 
## Chick: 45
## (Intercept)        Time 
##   35.673121    7.686432 
## ------------------------------------------------------------ 
## Chick: 43
## (Intercept)        Time 
##   52.185751    8.318863 
## ------------------------------------------------------------ 
## Chick: 41
## (Intercept)        Time 
##   39.337922    8.159885 
## ------------------------------------------------------------ 
## Chick: 47
## (Intercept)        Time 
##   36.489790    8.374981 
## ------------------------------------------------------------ 
## Chick: 49
## (Intercept)        Time 
##   31.662986    9.717894 
## ------------------------------------------------------------ 
## Chick: 46
## (Intercept)        Time 
##   27.771744    9.738466 
## ------------------------------------------------------------ 
## Chick: 50
## (Intercept)        Time 
##    23.78218    11.33293 
## ------------------------------------------------------------ 
## Chick: 42
## (Intercept)        Time 
##    19.86507    11.83679 
## ------------------------------------------------------------ 
## Chick: 48
## (Intercept)        Time 
##    7.947663   13.714718
    #as.table() 
    
#as.table()只會留下數值,我覺得反而不清楚

Q4

Convert the script in the NCEA 2007 example into a rmarkdown file and provide comments to each code chunk indicated by ‘##’. Give alternative code to perform the same calculation where appropriate.

Students’ learning in secondary schools are measured by the National Certificates of Educational Achievement (NCEA) in New Zealand. Students usually try to attain NCEA Level 1 in their third year of secondary schooling, Level 2 in their fourth year, and Level 3 in their fifth and final year of secondary school. The percentage of students who achieved each NCEA level is reported annually for all New Zealand secondary schools. The data set contains NCEA achievement percentages for 2007.

Source: Ministry of education - New Zealand

Column 1: School name Column 2: Achievement percentages for Level 1 Column 3: Achievement percentages for Level 2 Column 4: Achievement percentages for Level 3

## input data 
# read.table()可以讀取大多數的 ASCII 資料
# sep=":" 是指資料的分隔符號為:
# quote="" the set of quoting characters.To disable quoting altogether
dta4 <- read.table("NCEA2007.txt", sep=":", quote="", h=T, as.is=T)
## 資料維度,第一個數值是row,第二個是column
dim(dta4)
## [1] 88  4
##
str(dta4)
## 'data.frame':    88 obs. of  4 variables:
##  $ Name  : chr  "Al-Madinah School" "Alfriston College" "Ambury Park Centre for Riding Therapy" "Aorere College" ...
##  $ Level1: num  61.5 53.9 33.3 39.5 71.2 22.1 50.8 57.3 89.3 59.8 ...
##  $ Level2: num  75 44.1 20 50.2 78.9 30.8 34.8 49.8 89.7 65.7 ...
##  $ Level3: num  0 0 0 30.6 55.5 26.3 48.9 44.6 88.6 50.4 ...
##
head(dta4)
##                                    Name Level1 Level2 Level3
## 1                     Al-Madinah School   61.5   75.0    0.0
## 2                     Alfriston College   53.9   44.1    0.0
## 3 Ambury Park Centre for Riding Therapy   33.3   20.0    0.0
## 4                        Aorere College   39.5   50.2   30.6
## 5        Auckland Girls' Grammar School   71.2   78.9   55.5
## 6                      Auckland Grammar   22.1   30.8   26.3

apply

## apply(data matrix,MARGIN,FUN)
## apply是將函數 FUN用在指定的資料集 X 中的每個元素上,透過 MARGIN 參數來指定函數 FUN 是要依照列 (by row = 1) 還是欄/行 (by column = 2) 來執行。


# 一次求算level 1-3的平均
apply(dta4[, -1], MARGIN=2, FUN=mean)
##   Level1   Level2   Level3 
## 62.26705 61.06818 47.97614
#第二種方法
colMeans(dta4[, -1])
##   Level1   Level2   Level3 
## 62.26705 61.06818 47.97614

lapply

## list apply
# lapply function is applied for operations on list objects and returns a list object of same length of original set.

lapply(dta4[, -1], FUN=mean)
## $Level1
## [1] 62.26705
## 
## $Level2
## [1] 61.06818
## 
## $Level3
## [1] 47.97614

sapply

##sapply is wrapper class to lapply with difference being it returns vector or matrix instead of list object.
## simplify the list apply
sapply(dta4[, -1], FUN=mean)
##   Level1   Level2   Level3 
## 62.26705 61.06818 47.97614
## 求算level 1-3的range
apply(dta4[, -1], MARGIN=2, FUN=range)
##      Level1 Level2 Level3
## [1,]    2.8    0.0    0.0
## [2,]   97.4   95.7   95.7
##
lapply(dta4[, -1], FUN=range)
## $Level1
## [1]  2.8 97.4
## 
## $Level2
## [1]  0.0 95.7
## 
## $Level3
## [1]  0.0 95.7
## 
sapply(dta4[, -1], FUN=range)
##      Level1 Level2 Level3
## [1,]    2.8    0.0    0.0
## [2,]   97.4   95.7   95.7

splitting (for Q2)

##split(x,f),將x(Vector or data frame)切割成f(Groups of class factor, vector or list)

# 將dta2$Roll依據學校的Auth來分割資料,並命名為rollsByAuth
rollsByAuth <- split(dta2$Roll, dta2$Auth)
## data structure
str(rollsByAuth)
## List of 4
##  $ Other           : int 51
##  $ Private         : int [1:99] 255 39 154 73 83 25 95 85 94 729 ...
##  $ State           : int [1:2144] 318 200 455 86 577 329 637 395 201 267 ...
##  $ State Integrated: int [1:327] 438 26 191 560 151 114 126 171 211 57 ...
## 
class(rollsByAuth)
## [1] "list"
##lapply()將FUN=mean套用在split(dta2$Roll, dta2$Auth),也就是依據各校Auth類別的註冊人數計算mean
lapply(split(dta2$Roll, dta2$Auth), mean)
## $Other
## [1] 51
## 
## $Private
## [1] 308.798
## 
## $State
## [1] 300.6301
## 
## $`State Integrated`
## [1] 258.3792
# sapply()將FUN=mean套用在split(dta2$Roll, dta2$Auth),也就是依據各校Auth類別的註冊人數計算mean

sapply(split(dta2$Roll, dta2$Auth), mean)
##            Other          Private            State State Integrated 
##          51.0000         308.7980         300.6301         258.3792