Explain what does this statement do: lapply(lapply(search(), ls), length)
search() #是查詢目前所有被載入的套件列表
## [1] ".GlobalEnv" "package:stats" "package:graphics"
## [4] "package:grDevices" "package:utils" "package:datasets"
## [7] "package:methods" "Autoloads" "package:base"
#lapply(X, FUN, ...)是apply a Function over a List or Vector
#lapply(search(), ls) # 查詢ls package的名稱列表
lapply(lapply(search(), ls), length) # 查詢ls package 的數量(length是序列的長度)
## [[1]]
## [1] 0
##
## [[2]]
## [1] 449
##
## [[3]]
## [1] 87
##
## [[4]]
## [1] 113
##
## [[5]]
## [1] 247
##
## [[6]]
## [1] 104
##
## [[7]]
## [1] 203
##
## [[8]]
## [1] 0
##
## [[9]]
## [1] 1255
Convert the R script in the NZ schools example into a rmarkdown file and provide comments to each code chunk indicated by ‘##’. Give alternative code to perform the same calculation where appropriate.
The New Zealand Ministry of Education provides basic information for all primary and secondary schools in the country.
Source: Ministry of education - New Zealand
Column 1: School ID Column 2: School name Column 3: City where the school is located Column 4: The authority of the school Column 5: A socio-economic status of the families of the students of the school Column 6: The number of students enrolled at the school as of July 2007
## keep the school names with white spaces
<- read.csv("nzSchools.csv", as.is=2) #as.is is specified per column
dta2 head(dta2)
## ID Name City Auth Dec Roll
## 1 1015 Hora Hora School Whangarei State 2 318
## 2 1052 Morningside School Whangarei State 3 200
## 3 1062 Onerahi School Whangarei State 4 455
## 4 1092 Raurimu Avenue School Whangarei State 2 86
## 5 1130 Whangarei School Whangarei State 4 577
## 6 1018 Hurupaki School Whangarei State 8 329
## data structure
str(dta2)
## 'data.frame': 2571 obs. of 6 variables:
## $ ID : int 1015 1052 1062 1092 1130 1018 1029 1030 1588 1154 ...
## $ Name: chr "Hora Hora School" "Morningside School" "Onerahi School" "Raurimu Avenue School" ...
## $ City: Factor w/ 541 levels "Ahaura","Ahipara",..: 533 533 533 533 533 533 533 533 533 533 ...
## $ Auth: Factor w/ 4 levels "Other","Private",..: 3 3 3 3 3 3 3 3 4 3 ...
## $ Dec : int 2 3 4 2 4 8 5 5 6 1 ...
## $ Roll: int 318 200 455 86 577 329 637 395 438 201 ...
## dim()可顯示data.frame的資料維度,先顯示列(row),後顯示行(column)
dim(dta2)
## [1] 2571 6
2571 obs. of 6 variables
## 如果學生註冊人數(Roll)大於中位數median,則recode為Large,其餘為Small,同時寫入新變數$size
$Size <- ifelse(dta2$Roll > median(dta2$Roll), "Large", "Small")
dta2
# 也可以用cut()
$Size2<- cut(dta2$Roll,
dta2breaks=c(min(dta2$Roll),median(dta2$Roll),max(dta2$Roll)),
c("Small", "Large"))
## 拿掉變數Size
$Size <- NULL
dta2$Size2 <- NULL dta2
## 顯示data.frame前六列
head(dta2)
## ID Name City Auth Dec Roll
## 1 1015 Hora Hora School Whangarei State 2 318
## 2 1052 Morningside School Whangarei State 3 200
## 3 1062 Onerahi School Whangarei State 4 455
## 4 1092 Raurimu Avenue School Whangarei State 2 86
## 5 1130 Whangarei School Whangarei State 4 577
## 6 1018 Hurupaki School Whangarei State 8 329
## cut()函數可將數據進行分段recode
$Size <- cut(dta2$Roll, 3, labels=c("Small", "Mediam", "Large"))#將Roll平均分為3個區間,命名為"Small", "Mediam", "Large"
dta2
str(dta2$Size)
## Factor w/ 3 levels "Small","Mediam",..: 1 1 1 1 1 1 1 1 1 1 ...
## table()可以從任何vector、matrix、array、data.frame創造一個列聯表
table(dta2$Size) #計算三種不同Size的個數
##
## Small Mediam Large
## 2555 15 1
## order()函數對向量、矩陣進行排序,decreasing=T則可指定由大到小遞減排序
##依據註冊人數多寡排名,並新增變數$RollOrd
$RollOrd <- order(dta2$Roll, decreasing=T) #order()回傳的數值是"element的位置"
dta2
head(dta2$RollOrd)
## [1] 1726 301 376 2307 615 199
tail(dta2$RollOrd)
## [1] 2401 1590 1996 2112 1514 1575
數值最大的元素為原數列第1726個,然後是第301個、第376個…如此類推,最小值六個是在原數列位置的第2401、1590、1996、2112、1514、1575個。
## 看一下前6列和後六列,驗證order()函數的解讀有無錯誤
head(dta2[dta2$RollOrd, ])
## ID Name City Auth Dec Roll Size RollOrd
## 1726 498 Correspondence School Wellington State NA 5546 Large 753
## 301 28 Rangitoto College Auckland State 10 3022 Mediam 353
## 376 78 Avondale College Auckland State 4 2613 Mediam 712
## 2307 319 Burnside High School Christchurch State 8 2588 Mediam 709
## 615 41 Macleans College Auckland State 10 2476 Mediam 1915
## 199 43 Massey High School Auckland State 5 2452 Mediam 1683
tail(dta2[dta2$RollOrd, ])
## ID Name City Auth Dec Roll Size
## 2401 1641 Amana Christian School Dunedin Private 9 7 Small
## 1590 2461 Tangimoana School Manawatu State 4 6 Small
## 1996 3598 Woodbank School Kaikoura State 4 6 Small
## 2112 3386 Jacobs River School Jacobs River State 5 6 Small
## 1514 2407 Ngamatapouri School Sth Taranaki District State 9 5 Small
## 1575 2420 Papanui Junction School Taihape State 5 5 Small
## RollOrd
## 2401 2562
## 1590 266
## 1996 2478
## 2112 1501
## 1514 2377
## 1575 1542
學生註冊人數(Roll)最多的學校為Correspondence School,有5546個學生,在原數列的位置是第1726個。
## head()呈現前六筆學校,依照城市名稱Z到A及註冊人數Roll多到少排序
head(dta2[order(dta2$City, dta2$Roll, decreasing=T), ])
## ID Name City Auth Dec Roll Size RollOrd
## 2548 401 Menzies College Wyndham State 4 356 Small 859
## 2549 4054 Wyndham School Wyndham State 5 94 Small 1163
## 1611 2742 Woodville School Woodville State 3 147 Small 726
## 1630 2640 Papatawa School Woodville State 7 27 Small 2273
## 2041 3600 Woodend School Woodend State 9 375 Small 1401
## 1601 399 Central Southland College Winton State 7 549 Small 450
#看結果發現dta2$Roll在這裡沒有發揮作用,結果和下一行一樣
head(dta2[order(dta2$City, decreasing=T), ]) #按照城市名稱Z到A排序
## ID Name City Auth Dec Roll Size RollOrd
## 2548 401 Menzies College Wyndham State 4 356 Small 859
## 2549 4054 Wyndham School Wyndham State 5 94 Small 1163
## 1611 2742 Woodville School Woodville State 3 147 Small 726
## 1630 2640 Papatawa School Woodville State 7 27 Small 2273
## 2041 3600 Woodend School Woodend State 9 375 Small 1401
## 1601 399 Central Southland College Winton State 7 549 Small 450
## tail()呈現後六筆學校,依照城市名稱Z到A及註冊人數Roll多到少排序
tail(dta2[order(dta2$City, dta2$Roll, decreasing=T), ])
## ID Name City Auth Dec Roll Size RollOrd
## 2169 3273 Albury School Albury State 8 30 Small 1010
## 2018 350 Akaroa Area School Akaroa State 8 125 Small 1051
## 2023 3332 Duvauchelle School Akaroa State 9 41 Small 749
## 335 1200 Ahuroa School Ahuroa State 7 22 Small 193
## 99 1000 Ahipara School Ahipara State 3 241 Small 1963
## 2117 2105 Awahono School - Grey Valley Ahaura State 4 119 Small 364
## table()函數可製作列聯表(contingency table)
table(dta2$Auth) #計算學校authority類型數量
##
## Other Private State State Integrated
## 1 99 2144 327
私立學校有99所,州立學校2144所,State Integrated(州立整合型?)有327所,其他類1所
## 把table()命名為authtbl
<- table(dta2$Auth); authtbl authtbl
##
## Other Private State State Integrated
## 1 99 2144 327
## class()可以查詢object的類別
class(authtbl)
## [1] "table"
authtbl這個object是table
## 找出dta2$Auth == "Other"的那一筆學校資料
$Auth == "Other", ] dta2[dta2
## ID Name City Auth Dec Roll Size RollOrd
## 2315 518 Kingslea School Christchurch Other 1 51 Small 1579
## xtabs()可以從data.frame中, 利用統計模型公式(model formula)創造一個列聯表
xtabs(~ Auth + Dec, data=dta2) #第一個變數是row,第二個變數是column
## Dec
## Auth 1 2 3 4 5 6 7 8 9 10
## Other 1 0 0 0 0 0 0 0 0 0
## Private 0 0 2 6 2 2 6 11 12 38
## State 259 230 208 219 214 215 188 200 205 205
## State Integrated 12 22 35 28 38 34 45 45 37 31
xtabs(~ Dec + Auth, data=dta2) #列行位置可以互換
## Auth
## Dec Other Private State State Integrated
## 1 1 0 259 12
## 2 0 0 230 22
## 3 0 2 208 35
## 4 0 6 219 28
## 5 0 2 214 38
## 6 0 2 215 34
## 7 0 6 188 45
## 8 0 11 200 45
## 9 0 12 205 37
## 10 0 38 205 31
## 計算全部學校的平均註冊人數
mean(dta2$Roll)
## [1] 295.4737
## 計算私立學校平均註冊人數
mean(dta2$Roll[dta2$Auth == "Private"])
## [1] 308.798
## aggregate():Compute Summary Statistics of Data Subsets
#依據學校Auth類型,分群計算平均註冊人數
aggregate(dta2["Roll"], by=list(dta2$Auth), FUN=mean)
## Group.1 Roll
## 1 Other 51.0000
## 2 Private 308.7980
## 3 State 300.6301
## 4 State Integrated 258.3792
私立學校平均註冊人數略高於州立學校。
## Dec是學生家庭的社會經濟狀況,新增變數$Rich,將學生分群,Dec>5者為Rich
$Rich <- dta2$Dec > 5; head(dta2$Rich) dta2
## [1] FALSE FALSE FALSE FALSE FALSE TRUE
## 依據學校類型(Auth)及學生家庭的社會經濟狀況(Rich)來計算平均註冊人數
aggregate(dta2["Roll"], by=list(dta2$Auth, dta2$Rich), FUN=mean)
## Group.1 Group.2 Roll
## 1 Other FALSE 51.0000
## 2 Private FALSE 151.4000
## 3 State FALSE 261.7487
## 4 State Integrated FALSE 183.2370
## 5 Private TRUE 402.5362
## 6 State TRUE 338.8243
## 7 State Integrated TRUE 311.2135
## by(data, INDICES, FUN, ..., simplify = TRUE)
#data frame按照INDICES的factor拆分成小的data frames,在每個小的data frame上運用函数FUN。
#依據學校類型計算註冊人數範圍
by(dta2["Roll"], INDICES=list(dta2$Auth), FUN=range)
## : Other
## [1] 51 51
## ------------------------------------------------------------
## : Private
## [1] 7 1663
## ------------------------------------------------------------
## : State
## [1] 5 5546
## ------------------------------------------------------------
## : State Integrated
## [1] 18 1475
# 第二種方式:with()
with(dta2,tapply(Roll,list(Auth),range))
## $Other
## [1] 51 51
##
## $Private
## [1] 7 1663
##
## $State
## [1] 5 5546
##
## $`State Integrated`
## [1] 18 1475
State Integrated州立整合型學校註冊人數範圍在18至1475人; State州立學校註冊人數範圍在5至5546人; Private私立學校註冊人數範圍在7至1663人; Other其他類學校註冊人數範圍在51人
Split the ChickWeight{datasets} data by individual chicks to extract separate slope estimates of regressing weight onto Time for each chick.
# take a look dataset
# ?ChickWeight
head(ChickWeight)
## weight Time Chick Diet
## 1 42 0 1 1
## 2 51 2 1 1
## 3 59 4 1 1
## 4 64 6 1 1
## 5 76 8 1 1
## 6 93 10 1 1
tail(ChickWeight)
## weight Time Chick Diet
## 573 155 12 50 4
## 574 175 14 50 4
## 575 205 16 50 4
## 576 234 18 50 4
## 577 264 20 50 4
## 578 264 21 50 4
names(ChickWeight)
## [1] "weight" "Time" "Chick" "Diet"
library(magrittr)
#extract separate slope estimates of regressing weight onto Time for each chick
with(ChickWeight, by(ChickWeight, Chick,
function(x) {
lm(weight ~ Time, data = x) %>%
coef()
#%>% } ))
## Chick: 18
## (Intercept) Time
## 39 -2
## ------------------------------------------------------------
## Chick: 16
## (Intercept) Time
## 43.392857 1.053571
## ------------------------------------------------------------
## Chick: 15
## (Intercept) Time
## 46.83333 1.89881
## ------------------------------------------------------------
## Chick: 13
## (Intercept) Time
## 43.384359 2.239601
## ------------------------------------------------------------
## Chick: 9
## (Intercept) Time
## 52.094086 2.663137
## ------------------------------------------------------------
## Chick: 20
## (Intercept) Time
## 37.667826 3.732718
## ------------------------------------------------------------
## Chick: 10
## (Intercept) Time
## 38.695054 4.066102
## ------------------------------------------------------------
## Chick: 8
## (Intercept) Time
## 43.727273 4.827273
## ------------------------------------------------------------
## Chick: 17
## (Intercept) Time
## 43.030706 4.531538
## ------------------------------------------------------------
## Chick: 19
## (Intercept) Time
## 31.21222 5.08743
## ------------------------------------------------------------
## Chick: 4
## (Intercept) Time
## 32.86568 6.08864
## ------------------------------------------------------------
## Chick: 6
## (Intercept) Time
## 44.123431 6.378006
## ------------------------------------------------------------
## Chick: 11
## (Intercept) Time
## 47.921948 7.510967
## ------------------------------------------------------------
## Chick: 3
## (Intercept) Time
## 23.17955 8.48737
## ------------------------------------------------------------
## Chick: 1
## (Intercept) Time
## 24.465436 7.987899
## ------------------------------------------------------------
## Chick: 12
## (Intercept) Time
## 21.939797 8.440629
## ------------------------------------------------------------
## Chick: 2
## (Intercept) Time
## 24.724853 8.719861
## ------------------------------------------------------------
## Chick: 5
## (Intercept) Time
## 16.89563 10.05536
## ------------------------------------------------------------
## Chick: 14
## (Intercept) Time
## 20.52488 11.98245
## ------------------------------------------------------------
## Chick: 7
## (Intercept) Time
## 5.842535 13.205264
## ------------------------------------------------------------
## Chick: 24
## (Intercept) Time
## 53.067766 1.207533
## ------------------------------------------------------------
## Chick: 30
## (Intercept) Time
## 39.109666 5.898351
## ------------------------------------------------------------
## Chick: 22
## (Intercept) Time
## 40.082590 5.877931
## ------------------------------------------------------------
## Chick: 23
## (Intercept) Time
## 38.428074 6.685978
## ------------------------------------------------------------
## Chick: 27
## (Intercept) Time
## 29.858569 7.379368
## ------------------------------------------------------------
## Chick: 28
## (Intercept) Time
## 23.984874 9.703676
## ------------------------------------------------------------
## Chick: 26
## (Intercept) Time
## 20.70715 10.10316
## ------------------------------------------------------------
## Chick: 25
## (Intercept) Time
## 19.65119 11.30676
## ------------------------------------------------------------
## Chick: 29
## (Intercept) Time
## 5.882771 12.453487
## ------------------------------------------------------------
## Chick: 21
## (Intercept) Time
## 15.56330 15.47512
## ------------------------------------------------------------
## Chick: 33
## (Intercept) Time
## 45.830283 5.855241
## ------------------------------------------------------------
## Chick: 37
## (Intercept) Time
## 29.608834 6.677053
## ------------------------------------------------------------
## Chick: 36
## (Intercept) Time
## 25.85403 9.99047
## ------------------------------------------------------------
## Chick: 31
## (Intercept) Time
## 19.13099 10.02617
## ------------------------------------------------------------
## Chick: 39
## (Intercept) Time
## 17.03661 10.73710
## ------------------------------------------------------------
## Chick: 38
## (Intercept) Time
## 10.67282 12.06051
## ------------------------------------------------------------
## Chick: 32
## (Intercept) Time
## 13.69173 13.18091
## ------------------------------------------------------------
## Chick: 40
## (Intercept) Time
## 10.83830 13.44229
## ------------------------------------------------------------
## Chick: 34
## (Intercept) Time
## 5.081682 15.000151
## ------------------------------------------------------------
## Chick: 35
## (Intercept) Time
## 4.757979 17.258811
## ------------------------------------------------------------
## Chick: 44
## (Intercept) Time
## 44.909091 6.354545
## ------------------------------------------------------------
## Chick: 45
## (Intercept) Time
## 35.673121 7.686432
## ------------------------------------------------------------
## Chick: 43
## (Intercept) Time
## 52.185751 8.318863
## ------------------------------------------------------------
## Chick: 41
## (Intercept) Time
## 39.337922 8.159885
## ------------------------------------------------------------
## Chick: 47
## (Intercept) Time
## 36.489790 8.374981
## ------------------------------------------------------------
## Chick: 49
## (Intercept) Time
## 31.662986 9.717894
## ------------------------------------------------------------
## Chick: 46
## (Intercept) Time
## 27.771744 9.738466
## ------------------------------------------------------------
## Chick: 50
## (Intercept) Time
## 23.78218 11.33293
## ------------------------------------------------------------
## Chick: 42
## (Intercept) Time
## 19.86507 11.83679
## ------------------------------------------------------------
## Chick: 48
## (Intercept) Time
## 7.947663 13.714718
#as.table()
#as.table()只會留下數值,我覺得反而不清楚
Convert the script in the NCEA 2007 example into a rmarkdown file and provide comments to each code chunk indicated by ‘##’. Give alternative code to perform the same calculation where appropriate.
Students’ learning in secondary schools are measured by the National Certificates of Educational Achievement (NCEA) in New Zealand. Students usually try to attain NCEA Level 1 in their third year of secondary schooling, Level 2 in their fourth year, and Level 3 in their fifth and final year of secondary school. The percentage of students who achieved each NCEA level is reported annually for all New Zealand secondary schools. The data set contains NCEA achievement percentages for 2007.
Source: Ministry of education - New Zealand
Column 1: School name Column 2: Achievement percentages for Level 1 Column 3: Achievement percentages for Level 2 Column 4: Achievement percentages for Level 3
## input data
# read.table()可以讀取大多數的 ASCII 資料
# sep=":" 是指資料的分隔符號為:
# quote="" the set of quoting characters.To disable quoting altogether
<- read.table("NCEA2007.txt", sep=":", quote="", h=T, as.is=T) dta4
## 資料維度,第一個數值是row,第二個是column
dim(dta4)
## [1] 88 4
##
str(dta4)
## 'data.frame': 88 obs. of 4 variables:
## $ Name : chr "Al-Madinah School" "Alfriston College" "Ambury Park Centre for Riding Therapy" "Aorere College" ...
## $ Level1: num 61.5 53.9 33.3 39.5 71.2 22.1 50.8 57.3 89.3 59.8 ...
## $ Level2: num 75 44.1 20 50.2 78.9 30.8 34.8 49.8 89.7 65.7 ...
## $ Level3: num 0 0 0 30.6 55.5 26.3 48.9 44.6 88.6 50.4 ...
##
head(dta4)
## Name Level1 Level2 Level3
## 1 Al-Madinah School 61.5 75.0 0.0
## 2 Alfriston College 53.9 44.1 0.0
## 3 Ambury Park Centre for Riding Therapy 33.3 20.0 0.0
## 4 Aorere College 39.5 50.2 30.6
## 5 Auckland Girls' Grammar School 71.2 78.9 55.5
## 6 Auckland Grammar 22.1 30.8 26.3
## apply(data matrix,MARGIN,FUN)
## apply是將函數 FUN用在指定的資料集 X 中的每個元素上,透過 MARGIN 參數來指定函數 FUN 是要依照列 (by row = 1) 還是欄/行 (by column = 2) 來執行。
# 一次求算level 1-3的平均
apply(dta4[, -1], MARGIN=2, FUN=mean)
## Level1 Level2 Level3
## 62.26705 61.06818 47.97614
#第二種方法
colMeans(dta4[, -1])
## Level1 Level2 Level3
## 62.26705 61.06818 47.97614
## list apply
# lapply function is applied for operations on list objects and returns a list object of same length of original set.
lapply(dta4[, -1], FUN=mean)
## $Level1
## [1] 62.26705
##
## $Level2
## [1] 61.06818
##
## $Level3
## [1] 47.97614
##sapply is wrapper class to lapply with difference being it returns vector or matrix instead of list object.
## simplify the list apply
sapply(dta4[, -1], FUN=mean)
## Level1 Level2 Level3
## 62.26705 61.06818 47.97614
## 求算level 1-3的range
apply(dta4[, -1], MARGIN=2, FUN=range)
## Level1 Level2 Level3
## [1,] 2.8 0.0 0.0
## [2,] 97.4 95.7 95.7
##
lapply(dta4[, -1], FUN=range)
## $Level1
## [1] 2.8 97.4
##
## $Level2
## [1] 0.0 95.7
##
## $Level3
## [1] 0.0 95.7
##
sapply(dta4[, -1], FUN=range)
## Level1 Level2 Level3
## [1,] 2.8 0.0 0.0
## [2,] 97.4 95.7 95.7
##split(x,f),將x(Vector or data frame)切割成f(Groups of class factor, vector or list)
# 將dta2$Roll依據學校的Auth來分割資料,並命名為rollsByAuth
<- split(dta2$Roll, dta2$Auth) rollsByAuth
## data structure
str(rollsByAuth)
## List of 4
## $ Other : int 51
## $ Private : int [1:99] 255 39 154 73 83 25 95 85 94 729 ...
## $ State : int [1:2144] 318 200 455 86 577 329 637 395 201 267 ...
## $ State Integrated: int [1:327] 438 26 191 560 151 114 126 171 211 57 ...
##
class(rollsByAuth)
## [1] "list"
##lapply()將FUN=mean套用在split(dta2$Roll, dta2$Auth),也就是依據各校Auth類別的註冊人數計算mean
lapply(split(dta2$Roll, dta2$Auth), mean)
## $Other
## [1] 51
##
## $Private
## [1] 308.798
##
## $State
## [1] 300.6301
##
## $`State Integrated`
## [1] 258.3792
# sapply()將FUN=mean套用在split(dta2$Roll, dta2$Auth),也就是依據各校Auth類別的註冊人數計算mean
sapply(split(dta2$Roll, dta2$Auth), mean)
## Other Private State State Integrated
## 51.0000 308.7980 300.6301 258.3792