The math attainment page has a dataset and a script of R code chunks. Generate a markdown file from the script to push the output in HTML for posting to course Moodle site.
# first R session using math attainment data set
# read in a plain text file with variable names and assign a name to it
<- read.table("math_attainment.txt", header = T) dta
# structure of data
str(dta)
## 'data.frame': 39 obs. of 3 variables:
## $ math2: int 28 56 51 13 39 41 30 13 17 32 ...
## $ math1: int 18 22 44 8 20 12 16 5 9 18 ...
## $ cc : num 328 406 387 167 328 ...
The structure of data is data.frame
# first 6 rows
head(dta)
## math2 math1 cc
## 1 28 18 328.20
## 2 56 22 406.03
## 3 51 44 386.94
## 4 13 8 166.91
## 5 39 20 328.20
## 6 41 12 328.20
view first 6 rows
# variable mean
colMeans(dta) #column mean
## math2 math1 cc
## 28.76923 15.35897 188.83667
rowMeans(dta) #row mean
## [1] 124.73333 161.34333 160.64667 62.63667 129.06667 127.06667 68.76000
## [8] 37.64667 41.40000 127.20667 126.73333 89.12000 79.45333 81.12000
## [15] 84.03667 56.08333 54.41667 90.51333 78.70333 24.24000 54.02667
## [22] 63.46333 37.75333 53.94000 85.14333 51.60667 64.75000 59.71667
## [29] 47.15667 57.60667 63.22333 79.47667 62.71667 74.38000 74.14333
## [36] 84.47667 81.89333 72.87000 55.27333
# variable sd
apply(dta, 2, sd) #2是column, 1是row
## math2 math1 cc
## 10.720029 7.744224 84.842513
apply(dta, 1, sd) #1是row
## [1] 176.27823 212.58569 196.00703 90.33795 172.71599 174.78905 79.56716
## [8] 49.77845 49.35261 177.16548 174.57839 88.90311 97.84977 95.99293
## [15] 110.24053 65.25160 66.90681 98.39345 114.75580 19.88901 70.28145
## [22] 75.60545 41.73323 65.74444 89.53168 68.26598 69.43477 70.86989
## [29] 59.23710 62.77997 80.28568 93.97176 68.33806 99.43034 98.26318
## [36] 89.24988 93.39419 95.91538 64.62897
# correlation matrix
cor(dta)
## math2 math1 cc
## math2 1.0000000 0.7443604 0.6570098
## math1 0.7443604 1.0000000 0.5956771
## cc 0.6570098 0.5956771 1.0000000
# specify square plot region
par(pty="m")
pty(plot type)設置作圖類型。= “m”:最大化作圖;= “s”:方形圖。
# scatter plot of math2 by math1
plot(math2 ~ math1, data=dta, xlim=c(0, 60), ylim=c(0, 60),
xlab="Math score at Year 1", ylab="Math score at Year 2")
# add grid lines
grid()
# regression math2 by math1
<- lm(math2 ~ math1, data=dta)
dta.lm
# show results
summary(dta.lm)
##
## Call:
## lm(formula = math2 ~ math1, data = dta)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.430 -5.521 -0.369 4.253 20.388
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.944 2.607 4.965 1.57e-05 ***
## math1 1.030 0.152 6.780 5.57e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.255 on 37 degrees of freedom
## Multiple R-squared: 0.5541, Adjusted R-squared: 0.542
## F-statistic: 45.97 on 1 and 37 DF, p-value: 5.571e-08
# show anova table
anova(dta.lm)
## Analysis of Variance Table
##
## Response: math2
## Df Sum Sq Mean Sq F value Pr(>F)
## math1 1 2419.6 2419.59 45.973 5.571e-08 ***
## Residuals 37 1947.3 52.63
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
why we need to show both regression and anova results??
如何讓p值可以不要是科學符號
# add regression line and title
plot(math2 ~ math1, data=dta, xlim=c(0, 60), ylim=c(0, 60),
xlab="Math score at Year 1", ylab="Math score at Year 2")+abline(dta.lm, lty=2)+title("Mathematics Attainment")
## integer(0)
# specify maximum plot region
par(pty="m")
# only setup plot region
plot(scale(resid(dta.lm)) ~ fitted(dta.lm),
ylim=c(-3.5, 3.5), type="n",
xlab="Fitted values", ylab="Standardized residuals")
# give the value and lable
text(fitted(dta.lm), scale(resid(dta.lm)), labels=rownames(dta), cex=0.5)
# 格線
grid()
# add a horizontal red dash line
abline(h=0, lty=2, col="red")
scale 為centers and/or scales the columns of a numeric matrix,將殘差變為標準化殘差 lty 為linentype (0=blank, 1=solid (default), 2=dashed, 3=dotted, 4=dotdash, 5=longdash, 6=twodash)
# normal QQ plot of the values
qqnorm(scale(resid(dta.lm)))
# adds a line to a “theoretical”, by default normal
qqline(scale(resid(dta.lm)))
grid()
why we need to show both regression and anova results??
如何讓p值可以不要是科學符號
The notation, women{datasets}, indicates that a data object by the name women is in the datasets package. This package is preloaded when R is invoked. Explain the difference between c(women) and c(as.matrix(women)) using the women{datasets}
# first 6 rows
head(women)
## height weight
## 1 58 115
## 2 59 117
## 3 60 120
## 4 61 123
## 5 62 126
## 6 63 129
# 資料結構
class(women)
## [1] "data.frame"
# type of object(數據的細節)
typeof(women)
## [1] "list"
#
str(women)
## 'data.frame': 15 obs. of 2 variables:
## $ height: num 58 59 60 61 62 63 64 65 66 67 ...
## $ weight: num 115 117 120 123 126 129 132 135 139 142 ...
women是一個data.frame
c(women)
## $height
## [1] 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
##
## $weight
## [1] 115 117 120 123 126 129 132 135 139 142 146 150 154 159 164
class(c(women))
## [1] "list"
typeof(c(women))
## [1] "list"
c(women)意味著,women這個資料由兩個list組成,分別是height與weight
c(as.matrix(women))
## [1] 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 115 117 120 123
## [20] 126 129 132 135 139 142 146 150 154 159 164
class(c(as.matrix(women)))
## [1] "numeric"
typeof(c(as.matrix(women)))
## [1] "double"
as.matrix(women)是將women從data.frame轉成matrix的型態
c(as.matrix(women))把matrix中的所有數值取出來變成一串vector,因此可以看到身高和體重的數值被合併起來,變成30個elements in vector
what’s different between list and vector?
# list example
<- list(gender="female", age=18, hobby=c("eat", "sleep")) Jenny
list 允許不同typeof的資料放一起
vector only allows all elements of vector containing single data type
# vector example
<- c(5,10,15,20,25) # 建立一個number vector
a <- c("Tom", "Henry", "John") # 建立一個character vector
b
<- c(1, "john", 3) # 若是把number和character同時放入vector裡,R會自動將所有element的型態,轉變成character c
different typeof of Matrix depends on vector
Matrix是vector的一種新的排列形式,雖然它的class自成一格為matrix,但它的typeof依然會是它架構根源的atomic vector形式。
<- matrix(
Mat1 c("a","b","c","d"), 2, 2
) Mat1
## [,1] [,2]
## [1,] "a" "c"
## [2,] "b" "d"
class(Mat1)
## [1] "matrix" "array"
typeof(Mat1)
## [1] "character"
在這個範例中,Mat1是一個matrix但是他的type是character
<- matrix(
Mat2 c(1,1,1,4), 2, 2
) Mat2
## [,1] [,2]
## [1,] 1 1
## [2,] 1 4
class(Mat2)
## [1] "matrix" "array"
typeof(Mat2)
## [1] "double"
在這個範例中,Mat2是一個matrix但是他的type是double
What’s different between matrix and data.frame?
#
<-matrix(letters, 2, 13)
aa aa
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] "a" "c" "e" "g" "i" "k" "m" "o" "q" "s" "u" "w" "y"
## [2,] "b" "d" "f" "h" "j" "l" "n" "p" "r" "t" "v" "x" "z"
typeof(aa)
## [1] "character"
rownames(aa)
## NULL
colnames(aa)
## NULL
#
<-data.frame(letters, 2, 13)
bb bb
## letters X2 X13
## 1 a 2 13
## 2 b 2 13
## 3 c 2 13
## 4 d 2 13
## 5 e 2 13
## 6 f 2 13
## 7 g 2 13
## 8 h 2 13
## 9 i 2 13
## 10 j 2 13
## 11 k 2 13
## 12 l 2 13
## 13 m 2 13
## 14 n 2 13
## 15 o 2 13
## 16 p 2 13
## 17 q 2 13
## 18 r 2 13
## 19 s 2 13
## 20 t 2 13
## 21 u 2 13
## 22 v 2 13
## 23 w 2 13
## 24 x 2 13
## 25 y 2 13
## 26 z 2 13
c(bb)
## $letters
## [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s"
## [20] "t" "u" "v" "w" "x" "y" "z"
##
## $X2
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##
## $X13
## [1] 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13
## [26] 13
typeof(bb)
## [1] "list"
rownames(bb)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26"
colnames(bb)
## [1] "letters" "X2" "X13"
根據結果,matrix是character type,而且他沒有rownames與colnames(NULL) data.frame是3個list,他有rownames與colnames
Use help to examine the coding scheme for the mother’s race variable in the birthwt{MASS} dataset. The MASS comes with the base R installation but is not automatically loaded when R is invoked.
How many black mothers are there in this data frame?
What does the following R command do? > c(“White”, “Black”, “Other”)[birthwt$race]
讀取MASS package中的birthwt資料
data(birthwt, package = "MASS")
str(birthwt)
## 'data.frame': 189 obs. of 10 variables:
## $ low : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age : int 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
## $ race : int 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke: int 0 0 1 1 1 0 0 0 1 1 ...
## $ ptl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ht : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ui : int 1 0 0 1 1 0 0 0 0 0 ...
## $ ftv : int 0 3 1 2 0 0 1 1 1 0 ...
## $ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
class(birthwt$race)
## [1] "integer"
race是integer
factor(birthwt$race)
## [1] 2 3 1 1 1 3 1 3 1 1 3 3 3 3 1 1 2 1 3 1 3 1 1 3 3 1 1 1 2 2 2 1 2 1 2 1 1
## [38] 1 1 1 2 1 2 1 1 1 1 3 1 3 1 3 1 1 3 3 3 3 3 3 3 3 3 1 3 3 3 3 1 2 1 3 3 2
## [75] 1 2 1 1 2 1 1 1 3 3 3 3 3 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 3 1 3 2 1 1 1 2 1
## [112] 3 1 1 1 3 1 3 1 3 1 3 1 1 1 1 1 1 1 1 3 1 2 3 3 3 3 2 3 1 1 1 3 3 1 1 2 1
## [149] 3 3 3 1 1 1 1 3 2 1 2 3 1 3 3 3 2 1 3 3 1 1 2 2 2 3 3 1 1 1 1 2 3 3 1 3 1
## [186] 3 3 2 1
## Levels: 1 2 3
there are three levels.
#
$race_c<-c("White", "Black", "Other")[birthwt$race]
birthwtclass(birthwt$race_c)
## [1] "character"
typeof(birthwt$race_c)
## [1] "character"
將interger 1,2,3 依序lable 為 “White”, “Black”, “Other”
此時class與type是character
#
$race_f<-factor(birthwt$race, levels = 1:3, labels = c("White", "Black", "Other"))
birthwtclass(birthwt$race_f)
## [1] "factor"
typeof(birthwt$race_f)
## [1] "integer"
使用factor的方式去lable,race_f的class為factor, type是integer(代表的是其levels)
str(birthwt)
## 'data.frame': 189 obs. of 12 variables:
## $ low : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age : int 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
## $ race : int 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke : int 0 0 1 1 1 0 0 0 1 1 ...
## $ ptl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ht : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ui : int 1 0 0 1 1 0 0 0 0 0 ...
## $ ftv : int 0 3 1 2 0 0 1 1 1 0 ...
## $ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
## $ race_c: chr "Black" "Other" "White" "White" ...
## $ race_f: Factor w/ 3 levels "White","Black",..: 2 3 1 1 1 3 1 3 1 1 ...
table(birthwt$race_c) #character
##
## Black Other White
## 26 67 96
table(birthwt$race_f) #factor
##
## White Black Other
## 96 26 67
There are 26 black mothers in the birthwt dataset.
可以看到如果用factor,其table種族順序會依照我們給的level 與 lable排列, 但如果是character,其table種族順序會按照字母排序~B-O-W
Regarding UCBAdmissions{datasets} data object, what does the output > UCBAdmissions[,1,] > UCBAdmissions[,1,1] > UCBAdmissions[1,1,]
UCBAdmissions is an array.陣列(array)是矩陣的維度擴展 (矩陣是陣列的一個特例) 其元素(elements)一樣也都是同質的,意即資料類型(typeof)必須相同
UCBAdmissions
## , , Dept = A
##
## Gender
## Admit Male Female
## Admitted 512 89
## Rejected 313 19
##
## , , Dept = B
##
## Gender
## Admit Male Female
## Admitted 353 17
## Rejected 207 8
##
## , , Dept = C
##
## Gender
## Admit Male Female
## Admitted 120 202
## Rejected 205 391
##
## , , Dept = D
##
## Gender
## Admit Male Female
## Admitted 138 131
## Rejected 279 244
##
## , , Dept = E
##
## Gender
## Admit Male Female
## Admitted 53 94
## Rejected 138 299
##
## , , Dept = F
##
## Gender
## Admit Male Female
## Admitted 22 24
## Rejected 351 317
dim(UCBAdmissions) #[2*2*6]矩陣
## [1] 2 2 6
class(UCBAdmissions) #table
## [1] "table"
typeof(UCBAdmissions) #double
## [1] "double"
str(UCBAdmissions)
## 'table' num [1:2, 1:2, 1:6] 512 313 89 19 353 207 17 8 120 205 ...
## - attr(*, "dimnames")=List of 3
## ..$ Admit : chr [1:2] "Admitted" "Rejected"
## ..$ Gender: chr [1:2] "Male" "Female"
## ..$ Dept : chr [1:6] "A" "B" "C" "D" ...
為什麼class是table不是array?
1,,] # Admit(Admitted) x Gender x Dept UCBAdmissions[
## Dept
## Gender A B C D E F
## Male 512 353 120 138 53 22
## Female 89 17 202 131 94 24
1,] # Admit x Gender(Male) x Dept UCBAdmissions[,
## Dept
## Admit A B C D E F
## Admitted 512 353 120 138 53 22
## Rejected 313 207 205 279 138 351
2] # Admit x Gender x Dept (B) UCBAdmissions[,,
## Gender
## Admit Male Female
## Admitted 353 17
## Rejected 207 8
class(UCBAdmissions[,1,]) #table
## [1] "table"
typeof(UCBAdmissions[,1,]) #double
## [1] "double"
依然是二維矩陣
1,1] # Admit x Gender(Male) x Dept(A) UCBAdmissions[,
## Admitted Rejected
## 512 313
class(UCBAdmissions[,1,1]) #numeric
## [1] "numeric"
typeof(UCBAdmissions[,1,1])
## [1] "double"
變為vector
1,1,] # Admit(Admitted) x Gender(Male) x Dept UCBAdmissions[
## A B C D E F
## 512 353 120 138 53 22
1,,3] # Admit(Admitted) x Gender x Dept(C) UCBAdmissions[
## Male Female
## 120 202
What happens when the following command is entered? > help(ls(“package:MASS”)[92])
Use the observation to find out how many items there are in the package MASS. Note: Solve the following problems with only the course materials presented thus far.
data(package = "MASS")
attachNamespace("MASS")
ls("package:MASS")
## [1] "abbey" "accdeaths" "addterm"
## [4] "Aids2" "Animals" "anorexia"
## [7] "area" "as.fractions" "bacteria"
## [10] "bandwidth.nrd" "bcv" "beav1"
## [13] "beav2" "biopsy" "birthwt"
## [16] "Boston" "boxcox" "cabbages"
## [19] "caith" "Cars93" "cats"
## [22] "cement" "chem" "con2tr"
## [25] "contr.sdif" "coop" "corresp"
## [28] "cov.mcd" "cov.mve" "cov.rob"
## [31] "cov.trob" "cpus" "crabs"
## [34] "Cushings" "DDT" "deaths"
## [37] "denumerate" "dose.p" "drivers"
## [40] "dropterm" "eagles" "enlist"
## [43] "epil" "eqscplot" "farms"
## [46] "fbeta" "fgl" "fitdistr"
## [49] "forbes" "fractions" "frequency.polygon"
## [52] "GAGurine" "galaxies" "gamma.dispersion"
## [55] "gamma.shape" "gehan" "genotype"
## [58] "geyser" "gilgais" "ginv"
## [61] "glm.convert" "glm.nb" "glmmPQL"
## [64] "hills" "hist.FD" "hist.scott"
## [67] "housing" "huber" "hubers"
## [70] "immer" "Insurance" "is.fractions"
## [73] "isoMDS" "kde2d" "lda"
## [76] "ldahist" "leuk" "lm.gls"
## [79] "lm.ridge" "lmsreg" "lmwork"
## [82] "loglm" "loglm1" "logtrans"
## [85] "lqs" "lqs.formula" "ltsreg"
## [88] "mammals" "mca" "mcycle"
## [91] "Melanoma" "menarche" "michelson"
## [94] "minn38" "motors" "muscle"
## [97] "mvrnorm" "nclass.freq" "neg.bin"
## [100] "negative.binomial" "negexp.SSival" "newcomb"
## [103] "nlschools" "npk" "npr1"
## [106] "Null" "oats" "OME"
## [109] "painters" "parcoord" "petrol"
## [112] "phones" "Pima.te" "Pima.tr"
## [115] "Pima.tr2" "polr" "psi.bisquare"
## [118] "psi.hampel" "psi.huber" "qda"
## [121] "quine" "Rabbit" "rational"
## [124] "renumerate" "rlm" "rms.curv"
## [127] "rnegbin" "road" "rotifer"
## [130] "Rubber" "sammon" "select"
## [133] "Shepard" "ships" "shoes"
## [136] "shrimp" "shuttle" "Sitka"
## [139] "Sitka89" "Skye" "snails"
## [142] "SP500" "stdres" "steam"
## [145] "stepAIC" "stormer" "studres"
## [148] "survey" "synth.te" "synth.tr"
## [151] "theta.md" "theta.ml" "theta.mm"
## [154] "topo" "Traffic" "truehist"
## [157] "ucv" "UScereal" "UScrime"
## [160] "VA" "waders" "whiteside"
## [163] "width.SJ" "write.matrix" "wtloss"
在MASS中共有166個datasets
ls("package:MASS")[92]
## [1] "menarche"
在MASS中第92個datasets的名字是menarche
help(ls("package:MASS")[92])
可以讓R主動搜尋R Documentation中menarche這個資料集的資訊