inclass1- Linear regression

Question1

The math attainment page has a dataset and a script of R code chunks. Generate a markdown file from the script to push the output in HTML for posting to course Moodle site.

input data

# first R session using math attainment data set

# read in a plain text file with variable names and assign a name to it
dta <- read.table("math_attainment.txt", header = T)

checking data

# structure of data
str(dta)
## 'data.frame':    39 obs. of  3 variables:
##  $ math2: int  28 56 51 13 39 41 30 13 17 32 ...
##  $ math1: int  18 22 44 8 20 12 16 5 9 18 ...
##  $ cc   : num  328 406 387 167 328 ...

The structure of data is data.frame

# first 6 rows
head(dta)
##   math2 math1     cc
## 1    28    18 328.20
## 2    56    22 406.03
## 3    51    44 386.94
## 4    13     8 166.91
## 5    39    20 328.20
## 6    41    12 328.20

view first 6 rows

descriptive statistics

# variable mean
colMeans(dta) #column mean
##     math2     math1        cc 
##  28.76923  15.35897 188.83667
rowMeans(dta) #row mean
##  [1] 124.73333 161.34333 160.64667  62.63667 129.06667 127.06667  68.76000
##  [8]  37.64667  41.40000 127.20667 126.73333  89.12000  79.45333  81.12000
## [15]  84.03667  56.08333  54.41667  90.51333  78.70333  24.24000  54.02667
## [22]  63.46333  37.75333  53.94000  85.14333  51.60667  64.75000  59.71667
## [29]  47.15667  57.60667  63.22333  79.47667  62.71667  74.38000  74.14333
## [36]  84.47667  81.89333  72.87000  55.27333
# variable sd
apply(dta, 2, sd) #2是column, 1是row
##     math2     math1        cc 
## 10.720029  7.744224 84.842513
apply(dta, 1, sd) #1是row
##  [1] 176.27823 212.58569 196.00703  90.33795 172.71599 174.78905  79.56716
##  [8]  49.77845  49.35261 177.16548 174.57839  88.90311  97.84977  95.99293
## [15] 110.24053  65.25160  66.90681  98.39345 114.75580  19.88901  70.28145
## [22]  75.60545  41.73323  65.74444  89.53168  68.26598  69.43477  70.86989
## [29]  59.23710  62.77997  80.28568  93.97176  68.33806  99.43034  98.26318
## [36]  89.24988  93.39419  95.91538  64.62897
# correlation matrix
cor(dta)
##           math2     math1        cc
## math2 1.0000000 0.7443604 0.6570098
## math1 0.7443604 1.0000000 0.5956771
## cc    0.6570098 0.5956771 1.0000000

plot data

# specify square plot region
par(pty="m")

pty(plot type)設置作圖類型。= “m”:最大化作圖;= “s”:方形圖。

# scatter plot of math2 by math1
plot(math2 ~ math1, data=dta, xlim=c(0, 60), ylim=c(0, 60),
     xlab="Math score at Year 1", ylab="Math score at Year 2")

# add grid lines
grid()

regression analysis

# regression math2 by math1
dta.lm <- lm(math2 ~ math1, data=dta)

# show results
summary(dta.lm)
## 
## Call:
## lm(formula = math2 ~ math1, data = dta)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.430  -5.521  -0.369   4.253  20.388 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   12.944      2.607   4.965 1.57e-05 ***
## math1          1.030      0.152   6.780 5.57e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.255 on 37 degrees of freedom
## Multiple R-squared:  0.5541, Adjusted R-squared:  0.542 
## F-statistic: 45.97 on 1 and 37 DF,  p-value: 5.571e-08
# show anova table
anova(dta.lm)
## Analysis of Variance Table
## 
## Response: math2
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## math1      1 2419.6 2419.59  45.973 5.571e-08 ***
## Residuals 37 1947.3   52.63                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

why we need to show both regression and anova results??

如何讓p值可以不要是科學符號

# add regression line and title
plot(math2 ~ math1, data=dta, xlim=c(0, 60), ylim=c(0, 60),
     xlab="Math score at Year 1", ylab="Math score at Year 2")+abline(dta.lm, lty=2)+title("Mathematics Attainment")

## integer(0)

diagnostics

# specify maximum plot region
par(pty="m")

# only setup plot region 
plot(scale(resid(dta.lm)) ~ fitted(dta.lm), 
     ylim=c(-3.5, 3.5), type="n",
     xlab="Fitted values", ylab="Standardized residuals")

# give the value and lable
text(fitted(dta.lm), scale(resid(dta.lm)), labels=rownames(dta), cex=0.5) 

# 格線
grid()

# add a horizontal red dash line
abline(h=0, lty=2, col="red") 

scale 為centers and/or scales the columns of a numeric matrix,將殘差變為標準化殘差 lty 為linentype (0=blank, 1=solid (default), 2=dashed, 3=dotted, 4=dotdash, 5=longdash, 6=twodash)

normality check

# normal QQ plot of the values 
qqnorm(scale(resid(dta.lm)))

# adds a line to a “theoretical”, by default normal
qqline(scale(resid(dta.lm)))

grid()

Questions needed to be addressed

why we need to show both regression and anova results??

如何讓p值可以不要是科學符號

inclass2- Data structure:class, typeof

Question

The notation, women{datasets}, indicates that a data object by the name women is in the datasets package. This package is preloaded when R is invoked. Explain the difference between c(women) and c(as.matrix(women)) using the women{datasets}

Attributes of dataset

# first 6 rows
head(women)
##   height weight
## 1     58    115
## 2     59    117
## 3     60    120
## 4     61    123
## 5     62    126
## 6     63    129
# 資料結構
class(women)
## [1] "data.frame"
# type of object(數據的細節)
typeof(women)
## [1] "list"
#
str(women)
## 'data.frame':    15 obs. of  2 variables:
##  $ height: num  58 59 60 61 62 63 64 65 66 67 ...
##  $ weight: num  115 117 120 123 126 129 132 135 139 142 ...

women是一個data.frame

c(women)

c(women)
## $height
##  [1] 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 
## $weight
##  [1] 115 117 120 123 126 129 132 135 139 142 146 150 154 159 164
class(c(women))
## [1] "list"
typeof(c(women))
## [1] "list"

c(women)意味著,women這個資料由兩個list組成,分別是height與weight

c(as.matrix(women))

c(as.matrix(women))
##  [1]  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 115 117 120 123
## [20] 126 129 132 135 139 142 146 150 154 159 164
class(c(as.matrix(women)))
## [1] "numeric"
typeof(c(as.matrix(women)))
## [1] "double"

as.matrix(women)是將women從data.frame轉成matrix的型態

c(as.matrix(women))把matrix中的所有數值取出來變成一串vector,因此可以看到身高和體重的數值被合併起來,變成30個elements in vector

心得

what’s different between list and vector?

# list example
Jenny <- list(gender="female", age=18, hobby=c("eat", "sleep"))

list 允許不同typeof的資料放一起

vector only allows all elements of vector containing single data type

# vector example
a <- c(5,10,15,20,25)           # 建立一個number vector
b <- c("Tom", "Henry", "John")  # 建立一個character vector

c<- c(1, "john", 3) # 若是把number和character同時放入vector裡,R會自動將所有element的型態,轉變成character

different typeof of Matrix depends on vector

Matrix是vector的一種新的排列形式,雖然它的class自成一格為matrix,但它的typeof依然會是它架構根源的atomic vector形式。

Mat1 <- matrix(
  c("a","b","c","d"), 2, 2
)
Mat1
##      [,1] [,2]
## [1,] "a"  "c" 
## [2,] "b"  "d"
class(Mat1)
## [1] "matrix" "array"
typeof(Mat1)
## [1] "character"

在這個範例中,Mat1是一個matrix但是他的type是character

Mat2 <- matrix(
  c(1,1,1,4), 2, 2
)
Mat2
##      [,1] [,2]
## [1,]    1    1
## [2,]    1    4
class(Mat2)
## [1] "matrix" "array"
typeof(Mat2)
## [1] "double"

在這個範例中,Mat2是一個matrix但是他的type是double

What’s different between matrix and data.frame?

#
aa<-matrix(letters, 2, 13)
aa
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] "a"  "c"  "e"  "g"  "i"  "k"  "m"  "o"  "q"  "s"   "u"   "w"   "y"  
## [2,] "b"  "d"  "f"  "h"  "j"  "l"  "n"  "p"  "r"  "t"   "v"   "x"   "z"
typeof(aa)
## [1] "character"
rownames(aa)
## NULL
colnames(aa)
## NULL
#
bb<-data.frame(letters, 2, 13)
bb
##    letters X2 X13
## 1        a  2  13
## 2        b  2  13
## 3        c  2  13
## 4        d  2  13
## 5        e  2  13
## 6        f  2  13
## 7        g  2  13
## 8        h  2  13
## 9        i  2  13
## 10       j  2  13
## 11       k  2  13
## 12       l  2  13
## 13       m  2  13
## 14       n  2  13
## 15       o  2  13
## 16       p  2  13
## 17       q  2  13
## 18       r  2  13
## 19       s  2  13
## 20       t  2  13
## 21       u  2  13
## 22       v  2  13
## 23       w  2  13
## 24       x  2  13
## 25       y  2  13
## 26       z  2  13
c(bb)
## $letters
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s"
## [20] "t" "u" "v" "w" "x" "y" "z"
## 
## $X2
##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 
## $X13
##  [1] 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13
## [26] 13
typeof(bb)
## [1] "list"
rownames(bb)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26"
colnames(bb)
## [1] "letters" "X2"      "X13"

根據結果,matrix是character type,而且他沒有rownames與colnames(NULL) data.frame是3個list,他有rownames與colnames

inclass3- how to lable the variables

Question

Use help to examine the coding scheme for the mother’s race variable in the birthwt{MASS} dataset. The MASS comes with the base R installation but is not automatically loaded when R is invoked.

How many black mothers are there in this data frame?

What does the following R command do? > c(“White”, “Black”, “Other”)[birthwt$race]

input data

讀取MASS package中的birthwt資料

data(birthwt, package = "MASS")
str(birthwt)
## 'data.frame':    189 obs. of  10 variables:
##  $ low  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ age  : int  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt  : int  182 155 105 108 107 124 118 103 123 113 ...
##  $ race : int  2 3 1 1 1 3 1 3 1 1 ...
##  $ smoke: int  0 0 1 1 1 0 0 0 1 1 ...
##  $ ptl  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ht   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ui   : int  1 0 0 1 1 0 0 0 0 0 ...
##  $ ftv  : int  0 3 1 2 0 0 1 1 1 0 ...
##  $ bwt  : int  2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
class(birthwt$race)
## [1] "integer"

race是integer

explore how many levels in the race variable

factor(birthwt$race)
##   [1] 2 3 1 1 1 3 1 3 1 1 3 3 3 3 1 1 2 1 3 1 3 1 1 3 3 1 1 1 2 2 2 1 2 1 2 1 1
##  [38] 1 1 1 2 1 2 1 1 1 1 3 1 3 1 3 1 1 3 3 3 3 3 3 3 3 3 1 3 3 3 3 1 2 1 3 3 2
##  [75] 1 2 1 1 2 1 1 1 3 3 3 3 3 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 3 1 3 2 1 1 1 2 1
## [112] 3 1 1 1 3 1 3 1 3 1 3 1 1 1 1 1 1 1 1 3 1 2 3 3 3 3 2 3 1 1 1 3 3 1 1 2 1
## [149] 3 3 3 1 1 1 1 3 2 1 2 3 1 3 3 3 2 1 3 3 1 1 2 2 2 3 3 1 1 1 1 2 3 3 1 3 1
## [186] 3 3 2 1
## Levels: 1 2 3

there are three levels.

lables

#
birthwt$race_c<-c("White", "Black", "Other")[birthwt$race]
class(birthwt$race_c)
## [1] "character"
typeof(birthwt$race_c)
## [1] "character"

將interger 1,2,3 依序lable 為 “White”, “Black”, “Other”

此時class與type是character

#
birthwt$race_f<-factor(birthwt$race, levels = 1:3, labels = c("White", "Black", "Other"))
class(birthwt$race_f)
## [1] "factor"
typeof(birthwt$race_f)
## [1] "integer"

使用factor的方式去lable,race_f的class為factor, type是integer(代表的是其levels)

str(birthwt)
## 'data.frame':    189 obs. of  12 variables:
##  $ low   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ age   : int  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt   : int  182 155 105 108 107 124 118 103 123 113 ...
##  $ race  : int  2 3 1 1 1 3 1 3 1 1 ...
##  $ smoke : int  0 0 1 1 1 0 0 0 1 1 ...
##  $ ptl   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ht    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ui    : int  1 0 0 1 1 0 0 0 0 0 ...
##  $ ftv   : int  0 3 1 2 0 0 1 1 1 0 ...
##  $ bwt   : int  2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
##  $ race_c: chr  "Black" "Other" "White" "White" ...
##  $ race_f: Factor w/ 3 levels "White","Black",..: 2 3 1 1 1 3 1 3 1 1 ...
table(birthwt$race_c) #character
## 
## Black Other White 
##    26    67    96
table(birthwt$race_f) #factor
## 
## White Black Other 
##    96    26    67

There are 26 black mothers in the birthwt dataset.

可以看到如果用factor,其table種族順序會依照我們給的level 與 lable排列, 但如果是character,其table種族順序會按照字母排序~B-O-W

inclass4- what is array

Question

Regarding UCBAdmissions{datasets} data object, what does the output > UCBAdmissions[,1,] > UCBAdmissions[,1,1] > UCBAdmissions[1,1,]

input data

UCBAdmissions is an array.陣列(array)是矩陣的維度擴展 (矩陣是陣列的一個特例) 其元素(elements)一樣也都是同質的,意即資料類型(typeof)必須相同

UCBAdmissions
## , , Dept = A
## 
##           Gender
## Admit      Male Female
##   Admitted  512     89
##   Rejected  313     19
## 
## , , Dept = B
## 
##           Gender
## Admit      Male Female
##   Admitted  353     17
##   Rejected  207      8
## 
## , , Dept = C
## 
##           Gender
## Admit      Male Female
##   Admitted  120    202
##   Rejected  205    391
## 
## , , Dept = D
## 
##           Gender
## Admit      Male Female
##   Admitted  138    131
##   Rejected  279    244
## 
## , , Dept = E
## 
##           Gender
## Admit      Male Female
##   Admitted   53     94
##   Rejected  138    299
## 
## , , Dept = F
## 
##           Gender
## Admit      Male Female
##   Admitted   22     24
##   Rejected  351    317
dim(UCBAdmissions) #[2*2*6]矩陣
## [1] 2 2 6
class(UCBAdmissions) #table
## [1] "table"
typeof(UCBAdmissions) #double
## [1] "double"
str(UCBAdmissions)
##  'table' num [1:2, 1:2, 1:6] 512 313 89 19 353 207 17 8 120 205 ...
##  - attr(*, "dimnames")=List of 3
##   ..$ Admit : chr [1:2] "Admitted" "Rejected"
##   ..$ Gender: chr [1:2] "Male" "Female"
##   ..$ Dept  : chr [1:6] "A" "B" "C" "D" ...

為什麼class是table不是array?

UCBAdmissions[,1,]

UCBAdmissions[1,,] # Admit(Admitted) x Gender x Dept
##         Dept
## Gender     A   B   C   D   E   F
##   Male   512 353 120 138  53  22
##   Female  89  17 202 131  94  24
UCBAdmissions[,1,] # Admit x Gender(Male) x Dept
##           Dept
## Admit        A   B   C   D   E   F
##   Admitted 512 353 120 138  53  22
##   Rejected 313 207 205 279 138 351
UCBAdmissions[,,2] # Admit x Gender x Dept (B)
##           Gender
## Admit      Male Female
##   Admitted  353     17
##   Rejected  207      8
class(UCBAdmissions[,1,]) #table
## [1] "table"
typeof(UCBAdmissions[,1,]) #double
## [1] "double"

依然是二維矩陣

UCBAdmissions[,1,1]

UCBAdmissions[,1,1] # Admit x Gender(Male) x Dept(A)
## Admitted Rejected 
##      512      313
class(UCBAdmissions[,1,1]) #numeric
## [1] "numeric"
typeof(UCBAdmissions[,1,1])
## [1] "double"

變為vector

UCBAdmissions[1,1,]

UCBAdmissions[1,1,] # Admit(Admitted) x Gender(Male) x Dept
##   A   B   C   D   E   F 
## 512 353 120 138  53  22

UCBAdmissions[1,,3]

UCBAdmissions[1,,3] # Admit(Admitted) x Gender x Dept(C)
##   Male Female 
##    120    202

inclass5- how to search the datasets in package

Question

What happens when the following command is entered? > help(ls(“package:MASS”)[92])

Use the observation to find out how many items there are in the package MASS. Note: Solve the following problems with only the course materials presented thus far.

show all of dataset in “MASS” package

data(package = "MASS")
attachNamespace("MASS")

get a list of built-in “MASS” package datasets

ls("package:MASS")
##   [1] "abbey"             "accdeaths"         "addterm"          
##   [4] "Aids2"             "Animals"           "anorexia"         
##   [7] "area"              "as.fractions"      "bacteria"         
##  [10] "bandwidth.nrd"     "bcv"               "beav1"            
##  [13] "beav2"             "biopsy"            "birthwt"          
##  [16] "Boston"            "boxcox"            "cabbages"         
##  [19] "caith"             "Cars93"            "cats"             
##  [22] "cement"            "chem"              "con2tr"           
##  [25] "contr.sdif"        "coop"              "corresp"          
##  [28] "cov.mcd"           "cov.mve"           "cov.rob"          
##  [31] "cov.trob"          "cpus"              "crabs"            
##  [34] "Cushings"          "DDT"               "deaths"           
##  [37] "denumerate"        "dose.p"            "drivers"          
##  [40] "dropterm"          "eagles"            "enlist"           
##  [43] "epil"              "eqscplot"          "farms"            
##  [46] "fbeta"             "fgl"               "fitdistr"         
##  [49] "forbes"            "fractions"         "frequency.polygon"
##  [52] "GAGurine"          "galaxies"          "gamma.dispersion" 
##  [55] "gamma.shape"       "gehan"             "genotype"         
##  [58] "geyser"            "gilgais"           "ginv"             
##  [61] "glm.convert"       "glm.nb"            "glmmPQL"          
##  [64] "hills"             "hist.FD"           "hist.scott"       
##  [67] "housing"           "huber"             "hubers"           
##  [70] "immer"             "Insurance"         "is.fractions"     
##  [73] "isoMDS"            "kde2d"             "lda"              
##  [76] "ldahist"           "leuk"              "lm.gls"           
##  [79] "lm.ridge"          "lmsreg"            "lmwork"           
##  [82] "loglm"             "loglm1"            "logtrans"         
##  [85] "lqs"               "lqs.formula"       "ltsreg"           
##  [88] "mammals"           "mca"               "mcycle"           
##  [91] "Melanoma"          "menarche"          "michelson"        
##  [94] "minn38"            "motors"            "muscle"           
##  [97] "mvrnorm"           "nclass.freq"       "neg.bin"          
## [100] "negative.binomial" "negexp.SSival"     "newcomb"          
## [103] "nlschools"         "npk"               "npr1"             
## [106] "Null"              "oats"              "OME"              
## [109] "painters"          "parcoord"          "petrol"           
## [112] "phones"            "Pima.te"           "Pima.tr"          
## [115] "Pima.tr2"          "polr"              "psi.bisquare"     
## [118] "psi.hampel"        "psi.huber"         "qda"              
## [121] "quine"             "Rabbit"            "rational"         
## [124] "renumerate"        "rlm"               "rms.curv"         
## [127] "rnegbin"           "road"              "rotifer"          
## [130] "Rubber"            "sammon"            "select"           
## [133] "Shepard"           "ships"             "shoes"            
## [136] "shrimp"            "shuttle"           "Sitka"            
## [139] "Sitka89"           "Skye"              "snails"           
## [142] "SP500"             "stdres"            "steam"            
## [145] "stepAIC"           "stormer"           "studres"          
## [148] "survey"            "synth.te"          "synth.tr"         
## [151] "theta.md"          "theta.ml"          "theta.mm"         
## [154] "topo"              "Traffic"           "truehist"         
## [157] "ucv"               "UScereal"          "UScrime"          
## [160] "VA"                "waders"            "whiteside"        
## [163] "width.SJ"          "write.matrix"      "wtloss"

在MASS中共有166個datasets

ls("package:MASS")[92]
## [1] "menarche"

在MASS中第92個datasets的名字是menarche

help(ls("package:MASS")[92]) 

可以讓R主動搜尋R Documentation中menarche這個資料集的資訊