Exercise 2

Read in the data and do the linear regression

dta<-cats
knitr::kable(head(cats, 10))

Sex	Bwt	Hwt
F	2.0	7.0
F	2.0	7.4
F	2.0	9.5
F	2.1	7.2
F	2.1	7.3
F	2.1	7.6
F	2.1	8.1
F	2.1	8.2
F	2.1	8.3
F	2.1	8.5

coef(lm(Bwt ~ Sex, data = cats))

## (Intercept)        SexM 
##   2.3595745   0.5404255

coef(lm(Bwt ~ Sex, data = cats, contrasts = list(Sex = "contr.sum")))

## (Intercept)        Sex1 
##   2.6297872  -0.2702128

在第一條回歸式中，R默認使用Dummy Coding，但若加上contr.sum則表示Deviation Coding，This coding system compares the mean of the dependent variable for a given level to the overall mean of the dependent variable.

Reference : R Library Contrast Coding Systems for categorical variables

Exercise 3

Data

Read the data

readingtimes <- read.csv("C:/Users/user/Dropbox/1062-Data_manage/0312/readingtimes.txt", sep="")
knitr::kable(head(readingtimes, 10))

Snt	Sp	Wrds	New	S01	S02	S03	S04	S05	S06	S07	S08	S09	S10
1	1	13	1	3.429	2.795	4.161	3.071	3.625	3.161	3.232	7.161	1.536	4.063
2	2	16	3	6.482	5.411	4.491	5.063	9.295	5.643	8.357	4.313	2.946	6.652
3	3	9	2	1.714	2.339	3.018	2.464	6.045	2.455	4.920	3.366	1.375	2.179
4	4	9	2	3.679	3.714	2.866	2.732	4.205	6.241	3.723	6.330	1.152	3.661
5	5	10	3	4.000	2.902	2.991	2.670	3.884	3.223	3.143	6.143	2.759	3.330
6	6	18	4	6.973	8.018	6.625	7.571	8.795	13.188	11.170	6.071	7.964	7.866
7	7	6	1	2.634	1.750	2.268	2.884	3.491	3.688	2.054	1.696	1.455	3.705

Rank subjects by their reading speeed

rank(apply(readingtimes[,5:14],2,mean))

## S01 S02 S03 S04 S05 S06 S07 S08 S09 S10 
##   5   4   2   3  10   9   8   7   1   6

Estimate, on average, how long does it take to read a word

先計算每個學生的平均閱讀速度，以及平均字數，再計算平均一個字要花多少時間

matrixmean<-apply(readingtimes[,c(3,5:14)],2,mean) 
mean(matrixmean[2:11])/matrixmean[1]

##      Wrds 
## 0.3802173

Exercise 4

source code of the t.test function

getAnywhere(t.test.default)

## A single object matching 't.test.default' was found
## It was found in the following places
##   registered S3 method for t.test from namespace stats
##   namespace:stats
## with value
## 
## function (x, y = NULL, alternative = c("two.sided", "less", "greater"), 
##     mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95, 
##     ...) 
## {
##     alternative <- match.arg(alternative)
##     if (!missing(mu) && (length(mu) != 1 || is.na(mu))) 
##         stop("'mu' must be a single number")
##     if (!missing(conf.level) && (length(conf.level) != 1 || !is.finite(conf.level) || 
##         conf.level < 0 || conf.level > 1)) 
##         stop("'conf.level' must be a single number between 0 and 1")
##     if (!is.null(y)) {
##         dname <- paste(deparse(substitute(x)), "and", deparse(substitute(y)))
##         if (paired) 
##             xok <- yok <- complete.cases(x, y)
##         else {
##             yok <- !is.na(y)
##             xok <- !is.na(x)
##         }
##         y <- y[yok]
##     }
##     else {
##         dname <- deparse(substitute(x))
##         if (paired) 
##             stop("'y' is missing for paired test")
##         xok <- !is.na(x)
##         yok <- NULL
##     }
##     x <- x[xok]
##     if (paired) {
##         x <- x - y
##         y <- NULL
##     }
##     nx <- length(x)
##     mx <- mean(x)
##     vx <- var(x)
##     if (is.null(y)) {
##         if (nx < 2) 
##             stop("not enough 'x' observations")
##         df <- nx - 1
##         stderr <- sqrt(vx/nx)
##         if (stderr < 10 * .Machine$double.eps * abs(mx)) 
##             stop("data are essentially constant")
##         tstat <- (mx - mu)/stderr
##         method <- if (paired) 
##             "Paired t-test"
##         else "One Sample t-test"
##         estimate <- setNames(mx, if (paired) 
##             "mean of the differences"
##         else "mean of x")
##     }
##     else {
##         ny <- length(y)
##         if (nx < 1 || (!var.equal && nx < 2)) 
##             stop("not enough 'x' observations")
##         if (ny < 1 || (!var.equal && ny < 2)) 
##             stop("not enough 'y' observations")
##         if (var.equal && nx + ny < 3) 
##             stop("not enough observations")
##         my <- mean(y)
##         vy <- var(y)
##         method <- paste(if (!var.equal) 
##             "Welch", "Two Sample t-test")
##         estimate <- c(mx, my)
##         names(estimate) <- c("mean of x", "mean of y")
##         if (var.equal) {
##             df <- nx + ny - 2
##             v <- 0
##             if (nx > 1) 
##                 v <- v + (nx - 1) * vx
##             if (ny > 1) 
##                 v <- v + (ny - 1) * vy
##             v <- v/df
##             stderr <- sqrt(v * (1/nx + 1/ny))
##         }
##         else {
##             stderrx <- sqrt(vx/nx)
##             stderry <- sqrt(vy/ny)
##             stderr <- sqrt(stderrx^2 + stderry^2)
##             df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 
##                 1))
##         }
##         if (stderr < 10 * .Machine$double.eps * max(abs(mx), 
##             abs(my))) 
##             stop("data are essentially constant")
##         tstat <- (mx - my - mu)/stderr
##     }
##     if (alternative == "less") {
##         pval <- pt(tstat, df)
##         cint <- c(-Inf, tstat + qt(conf.level, df))
##     }
##     else if (alternative == "greater") {
##         pval <- pt(tstat, df, lower.tail = FALSE)
##         cint <- c(tstat - qt(conf.level, df), Inf)
##     }
##     else {
##         pval <- 2 * pt(-abs(tstat), df)
##         alpha <- 1 - conf.level
##         cint <- qt(1 - alpha/2, df)
##         cint <- tstat + c(-cint, cint)
##     }
##     cint <- mu + cint * stderr
##     names(tstat) <- "t"
##     names(df) <- "df"
##     names(mu) <- if (paired || !is.null(y)) 
##         "difference in means"
##     else "mean"
##     attr(cint, "conf.level") <- conf.level
##     rval <- list(statistic = tstat, parameter = df, p.value = pval, 
##         conf.int = cint, estimate = estimate, null.value = mu, 
##         alternative = alternative, method = method, data.name = dname)
##     class(rval) <- "htest"
##     return(rval)
## }
## <bytecode: 0x00000000145918f0>
## <environment: namespace:stats>

Exercise 5

Use R to list all possible sums from rolling three dice

d1 <- round(runif(10000)*5)+1
d2 <- round(runif(10000)*5)+1
d3 <- round(runif(10000)*5)+1
sumdice<- d1+d2+d3
sumdice %>% table

## .
##    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17 
##    4   73  179  370  669 1016 1306 1411 1429 1291  944  654  403  177   68 
##   18 
##    6

Exercise 6

An R script for IQ_Beh data set

dta <- read.table("C:/Users/user/Dropbox/1062-Data_manage/0312/IQ_Beh.txt", header = T, row.names = 1)

str(dta)

## 'data.frame':    94 obs. of  3 variables:
##  $ Dep: Factor w/ 2 levels "D","N": 2 2 2 2 1 2 2 2 2 2 ...
##  $ IQ : int  103 124 124 104 96 92 124 99 92 116 ...
##  $ BP : int  4 12 9 3 3 3 6 4 3 9 ...

head(dta)

##   Dep  IQ BP
## 1   N 103  4
## 2   N 124 12
## 3   N 124  9
## 4   N 104  3
## 5   D  96  3
## 6   N  92  3

class(dta)

## [1] "data.frame"

dim(dta)

## [1] 94  3

names(dta)

## [1] "Dep" "IQ"  "BP"

is.vector(dta$BP)

## [1] TRUE

dta[1, ]

##   Dep  IQ BP
## 1   N 103  4

dta[1:3, "IQ"]

## [1] 103 124 124

tail(dta[order(dta$BP), ])

##    Dep  IQ BP
## 16   N  89 11
## 58   N 117 11
## 66   N 126 11
## 2    N 124 12
## 73   D  99 13
## 12   D  22 17

tail(dta[order(-dta$BP), ], 4)

##    Dep  IQ BP
## 77   N 124  1
## 80   N 121  1
## 24   N 106  0
## 75   N 122  0

histogram of IQ

with(dta, hist(IQ, xlab = "IQ", main = ""))

boxplot of behavior problem by depression status

boxplot(BP ~ Dep, data = dta, 
        xlab = "Depression", 
        ylab = "Behavior problem score")

scatter plot

plot(IQ ~ BP, data = dta, pch = 20, col = dta$Dep, 
     xlab = "Behavior problem score", ylab = "IQ")
grid()

two regression lines

plot(BP ~ IQ, data = dta, type = "n",
     ylab = "Behavior problem score", xlab = "IQ")
text(dta$IQ, dta$BP, labels = dta$Dep, cex = 0.5)
abline(lm(BP ~ IQ, data = dta, subset = Dep == "D"))
abline(lm(BP ~ IQ, data = dta, subset = Dep == "N"), lty = 2)

0312 Data management homework

Yifang Yu

2018-03-19