0312 Data management homework

Yifang Yu

2018-03-19

Exercise 2

Read in the data and do the linear regression

dta<-cats
knitr::kable(head(cats, 10)) 
Sex Bwt Hwt
F 2.0 7.0
F 2.0 7.4
F 2.0 9.5
F 2.1 7.2
F 2.1 7.3
F 2.1 7.6
F 2.1 8.1
F 2.1 8.2
F 2.1 8.3
F 2.1 8.5
coef(lm(Bwt ~ Sex, data = cats))
## (Intercept)        SexM 
##   2.3595745   0.5404255
coef(lm(Bwt ~ Sex, data = cats, contrasts = list(Sex = "contr.sum")))
## (Intercept)        Sex1 
##   2.6297872  -0.2702128

在第一條回歸式中,R默認使用Dummy Coding,但若加上contr.sum則表示Deviation Coding,This coding system compares the mean of the dependent variable for a given level to the overall mean of the dependent variable.

Reference : R Library Contrast Coding Systems for categorical variables

Exercise 3

Data

Read the data

readingtimes <- read.csv("C:/Users/user/Dropbox/1062-Data_manage/0312/readingtimes.txt", sep="")
knitr::kable(head(readingtimes, 10))   
Snt Sp Wrds New S01 S02 S03 S04 S05 S06 S07 S08 S09 S10
1 1 13 1 3.429 2.795 4.161 3.071 3.625 3.161 3.232 7.161 1.536 4.063
2 2 16 3 6.482 5.411 4.491 5.063 9.295 5.643 8.357 4.313 2.946 6.652
3 3 9 2 1.714 2.339 3.018 2.464 6.045 2.455 4.920 3.366 1.375 2.179
4 4 9 2 3.679 3.714 2.866 2.732 4.205 6.241 3.723 6.330 1.152 3.661
5 5 10 3 4.000 2.902 2.991 2.670 3.884 3.223 3.143 6.143 2.759 3.330
6 6 18 4 6.973 8.018 6.625 7.571 8.795 13.188 11.170 6.071 7.964 7.866
7 7 6 1 2.634 1.750 2.268 2.884 3.491 3.688 2.054 1.696 1.455 3.705

Rank subjects by their reading speeed

rank(apply(readingtimes[,5:14],2,mean))
## S01 S02 S03 S04 S05 S06 S07 S08 S09 S10 
##   5   4   2   3  10   9   8   7   1   6

Estimate, on average, how long does it take to read a word

先計算每個學生的平均閱讀速度,以及平均字數,再計算平均一個字要花多少時間

matrixmean<-apply(readingtimes[,c(3,5:14)],2,mean) 
mean(matrixmean[2:11])/matrixmean[1]
##      Wrds 
## 0.3802173

Exercise 4

source code of the t.test function

getAnywhere(t.test.default)
## A single object matching 't.test.default' was found
## It was found in the following places
##   registered S3 method for t.test from namespace stats
##   namespace:stats
## with value
## 
## function (x, y = NULL, alternative = c("two.sided", "less", "greater"), 
##     mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95, 
##     ...) 
## {
##     alternative <- match.arg(alternative)
##     if (!missing(mu) && (length(mu) != 1 || is.na(mu))) 
##         stop("'mu' must be a single number")
##     if (!missing(conf.level) && (length(conf.level) != 1 || !is.finite(conf.level) || 
##         conf.level < 0 || conf.level > 1)) 
##         stop("'conf.level' must be a single number between 0 and 1")
##     if (!is.null(y)) {
##         dname <- paste(deparse(substitute(x)), "and", deparse(substitute(y)))
##         if (paired) 
##             xok <- yok <- complete.cases(x, y)
##         else {
##             yok <- !is.na(y)
##             xok <- !is.na(x)
##         }
##         y <- y[yok]
##     }
##     else {
##         dname <- deparse(substitute(x))
##         if (paired) 
##             stop("'y' is missing for paired test")
##         xok <- !is.na(x)
##         yok <- NULL
##     }
##     x <- x[xok]
##     if (paired) {
##         x <- x - y
##         y <- NULL
##     }
##     nx <- length(x)
##     mx <- mean(x)
##     vx <- var(x)
##     if (is.null(y)) {
##         if (nx < 2) 
##             stop("not enough 'x' observations")
##         df <- nx - 1
##         stderr <- sqrt(vx/nx)
##         if (stderr < 10 * .Machine$double.eps * abs(mx)) 
##             stop("data are essentially constant")
##         tstat <- (mx - mu)/stderr
##         method <- if (paired) 
##             "Paired t-test"
##         else "One Sample t-test"
##         estimate <- setNames(mx, if (paired) 
##             "mean of the differences"
##         else "mean of x")
##     }
##     else {
##         ny <- length(y)
##         if (nx < 1 || (!var.equal && nx < 2)) 
##             stop("not enough 'x' observations")
##         if (ny < 1 || (!var.equal && ny < 2)) 
##             stop("not enough 'y' observations")
##         if (var.equal && nx + ny < 3) 
##             stop("not enough observations")
##         my <- mean(y)
##         vy <- var(y)
##         method <- paste(if (!var.equal) 
##             "Welch", "Two Sample t-test")
##         estimate <- c(mx, my)
##         names(estimate) <- c("mean of x", "mean of y")
##         if (var.equal) {
##             df <- nx + ny - 2
##             v <- 0
##             if (nx > 1) 
##                 v <- v + (nx - 1) * vx
##             if (ny > 1) 
##                 v <- v + (ny - 1) * vy
##             v <- v/df
##             stderr <- sqrt(v * (1/nx + 1/ny))
##         }
##         else {
##             stderrx <- sqrt(vx/nx)
##             stderry <- sqrt(vy/ny)
##             stderr <- sqrt(stderrx^2 + stderry^2)
##             df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 
##                 1))
##         }
##         if (stderr < 10 * .Machine$double.eps * max(abs(mx), 
##             abs(my))) 
##             stop("data are essentially constant")
##         tstat <- (mx - my - mu)/stderr
##     }
##     if (alternative == "less") {
##         pval <- pt(tstat, df)
##         cint <- c(-Inf, tstat + qt(conf.level, df))
##     }
##     else if (alternative == "greater") {
##         pval <- pt(tstat, df, lower.tail = FALSE)
##         cint <- c(tstat - qt(conf.level, df), Inf)
##     }
##     else {
##         pval <- 2 * pt(-abs(tstat), df)
##         alpha <- 1 - conf.level
##         cint <- qt(1 - alpha/2, df)
##         cint <- tstat + c(-cint, cint)
##     }
##     cint <- mu + cint * stderr
##     names(tstat) <- "t"
##     names(df) <- "df"
##     names(mu) <- if (paired || !is.null(y)) 
##         "difference in means"
##     else "mean"
##     attr(cint, "conf.level") <- conf.level
##     rval <- list(statistic = tstat, parameter = df, p.value = pval, 
##         conf.int = cint, estimate = estimate, null.value = mu, 
##         alternative = alternative, method = method, data.name = dname)
##     class(rval) <- "htest"
##     return(rval)
## }
## <bytecode: 0x00000000145918f0>
## <environment: namespace:stats>

Exercise 5

Use R to list all possible sums from rolling three dice

d1 <- round(runif(10000)*5)+1
d2 <- round(runif(10000)*5)+1
d3 <- round(runif(10000)*5)+1
sumdice<- d1+d2+d3
sumdice %>% table
## .
##    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17 
##    4   73  179  370  669 1016 1306 1411 1429 1291  944  654  403  177   68 
##   18 
##    6

Exercise 6

An R script for IQ_Beh data set

dta <- read.table("C:/Users/user/Dropbox/1062-Data_manage/0312/IQ_Beh.txt", header = T, row.names = 1)

str(dta)
## 'data.frame':    94 obs. of  3 variables:
##  $ Dep: Factor w/ 2 levels "D","N": 2 2 2 2 1 2 2 2 2 2 ...
##  $ IQ : int  103 124 124 104 96 92 124 99 92 116 ...
##  $ BP : int  4 12 9 3 3 3 6 4 3 9 ...

head(dta)
##   Dep  IQ BP
## 1   N 103  4
## 2   N 124 12
## 3   N 124  9
## 4   N 104  3
## 5   D  96  3
## 6   N  92  3

class(dta)
## [1] "data.frame"

dim(dta)
## [1] 94  3

names(dta)
## [1] "Dep" "IQ"  "BP"

is.vector(dta$BP)
## [1] TRUE

dta[1, ]
##   Dep  IQ BP
## 1   N 103  4

dta[1:3, "IQ"]
## [1] 103 124 124

tail(dta[order(dta$BP), ])
##    Dep  IQ BP
## 16   N  89 11
## 58   N 117 11
## 66   N 126 11
## 2    N 124 12
## 73   D  99 13
## 12   D  22 17

tail(dta[order(-dta$BP), ], 4)
##    Dep  IQ BP
## 77   N 124  1
## 80   N 121  1
## 24   N 106  0
## 75   N 122  0

histogram of IQ

with(dta, hist(IQ, xlab = "IQ", main = ""))

boxplot of behavior problem by depression status

boxplot(BP ~ Dep, data = dta, 
        xlab = "Depression", 
        ylab = "Behavior problem score")

scatter plot

plot(IQ ~ BP, data = dta, pch = 20, col = dta$Dep, 
     xlab = "Behavior problem score", ylab = "IQ")
grid()

two regression lines

plot(BP ~ IQ, data = dta, type = "n",
     ylab = "Behavior problem score", xlab = "IQ")
text(dta$IQ, dta$BP, labels = dta$Dep, cex = 0.5)
abline(lm(BP ~ IQ, data = dta, subset = Dep == "D"))
abline(lm(BP ~ IQ, data = dta, subset = Dep == "N"), lty = 2)

end