Homework 3

Jiananduan

Chapter 3

Q4.

a)

y <- rnorm(100)  # generate 100 random numbers form normal distr. (0,1)
mean(y)  # calculate the mean
## [1] 0.009775
sd(y)  # calculate the standard deviation
## [1] 1.08

b)

av <- numeric()  # generate a numeric vector
for (i in 1:24) {
    # since 24 will be 24+1 in the last loop
    y <- rnorm(100)
    i <- i + 1
    av[i] <- mean(y)
    av <- c(av, av[i])  # endow the numbers into av
}
av <- av[2:26]  # since the first value is na
sd(av)
## [1] 0.1005

c)

meanloop <- function(n, m) {
    # n is the random numbers generated from N(0,1) m is the times of
    # calculating the mean
    mv <- numeric()
    for (i in 1:m - 1) {
        y <- rnorm(n)
        i <- i + 1
        mv[i] <- mean(y)
        mv <- c(mv, mv[i])
    }
    mv <- mv[2:m]
    plot(density(mv), main = "density plot")
}
par(mfrow = c(2, 2))
for (i in 1:4) {
    i = i + 1
    meanloop(100 * i, 25)
}

plot of chunk unnamed-chunk-3

Figure 1. Density plots of various sample sizes

Q10.

For the chi-squared distribution with one degree of freedom

par(mfrow = c(1, 5))
a <- rchisq(10, 1)
qqnorm(a)
qqline(a)
a <- rchisq(30, 1)
qqnorm(a)
qqline(a)
a <- rchisq(50, 1)
qqnorm(a)
qqline(a)
a <- rchisq(100, 1)
qqnorm(a)
qqline(a)
a <- rchisq(1000, 1)
qqnorm(a)
qqline(a)

plot of chunk unnamed-chunk-4

Figure 2. Normal Porbability plots for various sample sizes from chi-squared distribution
The shape will be consistent once the sample size is bigger than 50.

For the t-distribution with one degree of freedom

par(mfrow = c(1, 5))
x <- rt(10, 1)
qqnorm(x)
qqline(x)
x <- rt(30, 1)
qqnorm(x)
qqline(x)
x <- rt(60, 1)
qqnorm(x)
qqline(x)
x <- rt(100, 1)
qqnorm(x)
qqline(x)
x <- rt(300, 1)
qqnorm(x)
qqline(x)

plot of chunk unnamed-chunk-5

Figure 3. Normal Probability plots for samples from t distribution
The shape will be consistent once the sample size is bigger than 30.

Q13.

a) Simulate 1000 values from the Markov chain.

# function that simulate from a Markov chain
simula <- function(trans, N) {
    transita <- function(char, trans) {
        sample(colnames(trans), 1, prob = trans[char, ])
    }

    sim <- character(N)
    sim[1] <- sample(colnames(trans), 1)
    for (i in 2:N) {
        sim[i] <- transita(sim[i - 1], trans)
    }

    sim
}
# simulate data from the Markov chain
Pb <- matrix(c(0.6, 0.2, 0.2, 0.2, 0.4, 0.4, 0.4, 0.3, 0.3), ncol = 3, byrow = T)
colnames(Pb) <- c("Sun", "Cloud", "Rain")
row.names(Pb) <- c("Sun", "Cloud", "Rain")
N = 1000
sample <- simula(Pb, N)
# calculate the proportions and compare it with the theoritical ones
sunprob <- length(sample[sample == "Sun"])/N
cloudprob <- length(sample[sample == "Cloud"])/N
rainprob <- length(sample[sample == "Rain"])/N
# output
sunprob
## [1] 0.449
cloudprob
## [1] 0.278
rainprob
## [1] 0.273

The outputs are different with the theoretical proportions, however, the result is consistent with the theoretical proportions since the Sun proportion is bigger than the Cloud proportion and is bigger than Rain proportion.

b)

library(zoo)
## Attaching package: 'zoo'
## The following object(s) are masked from 'package:base':
## 
## as.Date, as.Date.numeric
Markov <- function(N = 100, initial.value = 1, P) {
    X <- numeric(N)
    X[1] <- initial.value + 1  # States 0:5; subscripts 1:6
    n <- nrow(P)
    for (i in 2:N) {
        X[i] <- sample(1:n, size = 1, prob = P[X[i - 1], ])
    }
    X - 1
}

plotmarkov <- function(n = 10000, start = 0, window = 100, transition = Pb, 
    npanels = 5) {
    xc2 <- Markov(n, start, transition)
    mav0 <- rollmean(as.integer(xc2 == 0), window)
    mav1 <- rollmean(as.integer(xc2 == 1), window)
    npanel <- cut(1:length(mav0), breaks = seq(from = 1, to = length(mav0), 
        length = npanels + 1), include.lowest = TRUE)
    df <- data.frame(av0 = mav0, av1 = mav1, x = 1:length(mav0), gp = npanel)
    print(xyplot(av0 + av1 ~ x | gp, data = df, layout = c(1, npanels), type = "l", 
        par.strip.text = list(cex = 0.65), scales = list(x = list(relation = "free"))))
}
N = 10000
winodw = 100
Pb <- matrix(c(0.6, 0.2, 0.2, 0.2, 0.4, 0.4, 0.4, 0.3, 0.3), ncol = 3, byrow = T)
plotmarkov(N, start = 0, window)
## Error: comparison (4) is possible only for atomic and list types

There is an error that I can't solve.

Chapter 4

Q1

library(DAAG)
## Warning: package 'DAAG' was built under R version 2.15.1
## Loading required package: MASS
## Loading required package: lattice
## Loading required package: latticeExtra
## Loading required package: RColorBrewer
## Loading required package: rpart
## Loading required package: randomForest
## randomForest 4.6-6
## Type rfNews() to see new features/changes/bug fixes.
## Loading required package: boot
## Attaching package: 'boot'
## The following object(s) are masked from 'package:lattice':
## 
## melanoma
## Loading required package: survival
## Loading required package: splines
## Attaching package: 'survival'
## The following object(s) are masked from 'package:boot':
## 
## aml
## Attaching package: 'DAAG'
## The following object(s) are masked from 'package:survival':
## 
## lung
## The following object(s) are masked from 'package:MASS':
## 
## hills
dim(nswdemo)
## [1] 722  10
d = with(nswdemo, mean(re75))
sem = with(nswdemo, d/sqrt(length(re75)))

The 95% confidence interval is
(d-2.31*sem, d+2.31*sem)=(2781.301,3304.492)

d = with(nswdemo, mean(re78))
sem = with(nswdemo, d/sqrt(length(re78)))

The 95% confidence interval is
(d-2.31*sem, d+2.31*sem)=(4985.705,5923.567)

Q2

par(mfrow = c(1, 2))
rn <- 1:100
plot(rn, qt(0.975, rn), type = "l")
plot(log(rn), qt(0.975, rn), type = "l", xaxt = "n")
axis(1, at = log(rn), labels = paste(rn))

plot of chunk unnamed-chunk-10

Figure 3.Plot of two-sided 95% critical values
The second graphs is better since itgives more information on how it changes with low degree of freedoms.

Q4

pvalue <- function(n, mean, sd) {
    x <- rnorm(n, mean, sd)
    t.test(x, mu = 0, alternative = "two.sided", conf.level = 0.95)$p.value
}
# generate 50 p-values
pv <- numeric()
for (i in 1:49) {
    pv[i] <- pvalue(10, 0, 1)
    pv <- c(pv, pv[i])
}
qqplot(x = qunif(ppoints(50)), pv, plot.it = T)

plot of chunk unnamed-chunk-11

Figure 5. The QQplot of p-values and a uniform random variables.
The graph shows consisitency between the generated p-values and the uniform random variables.

The article and EDA of Genes

1.

By saying 61 tests, the author didn't really mean 61 tests; it is a deduction that even there are 5% chance that the error may occur, the total number of errors wil accumulate thus increase the

# generate random numbers from a normal distribution with mean=0 sd=1
set.seed(123)
N = 200
rn = rnorm(N, 0, 1)
plot(density(rn))

plot of chunk unnamed-chunk-12

a = quantile(rn, 0.95)

Figure 6. Density plot of rnorm(200,0,1)
Now we get the 95% quantile of the random numbers, suppose the error happens beyond a=1.718657, in this case, the errors happened 200*5%=10 times; when N becomes larger, the frequency of the error will increase.

N = 20000
rn = rnorm(N, 0, 1)
plot(density(rn))

plot of chunk unnamed-chunk-13

a = quantile(rn, 0.95)

Figure 7. Density plot of rnorm(20000,0,1)
This case, the error happens when it overides 1.643612.

The statement could be rephrased as: By the time you repeated a tests N times, you cannot reject the null hypothesis by observing the significant result at certain confidence level.

2. EDA of Genes

9-tumors

library('R.matlab') 
## Warning: package 'R.matlab' was built under R version 2.15.1
## Loading required package: R.oo
## Warning: package 'R.oo' was built under R version 2.15.1
## Loading required package: R.methodsS3
## Warning: package 'R.methodsS3' was built under R version 2.15.1
## R.methodsS3 v1.4.2 (2012-06-22) successfully loaded. See ?R.methodsS3 for
## help.
## R.oo v1.9.9 (2012-09-11) successfully loaded. See ?R.oo for help.
## Attaching package: 'R.oo'
## The following object(s) are masked from 'package:methods':
## 
## getClasses, getMethods
## The following object(s) are masked from 'package:base':
## 
## attach, detach, gc, load, save
## R.matlab v1.6.1 (2012-06-30) successfully loaded. See ?R.matlab for help.
## Attaching package: 'R.matlab'
## The following object(s) are masked from 'package:base':
## 
## getOption, isOpen
NINET=readMat('http://www.gems-system.org/datasets/9_Tumors.mat');
## Loading required package: R.utils
## Warning: package 'R.utils' was built under R version 2.15.1
## R.utils v1.16.2 (2012-09-12) successfully loaded. See ?R.utils for help.
## Attaching package: 'R.utils'
## The following object(s) are masked from 'package:R.matlab':
## 
## evaluate, getOption, isOpen, isOpen.default, setOption
## The following object(s) are masked from 'package:utils':
## 
## timestamp
## The following object(s) are masked from 'package:base':
## 
## cat, commandArgs, getOption, inherits, isOpen, lapply, parse, warnings
NINET=NINET[[1]]
dim(NINET)
## [1]   60 5727
class.label=NINET[,1];#extract the class label
NINET=NINET[,-1];
dim(NINET)
## [1]   60 5726
save("class.label","NINET",file="NINET.RData");

a=c(2,4,6,8); # collumns 
summary(NINET[,a])#the summary of the 2st, 4th, 6th, and 8th genes
##        V1              V2              V3             V4       
##  Min.   :-77.0   Min.   :-76.0   Min.   : -18   Min.   :  -42  
##  1st Qu.:-21.8   1st Qu.:-17.0   1st Qu.: 846   1st Qu.: 6611  
##  Median : -4.0   Median :  4.5   Median :1288   Median : 9814  
##  Mean   : -0.4   Mean   : 28.5   Mean   :1572   Mean   : 9027  
##  3rd Qu.: 17.0   3rd Qu.: 43.0   3rd Qu.:2142   3rd Qu.:11810  
##  Max.   :119.0   Max.   :528.0   Max.   :3682   Max.   :18919  
summary(c(NINET))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -1320       7      41     236     131   19800 
b=c(100:150)
boxplot(NINET[,b],main='Boxplot for genes from 100:150 in column')

plot of chunk unnamed-chunk-14

Var <- var(NINET[,b]) # variance matrix
std <- apply(NINET[,b], 2, sd) # standard deviation of each column
plot(std, main='standard deviation of 100:150 of columns')

plot of chunk unnamed-chunk-14

theta = quantile(std,0.8) # output the 80% quantile of the std
# use theta as threthold to select genes
boxplot((NINET[,b[which(std<=theta)]])) # plot the 80% normal columns

plot of chunk unnamed-chunk-14

boxplot(c(NINET))#summary of the whole data

plot of chunk unnamed-chunk-14

plot(density(c(NINET)))

plot of chunk unnamed-chunk-14

plot(density(c(NINET)),xlim=c(-2000,20000))

plot of chunk unnamed-chunk-14


plot(apply(NINET,1,mean),type='b',pch='*')

plot of chunk unnamed-chunk-14

plot(apply(NINET[,1:500],2,mean),type='b',pch='*')

plot of chunk unnamed-chunk-14

Brain_Tumor1

BRAINT=readMat('http://www.gems-system.org/datasets/Brain_Tumor1.mat');
BRAINT=BRAINT[[1]]
dim(BRAINT)
## [1]   90 5921
class.label=BRAINT[,1];#extract the class label
BRAINT=BRAINT[,-1];
dim(BRAINT)
## [1]   90 5920
save("class.label","BRAINT",file="BRAINT.RData");

a=c(15,18,21,23); # collumns 
summary(BRAINT[,a])#the summary of the 15st, 18th, 21th, and 23th genes
##        V1              V2              V3               V4       
##  Min.   : 1133   Min.   :  502   Min.   :-455.0   Min.   : -204  
##  1st Qu.:10100   1st Qu.:15494   1st Qu.: -66.0   1st Qu.:   88  
##  Median :12530   Median :21103   Median : -12.5   Median :  274  
##  Mean   :13281   Mean   :22444   Mean   :  24.5   Mean   : 1302  
##  3rd Qu.:15996   3rd Qu.:28728   3rd Qu.:  46.2   3rd Qu.: 1678  
##  Max.   :28138   Max.   :45913   Max.   :1903.0   Max.   :11377  
summary(c(BRAINT))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -37800      29     198     728     611  207000 
b=c(300:350)
boxplot(BRAINT[,b])

plot of chunk unnamed-chunk-15

Var <- var(NINET[,b]) # variance matrix
std <- apply(NINET[,b], 2, sd) # standard deviation of each column
plot(std, main='standard deviation of 300:350 of columns')

plot of chunk unnamed-chunk-15

theta = quantile(std,0.8) # output the 80% quantile of the std
# use theta as threthold to select genes
boxplot((BRAINT[,b[which(std<=theta)]])) # plot the 80% normal columns

plot of chunk unnamed-chunk-15


boxplot(c(BRAINT))#summary of the whole data

plot of chunk unnamed-chunk-15

plot(density(c(BRAINT)))

plot of chunk unnamed-chunk-15

plot(density(c(BRAINT)),xlim=c(-2000,20000))

plot of chunk unnamed-chunk-15

plot(apply(BRAINT,1,mean),type='b',pch='*')

plot of chunk unnamed-chunk-15

plot(apply(BRAINT[,1:500],2,mean),type='b',pch='*')

plot of chunk unnamed-chunk-15