Homework 3

Jiananduan

Chapter 3

Q4.

y <- rnorm(100)  # generate 100 random numbers form normal distr. (0,1)
mean(y)  # calculate the mean

## [1] 0.009775

sd(y)  # calculate the standard deviation

## [1] 1.08

av <- numeric()  # generate a numeric vector
for (i in 1:24) {
    # since 24 will be 24+1 in the last loop
    y <- rnorm(100)
    i <- i + 1
    av[i] <- mean(y)
    av <- c(av, av[i])  # endow the numbers into av
}
av <- av[2:26]  # since the first value is na
sd(av)

## [1] 0.1005

meanloop <- function(n, m) {
    # n is the random numbers generated from N(0,1) m is the times of
    # calculating the mean
    mv <- numeric()
    for (i in 1:m - 1) {
        y <- rnorm(n)
        i <- i + 1
        mv[i] <- mean(y)
        mv <- c(mv, mv[i])
    }
    mv <- mv[2:m]
    plot(density(mv), main = "density plot")
}
par(mfrow = c(2, 2))
for (i in 1:4) {
    i = i + 1
    meanloop(100 * i, 25)
}

plot of chunk unnamed-chunk-3

Figure 1. Density plots of various sample sizes

Q10.

For the chi-squared distribution with one degree of freedom

par(mfrow = c(1, 5))
a <- rchisq(10, 1)
qqnorm(a)
qqline(a)
a <- rchisq(30, 1)
qqnorm(a)
qqline(a)
a <- rchisq(50, 1)
qqnorm(a)
qqline(a)
a <- rchisq(100, 1)
qqnorm(a)
qqline(a)
a <- rchisq(1000, 1)
qqnorm(a)
qqline(a)

plot of chunk unnamed-chunk-4

Figure 2. Normal Porbability plots for various sample sizes from chi-squared distribution
The shape will be consistent once the sample size is bigger than 50.

For the t-distribution with one degree of freedom

par(mfrow = c(1, 5))
x <- rt(10, 1)
qqnorm(x)
qqline(x)
x <- rt(30, 1)
qqnorm(x)
qqline(x)
x <- rt(60, 1)
qqnorm(x)
qqline(x)
x <- rt(100, 1)
qqnorm(x)
qqline(x)
x <- rt(300, 1)
qqnorm(x)
qqline(x)

plot of chunk unnamed-chunk-5

Figure 3. Normal Probability plots for samples from t distribution
The shape will be consistent once the sample size is bigger than 30.

Q13.

a) Simulate 1000 values from the Markov chain.

# function that simulate from a Markov chain
simula <- function(trans, N) {
    transita <- function(char, trans) {
        sample(colnames(trans), 1, prob = trans[char, ])
    }

    sim <- character(N)
    sim[1] <- sample(colnames(trans), 1)
    for (i in 2:N) {
        sim[i] <- transita(sim[i - 1], trans)
    }

    sim
}
# simulate data from the Markov chain
Pb <- matrix(c(0.6, 0.2, 0.2, 0.2, 0.4, 0.4, 0.4, 0.3, 0.3), ncol = 3, byrow = T)
colnames(Pb) <- c("Sun", "Cloud", "Rain")
row.names(Pb) <- c("Sun", "Cloud", "Rain")
N = 1000
sample <- simula(Pb, N)
# calculate the proportions and compare it with the theoritical ones
sunprob <- length(sample[sample == "Sun"])/N
cloudprob <- length(sample[sample == "Cloud"])/N
rainprob <- length(sample[sample == "Rain"])/N
# output
sunprob

## [1] 0.449

cloudprob

## [1] 0.278

rainprob

## [1] 0.273

The outputs are different with the theoretical proportions, however, the result is consistent with the theoretical proportions since the Sun proportion is bigger than the Cloud proportion and is bigger than Rain proportion.

library(zoo)

## Attaching package: 'zoo'

## The following object(s) are masked from 'package:base':
## 
## as.Date, as.Date.numeric

Markov <- function(N = 100, initial.value = 1, P) {
    X <- numeric(N)
    X[1] <- initial.value + 1  # States 0:5; subscripts 1:6
    n <- nrow(P)
    for (i in 2:N) {
        X[i] <- sample(1:n, size = 1, prob = P[X[i - 1], ])
    }
    X - 1
}

plotmarkov <- function(n = 10000, start = 0, window = 100, transition = Pb, 
    npanels = 5) {
    xc2 <- Markov(n, start, transition)
    mav0 <- rollmean(as.integer(xc2 == 0), window)
    mav1 <- rollmean(as.integer(xc2 == 1), window)
    npanel <- cut(1:length(mav0), breaks = seq(from = 1, to = length(mav0), 
        length = npanels + 1), include.lowest = TRUE)
    df <- data.frame(av0 = mav0, av1 = mav1, x = 1:length(mav0), gp = npanel)
    print(xyplot(av0 + av1 ~ x | gp, data = df, layout = c(1, npanels), type = "l", 
        par.strip.text = list(cex = 0.65), scales = list(x = list(relation = "free"))))
}
N = 10000
winodw = 100
Pb <- matrix(c(0.6, 0.2, 0.2, 0.2, 0.4, 0.4, 0.4, 0.3, 0.3), ncol = 3, byrow = T)
plotmarkov(N, start = 0, window)

## Error: comparison (4) is possible only for atomic and list types

There is an error that I can't solve.

Chapter 4

Q1

library(DAAG)

## Warning: package 'DAAG' was built under R version 2.15.1

## Loading required package: MASS

## Loading required package: lattice

## Loading required package: latticeExtra

## Loading required package: RColorBrewer

## Loading required package: rpart

## Loading required package: randomForest

## randomForest 4.6-6

## Type rfNews() to see new features/changes/bug fixes.

## Loading required package: boot

## Attaching package: 'boot'

## The following object(s) are masked from 'package:lattice':
## 
## melanoma

## Loading required package: survival

## Loading required package: splines

## Attaching package: 'survival'

## The following object(s) are masked from 'package:boot':
## 
## aml

## Attaching package: 'DAAG'

## The following object(s) are masked from 'package:survival':
## 
## lung

## The following object(s) are masked from 'package:MASS':
## 
## hills

dim(nswdemo)

## [1] 722  10

d = with(nswdemo, mean(re75))
sem = with(nswdemo, d/sqrt(length(re75)))

The 95% confidence interval is
(d-2.31*sem, d+2.31*sem)=(2781.301,3304.492)

d = with(nswdemo, mean(re78))
sem = with(nswdemo, d/sqrt(length(re78)))

The 95% confidence interval is
(d-2.31*sem, d+2.31*sem)=(4985.705,5923.567)

Q2

par(mfrow = c(1, 2))
rn <- 1:100
plot(rn, qt(0.975, rn), type = "l")
plot(log(rn), qt(0.975, rn), type = "l", xaxt = "n")
axis(1, at = log(rn), labels = paste(rn))

plot of chunk unnamed-chunk-10

Figure 3.Plot of two-sided 95% critical values
The second graphs is better since itgives more information on how it changes with low degree of freedoms.

Q4

pvalue <- function(n, mean, sd) {
    x <- rnorm(n, mean, sd)
    t.test(x, mu = 0, alternative = "two.sided", conf.level = 0.95)$p.value
}
# generate 50 p-values
pv <- numeric()
for (i in 1:49) {
    pv[i] <- pvalue(10, 0, 1)
    pv <- c(pv, pv[i])
}
qqplot(x = qunif(ppoints(50)), pv, plot.it = T)

plot of chunk unnamed-chunk-11

Figure 5. The QQplot of p-values and a uniform random variables.
The graph shows consisitency between the generated p-values and the uniform random variables.

The article and EDA of Genes

1.

By saying 61 tests, the author didn't really mean 61 tests; it is a deduction that even there are 5% chance that the error may occur, the total number of errors wil accumulate thus increase the

# generate random numbers from a normal distribution with mean=0 sd=1
set.seed(123)
N = 200
rn = rnorm(N, 0, 1)
plot(density(rn))

plot of chunk unnamed-chunk-12

a = quantile(rn, 0.95)

Figure 6. Density plot of rnorm(200,0,1)
Now we get the 95% quantile of the random numbers, suppose the error happens beyond a=1.718657, in this case, the errors happened 200*5%=10 times; when N becomes larger, the frequency of the error will increase.

N = 20000
rn = rnorm(N, 0, 1)
plot(density(rn))

plot of chunk unnamed-chunk-13

a = quantile(rn, 0.95)

Figure 7. Density plot of rnorm(20000,0,1)
This case, the error happens when it overides 1.643612.

The statement could be rephrased as: By the time you repeated a tests N times, you cannot reject the null hypothesis by observing the significant result at certain confidence level.

2. EDA of Genes

9-tumors

library('R.matlab')

## Warning: package 'R.matlab' was built under R version 2.15.1

## Loading required package: R.oo

## Warning: package 'R.oo' was built under R version 2.15.1

## Loading required package: R.methodsS3

## Warning: package 'R.methodsS3' was built under R version 2.15.1

## R.methodsS3 v1.4.2 (2012-06-22) successfully loaded. See ?R.methodsS3 for
## help.

## R.oo v1.9.9 (2012-09-11) successfully loaded. See ?R.oo for help.

## Attaching package: 'R.oo'

## The following object(s) are masked from 'package:methods':
## 
## getClasses, getMethods

## The following object(s) are masked from 'package:base':
## 
## attach, detach, gc, load, save

## R.matlab v1.6.1 (2012-06-30) successfully loaded. See ?R.matlab for help.

## Attaching package: 'R.matlab'

## The following object(s) are masked from 'package:base':
## 
## getOption, isOpen

NINET=readMat('http://www.gems-system.org/datasets/9_Tumors.mat');

## Loading required package: R.utils

## Warning: package 'R.utils' was built under R version 2.15.1

## R.utils v1.16.2 (2012-09-12) successfully loaded. See ?R.utils for help.

## Attaching package: 'R.utils'

## The following object(s) are masked from 'package:R.matlab':
## 
## evaluate, getOption, isOpen, isOpen.default, setOption

## The following object(s) are masked from 'package:utils':
## 
## timestamp

## The following object(s) are masked from 'package:base':
## 
## cat, commandArgs, getOption, inherits, isOpen, lapply, parse, warnings

NINET=NINET[[1]]
dim(NINET)

## [1]   60 5727

class.label=NINET[,1];#extract the class label
NINET=NINET[,-1];
dim(NINET)

## [1]   60 5726

save("class.label","NINET",file="NINET.RData");

a=c(2,4,6,8); # collumns 
summary(NINET[,a])#the summary of the 2st, 4th, 6th, and 8th genes

##        V1              V2              V3             V4       
##  Min.   :-77.0   Min.   :-76.0   Min.   : -18   Min.   :  -42  
##  1st Qu.:-21.8   1st Qu.:-17.0   1st Qu.: 846   1st Qu.: 6611  
##  Median : -4.0   Median :  4.5   Median :1288   Median : 9814  
##  Mean   : -0.4   Mean   : 28.5   Mean   :1572   Mean   : 9027  
##  3rd Qu.: 17.0   3rd Qu.: 43.0   3rd Qu.:2142   3rd Qu.:11810  
##  Max.   :119.0   Max.   :528.0   Max.   :3682   Max.   :18919

summary(c(NINET))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -1320       7      41     236     131   19800

b=c(100:150)
boxplot(NINET[,b],main='Boxplot for genes from 100:150 in column')