Data Science(part I)

The Data Scientist's Toolbox

链接

R Programming

Overview and History of R

获得帮助

help()
`?`(command)
# 提问给出以下信息
version
str(.Platform)

数据类型及基本运算

FALSE Inf NA NaN NULL TRUE break else for function if in next repeat while
x <- 1:4  # puts c(1,2,3,4) into x
i <- rep(2, 4)  # puts c(2,2,2,2) into i
y <- rep(x, 2)  # puts c(1,2,3,4,1,2,3,4) into y
z <- rep(x, i)  # puts c(1,1,2,2,3,3,4,4) into z
w <- rep(x, x)  # puts c(1,2,2,3,3,3,4,4,4,4) into w

截取数据

读取数据

getwd()
setwd()
initial <- read.table("datatable.txt", nrows = 100)
classes <- sapply(initial, class)
tabAll <- read.table("datatable.txt", colClasses = classes)

控制结构

if(<condition>) {
        ## do something
} else {
        ## do something else
}
if(<condition1>) {
        ## do something
} else if(<condition2>)  {
        ## do something different
} else {
        ## do something different
}
for (i in 1:10) {
    print(i)
}
count <- 0
while (count < 10) {
    print(count)
    count <- count + 1
}
for (i in 1:100) {
    if (i <= 20) {
        ## Skip the first 20 iterations
        next
    }
    ## Do something here
}

函数

f <- function(<arguments>) {
        ## Do something interesting
}

编程标准

范围规则

向量化操作

日期与时间

循环

lapply

x <- list(a = 1:5, b = rnorm(10))
lapply(x, mean)
## $a
## [1] 3
## 
## $b
## [1] 0.09485

x <- 1:4
lapply(x, runif, min = 0, max = 10)
## [[1]]
## [1] 8.89
## 
## [[2]]
## [1] 5.639 1.004
## 
## [[3]]
## [1] 3.834 9.982 1.325
## 
## [[4]]
## [1] 2.674 3.279 1.805 5.022

x <- list(a = matrix(1:4, 2, 2), b = matrix(1:6, 3, 2))
lapply(x, function(elt) elt[, 1])
## $a
## [1] 1 2
## 
## $b
## [1] 1 2 3

sapply

x <- list(a = 1:4, b = rnorm(10), c = rnorm(20, 1), d = rnorm(100, 5))
sapply(x, mean)
##      a      b      c      d 
## 2.5000 0.3861 1.0057 4.9518

vapply

replicate

rapply

apply

x <- matrix(rnorm(50), 10, 5)
apply(x, 1, quantile, probs = c(0.25, 0.75))
##        [,1]    [,2]    [,3]    [,4]    [,5]   [,6]    [,7]     [,8]
## 25% -0.4016 -0.3194 -1.4145 -1.3172 -0.9657 -1.339 -0.9349 -0.07081
## 75%  0.5756  0.8038 -0.1331 -0.8333  0.3405 -0.174 -0.5182  1.01415
##        [,9]   [,10]
## 25% -0.1032 -0.1605
## 75%  0.1423  0.4605

a <- array(rnorm(2 * 2 * 10), c(2, 2, 10))
apply(a, c(1, 2), mean)
##         [,1]     [,2]
## [1,] -0.1631  0.02116
## [2,]  0.2227 -0.02247

tapply

x <- c(rnorm(10), runif(10), rnorm(10, 1))
f <- gl(3, 10)
tapply(x, f, mean)
##       1       2       3 
## -0.1923  0.5339  0.2723

by

split

x <- c(rnorm(10), runif(10), rnorm(10, 1))
f <- gl(3, 10)
lapply(split(x, f), mean)
## $`1`
## [1] -0.3494
## 
## $`2`
## [1] 0.3945
## 
## $`3`
## [1] 1.113

x <- rnorm(10)
f1 <- gl(2, 5)
f2 <- gl(5, 2)
str(split(x, list(f1, f2), drop = TRUE))
## List of 6
##  $ 1.1: num [1:2] -0.356 -0.175
##  $ 1.2: num [1:2] -0.527 0.394
##  $ 1.3: num 0.233
##  $ 2.3: num -0.166
##  $ 2.4: num [1:2] -0.96 -0.735
##  $ 2.5: num [1:2] -0.321 0.208

mapply

noise <- function(n, mean, sd) {
    rnorm(n, mean, sd)
}
mapply(noise, 1:5, 1:5, 2)
## [[1]]
## [1] 0.5623
## 
## [[2]]
## [1] 3.762 1.500
## 
## [[3]]
## [1] 2.163 5.055 2.814
## 
## [[4]]
## [1] 0.1819 2.3478 6.1354 5.4761
## 
## [[5]]
## [1]  5.61986  5.20469  3.63678 -0.06725  6.53935

# 等同于如下循环

# list(noise(1, 1, 2), noise(2, 2, 2), noise(3, 3, 2), noise(4, 4, 2),
# noise(5, 5, 2))

eapply

模拟

dnorm(x, mean = 0, sd = 1, log = FALSE)
pnorm(q, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
qnorm(p, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
rnorm(n, mean = 0, sd = 1)

调试

分析代码