R

  1. 這部分來探討R最基礎的資料結構- Atomic Vector ,再討論以原子向量為延伸,加入 dim 屬性的 Array 。接著討論屬於異質資料的 ListData Frame

  2. 原子向量(atomic vector)又稱為 「同質性向量(homogeneous)」

v <- c(TRUE, FALSE)
class(v)
## [1] "logical"
v
## [1]  TRUE FALSE
v <- c(TRUE, FALSE, 100L)
class(v)
## [1] "integer"
v
## [1]   1   0 100
v <- c(TRUE, FALSE, 100L, 100)
class(v)
## [1] "numeric"
v
## [1]   1   0 100 100
v <- c(TRUE, FALSE, 100L, 100, 100 + 0i)
class(v)
## [1] "complex"
v
## [1]   1+0i   0+0i 100+0i 100+0i 100+0i
v <- c(TRUE, FALSE, 100L, 100, "100")
class(v)
## [1] "character"
v
## [1] "TRUE"  "FALSE" "100"   "100"   "100"
  1. R沒有 純量(scalar),純量只不過是長度為1的向量
?vector # 查詢vector函數
vector() # 空向量(empty vector)
## logical(0)
vector(mode = "logical", length = 5)  # 各個不同類別型態的向量初始化
## [1] FALSE FALSE FALSE FALSE FALSE
vector(mode = "numeric", length = 5)  # 注意向量元素的內容
## [1] 0 0 0 0 0
vector(mode = "integer", length = 5)
## [1] 0 0 0 0 0
vector(mode = "character", length = 5)
## [1] "" "" "" "" ""
vector(mode = "complex", length = 5)
## [1] 0+0i 0+0i 0+0i 0+0i 0+0i
  1. R裡面所有的操作背後都有一個函數
  1. 數字開頭
# NOT RUN
# 1x <- 100
# 錯誤: unexpected symbol in "1x"
  1. 『_』開頭
# _x <- 100
# 錯誤: unexpected input in "_"
  1. 含空白字元
# x y <- 100
# 錯誤: unexpected symbol in "x y"
  1. 為保留字(reserved words)。可用?reserved查詢所有R的保留字。

另外,如用dot開頭(ex:.xx)命名時,變數會被創建,但物件名不會出現在Global Enviroment中。但可透過 ls() 中將參數 all.names 設定為 TRUE 即可

.x <- 100
ls() # 看不到.x
## [1] "v"
ls(all.names = TRUE) # 可看見.x
## [1] ".x" "v"

如要打破上述命名規則,可使用成對『``』將名字放在其中:

`1x` <- 100
`1x`
## [1] 100
`_x` <- 100
`_x`
## [1] 100
`:)` <- 100
`:)`
## [1] 100
`x y` <- 100
`x y`
## [1] 100

常見的R指令(如二元運算子),背後也都有一個函數作支援:

10 > 2
## [1] TRUE
`>`(10, 2) # 指令與上行相同
## [1] TRUE
x <- 100
x
## [1] 100
`<-`(x, 100) # 指令與上行相同
x
## [1] 100
  1. R的 向量 沒有 維度(dimension) 這個屬性
dim(x = 1:10)  ## NULL: 代表『不存在』意義的一個R物件
## NULL
dim(x = vector()) # 即使空向量也是
## NULL
  1. R裡面所有的東西都是 物件,包含函數。
class(x = rnorm)   # function物件
## [1] "function"

Atomic Vector

numeric

100 # 數字100
## [1] 100
class(100) # 查詢物件的類別型態
## [1] "numeric"
rnorm(n = 10, mean = 5, sd = 2) # 產生常態分配的亂數
##  [1] 5.4524949 5.4489767 0.4888329 4.1245115 5.7619111 4.5372758 1.6190812
##  [8] 4.8013371 7.4468476 5.8080104
1e-3
## [1] 0.001
1E-3 # 大寫E與小寫e皆可
## [1] 0.001
class(x = 2.78e-3)
## [1] "numeric"

integer

x <- 1:10
length(x = x)
## [1] 10
dim(x = x)
## NULL
class(x = x)
## [1] "integer"
1L
## [1] 1
class(x = 1L)
## [1] "integer"
1:10; 10:1  # 加;號,簡短程式可寫在同一行
##  [1]  1  2  3  4  5  6  7  8  9 10
##  [1] 10  9  8  7  6  5  4  3  2  1

logical

TRUE
## [1] TRUE
FALSE
## [1] FALSE
c(T, T, F, F) # 縮寫
## [1]  TRUE  TRUE FALSE FALSE

character (character string)

"A"
## [1] "A"
'A'
## [1] "A"
c("A", "B", "C")
## [1] "A" "B" "C"
"" # 空字串
## [1] ""
# 常用的字元字串向量
letters
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s"
## [20] "t" "u" "v" "w" "x" "y" "z"
LETTERS
##  [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S"
## [20] "T" "U" "V" "W" "X" "Y" "Z"
month.abb
##  [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
month.name
##  [1] "January"   "February"  "March"     "April"     "May"       "June"     
##  [7] "July"      "August"    "September" "October"   "November"  "December"

complex

clx1 <- 1 + 1i
class(clx1)
## [1] "complex"
clx2 <- 1 + 0i
class(clx2)
## [1] "complex"
# 1 + 1*i
# not run

\(\sqrt{-1}\) :

sqrt(-1 + 0i)  # 可得虛數i
## [1] 0+1i

NA

not available 的縮寫,代表『遺失值』意義。

class(NA)
## [1] "logical"
class(NA_character_); class(NA_complex_); class(NA_integer_); class(NA_complex_)
## [1] "character"
## [1] "complex"
## [1] "integer"
## [1] "complex"
length(NA)
## [1] 1
x <- c(1, 2, NA, 4, NA)
x == NA
## [1] NA NA NA NA NA
# 判斷是否為NA,請用is.na()
is.na(x)
## [1] FALSE FALSE  TRUE FALSE  TRUE

NULL

代表『不存在』意義的特殊物件。

class(NULL)
## [1] "NULL"
length(NULL)
## [1] 0
NULL == NULL
## logical(0)
NULL == NA
## logical(0)
NULL == 0
## logical(0)
NULL == NULL
## logical(0)
is.null(NULL)
## [1] TRUE

Inf

class(Inf)
## [1] "numeric"
print(1/0)
## [1] Inf
print(log(0))
## [1] -Inf
1 + 1/0
## [1] Inf
Inf == Inf
## [1] TRUE
Inf == -Inf
## [1] FALSE
is.infinite(-Inf)
## [1] TRUE
is.infinite(Inf)
## [1] TRUE
is.finite(Inf)
## [1] FALSE

NaN

  • Not a Number(NaN)
NaN
## [1] NaN
1/0 - 1/0
## [1] NaN
sqrt(-1)
## Warning in sqrt(-1): 產生了 NaNs
## [1] NaN
log(-1)
## Warning in log(-1): 產生了 NaNs
## [1] NaN
class(NaN)
## [1] "numeric"
x <- c(1, 3, 1, NaN, 5, 4)
is.nan(x)
## [1] FALSE FALSE FALSE  TRUE FALSE FALSE

常見的屬性(attributes)與索引操作(subsetting)

names

x <- 1:5
x
## [1] 1 2 3 4 5
names(x)
## NULL
names(x) <- c("A", "B", "C", "D", "E")
x
## A B C D E 
## 1 2 3 4 5

dim

由之前的範例可知,R的向量不具有維度(dim)屬性

dim(x)
## NULL

值得注意的是,長度(length())並不算屬性

subsetting for [

postive vector
v <- c(1, 4, 5, 2, 8)
length(v)
## [1] 5
v[1]
## [1] 1
v[3]
## [1] 5
v[1:3]
## [1] 1 4 5
# v[1,3, 3]
v[c(1, 3, 5)]
## [1] 1 5 8
v[7]
## [1] NA
v[4:8]
## [1]  2  8 NA NA NA
character vector
v <- c(1, 4, 5, 2, 8)
names(v) <- letters[1:5]
v
## a b c d e 
## 1 4 5 2 8
v["a"]
## a 
## 1
v[c("a", "c", "e")]
## a c e 
## 1 5 8
negative vector
v <- 1:5
length(v)
## [1] 5
v[-1]
## [1] 2 3 4 5
v[-c(1, 3, 5)]
## [1] 2 4
v[-(1:3)]
## [1] 4 5
v[-8]   # 欲刪除之index不存在
## [1] 1 2 3 4 5
logical
v <- 1:5
v[c(T, T, F, F, T)]
## [1] 1 2 5
NULL
v <- 1:5
v[NULL]
## integer(0)
nothing
v <- 1:5
v[]
## [1] 1 2 3 4 5
zero
v <- 1:4
v[0]
## integer(0)
w <- letters[1:5]
w[0]
## character(0)
NA
v <- 1:5
v[NA]
## [1] NA NA NA NA NA
postive + negative
# v <- 1:4
# v[c(1, -3, 4)]
# Error in v[c(1, -3, 4)] : 只有負數下標中才能有 0
postive + NA
v <- 1:5
v[c(1, 2, NA, 4)]
## [1]  1  2 NA  4
negtive + NA
v <- 1:5
v[c(-1, -2, 0, -4)]
## [1] 3 5
postive + zero
v <- 1:5
v[c(0, 1, 2)]
## [1] 1 2
v[c(1, 0, 2)]
## [1] 1 2
v[c(1, 2, 0)]
## [1] 1 2
negtive + zero
v <- 1:5
v[c(0, -1, -2)]
## [1] 3 4 5
positive + NULL
v <- 1:5
v[NULL]
## integer(0)
v[c(NULL, 1, 3)]
## [1] 1 3
negative + NULL
# v[c(-1, -2, NA)]
# Error in v[c(-1, -2, NA)] : 只有負數下標中才能有 0

案例補充 – 集合操作

set.seed(seed = 1000) # 設定亂數種子
d <- rnorm(n = 100, mean = 10, sd = 5)
head(d) # 觀察前6筆資料
## [1]  7.771109  3.970717 10.205632 13.196942  6.067228  8.072554
tail(d) # 觀察後6筆資料
## [1]  9.478942 12.339197 12.219604 14.142764  8.064749 20.094691
tail(d, n = 10)  # 可透過參數n設定,觀察更多筆資料
##  [1] 19.857662  0.395024 12.310630  9.196380  9.478942 12.339197 12.219604
##  [8] 14.142764  8.064749 20.094691
names(d)
## NULL
names(d) <- sample(x = c(letters, 1:9), size = 100, replace = TRUE) # 設定names屬性

透過R集合(set)相關的函數操作,可將向量視為集合,如集合的交集、聯集與差集等。

?setdiff  # 查詢相關集合函數
setequal(x = c(1, 1, 2, 3, 3, 3), y = c(1, 3, 2)) # 集合的比較
## [1] TRUE
x <- names(d)  # 抓出每個樣本點的名稱
x              # 注意:名稱會有重複的現象。
##   [1] "6" "t" "w" "1" "w" "b" "p" "r" "e" "7" "p" "f" "2" "h" "s" "x" "5" "g"
##  [19] "y" "3" "9" "t" "s" "r" "e" "f" "u" "2" "t" "d" "9" "e" "9" "w" "a" "a"
##  [37] "m" "v" "2" "u" "i" "8" "h" "a" "a" "b" "a" "7" "9" "u" "g" "z" "8" "z"
##  [55] "s" "1" "o" "h" "s" "o" "v" "h" "v" "f" "z" "x" "7" "n" "c" "d" "7" "7"
##  [73] "e" "5" "p" "y" "y" "v" "d" "o" "r" "r" "p" "z" "c" "e" "r" "g" "m" "6"
##  [91] "i" "c" "f" "6" "q" "t" "2" "o" "l" "n"
setdiff(x = x, y = as.character(1:9)) # 以集合的方式扣除掉數字名後得出所有英文字母的名稱,不會有重複
##  [1] "t" "w" "b" "p" "r" "e" "f" "h" "s" "x" "g" "y" "u" "d" "a" "m" "v" "i" "z"
## [20] "o" "n" "c" "q" "l"
d[setdiff(x = x, y = as.character(1:9))]
##          t          w          b          p          r          e          f 
##  3.9707172 10.2056316  8.0725535  7.6206606 13.5987535  9.9074719  7.2275565 
##          h          s          x          g          y          u          d 
##  9.3956384  3.3197948 10.8502874 10.1246593 -0.2329271  1.0807793 16.1046783 
##          a          m          v          i          z          o          n 
##  1.1690069  8.1690966 15.2880059  7.4134678  4.5265315  1.0483853 18.0960436 
##          c          q          l 
##  5.2153616  9.4789423  8.0647494

注意:以上做法會有問題。 因名稱會有重複的現象,所以當你用character vector subsetting的方式按名稱取值,只會抓出具相同名稱的樣本點中第一個值,因此在上述結果只有看各名稱下唯一值的數字。 建議:雖然R允許取重複名稱,但我們不建議。

nms <- setdiff(x = x, y = as.character(1:9)) # 這是我們要抓的名單
# 用is.element()一個個去檢查每一個樣本點是否在我們的名單(set)中(採logical vector subsetting)
d[is.element(el = names(d), set = nms)]     
##          t          w          w          b          p          r          e 
##  3.9707172 10.2056316  6.0672282  8.0725535  7.6206606 13.5987535  9.9074719 
##          p          f          h          s          x          g          y 
##  5.0878609  7.2275565  9.3956384  3.3197948 10.8502874 10.1246593 -0.2329271 
##          t          s          r          e          f          u          t 
##  3.8649200 14.1712367 12.6628587  6.7658752 13.0158063  1.0807793 12.8048786 
##          d          e          w          a          a          m          v 
## 16.1046783 13.4971476  7.6742453  1.1690069 10.9464430  8.1690966 15.2880059 
##          u          i          h          a          a          b          a 
##  3.2582047  7.4134678 10.9273251  9.7815428  8.9204331 17.3188767 11.1483332 
##          u          g          z          z          s          o          h 
##  5.1590856 11.2585569  4.5265315  5.0184900 10.5028901  1.0483853 11.5585061 
##          s          o          v          h          v          f          z 
## 22.7699400  5.6958112 12.7196422  8.0383098 16.1772095 15.9804322  7.5212655 
##          x          n          c          d          e          p          y 
##  8.5282939 18.0960436  5.2153616 10.2061856 11.4272881  9.2032820  7.6954055 
##          y          v          d          o          r          r          p 
## 10.8421904 16.9774651 13.6421313 11.6754497 15.8463825 11.2398341  8.2092526 
##          z          c          e          r          g          m          i 
## 16.9174666 12.0603458  9.3849607  9.6688534 -1.6124544  4.7717175 19.8576619 
##          c          f          q          t          o          l          n 
##  0.3950240 12.3106304  9.4789423 12.3391970 14.1427640  8.0647494 20.0946908
d[is.element(el = names(d), set = c("A", "B", "100"))]    # 會取出空向量
## named numeric(0)
d[is.element(el = names(d), set = c(nms, "1", "2", "3"))] # 名單添加1號, 2號, 3號   
##          t          w          1          w          b          p          r 
##  3.9707172 10.2056316 13.1969420  6.0672282  8.0725535  7.6206606 13.5987535 
##          e          p          f          2          h          s          x 
##  9.9074719  5.0878609  7.2275565 10.6069059  9.3956384  3.3197948 10.8502874 
##          g          y          3          t          s          r          e 
## 10.1246593 -0.2329271 11.0657705  3.8649200 14.1712367 12.6628587  6.7658752 
##          f          u          2          t          d          e          w 
## 13.0158063  1.0807793 11.6747108 12.8048786 16.1046783 13.4971476  7.6742453 
##          a          a          m          v          2          u          i 
##  1.1690069 10.9464430  8.1690966 15.2880059  6.2918927  3.2582047  7.4134678 
##          h          a          a          b          a          u          g 
## 10.9273251  9.7815428  8.9204331 17.3188767 11.1483332  5.1590856 11.2585569 
##          z          z          s          1          o          h          s 
##  4.5265315  5.0184900 10.5028901 14.7684014  1.0483853 11.5585061 22.7699400 
##          o          v          h          v          f          z          x 
##  5.6958112 12.7196422  8.0383098 16.1772095 15.9804322  7.5212655  8.5282939 
##          n          c          d          e          p          y          y 
## 18.0960436  5.2153616 10.2061856 11.4272881  9.2032820  7.6954055 10.8421904 
##          v          d          o          r          r          p          z 
## 16.9774651 13.6421313 11.6754497 15.8463825 11.2398341  8.2092526 16.9174666 
##          c          e          r          g          m          i          c 
## 12.0603458  9.3849607  9.6688534 -1.6124544  4.7717175 19.8576619  0.3950240 
##          f          q          t          2          o          l          n 
## 12.3106304  9.4789423 12.3391970 12.2196041 14.1427640  8.0647494 20.0946908

Array

當Vector擁有維度(dimension)之後,其類別型態變為 『陣列(Array)』。當其維度為『1維』時,其為『一維陣列』。維度為『2維』時,其 特稱『矩陣(Matrix)』 ,其他情況,如『3維以上』亦皆為『陣列』。

因為陣列(含矩陣)其來源為同質性的Atomic Vector,故陣列(含矩陣)亦為 同質性 的資料型態。

v <- 1:30
dim(v)
## NULL
class(v)
## [1] "integer"
dim(v) <- 30
class(v) # 為『1維陣列』
## [1] "array"
v
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30
dim(v) <- c(5, 6)
class(v) # 為矩陣
## [1] "matrix" "array"
v
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    1    6   11   16   21   26
## [2,]    2    7   12   17   22   27
## [3,]    3    8   13   18   23   28
## [4,]    4    9   14   19   24   29
## [5,]    5   10   15   20   25   30
dim(v) <- c(5, 3, 2)
class(v) # 3維陣列
## [1] "array"
dim(v) <- NULL # 失去維度後,會變為原來同質性的integer vector
v 
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30

Matrix

v1 <- 1:20
dim(v1) <- c(5, 4)
v1
##      [,1] [,2] [,3] [,4]
## [1,]    1    6   11   16
## [2,]    2    7   12   17
## [3,]    3    8   13   18
## [4,]    4    9   14   19
## [5,]    5   10   15   20
v2 <- matrix(data = 1:20, nrow = 5, ncol = 4) # by column-order
v2 # v1與v1內容相同,但做法不同。
##      [,1] [,2] [,3] [,4]
## [1,]    1    6   11   16
## [2,]    2    7   12   17
## [3,]    3    8   13   18
## [4,]    4    9   14   19
## [5,]    5   10   15   20
v3 <- matrix(data = 1:20, nrow = 5, ncol = 4, byrow = TRUE) # NOT by column-order
v3
##      [,1] [,2] [,3] [,4]
## [1,]    1    2    3    4
## [2,]    5    6    7    8
## [3,]    9   10   11   12
## [4,]   13   14   15   16
## [5,]   17   18   19   20

值得注意的是,針對一個矩陣求取長度(length):

length(v3) # 回傳向量的長度
## [1] 20

由此可知,向量與矩陣的差別在於是否具有維度屬性與否。

List

『異質性(heterogeneous)向量』,亦不具『維度』。

list建立

# ?list
l1 <- list(TRUE, 1L, 1, "1", list(100, 200), mean, median, sd)
class(l1)
## [1] "list"
length(l1)
## [1] 8
dim(l1)
## NULL
print(l1)
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] 1
## 
## [[3]]
## [1] 1
## 
## [[4]]
## [1] "1"
## 
## [[5]]
## [[5]][[1]]
## [1] 100
## 
## [[5]][[2]]
## [1] 200
## 
## 
## [[6]]
## function (x, ...) 
## UseMethod("mean")
## <bytecode: 0x7fe86618d588>
## <environment: namespace:base>
## 
## [[7]]
## function (x, na.rm = FALSE, ...) 
## UseMethod("median")
## <bytecode: 0x7fe8624b1fb8>
## <environment: namespace:stats>
## 
## [[8]]
## function (x, na.rm = FALSE) 
## sqrt(var(if (is.vector(x) || is.factor(x)) x else as.double(x), 
##     na.rm = na.rm))
## <bytecode: 0x7fe8606c96e8>
## <environment: namespace:stats>

list subsetting

[

回傳 (子清單)sub-list

l <-  list(TRUE, 1L, 1, "1", list(100, 200), matrix(1:20, nrow = 5, ncol = 4), mean)
l[5]
## [[1]]
## [[1]][[1]]
## [1] 100
## 
## [[1]][[2]]
## [1] 200
class(l[6])
## [1] "list"

[[

回傳list中元素的內容物,內容物是什麼,提領出來就是什麼。

l <-  list(TRUE, 1L, 1, "1", list(100, 200), matrix(1:20, nrow = 5, ncol = 4), mean)
l[6]
## [[1]]
##      [,1] [,2] [,3] [,4]
## [1,]    1    6   11   16
## [2,]    2    7   12   17
## [3,]    3    8   13   18
## [4,]    4    9   14   19
## [5,]    5   10   15   20
class(l[[6]])
## [1] "matrix" "array"
l[[7]]
## function (x, ...) 
## UseMethod("mean")
## <bytecode: 0x7fe86618d588>
## <environment: namespace:base>
class(l[[7]])
## [1] "function"
l[[7]](1:20)
## [1] 10.5

$

當list具備 『names』 屬性時即可使用$。功能類似[[,但不完全相同。且在互動模式下使用便利。

l <- list(A = 100, B = 200, E = 300)
l[["A"]]
## [1] 100
l$A
## [1] 100
l <- list(A = 100, B = 200, E = 300)
x <- "A"
l[[x]]
## [1] 100
l$x
## NULL

因為R會將程式 l[[x]] 視為 l$x ,因 l 內並無元素命名為 x ,故回傳NULL

Data Frame

# ?data.frame
df <- data.frame(A = 10:1, B = rnorm(n = 10), C = runif(n = 10))
df
##     A          B          C
## 1  10 -0.3615950 0.82076792
## 2   9 -1.1619680 0.20936290
## 3   8 -0.7114164 0.76743166
## 4   7  0.3489709 0.67099722
## 5   6  0.4273664 0.04625896
## 6   5  1.6608471 0.75360079
## 7   4  0.5816135 0.40778331
## 8   3  0.1434655 0.37252915
## 9   2  0.3433031 0.25478193
## 10  1 -0.9314912 0.39000806
class(df)
## [1] "data.frame"
typeof(df) # 可確認data frame的內部型態本質為list
## [1] "list"

data frame的特性有些來自matrix,有些來自list。

from Matrix

dim(df)
## [1] 10  3
dimnames(df)
## [[1]]
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
## 
## [[2]]
## [1] "A" "B" "C"
colnames(df)
## [1] "A" "B" "C"
rownames(df)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
df[, "A"]
##  [1] 10  9  8  7  6  5  4  3  2  1
df[2, ]
##   A         B         C
## 2 9 -1.161968 0.2093629
df["2", ]
##   A         B         C
## 2 9 -1.161968 0.2093629
df[8, 1]
## [1] 3
df["8", 1]
## [1] 3
df["8", "A"]
## [1] 3
df[8, "A"]
## [1] 3

from List

length(df)
## [1] 3
names(df)
## [1] "A" "B" "C"
df$A
##  [1] 10  9  8  7  6  5  4  3  2  1
df$A[8]
## [1] 3
df[["A"]][8]
## [1] 3
df[[c(1, 8)]] # recursive setting。分兩層提取資料,第一層先抓第1個元素,進去第二層資料後提領第8個。
## [1] 3

Tibble

library(tidyverse)   # library(tibble)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
tibble(x = letters)
## # A tibble: 26 × 1
##    x    
##    <chr>
##  1 a    
##  2 b    
##  3 c    
##  4 d    
##  5 e    
##  6 f    
##  7 g    
##  8 h    
##  9 i    
## 10 j    
## # … with 16 more rows
tibble(x = 1:3, y = list(1:5, 1:10, 1:20))
## # A tibble: 3 × 2
##       x y         
##   <int> <list>    
## 1     1 <int [5]> 
## 2     2 <int [10]>
## 3     3 <int [20]>
tribble(
  ~x, ~y, ~z,
  #--|--|----
  "a", 2, 3.6,
  "b", 1, 8.5
)
## # A tibble: 2 × 3
##   x         y     z
##   <chr> <dbl> <dbl>
## 1 a         2   3.6
## 2 b         1   8.5
as_tibble(iris)
## # A tibble: 150 × 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # … with 140 more rows
  • It evaluates its arguments lazily and sequentially:
tibble(x = 1:5, y = x ^ 2)
## # A tibble: 5 × 2
##       x     y
##   <int> <dbl>
## 1     1     1
## 2     2     4
## 3     3     9
## 4     4    16
## 5     5    25
tibble(
  x = 1:5, 
  y = 1, 
  z = x ^ 2 + y
)
## # A tibble: 5 × 3
##       x     y     z
##   <int> <dbl> <dbl>
## 1     1     1     2
## 2     2     1     5
## 3     3     1    10
## 4     4     1    17
## 5     5     1    26

Data Import and Export

titanic <- read_csv("titanic.csv")
## Rows: 891 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (7): PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
titanic
## # A tibble: 891 × 12
##    PassengerId Survived Pclass Name   Sex     Age SibSp Parch Ticket  Fare Cabin
##          <dbl>    <dbl>  <dbl> <chr>  <chr> <dbl> <dbl> <dbl> <chr>  <dbl> <chr>
##  1           1        0      3 Braun… male     22     1     0 A/5 2…  7.25 <NA> 
##  2           2        1      1 Cumin… fema…    38     1     0 PC 17… 71.3  C85  
##  3           3        1      3 Heikk… fema…    26     0     0 STON/…  7.92 <NA> 
##  4           4        1      1 Futre… fema…    35     1     0 113803 53.1  C123 
##  5           5        0      3 Allen… male     35     0     0 373450  8.05 <NA> 
##  6           6        0      3 Moran… male     NA     0     0 330877  8.46 <NA> 
##  7           7        0      1 McCar… male     54     0     0 17463  51.9  E46  
##  8           8        0      3 Palss… male      2     3     1 349909 21.1  <NA> 
##  9           9        1      3 Johns… fema…    27     0     2 347742 11.1  <NA> 
## 10          10        1      2 Nasse… fema…    14     1     0 237736 30.1  <NA> 
## # … with 881 more rows, and 1 more variable: Embarked <chr>
write_csv(titanic, file = "titanic_R.csv")

Compared to base R

  • They are typically much faster (~10x) than their base equivalents. Long running jobs have a progress bar, so you can see what’s happening. If you’re looking for raw speed, try data.table::fread(). It doesn’t fit quite so well into the tidyverse, but it can be quite a bit faster.

  • They produce tibbles, they don’t convert character vectors to factors, use row names, or munge the column names. These are common sources of frustration with the base R functions.

  • They are more reproducible. Base R functions inherit some behaviour from your operating system and environment variables, so import code that works on your computer might not work on someone else’s.

Tibbles vs Data Frames

  • Printing

    • When you print a tibble, it only shows the first ten rows and all the columns that fit on one screen.

    • It also prints an abbreviated description of the column type, and uses font styles and color for highlighting:

tibble(x = -5:100, y = 123.456 * (3 ^ x))
## # A tibble: 106 × 2
##        x         y
##    <int>     <dbl>
##  1    -5     0.508
##  2    -4     1.52 
##  3    -3     4.57 
##  4    -2    13.7  
##  5    -1    41.2  
##  6     0   123.   
##  7     1   370.   
##  8     2  1111.   
##  9     3  3333.   
## 10     4 10000.   
## # … with 96 more rows
  • Subsetting

    • [ always returns another tibble. Contrast this with a data frame: sometimes [ returns a data frame and sometimes it just returns a vector:
df1 <- data.frame(x = 1:3, y = 3:1)
class(df1[, 1:2])
## [1] "data.frame"
class(df1[, 1])
## [1] "integer"
df2 <- tibble(x = 1:3, y = 3:1)
class(df2[, 1:2])
## [1] "tbl_df"     "tbl"        "data.frame"
class(df2[, 1])
## [1] "tbl_df"     "tbl"        "data.frame"

To extract a single column use [[ or $:

class(df2[[1]])
## [1] "integer"
class(df2$x)
## [1] "integer"

Tibbles are also stricter with $. Tibbles never do partial matching, and will throw a warning and return NULL if the column does not exist:

df <- data.frame(abc = 1)
df$a
## [1] 1
df2 <- tibble(abc = 1)
df2$a
## Warning: Unknown or uninitialised column: `a`.
## NULL

However, tibbles respect the drop argument if it is provided:

data.frame(a = 1:3)[, "a", drop = TRUE]
## [1] 1 2 3
tibble(a = 1:3)[, "a", drop = TRUE]
## [1] 1 2 3

Tibbles do not support row names. They are removed when converting to a tibble or when subsetting:

df <- data.frame(a = 1:3, row.names = letters[1:3])
rownames(df)
## [1] "a" "b" "c"
rownames(as_tibble(df))
## [1] "1" "2" "3"
tbl <- tibble(a = 1:3)
rownames(tbl) <- letters[1:3]
## Warning: Setting row names on a tibble is deprecated.
rownames(tbl)
## [1] "a" "b" "c"
rownames(tbl[1, ])
## [1] "1"

See vignette("invariants") for a detailed comparison between tibbles and data frames.

  • Recycling

    • When constructing a tibble, only values of length 1 are recycled.

    • The first column with length different to one determines the number of rows in the tibble, conflicts lead to an error:

tibble(a = 1, b = 1:3)
## # A tibble: 3 × 2
##       a     b
##   <dbl> <int>
## 1     1     1
## 2     1     2
## 3     1     3
tibble(a = 1:3, b = 1)
## # A tibble: 3 × 2
##       a     b
##   <int> <dbl>
## 1     1     1
## 2     2     1
## 3     3     1
# tibble(a = 1:3, c = 1:2)
# Error:
# ! Tibble columns must have compatible sizes.
# • Size 3: Existing data.
# • Size 2: Column `c`.
# ℹ Only values of size one are recycled.


This also extends to tibbles with zero rows, which is sometimes important for programming:

tibble(a = 1, b = integer())
## # A tibble: 0 × 2
## # … with 2 variables: a <dbl>, b <int>
tibble(a = integer(), b = 1)
## # A tibble: 0 × 2
## # … with 2 variables: a <int>, b <dbl>
  • Arithmetic operations

    • Unlike data frames, tibbles don’t support arithmetic operations on all columns.

    • The result is silently coerced to a data frame. Do not rely on this behavior, it may become an error in a forthcoming version.

tbl <- tibble(a = 1:3, b = 4:6)
tbl * 2
##   a  b
## 1 2  8
## 2 4 10
## 3 6 12



Python

Data Type

float

  • 浮點數型態
1e-3
## 0.001
1E-3 # 大寫E也可以
## 0.001
type(1e3)
## <class 'float'>
import math
type(math.e ** 2 )
## <class 'float'>

int

  • 整數型態
x = 100
x
## 100
type(x)
## <class 'int'>

bool

  • TrueFalse

  • 本質上為int的子類別,True為1,False為0

  • 擁有不同於int型態的列印(print)方式

True
## True
type(False)
## <class 'bool'>
float(True)
## 1.0
float(False)
## 0.0
int(True)
## 1
int(False)
## 0
True is 1
## False
True == 1 
## True

str

  • 為字元序列(sequence)型別。
  • 為不可變(immutable)ㄉ˙
# help(str)
'This is a string'
## 'This is a string'
"This is a string"
## 'This is a string'
'''This is a string'''
## 'This is a string'
"""This is a string"""
## 'This is a string'
  • 反斜線(\)可用於『脫逸字元(escape character)』,可賦予特殊意義:

    • \n:代表換行

    • \t:代表Tab

    • \\:代表正常的反斜線\

    • \":代表”雙引號字元

x = "\tThis string starts with a \"tab\"." 
x
## '\tThis string starts with a "tab".'
print(x)
##  This string starts with a "tab".
x = "This string contains a single backslash(\\)."
x
## 'This string contains a single backslash(\\).'
print(x)
## This string contains a single backslash(\).
x = 'Can\'t get by without a backslash'
x
## "Can't get by without a backslash"
print(x)
## Can't get by without a backslash
x = "Can't get by without a backslash"
x
## "Can't get by without a backslash"
print(x)
## Can't get by without a backslash
x = "Backslash your \"character\" !"
x
## 'Backslash your "character" !'
print(x)
## Backslash your "character" !
x = 'You can leave the " alone'
x
## 'You can leave the " alone'
print(x)
## You can leave the " alone

Python提供『三引號』,可建立『跨行字串』,且字串中可包含『單引號’』與『雙引號”』 }

x = """Starting and ending a string with triple " characters
permits embedded newlines, and the use of " and ' without
backslashes"""
x
## 'Starting and ending a string with triple " characters\npermits embedded newlines, and the use of " and \' without\nbackslashes'
print(x)
## Starting and ending a string with triple " characters
## permits embedded newlines, and the use of " and ' without
## backslashes
# name = input("Name? ") # 從使用者鍵入值取得資料
# type(name)
# print(name)

字串(str)補充

  • 字串為不可變「字元」序列:字串為字元依照順序組成

  • 故可進行取值與切片

x = 'Goodbye\n!'   # \n:換行跳脫字元(escape character)
x
## 'Goodbye\n!'
len(x)
## 9
print(x)           # print函數自動在字串尾端添加換行跳脫字元 
## Goodbye
## !
z = 'a\n\tb'
z
## 'a\n\tb'
print(z)
## a
##  b
print("abc\n")    # 2次換行
## abc
print("abc\n", end='')    # 1次換行
## abc
x = "Hello" + "World"
x
## 'HelloWorld'
x = "Hello" "World"    # Python會將空白相隔的字串連接在一起
x
## 'HelloWorld'

split()join():切割與連結字串

  • join()函數用於連結字串

  • +也可以用於連結字串,但+用於連結字串時,會建立新字串。故建立大量字串時,會產生很多無用的字串物件,程式碼效率會變差

" ".join(["join", "puts", "spaces", "between", "elements"])
## 'join puts spaces between elements'
"::".join(["Separated", "with", "colons"])
## 'Separated::with::colons'
"".join(["Separated", "by", "nothing"])
## 'Separatedbynothing'
  • split()會將字串分割為字串list,其預設以空白字元(whitespace)切割字串

  • 空白字元包含空格換行定位等字元

x = "You\t\t can have tabs\t\n \t and newlines \n\n mixed in"
x.split()
## ['You', 'can', 'have', 'tabs', 'and', 'newlines', 'mixed', 'in']
x = "Mississippi"
x.split("ss")
## ['Mi', 'i', 'ippi']
x = 'a b c d'
x.split(' ', 1)
## ['a', 'b c d']
x.split(' ', 2)
## ['a', 'b', 'c d']
x.split(' ', 100)
## ['a', 'b', 'c', 'd']

轉換字串為數字

float('123.456')

# float('xxyy') 
# ValueError: could not convert string to float: 'xxyy'
## 123.456
int('3333')

# int('123.456')
# ValueError: invalid literal for int() with base 10: '123.456'
## 3333
int('10000', 8)
## 4096
int('101', 2)
## 5
int('ff', 16)

# int('123456', 6)
# ValueError: invalid literal for int() with base 6: '123456'
## 255

strip()lstrip()rstrip()移除開頭或結尾處的多餘空白

x = "  Hello,    World\t\t "
x.strip()
## 'Hello,    World'
x.lstrip()            # 移除左邊的空白
## 'Hello,    World\t\t '
'Hello,    World\t\t '
## 'Hello,    World\t\t '
x.rstrip()            # 移除右邊的空白
## '  Hello,    World'
import string
string.whitespace     # 查詢被Python視為空白的字元
## ' \t\n\r\x0b\x0c'
x = "www.python.org"
x.strip("w")                       
## '.python.org'
x.strip("gor")        # 移除所有g, o, r字元              
## 'www.python.'
x.strip(".gorw")      # 移除所有 ., g, o, r, w字元      
## 'python'
demo = " Demo  Example  "  # 移除所有的空白
demo.replace(" ", "") 
## 'DemoExample'

其他字串相關函數

x = "123"
x.isdigit() 
## True
x.isalpha() 
## False
x = "MM"
x.islower() 
## False
x.isupper() 
## True
  • 一些有用的字串常數
import string
string.ascii_letters
## 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
string.ascii_uppercase
## 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
string.ascii_lowercase
## 'abcdefghijklmnopqrstuvwxyz'
string.digits
## '0123456789'

字串的搜尋

  • in operator
x = "The string"
"str" in x
## True
"sTr" in x
## False
"e s" in x
## True
  • find(string, start, end)
x = "Mississippi"
x.find("ss")
## 2
x.find("zz")    # 找不到傳回-1
## -1
x = "Mississippi"
x.find("s")
## 2
x.find("s",2)    # 從索引2開始尋找
## 2
x.find("s",4)    # 從索引4開始尋找
## 5
x.find("s",4,5)  ## 從索引4開始尋找,並在索引5之前結束
## -1
x.find("ss", 3)
## 5
x.find("ss", 0, 3)
## -1
  • rfind():從字串的結尾向開頭進行搜尋,並傳回搜尋文字最後出現的索引位置
x = "Mississippi"
x.rfind("ss")
## 5
  • index()rindex()index()rindex()分別與find()rfind()相同,但index()rindex()找不到文字時,不會回傳-1而是引發ValueError例外錯誤
x = "Mississippi"
# x.index("zz")
# ValueError: substring not found
  • count()
x = "Mississippi"
x.count("ss")
## 2
x.count("s")
## 4
  • startswith()endswith()
x = "Mississippi"
x.startswith("Miss")
## True
x.startswith("Mist")
## False
x.endswith("pi")
## True
x.endswith("p")
## False

字串的修改

  • 雖然字串為不可變資料型態,但字串物件仍提供幾個method可對該字串進行操作,並回傳一個修改後的新字串

  • replace()

x = "Mississippi"
x.replace("ss", "+++")
## 'Mi+++i+++ippi'
  • maketrans()translate()
x = "~x ^ (y % z)"
table = x.maketrans("~^()", "!&[]")   # 組成一個字元參照表
type(table)                           # 按照參照表table來替換字元
## <class 'dict'>
table                                 # 字元參照表
## {126: 33, 94: 38, 40: 91, 41: 93}
x.translate(table)
## '!x & [y % z]'
  • 透過list來修改字串
text = "Hello, World"
wordList = list(text)
wordList[6:] = []       
wordList.reverse()
text = "".join(wordList)
print(text)      
## ,olleH

透過repr()str()將物件轉換為字串表示

  • repr()傳回的字串是給Python程式讀取(formal string representation),可透過此字串重建原始物件

  • str()傳回的字串是給人看的(informal string representation),可讀性比較高

  • 許多情況之下,repr()str()內容並無不同

repr([1,2,3,4])
## '[1, 2, 3, 4]'
x = [1]
x.append(2)
x
## [1, 2]
x.append([3, 4])
x
## [1, 2, [3, 4]]
"The list x is " + repr(x)
## 'The list x is [1, 2, [3, 4]]'
repr(len)
## '<built-in function len>'
repr(list)
## "<class 'list'>"
from datetime import datetime
now = datetime.now()
str(now)
## '2022-04-25 16:37:08.496266'
print(now)
## 2022-04-25 16:37:08.496266
repr(now)
## 'datetime.datetime(2022, 4, 25, 16, 37, 8, 496266)'

使用%格式化字串

  • 舊式做法

  • 常用的格式化規範

    字串 意義
    %s 字串
    %c 字元
    %b 二進位
    %d 十進位
    %x 十六進位
    %f 浮點數
    %e 指數
    不指名 與d相同
errno = 12345
name = "Bob"

"Hello, %s" % name   # %s 告訴Python此處要替換成字串,他會到第二個 % 運算子後面找到變數
## 'Hello, Bob'
"%x" % errno         # %x 把整數轉換成字串並以16進位數字表示
## '3039'
"Hi, %s, 錯誤:0x%x 發生了" % (name, errno)
## 'Hi, Bob, 錯誤:0x3039 發生了'
"Hi, %(Name)s, 錯誤:0x%(errNo)x 發生了" % {"errNo":errno, "Name":name} 
## 'Hi, Bob, 錯誤:0x3039 發生了'

使用format()格式化字串

  • 使用位置參數
"{} is the {} of {}".format("Ambrosia", "food", "the gods")
## 'Ambrosia is the food of the gods'
"{{Ambrosia}} is the {} of {}".format("food", "the gods")
# 若格式化字串內需顯示{與}字元,則需重複寫兩次{{與}}
## '{Ambrosia} is the food of the gods'
"{} + {} = {}".format(1, 2, 1+2)
## '1 + 2 = 3'
x = [1, 2, "three"]
"The {} contains: {}".format("list", x)
## "The list contains: [1, 2, 'three']"
  • 使用編號參數
"{2} is the {0} of {1}".format("food", "the gods", "Ambrosia")
## 'Ambrosia is the food of the gods'
'{0}{1}{0}'.format('abc', 'def')
## 'abcdefabc'
  • 使用具名參數(named parameter)
"{food} is the food of {user}".format(food="Ambrosia", user="the gods") 
## 'Ambrosia is the food of the gods'
"{0} is the food of {user[1]}".format("Ambrosia", user=["men", "the gods", "others"]) 


# "{0} is the food of {user}".format(user="the gods", "Ambrosia")
# SyntaxError: non-keyword arg after keyword arg
## 'Ambrosia is the food of the gods'
import math
"{}為 {}".format("圓周率", math.pi)
## '圓周率為 3.141592653589793'
"{:10s}為 {:10.5f}".format("圓周率", math.pi)
# 寬度為10, 類型為字串,字串預設靠左
# 寬度為10, 取5位小數點, 類型為浮點數(f), 數字預設靠右
## '圓周率       為    3.14159'
"{:>10s}為 {:+10.5f}".format("圓周率", math.pi)
# 寬度為10, 類型為字串,字串靠右
# 寬度為10, 取5位小數點, 類型為浮點數(f), 數字預設靠右, 並顯示+-號
## '       圓周率為   +3.14159'
"{0:*<10s}為 {1:#> 10.5f}".format("圓周率", math.pi)
# 寬度為10, 靠左(<), 多餘空格用*字元填滿, 並以編號0來取得format()內第0個參數
# 寬度為10, 取5位小數點, 類型為浮點數(f), 空格代表若為正數就留空格,若為負數則加負號, 剩下空格用#字元填滿, 並用編號1來取得format()內第1個參數
## '圓周率*******為 ## 3.14159'
"{0:*<10s}為 {1:#>+10.5f}".format("圓周率", math.pi)
## '圓周率*******為 ##+3.14159'
"{name:P^10s}為 {value:=+10.2f}".format(name = "圓周率", value = math.pi)
# 寬度為10, 置中(^), 多餘空格用P字元填滿, 並以名稱name來取得format()內參數
# 寬度為10, 取2位小數點, 類型為浮點數(f), +號代表強制加上+-號, =號代表把正負號放到最左邊
## 'PPP圓周率PPPP為 +     3.14'

f-string(Python 3.6+)

  • 可直接將Python運算式嵌入字串中

  • 速度較快

name = "Bob"
f'你好, {name}!'
## '你好, Bob!'
a = 5
b = 10
name = "Peter"
No = 100
f'5 加 10 等於 {a + b}, 而非 {2 * (a + b)}.'
## '5 加 10 等於 15, 而非 30.'
f'Hi~ {name:s}, 有錯誤 0x{errno:x} 發生了!'
# :s代表字串
# :x代表十六進位數
## 'Hi~ Peter, 有錯誤 0x3039 發生了!'
f'Hello, {name:s}'
## 'Hello, Peter'
f'Hello, {name=}'   # 變數名稱後面加『=』, 就會把變數名稱與內容一起印出來
## "Hello, name='Peter'"
f'Hello, {No=}'     # 變數名稱後面加『=』, 就會把變數名稱與內容一起印出來
## 'Hello, No=100'

complex

import math
import cmath
1 + 1j
## (1+1j)
1 - 2J # 大寫J也可以

# NOT RUN
# math.sqrt(-1) 
## (1-2j)
cmath.sqrt(-1 + 0j)
## 1j
cmath.sqrt(-1)
## 1j

None

  • 特殊的基本型態,代表『不存在』或是『空值』的意義(類似R中的 NULL )。

  • None在Python亦代表 佔位符號(place holder),用來表示資料中某一個欄位目前尚未得知具體之值,先保留該位置,之後再填值(類似R中 NA 的功能)。

type(None)
## <class 'NoneType'>
None == False
## False
None == 0
## False
None == None # None只會等於自己
## True
False == 0  

# 與R比較 (Not Run):
# len(None)
# TypeError: object of type 'NoneType' has no len()
## True

Inf

float('Inf')
## inf
float('inf')
## inf
float('INF')
## inf
import math
math.inf
## inf
math.isinf(float("-inf")) # OUTPUT:True. Return True if x is a positive or negative infinity, and False otherwise.
## True
math.isinf(float("inf"))  # OUTPUT:True
# by comparing to infinity
## True
float("inf") == float("inf") # OUTPUT:True
## True
float("-inf") == float("-inf") # OUTPUT:True
## True
float("inf") == float("-inf")
## False

NaN

import math
math.nan
## nan
math.isnan(math.nan)
## True
import numpy as np
np.nan
## nan
type(np.nan)
## <class 'float'>
x = np.array([1, 2, 3, np.nan, math.nan])
x
## array([ 1.,  2.,  3., nan, nan])
np.isnan(x)
## array([False, False, False,  True,  True])

list

  • Python中的list是由『有序(ordered)』的元素(element)構成。
  • 與R的list相同,皆可包含不同型別的元素在裡頭(異質性)。
  • 為『可變(mutable)』的資料型態,支持『原地修改(modify in place)』
  • list長度可變,可嵌套(nested)
  • []建立list
  • 可視為『物件參考的陣列(array of object references)

list基本操作

l0 = [] # 空list
l1 = [True, 1, 1.0, '1', ['a', 'b', 'c']]
l1
## [True, 1, 1.0, '1', ['a', 'b', 'c']]
type(l1)
## <class 'list'>
len(l1)
## 5
list('NCCU')
## ['N', 'C', 'C', 'U']
list(range(-4, 4))
## [-4, -3, -2, -1, 0, 1, 2, 3]
l2 = [1, 2, [3, 4]]
l3 = [5, 6, [7, 8]]
l2 + l3                                        # list的拼接
## [1, 2, [3, 4], 5, 6, [7, 8]]
l2 * 3                                         # list的重複
## [1, 2, [3, 4], 1, 2, [3, 4], 1, 2, [3, 4]]
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
matrix
## [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
matrix[1]
## [4, 5, 6]
matrix[1][1]
## 5

索引(index)

  • Python索引從0開使,並採[n]取值
x = ["first", "second", "third", "fourth"]
x[0]
## 'first'
x[2]
## 'third'
  • 索引值如為負數,則代表從list尾端開始計數
x[-1]  # 取最後一個位置
## 'fourth'
x[-2]  # 取得倒數第二個位置
## 'third'

list slicing

  • 提取多個元素時採切片(slicing)

  • [index1:index2]:代表指定從index1至index2之間(不包括index2)的元素

L = ['NCCU', 'MoneyBanking', 'QF']
L[1] = 'MONEY_BANKING'
L
## ['NCCU', 'MONEY_BANKING', 'QF']
L[0:2] = ['School', 'Department']
L
## ['School', 'Department', 'QF']
L = [1, 2, 3]
L
## [1, 2, 3]
L[1:2] = [8, 9]  # 長度可變:insertion
L
## [1, 8, 9, 3]
L[1:1] = [6, 7]  # 長度可變:insertion, replace nothing
L
## [1, 6, 7, 8, 9, 3]
L[1:3] = []
L
## [1, 8, 9, 3]
# R code:
# L <- list(1, 2, 3)   
# L[1] <- list(8, 9)  
# 被替換的項目不是替換值長度的倍數
  • 可省略index1或index2

  • 如兩個索引都省略,則會複製整個list

  • 如兩個索引都省略,則會複製整個list

  • 如第2個索引值在第1個索引值之前,則回傳empty list
L = ['NCCU', 'MoneyBanking', 'QF', 'students', 'class']
L[:2]   # 只️取前2個元素
## ['NCCU', 'MoneyBanking']
L[2:]   # 不取前2個元素
## ['QF', 'students', 'class']
x = L[:]
x
## ['NCCU', 'MoneyBanking', 'QF', 'students', 'class']
L[-1:2]
## []
  • 更改list的元素
x = [1, 2, 3]
x[1:2] = [8, 9]  # 長度可變:insertion
x
## [1, 8, 9, 3]
x = [1, 2, 3]
x[1:1] = [8, 9]  # 長度可變:insertion, replace nothing
x
## [1, 8, 9, 2, 3]
x = [1, 2, 3, 4]
x[len(x):] = [5, 6, 7] 
x
## [1, 2, 3, 4, 5, 6, 7]
x[:0] = [-1, 0]
x
## [-1, 0, 1, 2, 3, 4, 5, 6, 7]
x[1:-1] = []   # 移除list內多個元素
x
## [-1, 7]

list更多的操作

L = ['eat', 'more', 'SPAM']
L.append('please')
L
## ['eat', 'more', 'SPAM', 'please']
L.sort()
L
## ['SPAM', 'eat', 'more', 'please']
L.append(['Chen'])           # 請與L.extend() method 比較
L
# L.sort()
# TypeError: '<' not supported between instances of 'list' and 'str'
# 
# Detailed traceback: 
#   File "<string>", line 1, in <module>
## ['SPAM', 'eat', 'more', 'please', ['Chen']]
L = ['abc', 'ABD', 'aBe']
L.sort()
L
## ['ABD', 'aBe', 'abc']
L = ['abc', 'ABD', 'aBe']
L.sort(key = str.lower)
L
## ['abc', 'ABD', 'aBe']
L = ['abc', 'ABD', 'aBe']
L.sort(key = str.lower, reverse = True)
L
## ['aBe', 'ABD', 'abc']
def compare_num_of_chars(string1):
    return len(string1)

def compare_num_of_chars(string1):
  return len(string1)
word_list = ['Python', 'is', 'better', 'than', 'C']
word_list.sort()
print(word_list)
## ['C', 'Python', 'better', 'is', 'than']
word_list = ['Python', 'is', 'better', 'than', 'C']
word_list.sort(key=compare_num_of_chars)
print(word_list)
## ['C', 'is', 'than', 'Python', 'better']
L = ['spam', 'eggs', 'ham']
L.index('eggs')

# L.index('egg')
# ValueError: 'egg' is not in list
# 
# Detailed traceback: 
#   File "<string>", line 1, in <module>
## 1
L.insert(1, 'toast')
L
## ['spam', 'toast', 'eggs', 'ham']
L.remove('eggs')
L
## ['spam', 'toast', 'ham']
L.pop(1)                              # delete by position
## 'toast'
L
## ['spam', 'ham']
['1', '2', '1', '1', '3'].count('1')  # number of occurences
## 3
L = ['spam', 'eggs', 'ham', 'toast']
del L[0]
L
## ['eggs', 'ham', 'toast']
del L[1:3]
L
## ['eggs']

tuple

  • 與list類似,但tuple只能被建立而不能被修改(不可變,immutable),故可當dict的鍵值(key)

  • 與list類似,在Python中被視為有順序(ordered)的『序列(sequence)』

  • 其中,strlisttuple在Python中皆視為有順序的『序列(sequence)』

  • 也可採[]進行取值與切片

  • +亦可使用

import random
t1 = (1, 2, 2, random.gauss(10, 2))
t1
## (1, 2, 2, 10.798491592666888)
type(t1)
## <class 'tuple'>
t2 = 2, 4, 8, 1
t2
## (2, 4, 8, 1)
type(t2)
## <class 'tuple'>
max(t2)
## 8
t3 = 3,
type(t3)
## <class 'tuple'>
3 in [3, 4, 7, 9, 1]
## True
one, two, three, four = 1, 2, 3, 4  # 自動打包(packing)後自動解包(unpacking),同時指定4個變數值
one
## 1
two
## 2
  • 自動打包與自動解包不只適用於tuple,只要是序列型別都適用。
v1, v2, v3 = [1, 2, 3]
v1
## 1
w1, w2, w3 = 'abc'
w2

# q1, q2 = 'ABC' # 多重指定變數值時,兩邊數量要一樣多
# ValueError: too many values to unpack (expected 2)
# 
# Detailed traceback: 
#   File "<string>", line 1, in <module>
## 'b'
  • *號版本自動解包

  • *的標記的元素會將所有多餘的項目當作list來接收

a, b, *c = (1, 2, 3, 4)
a
## 1
b
## 2
c
## [3, 4]
a, *b, c = (1, 2, 3, 4)
a
## 1
b
## [2, 3]
c
## 4
*a, b, c = (1, 2, 3, 4)
a
## [1, 2]
b
## 3
c
## 4
a,b,c
## ([1, 2], 3, 4)
a, b, c, d, *e = (1, 2, 3, 4)
a
## 1
b
## 2
c
## 3
e
## []
x = [1, 2, 3, 4, 5]
a, b, *_ = x
a
## 1
b
## 2
_
## [3, 4, 5]
  • 可使用list()函數將任何序列型資料轉為list。而tuple()函數則轉為tuple
list((1, 2, 3, 4))
## [1, 2, 3, 4]
tuple([1, 2, 3, 4])
## (1, 2, 3, 4)
list('NCCU')
## ['N', 'C', 'C', 'U']
tuple('Money and Banking')
## ('M', 'o', 'n', 'e', 'y', ' ', 'a', 'n', 'd', ' ', 'B', 'a', 'n', 'k', 'i', 'n', 'g')
  • 用於交換變數時相當方便
a = 100
b = 200

temp = a
a = b
b = temp
a, b
## (200, 100)
a = 100
b = 200

a, b = b, a
a, b
## (200, 100)
  • 可使用list()函數將任何序列型資料轉為list。而tuple()函數則轉為tuple。
list((1, 2, 3, 4))
## [1, 2, 3, 4]
tuple([1, 2, 3, 4])
## (1, 2, 3, 4)
list('NCCU')
## ['N', 'C', 'C', 'U']
tuple('Money and Banking')
## ('M', 'o', 'n', 'e', 'y', ' ', 'a', 'n', 'd', ' ', 'B', 'a', 'n', 'k', 'i', 'n', 'g')

set

  • set代表無順序的資料所構成(list與tuple之元素則有順序意義)

  • set中重複的資料會被自動刪除不計,可保持元素唯一性

  • 值得注意的是,set中的元素須為『不可變的資料(immutable)』,故intfloatstr、與tuple可作為set的元素。而list、dict與set本身則不行。

  • 當任務的重點為判斷一群資料是否包含某個物件,而不重視順序時,則可使用set型別

  • set中的元素沒有順序,故無法使用索引[n]或切片來存取,且+*也無法使用

x = {1, 2, 1, 3, 3, 1, 2, 4}
x
## {1, 2, 3, 4}
type(x)
## <class 'set'>
x = set([1, 2, 1, 3, 3, 1, 2, 4])
x
## {1, 2, 3, 4}
type(x)
## <class 'set'>
x.add(6)
x
## {1, 2, 3, 4, 6}
x.remove(2)
x
## {1, 3, 4, 6}
3 in x
## True
5 in x
## False
x = {1, 2, 1, 2, 1, 2}
y = set([1, 7, 7, 8, 9])
x
## {1, 2}
y
## {8, 1, 9, 7}
x | y  # 聯集
## {1, 2, 7, 8, 9}
x & y  # 交集
## {1}
x ^ y  # Symmetric Difference (XOR): 只屬於其中一個集合,且不屬於另一個集合之元素所形成的集合
## {2, 7, 8, 9}
x - y  # 差集
## {2}
x1 = {'foo', 'bar', 'baz'}
x1.issubset({'foo', 'bar', 'baz', 'qux', 'quux'})
## True
x1 <= {'foo', 'bar', 'baz', 'qux', 'quux'}
## True
x2 = {'baz', 'qux', 'quux'}
x1 <= x2    
## False
v = {"a", "e", "i", "o", "u"}
v.add("x")
v
## {'e', 'x', 'o', 'a', 'u', 'i'}
v.discard("z")     # 與v.remove()不同,當欲移除之元素不存在時,則不會出現錯誤訊息
v
## {'e', 'x', 'o', 'a', 'u', 'i'}
letters = set("alice")
letters
## {'e', 'l', 'i', 'c', 'a'}
letters.intersection(v)
## {'e', 'a', 'i'}
letters.union(v)
## {'e', 'l', 'i', 'o', 'x', 'c', 'a', 'u'}
letters.difference(v)
## {'c', 'l'}
letters.symmetric_difference(v)
## {'x', 'c', 'o', 'u', 'l'}
s = {"a", "e"}
s.issubset(letters)
## True
letters.issuperset(s)
## True
letters.isdisjoint(s)
## False
  • set無法成為另一個set的元素(因set為可變的型別),故Python提供frozenset型別解決上述問題

  • frozenset為不可變型別

x = set([1, 2, 3, 1, 3, 5])
z = frozenset(x)
type(x)
## <class 'set'>
type(z)
## <class 'frozenset'>
# z.add(6)
# AttributeError: 'frozenset' object has no attribute 'add'
# 
# Detailed traceback: 
#   File "<string>", line 1, in <module>
x.add(z)
x
## {1, 2, 3, 5, frozenset({1, 2, 3, 5})}
len(x)
## 5

dict

  • 鍵(key):值(value)的對應方式建立的資料結構。可為 intstr、或其他Python不可變物件,且必須為唯一的
  • 但字典內資料並按照字面上的順序作為索引排序
  • 字典可以作為紀錄、搜尋表及其他 keyvalue 更為重要之用途使用。而字典索引是一種非常有效率的搜索方式
  • Python語言的底層很多地方都以dict來實作

建立dict

ages = {'Mary':13, 'John': 14, 'Tony':13}
type(ages)
## <class 'dict'>
ages
## {'Mary': 13, 'John': 14, 'Tony': 13}
'Mary' in ages
## True
x = {}
x
## {}
type(x)
## <class 'dict'>
x[0] = 'NCCU'  # 此0是當作key,並非當作索引用
x[1] = 'Money and Banking'
x
## {0: 'NCCU', 1: 'Money and Banking'}
x[1]
## 'Money and Banking'
len(x)
## 2
# empty list
y = []
type(y)
# y[0] = 'NCCU'  # 指定一個不存在的索引值0,值得注意的是:R允許這個操作
# ndexError: list assignment index out of range
# 
# Detailed traceback: 
#   File "<string>", line 1, in <module>
## <class 'list'>
l <- list()
class(l)
## [1] "list"
l[1] <- "NCCU"
l
## [[1]]
## [1] "NCCU"

dict的其他操作

english_to_french = {'red': 'rouge', 'blue': 'bleu', 'green': 'vert'}
len(english_to_french)
## 3
list(english_to_french.keys())
## ['red', 'blue', 'green']
list(english_to_french.values())
## ['rouge', 'bleu', 'vert']
list(english_to_french.items())
## [('red', 'rouge'), ('blue', 'bleu'), ('green', 'vert')]
del english_to_french['green']
list(english_to_french.items())
## [('red', 'rouge'), ('blue', 'bleu')]
'red' in english_to_french
## True
'orange' in english_to_french
## False
english_to_french.get('blue', 'No translation')
## 'bleu'
english_to_french.get('chartreuse', 'No translation')
## 'No translation'
english_to_french.setdefault('chartreuse', 'No translation') # 找不到該值時,會新增 鍵:值
## 'No translation'
x = {0: 'zero', 1: 'one'}
y = x.copy()
y
## {0: 'zero', 1: 'one'}
z = {1: 'One', 2: 'Two'}
x = {0: 'zero', 1: 'one'}
x.update(z)
x
## {0: 'zero', 1: 'One', 2: 'Two'}

Numpy

  • NumPy’s main object is the homogeneous multidimensional array.

  • It is a table of elements (usually numbers), all of the same type, indexed by a
    tuple of non-negative integers.

  • In NumPy dimensions are called axes.

  • Def: dimention為每一軸所含的元素個數。

    Ex: 2個軸,每個軸有3個元素(3維)。

  • NumPy’s array class is called ndarray. It is also known by the alias
    array.

import time
import numpy as np

def benchmark(n):
    # using list
    a_list = list(range(1, n + 1))
    t1 = time.time()
    tot = sum(a_list)
    t2 = time.time()
    print('Time taken by Python is', t2 - t1)
    
    # using numpy
    a = np.arange(1, n+1)
    t1 = time.time()
    tot = np.sum(a)
    t2 = time.time()
    print('Time taken by Python is', t2 - t1)
    
benchmark(1000000)
## Time taken by Python is 0.0062713623046875
## Time taken by Python is 0.0008161067962646484
import numpy as np
a = np.arange(15).reshape(3, 5)
a
## array([[ 0,  1,  2,  3,  4],
##        [ 5,  6,  7,  8,  9],
##        [10, 11, 12, 13, 14]])
a.shape
## (3, 5)
a.ndim
## 2
a.dtype.name
## 'int64'
a.itemsize
## 8
a.size
## 15
type(a)
## <class 'numpy.ndarray'>
b = np.array([6, 7, 8])
b
## array([6, 7, 8])
type(b)
## <class 'numpy.ndarray'>
# a = np.array(1, 2, 3, 4)    # WRONG
# TypeError: array() takes from 1 to 2 positional arguments but 4 were given
c = np.array([[1, 2], [3, 4]], dtype=complex)
a = np.array([[1, 2, 3], [10, 20, 30, 40]])
## <string>:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
a
## array([list([1, 2, 3]), list([10, 20, 30, 40])], dtype=object)
a.shape
## (2,)
np.zeros((3, 4))
## array([[0., 0., 0., 0.],
##        [0., 0., 0., 0.],
##        [0., 0., 0., 0.]])
np.ones((2, 3, 4), dtype=np.int16)
## array([[[1, 1, 1, 1],
##         [1, 1, 1, 1],
##         [1, 1, 1, 1]],
## 
##        [[1, 1, 1, 1],
##         [1, 1, 1, 1],
##         [1, 1, 1, 1]]], dtype=int16)
np.empty((2, 3))
## array([[1.39069238e-309, 1.39069238e-309, 1.39069238e-309],
##        [1.39069238e-309, 1.39069238e-309, 1.39069238e-309]])
from numpy import pi
np.linspace(0, 2, 9)     
## array([0.  , 0.25, 0.5 , 0.75, 1.  , 1.25, 1.5 , 1.75, 2.  ])
x = np.linspace(0, 2 * pi, 100)  
f = np.sin(x)
f
## array([ 0.00000000e+00,  6.34239197e-02,  1.26592454e-01,  1.89251244e-01,
##         2.51147987e-01,  3.12033446e-01,  3.71662456e-01,  4.29794912e-01,
##         4.86196736e-01,  5.40640817e-01,  5.92907929e-01,  6.42787610e-01,
##         6.90079011e-01,  7.34591709e-01,  7.76146464e-01,  8.14575952e-01,
##         8.49725430e-01,  8.81453363e-01,  9.09631995e-01,  9.34147860e-01,
##         9.54902241e-01,  9.71811568e-01,  9.84807753e-01,  9.93838464e-01,
##         9.98867339e-01,  9.99874128e-01,  9.96854776e-01,  9.89821442e-01,
##         9.78802446e-01,  9.63842159e-01,  9.45000819e-01,  9.22354294e-01,
##         8.95993774e-01,  8.66025404e-01,  8.32569855e-01,  7.95761841e-01,
##         7.55749574e-01,  7.12694171e-01,  6.66769001e-01,  6.18158986e-01,
##         5.67059864e-01,  5.13677392e-01,  4.58226522e-01,  4.00930535e-01,
##         3.42020143e-01,  2.81732557e-01,  2.20310533e-01,  1.58001396e-01,
##         9.50560433e-02,  3.17279335e-02, -3.17279335e-02, -9.50560433e-02,
##        -1.58001396e-01, -2.20310533e-01, -2.81732557e-01, -3.42020143e-01,
##        -4.00930535e-01, -4.58226522e-01, -5.13677392e-01, -5.67059864e-01,
##        -6.18158986e-01, -6.66769001e-01, -7.12694171e-01, -7.55749574e-01,
##        -7.95761841e-01, -8.32569855e-01, -8.66025404e-01, -8.95993774e-01,
##        -9.22354294e-01, -9.45000819e-01, -9.63842159e-01, -9.78802446e-01,
##        -9.89821442e-01, -9.96854776e-01, -9.99874128e-01, -9.98867339e-01,
##        -9.93838464e-01, -9.84807753e-01, -9.71811568e-01, -9.54902241e-01,
##        -9.34147860e-01, -9.09631995e-01, -8.81453363e-01, -8.49725430e-01,
##        -8.14575952e-01, -7.76146464e-01, -7.34591709e-01, -6.90079011e-01,
##        -6.42787610e-01, -5.92907929e-01, -5.40640817e-01, -4.86196736e-01,
##        -4.29794912e-01, -3.71662456e-01, -3.12033446e-01, -2.51147987e-01,
##        -1.89251244e-01, -1.26592454e-01, -6.34239197e-02, -2.44929360e-16])
a = np.arange(6)
np.reshape(a, newshape=(2, 3), order='C')
## array([[0, 1, 2],
##        [3, 4, 5]])
  • order:

    • C means to read/write the elements using C-like index order.

    • F means to read/write the elements using Fortran-like index order.

Basic Operation

  • Arithmetic operators on arrays apply elementwise. A new array is
    created and filled with the result.
a = np.array([20, 30, 40, 50])
b = np.arange(4)
b
## array([0, 1, 2, 3])
c = a - b
c
## array([20, 29, 38, 47])
b ** 2
## array([0, 1, 4, 9])
10 * np.sin(a)
## array([ 9.12945251, -9.88031624,  7.4511316 , -2.62374854])
a < 35
## array([ True,  True, False, False])
  • Unlike in many matrix languages, the product operator * operates elementwise in NumPy arrays.

  • The matrix product can be performed using the @ operator (in python >=3.5) or the dot function or method:

A = np.array([[1, 1], [0, 1]])
B = np.array([[2, 0], [3, 4]])
A * B     # elementwise product
## array([[2, 0],
##        [0, 4]])
A @ B     # matrix product
## array([[5, 4],
##        [3, 4]])
A.dot(B)  # another matrix product
## array([[5, 4],
##        [3, 4]])
  • Some operations, such as += and *=, act in place to modify an existing array rather than create a new one.
rg = np.random.default_rng(1)  # create instance of default random number generator
a = np.ones((2, 3), dtype=int)
b = rg.random((2, 3))
a *= 3
a
## array([[3, 3, 3],
##        [3, 3, 3]])
b += a
b

# a += b  # b is not automatically converted to integer type
# UFuncTypeError: Cannot cast ufunc 'add' output from dtype('float64') to dtype('int64') with casting rule 'same_kind'
## array([[3.51182162, 3.9504637 , 3.14415961],
##        [3.94864945, 3.31183145, 3.42332645]])
x = np.arange(6)
x = x.reshape((2, 3))
x
## array([[0, 1, 2],
##        [3, 4, 5]])
np.zeros_like(x)
## array([[0, 0, 0],
##        [0, 0, 0]])
np.ones_like(x)
## array([[1, 1, 1],
##        [1, 1, 1]])
np.empty_like(x)
## array([[1, 1, 1],
##        [1, 1, 1]])
np.full((2, 2), np.inf)
## array([[inf, inf],
##        [inf, inf]])
np.full((2, 2), 10)
## array([[10, 10],
##        [10, 10]])
np.full((2, 3), [10, 20, 30])
## array([[10, 20, 30],
##        [10, 20, 30]])
  • When operating with arrays of different types, the type of the resulting array corresponds to the more general or precise one (a behavior known as upcasting).
from numpy import pi
a = np.ones(3, dtype=np.int32)
b = np.linspace(0, pi, 3)
b.dtype.name
## 'float64'
c = a + b
c
## array([1.        , 2.57079633, 4.14159265])
c.dtype.name
## 'float64'
d = np.exp(c * 1j)
d
## array([ 0.54030231+0.84147098j, -0.84147098+0.54030231j,
##        -0.54030231-0.84147098j])
d.dtype.name
## 'complex128'
  • Many unary operations, such as computing the sum of all the elements in the array, are implemented as methods of the ndarray class.
a = rg.random((2, 3))
a
## array([[0.82770259, 0.40919914, 0.54959369],
##        [0.02755911, 0.75351311, 0.53814331]])
a.sum()
## 3.1057109529998157
a.min()
## 0.027559113243068367
a.max()
## 0.8277025938204418
b = np.arange(12).reshape(3, 4)
b
## array([[ 0,  1,  2,  3],
##        [ 4,  5,  6,  7],
##        [ 8,  9, 10, 11]])
b.sum(axis=0)  # sum of each column
## array([12, 15, 18, 21])
b.min(axis=1)     # min of each row
## array([0, 4, 8])
b.cumsum(axis=1)  # cumulative sum along each row
## array([[ 0,  1,  3,  6],
##        [ 4,  9, 15, 22],
##        [ 8, 17, 27, 38]])

Universal Functions

  • NumPy provides familiar mathematical functions such as sin, cos, and exp. In NumPy, these are called “universal functions” (ufunc).

  • Within NumPy, these functions operate elementwise on an array, producing an array as output.

B = np.arange(3)
B
## array([0, 1, 2])
np.exp(B)
## array([1.        , 2.71828183, 7.3890561 ])
np.sqrt(B)
## array([0.        , 1.        , 1.41421356])
C = np.array([2., -1., 4.])
np.add(B, C)
## array([2., 0., 6.])

Indexing, Slicing and Iterating

  • One-dimensional arrays can be indexed, sliced and iterated over, much like lists and other Python sequences.
a = np.arange(10)**3
a[2]
## 8
a[2:5]
## array([ 8, 27, 64])
a[:6:2] = 1000
a[::-1]  # reversed a
## array([ 729,  512,  343,  216,  125, 1000,   27, 1000,    1, 1000])
for i in a:
  print(i**(1 / 3.))
## 9.999999999999998
## 1.0
## 9.999999999999998
## 3.0
## 9.999999999999998
## 4.999999999999999
## 5.999999999999999
## 6.999999999999999
## 7.999999999999999
## 8.999999999999998
  • Multidimensional arrays can have one index per axis.

  • These indices are given in a tuple separated by commas:

def f(x, y):
  return 10 * x + y

b = np.fromfunction(f, (5, 4), dtype=int)
b
## array([[ 0,  1,  2,  3],
##        [10, 11, 12, 13],
##        [20, 21, 22, 23],
##        [30, 31, 32, 33],
##        [40, 41, 42, 43]])
b[2, 3]
## 23
b[0:5, 1]  # each row in the second column of b
## array([ 1, 11, 21, 31, 41])
b[:, 1]    # equivalent to the previous example
## array([ 1, 11, 21, 31, 41])
b[1:3, :]  # each column in the second and third row of b
## array([[10, 11, 12, 13],
##        [20, 21, 22, 23]])
b[-1]   # the last row. Equivalent to b[-1, :]
## array([40, 41, 42, 43])
  • The dots (...) represent as many colons as needed to produce a complete indexing tuple. For example, if x is an array with 5 axes, then

    • x[1, 2, ...] is equivalent to x[1, 2, :, :, :],

    • x[..., 3] to x[:, :, :, :, 3] and

    • x[4, ..., 5, :] to x[4, :, :, 5, :].

c = np.array([[[  0,  1,  2],  # a 3D array (two stacked 2D arrays)
               [ 10, 12, 13]],
             [[100, 101, 102],
              [110, 112, 113]]])
c.shape
## (2, 2, 3)
c[1, ...]  # same as c[1, :, :] or c[1]
## array([[100, 101, 102],
##        [110, 112, 113]])
c[..., 2]  # same as c[:, :, 2]
## array([[  2,  13],
##        [102, 113]])
  • Iterating over multidimensional arrays is done with respect to the first axis:
for row in b:
  print(row)
  
## [0 1 2 3]
## [10 11 12 13]
## [20 21 22 23]
## [30 31 32 33]
## [40 41 42 43]
for element in b.flat:
  print(element)
## 0
## 1
## 2
## 3
## 10
## 11
## 12
## 13
## 20
## 21
## 22
## 23
## 30
## 31
## 32
## 33
## 40
## 41
## 42
## 43
a = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
a < 5
## array([[ True,  True,  True,  True],
##        [False, False, False, False],
##        [False, False, False, False]])
b = np.nonzero(a < 5)
b
## (array([0, 0, 0, 0]), array([0, 1, 2, 3]))
list_of_coordinates= list(zip(b[0], b[1]))
for coord in list_of_coordinates:
  print(coord)
## (0, 0)
## (0, 1)
## (0, 2)
## (0, 3)

Shape Manipulation

  • Changing the shape of an array
a = np.floor(10 * rg.random((3, 4)))
a.ravel()  # returns the array, flattened
## array([3., 7., 3., 4., 1., 4., 2., 2., 7., 2., 4., 9.])
a.reshape(6, 2)  # returns the array with a modified shape
## array([[3., 7.],
##        [3., 4.],
##        [1., 4.],
##        [2., 2.],
##        [7., 2.],
##        [4., 9.]])
a.T  # returns the array, transposed
## array([[3., 1., 7.],
##        [7., 4., 2.],
##        [3., 2., 4.],
##        [4., 2., 9.]])
a.T.shape
## (4, 3)
a.shape
## (3, 4)
  • The order of the elements in the array resulting from ravel is normally “C-style”, that is, the rightmost index “changes the fastest”, so the element after a[0, 0] is a[0, 1].

  • If the array is reshaped to some other shape, again the array is treated as “C-style”.

  • NumPy normally creates arrays stored in this order, so ravel will usually not need to copy its argument, but if the array was made by taking slices of another array or created with unusual options, it may need to be copied.

  • The functions ravel and reshape can also be instructed, using an optional argument, to use FORTRAN-style arrays, in which the leftmost index changes the fastest.

a
## array([[3., 7., 3., 4.],
##        [1., 4., 2., 2.],
##        [7., 2., 4., 9.]])
a.resize((2, 6))
a.reshape(3, -1)
## array([[3., 7., 3., 4.],
##        [1., 4., 2., 2.],
##        [7., 2., 4., 9.]])

How to convert a 1D array into a 2D array (how to add a new axis to an array)

  • Using np.newaxis will increase the dimensions of your array by one dimension when used once. This means that a 1D array will become a 2D array, a 2D array will become a 3D array, and so on.
a = np.array([1, 2, 3, 4, 5, 6])
a.shape
## (6,)
a2 = a[np.newaxis, :]
a2.shape
## (1, 6)
col_vector = a[:, np.newaxis]
col_vector.shape
## (6, 1)
  • You can use np.expand_dims to add an axis at index position 1 with:
b = np.expand_dims(a, axis=1)
b.shape
## (6, 1)
c = np.expand_dims(a, axis=0)
c.shape
## (1, 6)

Stacking together different arrays

a = np.floor(10 * rg.random((2, 2)))
a
## array([[9., 7.],
##        [5., 2.]])
b = np.floor(10 * rg.random((2, 2)))
np.vstack((a, b))
## array([[9., 7.],
##        [5., 2.],
##        [1., 9.],
##        [5., 1.]])
np.hstack((a, b))
## array([[9., 7., 1., 9.],
##        [5., 2., 5., 1.]])
  • The function column_stack stacks 1D arrays as columns into a 2D array. It is equivalent to hstack only for 2D arrays:
from numpy import newaxis
np.column_stack((a, b))  # with 2D arrays
## array([[9., 7., 1., 9.],
##        [5., 2., 5., 1.]])
a = np.array([4., 2.])
b = np.array([3., 8.])
np.column_stack((a, b))  # returns a 2D array
## array([[4., 3.],
##        [2., 8.]])
np.hstack((a, b))        # the result is different
## array([4., 2., 3., 8.])
a[:, newaxis]  # view `a` as a 2D column vector
## array([[4.],
##        [2.]])
np.column_stack((a[:, newaxis], b[:, newaxis]))
## array([[4., 3.],
##        [2., 8.]])
np.hstack((a[:, newaxis], b[:, newaxis]))  # the result is the same
## array([[4., 3.],
##        [2., 8.]])
  • n complex cases, r_ and c_ are useful for creating arrays by stacking numbers along one axis. They allow the use of range literals :

  • When used with arrays as arguments, r_ and c_ are similar to vstack and hstack in their default behavior, but allow for an optional argument giving the number of the axis along which to concatenate.

np.r_[1:4, 0, 4]
## array([1, 2, 3, 0, 4])

How to get unique items and counts

a = np.array([11, 11, 12, 13, 14, 15, 16, 17, 12, 13, 11, 14, 18, 19, 20])
unique_values = np.unique(a)
unique_values
## array([11, 12, 13, 14, 15, 16, 17, 18, 19, 20])
unique_values, indices_list = np.unique(a, return_index=True)
unique_values
## array([11, 12, 13, 14, 15, 16, 17, 18, 19, 20])
unique_values, occurrence_count = np.unique(a, return_counts=True)
occurrence_count
## array([3, 2, 2, 2, 1, 1, 1, 1, 1, 1])
a_2d = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4]])
unique_values = np.unique(a_2d)
unique_values
## array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])
unique_rows = np.unique(a_2d, axis=0)
unique_rows
## array([[ 1,  2,  3,  4],
##        [ 5,  6,  7,  8],
##        [ 9, 10, 11, 12]])
unique_rows, indices, occurrence_count = np.unique(
  a_2d, axis=0, return_counts=True, return_index=True)

unique_rows
## array([[ 1,  2,  3,  4],
##        [ 5,  6,  7,  8],
##        [ 9, 10, 11, 12]])
indices
## array([0, 1, 2])
occurrence_count
## array([2, 1, 1])

Reshaping and flattening multidimensional arrays

  • There are two popular ways to flatten an array: .flatten() and .ravel().

  • The primary difference between the two is that the new array created usingravel() is actually a reference to the parent array (i.e., a “view”).

  • This means that any changes to the new array will affect the parent array as well.

  • Since ravel does not create a copy, it’s memory efficient.

x = np.array([[1 , 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
x.flatten()
## array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])
a1 = x.flatten()
a1[0] = 99
print(x)      # Original array
## [[ 1  2  3  4]
##  [ 5  6  7  8]
##  [ 9 10 11 12]]
print(a1)     # New array
## [99  2  3  4  5  6  7  8  9 10 11 12]
a2 = x.ravel()
a2[0] = 98
print(x)  # Original array
## [[98  2  3  4]
##  [ 5  6  7  8]
##  [ 9 10 11 12]]
print(a2)  # New array
## [98  2  3  4  5  6  7  8  9 10 11 12]

Splitting one array into several smaller ones

  • Using hsplit, you can split an array along its horizontal axis, either by specifying the number of equally shaped arrays to return, or by specifying the columns after which the division should occur:
a = np.floor(10 * rg.random((2, 12)))
a
# Split `a` into 3
## array([[6., 7., 6., 9., 0., 5., 4., 0., 6., 8., 5., 2.],
##        [8., 5., 5., 7., 1., 8., 6., 7., 1., 8., 1., 0.]])
np.hsplit(a, 3)
# Split `a` after the third and the fourth column
## [array([[6., 7., 6., 9.],
##        [8., 5., 5., 7.]]), array([[0., 5., 4., 0.],
##        [1., 8., 6., 7.]]), array([[6., 8., 5., 2.],
##        [1., 8., 1., 0.]])]
np.hsplit(a, (3, 4))
## [array([[6., 7., 6.],
##        [8., 5., 5.]]), array([[9.],
##        [7.]]), array([[0., 5., 4., 0., 6., 8., 5., 2.],
##        [1., 8., 6., 7., 1., 8., 1., 0.]])]
  • vsplit splits along the vertical axis, and array_split allows one to specify along which axis to split.

Deep Copy

  • The copy method makes a complete copy of the array and its data.
d = a.copy()
d is a
## False
d.base is a  # d doesn't share anything with a
## False
d[0, 0] = 9999
a
## array([[6., 7., 6., 9., 0., 5., 4., 0., 6., 8., 5., 2.],
##        [8., 5., 5., 7., 1., 8., 6., 7., 1., 8., 1., 0.]])
  • Sometimes copy should be called after slicing if the original array is not required anymore.

  • For example, suppose a is a huge intermediate result and the final result b only contains a small fraction of a, a deep copy should be made when constructing b with slicing:

a = np.arange(int(1e8))
b = a[:100].copy()
del a  # the memory of ``a`` can be released.

Broadcasting

  • The term broadcasting describes how NumPy treats arrays with differentshapes during arithmetic operations.

  • Subject to certain constraints, the smaller array is “broadcast” across the larger array so that they have compatible shapes.

  • Broadcasting provides a means of vectorizing array operations so that looping occurs in C instead of Python.

  • It does this without making needless copies of data and usually leads to efficient algorithm implementations.

  • There are, however, cases where broadcasting is a bad idea because it leads to inefficient use of memory that slows computation.

a = np.array([1.0, 2.0, 3.0])
b = np.array([2.0, 2.0, 2.0])
a * b
## array([2., 4., 6.])
a = np.array([1.0, 2.0, 3.0])
b = 2.0
a * b
## array([2., 4., 6.])

General Broadcasting Rules

  • When operating on two arrays, NumPy compares their shapes element-wise.

  • It starts with the trailing (i.e. rightmost) dimensions and works its way left. Two dimensions are compatible when

    1. they are equal, or

    2. one of them is 1

  • If these conditions are not met, a ValueError: operands could not be broadcast together exception is thrown, indicating that the arrays have incompatible shapes.

  • The size of the resulting array is the size that is not 1 along each axis of the inputs.

from numpy import array
a = array([[ 0.0,  0.0,  0.0],
           [10.0, 10.0, 10.0],
           [20.0, 20.0, 20.0],
           [30.0, 30.0, 30.0]])
b = array([1.0, 2.0, 3.0])
a + b
## array([[ 1.,  2.,  3.],
##        [11., 12., 13.],
##        [21., 22., 23.],
##        [31., 32., 33.]])
b = array([1.0, 2.0, 3.0, 4.0])
# a + b
# ValueError: operands could not be broadcast together with shapes (4,3) (4,) 


Pandas

  • pandas is a Python package providing fast, flexible, and expressive data structures designed to make working with “relational” or “labeled” data both easy and intuitive.

  • It aims to be the fundamental high-level building block for doing practical, real-world data analysis in Python.

  • Tabular data with heterogeneously-typed columns, as in an SQL table or Excel spreadsheet

  • Ordered and unordered (not necessarily fixed-frequency) time series data.

  • Arbitrary matrix data (homogeneously typed or heterogeneous) with row and column labels

  • Any other form of observational / statistical data sets. The data need not be labeled at all to be placed into a pandas data structure

import pandas as pd
df = pd.DataFrame( 
    {
       "Name": [
       "Braund, Mr. Owen Harris",
       "Allen, Mr. William Henry",
       "Bonnell, Miss. Elizabeth",
       ],
      "Age": [22, 35, 58],
      "Sex": ["male", "male", "female"],
     }
   )
df
##                        Name  Age     Sex
## 0   Braund, Mr. Owen Harris   22    male
## 1  Allen, Mr. William Henry   35    male
## 2  Bonnell, Miss. Elizabeth   58  female
df2 = pd.DataFrame(
  {
    "A": 1.0,
   "B": pd.Timestamp("20130102"),
   "C": pd.Series(1, index=list(range(4)), dtype="float32"),
   "D": np.array([3] * 4, dtype="int32"),
   "E": pd.Categorical(["test", "train", "test", "train"]),
   "F": "foo",
  }
)
df2
##      A          B    C  D      E    F
## 0  1.0 2013-01-02  1.0  3   test  foo
## 1  1.0 2013-01-02  1.0  3  train  foo
## 2  1.0 2013-01-02  1.0  3   test  foo
## 3  1.0 2013-01-02  1.0  3  train  foo

Each column in a DataFrame is a Series

df["Age"]
## 0    22
## 1    35
## 2    58
## Name: Age, dtype: int64
type(df["Age"])
## <class 'pandas.core.series.Series'>
ages = pd.Series([22, 35, 58], name="Age")
ages
## 0    22
## 1    35
## 2    58
## Name: Age, dtype: int64

Basic Operation

df["Age"].max()
## 58
ages.max()
## 58
df.describe()
##              Age
## count   3.000000
## mean   38.333333
## std    18.230012
## min    22.000000
## 25%    28.500000
## 50%    35.000000
## 75%    46.500000
## max    58.000000

Data Import and Export

import pandas as pd
titanic = pd.read_csv("titanic.csv")
titanic
##      PassengerId  Survived  Pclass  ...     Fare Cabin  Embarked
## 0              1         0       3  ...   7.2500   NaN         S
## 1              2         1       1  ...  71.2833   C85         C
## 2              3         1       3  ...   7.9250   NaN         S
## 3              4         1       1  ...  53.1000  C123         S
## 4              5         0       3  ...   8.0500   NaN         S
## ..           ...       ...     ...  ...      ...   ...       ...
## 886          887         0       2  ...  13.0000   NaN         S
## 887          888         1       1  ...  30.0000   B42         S
## 888          889         0       3  ...  23.4500   NaN         S
## 889          890         1       1  ...  30.0000  C148         C
## 890          891         0       3  ...   7.7500   NaN         Q
## 
## [891 rows x 12 columns]
titanic.head(10)
##    PassengerId  Survived  Pclass  ...     Fare Cabin  Embarked
## 0            1         0       3  ...   7.2500   NaN         S
## 1            2         1       1  ...  71.2833   C85         C
## 2            3         1       3  ...   7.9250   NaN         S
## 3            4         1       1  ...  53.1000  C123         S
## 4            5         0       3  ...   8.0500   NaN         S
## 5            6         0       3  ...   8.4583   NaN         Q
## 6            7         0       1  ...  51.8625   E46         S
## 7            8         0       3  ...  21.0750   NaN         S
## 8            9         1       3  ...  11.1333   NaN         S
## 9           10         1       2  ...  30.0708   NaN         C
## 
## [10 rows x 12 columns]
titanic.dtypes
## PassengerId      int64
## Survived         int64
## Pclass           int64
## Name            object
## Sex             object
## Age            float64
## SibSp            int64
## Parch            int64
## Ticket          object
## Fare           float64
## Cabin           object
## Embarked        object
## dtype: object
titanic.to_excel("titanic_test.xlsx", sheet_name="passengers", index=False)
titanic_test = pd.read_excel("titanic_test.xlsx", sheet_name="passengers")
titanic_test
##      PassengerId  Survived  Pclass  ...     Fare Cabin  Embarked
## 0              1         0       3  ...   7.2500   NaN         S
## 1              2         1       1  ...  71.2833   C85         C
## 2              3         1       3  ...   7.9250   NaN         S
## 3              4         1       1  ...  53.1000  C123         S
## 4              5         0       3  ...   8.0500   NaN         S
## ..           ...       ...     ...  ...      ...   ...       ...
## 886          887         0       2  ...  13.0000   NaN         S
## 887          888         1       1  ...  30.0000   B42         S
## 888          889         0       3  ...  23.4500   NaN         S
## 889          890         1       1  ...  30.0000  C148         C
## 890          891         0       3  ...   7.7500   NaN         Q
## 
## [891 rows x 12 columns]
titanic.info()
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 891 entries, 0 to 890
## Data columns (total 12 columns):
##  #   Column       Non-Null Count  Dtype  
## ---  ------       --------------  -----  
##  0   PassengerId  891 non-null    int64  
##  1   Survived     891 non-null    int64  
##  2   Pclass       891 non-null    int64  
##  3   Name         891 non-null    object 
##  4   Sex          891 non-null    object 
##  5   Age          714 non-null    float64
##  6   SibSp        891 non-null    int64  
##  7   Parch        891 non-null    int64  
##  8   Ticket       891 non-null    object 
##  9   Fare         891 non-null    float64
##  10  Cabin        204 non-null    object 
##  11  Embarked     889 non-null    object 
## dtypes: float64(2), int64(5), object(5)
## memory usage: 83.7+ KB
  • It is indeed a DataFrame.

  • There are 891 entries, i.e. 891 rows.

  • Each row has a row label (aka the index) with values ranging from 0 to 890.

  • The table has 12 columns. Most columns have a value for each of the rows (all 891 values are non-null). Some columns do have missing values and less than 891 non-null values.

  • The columns Name, Sex, Cabin and Embarked consists of textual data (strings, aka object). The other columns are numerical data with some of them whole numbers (aka integer) and others are real numbers (aka float).

  • The kind of data (characters, integers,…) in the different columns are summarized by listing the dtypes.

  • The approximate amount of RAM used to hold the DataFrame is provided as well.

Subsetting

ages = titanic["Age"]
titanic["Age"].shape
## (891,)
age_sex = titanic[["Age", "Sex"]]
above_35 = titanic[titanic["Age"] > 35]
above_35.head()
##     PassengerId  Survived  Pclass  ...     Fare Cabin  Embarked
## 1             2         1       1  ...  71.2833   C85         C
## 6             7         0       1  ...  51.8625   E46         S
## 11           12         1       1  ...  26.5500  C103         S
## 13           14         0       3  ...  31.2750   NaN         S
## 15           16         1       2  ...  16.0000   NaN         S
## 
## [5 rows x 12 columns]
titanic["Age"] > 35
## 0      False
## 1       True
## 2      False
## 3      False
## 4      False
##        ...  
## 886    False
## 887    False
## 888    False
## 889    False
## 890    False
## Name: Age, Length: 891, dtype: bool
class_23 = titanic[titanic["Pclass"].isin([2, 3])]
class_23 = titanic[(titanic["Pclass"] == 2) | (titanic["Pclass"] == 3)]
age_no_na = titanic[titanic["Age"].notna()]
adult_names = titanic.loc[titanic["Age"] > 35, "Name"]
titanic.iloc[9:25, 2:5]
##     Pclass                                               Name     Sex
## 9        2                Nasser, Mrs. Nicholas (Adele Achem)  female
## 10       3                    Sandstrom, Miss. Marguerite Rut  female
## 11       1                           Bonnell, Miss. Elizabeth  female
## 12       3                     Saundercock, Mr. William Henry    male
## 13       3                        Andersson, Mr. Anders Johan    male
## 14       3               Vestrom, Miss. Hulda Amanda Adolfina  female
## 15       2                   Hewlett, Mrs. (Mary D Kingcome)   female
## 16       3                               Rice, Master. Eugene    male
## 17       2                       Williams, Mr. Charles Eugene    male
## 18       3  Vander Planke, Mrs. Julius (Emelia Maria Vande...  female
## 19       3                            Masselmani, Mrs. Fatima  female
## 20       2                               Fynney, Mr. Joseph J    male
## 21       2                              Beesley, Mr. Lawrence    male
## 22       3                        McGowan, Miss. Anna "Annie"  female
## 23       1                       Sloper, Mr. William Thompson    male
## 24       3                      Palsson, Miss. Torborg Danira  female
titanic.iloc[0:3, 3] = "anonymous"
  • .loc is primarily label based, but may also be used with a boolean array. .loc will raise KeyError when the items are not found.

  • .iloc is primarily integer position based (from 0 to length-1 of the axis), but may also be used with a boolean array. .iloc will raise IndexError if a requested indexer is out-of-bounds, except slice indexers which allow out-of-bounds indexing.

dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4),
    index=dates, columns=['A', 'B', 'C', 'D'])
df
##                    A         B         C         D
## 2000-01-01 -1.717171  3.497959 -1.933236  0.689425
## 2000-01-02 -1.554594  1.474722  0.835386  0.125756
## 2000-01-03 -0.033432  0.800902 -0.338156 -0.708472
## 2000-01-04 -0.159002 -0.066103 -0.311853 -1.419839
## 2000-01-05 -1.870295  1.065438  0.261995 -0.789540
## 2000-01-06  1.140602  2.622254  0.998559  0.618193
## 2000-01-07  1.317243 -0.176217 -0.517474 -1.351632
## 2000-01-08 -0.205541 -0.588750 -1.758490 -0.198265
df[['B', 'A']] = df[['A', 'B']]
df
##                    A         B         C         D
## 2000-01-01  3.497959 -1.717171 -1.933236  0.689425
## 2000-01-02  1.474722 -1.554594  0.835386  0.125756
## 2000-01-03  0.800902 -0.033432 -0.338156 -0.708472
## 2000-01-04 -0.066103 -0.159002 -0.311853 -1.419839
## 2000-01-05  1.065438 -1.870295  0.261995 -0.789540
## 2000-01-06  2.622254  1.140602  0.998559  0.618193
## 2000-01-07 -0.176217  1.317243 -0.517474 -1.351632
## 2000-01-08 -0.588750 -0.205541 -1.758490 -0.198265
dfl = pd.DataFrame(np.random.randn(5, 4),
                            columns=list('ABCD'),
                            index=pd.date_range('20130101', periods=5))
# dfl.loc[2:3]
# TypeError: cannot do slice indexing on DatetimeIndex with these indexers [2] of type int
# Modify Chunk OptionsRun All Chunks AboveRun Current Chunk
dfl.loc['20130102':'20130104']
##                    A         B         C         D
## 2013-01-02 -1.065678 -0.540383  1.612000  1.238411
## 2013-01-03  0.157837  1.340046 -1.768334  1.188485
## 2013-01-04 -1.445582  0.322281  0.877198 -0.743674
df1 = pd.DataFrame(np.random.randn(6, 4),
                            index=list('abcdef'),
                            columns=list('ABCD'))
df1
##           A         B         C         D
## a  0.283135  0.697850  1.065823  0.383778
## b -0.037880  0.544931 -0.333350 -0.475377
## c  0.215922  0.480092 -2.435526  1.040459
## d  1.107364 -0.308551 -0.077885 -0.878722
## e -0.733931  0.350377  0.200621 -0.860042
## f  0.789388 -0.057856 -0.570553 -1.195161
df1.loc[['a', 'b', 'd'], :]
##           A         B         C         D
## a  0.283135  0.697850  1.065823  0.383778
## b -0.037880  0.544931 -0.333350 -0.475377
## d  1.107364 -0.308551 -0.077885 -0.878722
df1.loc['d':, 'A':'C']
##           A         B         C
## d  1.107364 -0.308551 -0.077885
## e -0.733931  0.350377  0.200621
## f  0.789388 -0.057856 -0.570553
df1.loc['a']
## A    0.283135
## B    0.697850
## C    1.065823
## D    0.383778
## Name: a, dtype: float64
df1.loc['a'] > 0
## A    True
## B    True
## C    True
## D    True
## Name: a, dtype: bool
df1.loc[:, df1.loc['a'] > 0]
##           A         B         C         D
## a  0.283135  0.697850  1.065823  0.383778
## b -0.037880  0.544931 -0.333350 -0.475377
## c  0.215922  0.480092 -2.435526  1.040459
## d  1.107364 -0.308551 -0.077885 -0.878722
## e -0.733931  0.350377  0.200621 -0.860042
## f  0.789388 -0.057856 -0.570553 -1.195161
s1 = pd.Series(np.random.randn(5), index=list(range(0, 10, 2)))
s1
## 0    0.462635
## 2    0.375835
## 4   -0.685573
## 6    0.478371
## 8   -1.206119
## dtype: float64
s1.iloc[:3]
## 0    0.462635
## 2    0.375835
## 4   -0.685573
## dtype: float64
s1.iloc[3]
## 0.4783714131319601
s1.iloc[:3] = 0
s1
## 0    0.000000
## 2    0.000000
## 4    0.000000
## 6    0.478371
## 8   -1.206119
## dtype: float64
df1 = pd.DataFrame(np.random.randn(6, 4),
                            index=list(range(0, 12, 2)),
                            columns=list(range(0, 8, 2)))
df1  
##            0         2         4         6
## 0   0.973026 -0.325970  1.061606 -0.638610
## 2  -0.612402  0.606244  1.672033 -0.317543
## 4   1.020128 -0.964174  0.813931 -0.004320
## 6  -0.185217  0.266766 -0.569220 -1.125756
## 8   0.224944 -1.644064  0.285576  2.009107
## 10  2.087139  2.232343  0.634498 -2.907553
df1.iloc[:3]
##           0         2         4         6
## 0  0.973026 -0.325970  1.061606 -0.638610
## 2 -0.612402  0.606244  1.672033 -0.317543
## 4  1.020128 -0.964174  0.813931 -0.004320
df1.iloc[1:5, 2:4]
##           4         6
## 2  1.672033 -0.317543
## 4  0.813931 -0.004320
## 6 -0.569220 -1.125756
## 8  0.285576  2.009107
df1.iloc[[1, 3, 5], [1, 3]]
##            2         6
## 2   0.606244 -0.317543
## 6   0.266766 -1.125756
## 10  2.232343 -2.907553
df1.iloc[1:3, :]
##           0         2         4         6
## 2 -0.612402  0.606244  1.672033 -0.317543
## 4  1.020128 -0.964174  0.813931 -0.004320
df1.iloc[:, 1:3]
##            2         4
## 0  -0.325970  1.061606
## 2   0.606244  1.672033
## 4  -0.964174  0.813931
## 6   0.266766 -0.569220
## 8  -1.644064  0.285576
## 10  2.232343  0.634498
df1.iloc[1, 1]
## 0.6062438224993173
df1.iloc[1]
## 0   -0.612402
## 2    0.606244
## 4    1.672033
## 6   -0.317543
## Name: 2, dtype: float64

Selection by callable

  • .loc, .iloc, and also [] indexing can accept a callable as indexer.

  • The callable must be a function with one argument (the calling Series or DataFrame) that returns valid output for indexing.

df1 = pd.DataFrame(np.random.randn(6, 4),
                            index=list('abcdef'),
                            columns=list('ABCD'))
df1
##           A         B         C         D
## a  0.924869  0.597715 -0.351335  0.566668
## b -0.558774  0.637554 -0.914388  0.246953
## c -0.606615  0.253886 -0.053065  1.536733
## d  0.414170  1.087474 -1.440676  0.189209
## e  1.110885 -1.166853  1.227121  0.443960
## f  0.084542 -1.001249 -0.079707  0.939407
df1.loc[lambda df: df['A'] > 0, :]
##           A         B         C         D
## a  0.924869  0.597715 -0.351335  0.566668
## d  0.414170  1.087474 -1.440676  0.189209
## e  1.110885 -1.166853  1.227121  0.443960
## f  0.084542 -1.001249 -0.079707  0.939407
df1.loc[:, lambda df: ['A', 'B']]
##           A         B
## a  0.924869  0.597715
## b -0.558774  0.637554
## c -0.606615  0.253886
## d  0.414170  1.087474
## e  1.110885 -1.166853
## f  0.084542 -1.001249
df1.iloc[:, lambda df: [0, 1]]
##           A         B
## a  0.924869  0.597715
## b -0.558774  0.637554
## c -0.606615  0.253886
## d  0.414170  1.087474
## e  1.110885 -1.166853
## f  0.084542 -1.001249
df1[lambda df: df.columns[0]]
## a    0.924869
## b   -0.558774
## c   -0.606615
## d    0.414170
## e    1.110885
## f    0.084542
## Name: A, dtype: float64
df1['A'].loc[lambda s: s > 0]
## a    0.924869
## d    0.414170
## e    1.110885
## f    0.084542
## Name: A, dtype: float64

Combining positional and label-based indexing

dfd = pd.DataFrame({'A': [1, 2, 3],
                    'B': [4, 5, 6]},
                   index=list('abc'))
dfd
##    A  B
## a  1  4
## b  2  5
## c  3  6
dfd.loc[dfd.index[[0, 2]], 'A']
## a    1
## c    3
## Name: A, dtype: int64
dfd.iloc[[0, 2], dfd.columns.get_loc('A')]
## a    1
## c    3
## Name: A, dtype: int64
dfd.iloc[[0, 2], dfd.columns.get_indexer(['A', 'B'])]
##    A  B
## a  1  4
## c  3  6

Reindexing

s = pd.Series([1, 2, 3])
list(s.index)
## [0, 1, 2]
s.reindex([1, 2, 3])
## 1    2.0
## 2    3.0
## 3    NaN
## dtype: float64
labels = [1, 2, 3]
s.loc[s.index.intersection(labels)]
## 1    2
## 2    3
## dtype: int64
df = pd.DataFrame({"A":[1, 5, 3, 4, 2], 
                   "B":[3, 2, 4, 3, 4], 
                   "C":[2, 2, 7, 3, 4], 
                   "D":[4, 3, 6, 12, 7]}, 
                   index =["first", "second", "third", "fourth", "fifth"]) 
df
##         A  B  C   D
## first   1  3  2   4
## second  5  2  2   3
## third   3  4  7   6
## fourth  4  3  3  12
## fifth   2  4  4   7
df.reindex(["first", "dues", "trois", "fourth", "fifth"])
##           A    B    C     D
## first   1.0  3.0  2.0   4.0
## dues    NaN  NaN  NaN   NaN
## trois   NaN  NaN  NaN   NaN
## fourth  4.0  3.0  3.0  12.0
## fifth   2.0  4.0  4.0   7.0
df
##         A  B  C   D
## first   1  3  2   4
## second  5  2  2   3
## third   3  4  7   6
## fourth  4  3  3  12
## fifth   2  4  4   7
df.reindex(["first", "dues", "trois", "fourth", "fifth"], fill_value = 100)
##           A    B    C    D
## first     1    3    2    4
## dues    100  100  100  100
## trois   100  100  100  100
## fourth    4    3    3   12
## fifth     2    4    4    7
import pandas as pd 
  
# Creating the first dataframe  
df1 = pd.DataFrame({"A":[1, 5, 3, 4, 2], 
                    "B":[3, 2, 4, 3, 4], 
                    "C":[2, 2, 7, 3, 4], 
                    "D":[4, 3, 6, 12, 7]}) 
  
# reindexing the column axis with 
# old and new index values 
df.reindex(columns =["A", "B", "D", "E"])

# reindex the columns 
# fill the missing values by 25 
##         A  B   D   E
## first   1  3   4 NaN
## second  5  2   3 NaN
## third   3  4   6 NaN
## fourth  4  3  12 NaN
## fifth   2  4   7 NaN
df.reindex(columns =["A", "B", "D", "E"], fill_value = 25)
##         A  B   D   E
## first   1  3   4  25
## second  5  2   3  25
## third   3  4   6  25
## fourth  4  3  12  25
## fifth   2  4   7  25
  • Generally, you can intersect the desired labels with the current axis, and then reindex.
s = pd.Series(np.arange(4), index=['a', 'a', 'b', 'c'])
labels = ['c', 'd']
# s.reindex(labels)
# ValueError: cannot reindex on an axis with duplicate labels
s.loc[s.index.intersection(labels)].reindex(labels)
## c    3.0
## d    NaN
## dtype: float64
  • However, this would still raise if your resulting index is duplicated.
labels = ['a', 'd']

# s.loc[s.index.intersection(labels)].reindex(labels)
# ValueError: cannot reindex on an axis with duplicate labels

Fast scalar value getting and setting

  • Since indexing with [] must handle a lot of cases (single-label access, slicing, boolean indexing, etc.), it has a bit of overhead in order to figure out what you’re asking for.

  • If you only want to access a scalar value, the fastest way is to use the at and iat methods, which are implemented on all of the data structures.

  • Similarly to loc, at provides label based scalar lookups, while, iat provides integer based lookups analogously to iloc{python}

s = pd.Series([0, 1, 2, 3, 4, 5])
s
## 0    0
## 1    1
## 2    2
## 3    3
## 4    4
## 5    5
## dtype: int64
s.iat[5]
## 5
df.iat[3, 0]
## 4
df.at[dates[5], 'E'] = 7

Boolean indexing

  • Another common operation is the use of boolean vectors to filter the data. The operators are: | for or, & for and, and ~ for not.

  • These must be grouped by using parentheses, since by default Python will evaluate an expression such as df['A'] > 2 & df['B'] < 3 as df['A'] > (2 & df['B']) < 3, while the desired evaluation order is (df['A'] > 2) & (df['B'] < 3)

s = pd.Series(range(-3, 4))
s[s > 0]
## 4    1
## 5    2
## 6    3
## dtype: int64
s[(s < -1) | (s > 0.5)]
## 0   -3
## 1   -2
## 4    1
## 5    2
## 6    3
## dtype: int64
s[~(s < 0)]
## 3    0
## 4    1
## 5    2
## 6    3
## dtype: int64
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
                    'b': ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
                    'c': np.random.randn(7)})
# only want 'two' or 'three'
criterion = df2['a'].map(lambda x: x.startswith('t'))
df2[criterion]
# equivalent but slower
##        a  b         c
## 2    two  y -0.463740
## 3  three  x  0.354440
## 4    two  y  0.868629
df2[[x.startswith('t') for x in df2['a']]]
# Multiple criteria
##        a  b         c
## 2    two  y -0.463740
## 3  three  x  0.354440
## 4    two  y  0.868629
df2[criterion & (df2['b'] == 'x')]
##        a  b        c
## 3  three  x  0.35444
df2.loc[criterion & (df2['b'] == 'x'), 'b':'c']
##    b        c
## 3  x  0.35444

The where() Method and Masking

  • Selecting values from a Series with a boolean vector generally returns a subset of the data.

  • To guarantee that selection output has the same shape as the original data, you can use the where method in Series and DataFrame.

s[s > 0]
## 4    1
## 5    2
## 6    3
## dtype: int64
s.where(s > 0)
## 0    NaN
## 1    NaN
## 2    NaN
## 3    NaN
## 4    1.0
## 5    2.0
## 6    3.0
## dtype: float64
  • In addition, where takes an optional other argument for replacement of values where the condition is False, in the returned copy.
df[df < 0]
##                       A   B   C   D   E
## first               NaN NaN NaN NaN NaN
## second              NaN NaN NaN NaN NaN
## third               NaN NaN NaN NaN NaN
## fourth              NaN NaN NaN NaN NaN
## fifth               NaN NaN NaN NaN NaN
## 2000-01-06 00:00:00 NaN NaN NaN NaN NaN
df.where(df < 0, -df)
##                        A    B    C     D    E
## first               -1.0 -3.0 -2.0  -4.0  NaN
## second              -5.0 -2.0 -2.0  -3.0  NaN
## third               -3.0 -4.0 -7.0  -6.0  NaN
## fourth              -4.0 -3.0 -3.0 -12.0  NaN
## fifth               -2.0 -4.0 -4.0  -7.0  NaN
## 2000-01-06 00:00:00  NaN  NaN  NaN   NaN -7.0
  • You may wish to set values based on some boolean criteria.
s2 = s.copy()
s2[s2 < 0] = 0
s2
## 0    0
## 1    0
## 2    0
## 3    0
## 4    1
## 5    2
## 6    3
## dtype: int64
df2 = df.copy()
df2[df2 < 0] = 0
df2
##                        A    B    C     D    E
## first                1.0  3.0  2.0   4.0  NaN
## second               5.0  2.0  2.0   3.0  NaN
## third                3.0  4.0  7.0   6.0  NaN
## fourth               4.0  3.0  3.0  12.0  NaN
## fifth                2.0  4.0  4.0   7.0  NaN
## 2000-01-06 00:00:00  NaN  NaN  NaN   NaN  7.0
  • By default, where returns a modified copy of the data.

  • There is an optional parameter inplace so that the original data can be modified without creating a copy:

df_orig = df.copy()
df_orig.where(df > 0, -df, inplace=True)
df_orig
##                        A    B    C     D    E
## first                1.0  3.0  2.0   4.0  NaN
## second               5.0  2.0  2.0   3.0  NaN
## third                3.0  4.0  7.0   6.0  NaN
## fourth               4.0  3.0  3.0  12.0  NaN
## fifth                2.0  4.0  4.0   7.0  NaN
## 2000-01-06 00:00:00  NaN  NaN  NaN   NaN  7.0
  • mask() is the inverse boolean operation of where.
s.mask(s >= 0)
## 0   -3.0
## 1   -2.0
## 2   -1.0
## 3    NaN
## 4    NaN
## 5    NaN
## 6    NaN
## dtype: float64
df.mask(df >= 0)
##                       A   B   C   D   E
## first               NaN NaN NaN NaN NaN
## second              NaN NaN NaN NaN NaN
## third               NaN NaN NaN NaN NaN
## fourth              NaN NaN NaN NaN NaN
## fifth               NaN NaN NaN NaN NaN
## 2000-01-06 00:00:00 NaN NaN NaN NaN NaN

補充教材:MultiIndex / advanced indexing

  • Hierarchical / Multi-level indexing is very exciting as it opens the door to some quite sophisticated data analysis and manipulation, especially for working with higher dimensional data.

  • In essence, it enables you to store and manipulate data with an arbitrary number of dimensions in lower dimensional data structures like Series (1d) and DataFrame (2d).

Creating a MultiIndex (hierarchical index) object

arrays = [
            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
            ["one", "two", "one", "two", "one", "two", "one", "two"],
         ]
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
index
## MultiIndex([('bar', 'one'),
##             ('bar', 'two'),
##             ('baz', 'one'),
##             ('baz', 'two'),
##             ('foo', 'one'),
##             ('foo', 'two'),
##             ('qux', 'one'),
##             ('qux', 'two')],
##            names=['first', 'second'])
s = pd.Series(np.random.randn(8), index=index)
s
## first  second
## bar    one       0.331363
##        two      -0.723341
## baz    one      -0.837772
##        two      -0.247049
## foo    one       0.689141
##        two       1.121384
## qux    one       1.079568
##        two      -0.329814
## dtype: float64
iterables = [["bar", "baz", "foo", "qux"], ["one", "two"]]
pd.MultiIndex.from_product(iterables, names=["first", "second"])
## MultiIndex([('bar', 'one'),
##             ('bar', 'two'),
##             ('baz', 'one'),
##             ('baz', 'two'),
##             ('foo', 'one'),
##             ('foo', 'two'),
##             ('qux', 'one'),
##             ('qux', 'two')],
##            names=['first', 'second'])
df = pd.DataFrame(
             [["bar", "one"], ["bar", "two"], ["foo", "one"], ["foo", "two"]],
             columns=["first", "second"],
       )
pd.MultiIndex.from_frame(df)
## MultiIndex([('bar', 'one'),
##             ('bar', 'two'),
##             ('foo', 'one'),
##             ('foo', 'two')],
##            names=['first', 'second'])
arrays = [
             np.array(["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"]),
             np.array(["one", "two", "one", "two", "one", "two", "one", "two"]),
         ]
s = pd.Series(np.random.randn(8), index=arrays)
s
## bar  one   -0.486092
##      two   -0.584221
## baz  one    1.026291
##      two    0.793277
## foo  one    0.909938
##      two   -0.586864
## qux  one    0.011569
##      two    0.855648
## dtype: float64
df = pd.DataFrame(np.random.randn(8, 4), index=arrays)
df
##                 0         1         2         3
## bar one  1.427445 -0.008988  0.155867  1.773813
##     two  1.800974 -0.528620 -1.384463 -0.441665
## baz one  1.597536  0.674032  0.639739 -1.610106
##     two  1.375617  0.170506 -0.275090  0.751915
## foo one -0.844638  2.489081  0.429008  0.315384
##     two  0.265210 -0.511827  1.118101  1.154967
## qux one -0.466893  0.156133  0.329883  0.753334
##     two  0.414061  2.253521  0.054647  0.283191
df = pd.DataFrame(np.random.randn(3, 8), index=["A", "B", "C"], columns=index)
df
## first        bar                 baz  ...       foo       qux          
## second       one       two       one  ...       two       one       two
## A       0.192898 -0.512821  0.145443  ...  0.222328  0.933562  0.333218
## B       0.070086 -0.597243 -0.323979  ... -0.525442  0.760575  1.122120
## C      -1.727654  0.164831  0.097347  ...  0.308360  0.186375 -0.842334
## 
## [3 rows x 8 columns]
pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6])
## first              bar                 baz                 foo          
## second             one       two       one       two       one       two
## first second                                                            
## bar   one     0.250750 -0.321133 -0.056850 -0.397326  2.197315 -0.277122
##       two    -0.219032 -1.175848  0.177063  1.551406  1.423623 -0.176671
## baz   one    -0.869122 -0.525475  0.096946 -0.379357 -0.586905 -1.327078
##       two    -0.041594 -0.646962  0.487522 -0.195762  1.414732 -1.155375
## foo   one    -2.637186  0.145552  0.195915  0.574690  1.069698 -1.467216
##       two    -0.500530  0.568413  0.703336 -1.572689  0.844033  2.222156

Basic indexing on axis with MultiIndex

df["bar"]
## second       one       two
## A       0.192898 -0.512821
## B       0.070086 -0.597243
## C      -1.727654  0.164831
df["bar", "one"]
## A    0.192898
## B    0.070086
## C   -1.727654
## Name: (bar, one), dtype: float64
df["bar"]["one"]
## A    0.192898
## B    0.070086
## C   -1.727654
## Name: one, dtype: float64
s["qux"]
## one    0.011569
## two    0.855648
## dtype: float64
df = df.T
df
##                      A         B         C
## first second                              
## bar   one     0.192898  0.070086 -1.727654
##       two    -0.512821 -0.597243  0.164831
## baz   one     0.145443 -0.323979  0.097347
##       two     0.606685 -0.956177  1.156921
## foo   one    -1.316539 -1.249647 -0.586794
##       two     0.222328 -0.525442  0.308360
## qux   one     0.933562  0.760575  0.186375
##       two     0.333218  1.122120 -0.842334
df.loc[("bar", "two")]
## A   -0.512821
## B   -0.597243
## C    0.164831
## Name: (bar, two), dtype: float64
df.loc[("bar", "two"), "A"]
## -0.5128207322863069
df.loc["bar"]
##                A         B         C
## second                              
## one     0.192898  0.070086 -1.727654
## two    -0.512821 -0.597243  0.164831
df.loc["baz":"foo"]
##                      A         B         C
## first second                              
## baz   one     0.145443 -0.323979  0.097347
##       two     0.606685 -0.956177  1.156921
## foo   one    -1.316539 -1.249647 -0.586794
##       two     0.222328 -0.525442  0.308360
df.loc[("baz", "two"):("qux", "one")]
##                      A         B         C
## first second                              
## baz   two     0.606685 -0.956177  1.156921
## foo   one    -1.316539 -1.249647 -0.586794
##       two     0.222328 -0.525442  0.308360
## qux   one     0.933562  0.760575  0.186375
df.loc[("baz", "two"):"foo"]
##                      A         B         C
## first second                              
## baz   two     0.606685 -0.956177  1.156921
## foo   one    -1.316539 -1.249647 -0.586794
##       two     0.222328 -0.525442  0.308360
df.loc[[("bar", "two"), ("qux", "one")]]
##                      A         B         C
## first second                              
## bar   two    -0.512821 -0.597243  0.164831
## qux   one     0.933562  0.760575  0.186375
s = pd.Series(
             [1, 2, 3, 4, 5, 6],
             index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]]),
         )
s.loc[[("A", "c"), ("B", "d")]]  # list of tuples
## A  c    1
## B  d    5
## dtype: int64
s.loc[(["A", "B"], ["c", "d"])]  # tuple of lists
## A  c    1
##    d    2
## B  c    4
##    d    5
## dtype: int64
df
##                      A         B         C
## first second                              
## bar   one     0.192898  0.070086 -1.727654
##       two    -0.512821 -0.597243  0.164831
## baz   one     0.145443 -0.323979  0.097347
##       two     0.606685 -0.956177  1.156921
## foo   one    -1.316539 -1.249647 -0.586794
##       two     0.222328 -0.525442  0.308360
## qux   one     0.933562  0.760575  0.186375
##       two     0.333218  1.122120 -0.842334
df.xs("one", level="second")
# using the slicers
##               A         B         C
## first                              
## bar    0.192898  0.070086 -1.727654
## baz    0.145443 -0.323979  0.097347
## foo   -1.316539 -1.249647 -0.586794
## qux    0.933562  0.760575  0.186375
df.loc[(slice(None), "one"), :]
##                      A         B         C
## first second                              
## bar   one     0.192898  0.070086 -1.727654
## baz   one     0.145443 -0.323979  0.097347
## foo   one    -1.316539 -1.249647 -0.586794
## qux   one     0.933562  0.760575  0.186375
df = df.T
df.xs("one", level="second", axis=1)

# using the slicers
## first       bar       baz       foo       qux
## A      0.192898  0.145443 -1.316539  0.933562
## B      0.070086 -0.323979 -1.249647  0.760575
## C     -1.727654  0.097347 -0.586794  0.186375
df.loc[:, (slice(None), "one")]
## first        bar       baz       foo       qux
## second       one       one       one       one
## A       0.192898  0.145443 -1.316539  0.933562
## B       0.070086 -0.323979 -1.249647  0.760575
## C      -1.727654  0.097347 -0.586794  0.186375
  • You can pass drop_level=False to xs to retain the level that was selected.
df.xs("one", level="second", axis=1, drop_level=False)
## first        bar       baz       foo       qux
## second       one       one       one       one
## A       0.192898  0.145443 -1.316539  0.933562
## B       0.070086 -0.323979 -1.249647  0.760575
## C      -1.727654  0.097347 -0.586794  0.186375
df.xs("one", level="second", axis=1, drop_level=True)
## first       bar       baz       foo       qux
## A      0.192898  0.145443 -1.316539  0.933562
## B      0.070086 -0.323979 -1.249647  0.760575
## C     -1.727654  0.097347 -0.586794  0.186375

補充:Reserved words (keywords)

『保留字(Reserved words)』 不可做為變數名稱
R:
if, else, repeat, while, function,

for, in, next, break,TRUE, FALSE,

NULL, Inf, NaN, NA, NA_integer_,

NA_real_, NA_complex_, NA_character_

Python:
and, def, False, import, not,

True, as, del, finally, in, or,

try, assert, elif, for, is, pass,

while, break, else, from, lambda, print,

with, class, except, global, None, raise,

yield, continue, exec, if, nonlocal, return

補充:R套件安裝

# install.packages("xts")
library(xts)
## 載入需要的套件:zoo
## 
## 載入套件:'zoo'
## 下列物件被遮斷自 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 載入套件:'xts'
## 下列物件被遮斷自 'package:dplyr':
## 
##     first, last
search()
##  [1] ".GlobalEnv"        "package:xts"       "package:zoo"      
##  [4] "package:forcats"   "package:stringr"   "package:dplyr"    
##  [7] "package:purrr"     "package:readr"     "package:tidyr"    
## [10] "package:tibble"    "package:ggplot2"   "package:tidyverse"
## [13] "package:stats"     "package:graphics"  "package:grDevices"
## [16] "package:utils"     "package:datasets"  "package:methods"  
## [19] "Autoloads"         "package:base"
stats::rnorm(n = 5)
## [1] -0.07624891  0.15594520 -0.09452932  0.95981448 -0.63450988

補充:循環補齊(recycling)

善用循環補齊、即 元素對元素運算(向量化運算) ,執行速度較快。

1 == 1
## [1] TRUE
1 == 2
## [1] FALSE
v1 <- 1:5
v2 <- 1:5
v3 <- 1:10
v4 <- 1:7
v1 + v4 # 會出現警告,因為長度並非互為倍數
## Warning in v1 + v4: 較長的物件長度並非較短物件長度的倍數
## [1]  2  4  6  8 10  7  9
v1 + 100 # 後面會進行循環補齊,形成具五個100的向量,並做元素對元素相加
## [1] 101 102 103 104 105
v1 * 4   # 後面會進行循環補齊,形成具五個4的向量,並做元素對元素相乘
## [1]  4  8 12 16 20
v1 >= 3 # 循環補齊做循環比較
## [1] FALSE FALSE  TRUE  TRUE  TRUE
x <- c(1, 2, 3, NA, 5, NA, 8)
x == NA   # 會得出NA,要小心
## [1] NA NA NA NA NA NA NA
2 == NA   # 會得出NA,要小心
## [1] NA
# is.___ 為「疑問句」
# as.___ 為「當作」
is.na(x)
## [1] FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE
set.seed(seed = 100) # 設定亂數種子
d <- rnorm(n = 100)
# 求出d > 1.96的數字個數
d > 1.96  # 可看出哪些值滿足,得出邏輯向量
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [97] FALSE FALSE FALSE FALSE
sum(d > 1.96) # 求個數,藉由「強制型別轉換」,TRUE為數值1,FALSE為數值0
## [1] 3
mean(d > 1.96) #求機率
## [1] 0.03