這部分來探討R最基礎的資料結構- Atomic Vector ,再討論以原子向量為延伸,加入 dim 屬性的 Array 。接著討論屬於異質資料的 List 與 Data Frame 。
原子向量(atomic vector)又稱為 「同質性向量(homogeneous)」
v <- c(TRUE, FALSE)
class(v)
## [1] "logical"
v
## [1] TRUE FALSE
v <- c(TRUE, FALSE, 100L)
class(v)
## [1] "integer"
v
## [1] 1 0 100
v <- c(TRUE, FALSE, 100L, 100)
class(v)
## [1] "numeric"
v
## [1] 1 0 100 100
v <- c(TRUE, FALSE, 100L, 100, 100 + 0i)
class(v)
## [1] "complex"
v
## [1] 1+0i 0+0i 100+0i 100+0i 100+0i
v <- c(TRUE, FALSE, 100L, 100, "100")
class(v)
## [1] "character"
v
## [1] "TRUE" "FALSE" "100" "100" "100"
?vector # 查詢vector函數
vector() # 空向量(empty vector)
## logical(0)
vector(mode = "logical", length = 5) # 各個不同類別型態的向量初始化
## [1] FALSE FALSE FALSE FALSE FALSE
vector(mode = "numeric", length = 5) # 注意向量元素的內容
## [1] 0 0 0 0 0
vector(mode = "integer", length = 5)
## [1] 0 0 0 0 0
vector(mode = "character", length = 5)
## [1] "" "" "" "" ""
vector(mode = "complex", length = 5)
## [1] 0+0i 0+0i 0+0i 0+0i 0+0i
# NOT RUN
# 1x <- 100
# 錯誤: unexpected symbol in "1x"
# _x <- 100
# 錯誤: unexpected input in "_"
# x y <- 100
# 錯誤: unexpected symbol in "x y"
另外,如用dot開頭(ex:.xx)命名時,變數會被創建,但物件名不會出現在Global Enviroment中。但可透過 ls() 中將參數 all.names 設定為 TRUE 即可
.x <- 100
ls() # 看不到.x
## [1] "v"
ls(all.names = TRUE) # 可看見.x
## [1] ".x" "v"
如要打破上述命名規則,可使用成對『``』將名字放在其中:
`1x` <- 100
`1x`
## [1] 100
`_x` <- 100
`_x`
## [1] 100
`:)` <- 100
`:)`
## [1] 100
`x y` <- 100
`x y`
## [1] 100
常見的R指令(如二元運算子),背後也都有一個函數作支援:
10 > 2
## [1] TRUE
`>`(10, 2) # 指令與上行相同
## [1] TRUE
x <- 100
x
## [1] 100
`<-`(x, 100) # 指令與上行相同
x
## [1] 100
dim(x = 1:10) ## NULL: 代表『不存在』意義的一個R物件
## NULL
dim(x = vector()) # 即使空向量也是
## NULL
class(x = rnorm) # function物件
## [1] "function"
numeric100 # 數字100
## [1] 100
class(100) # 查詢物件的類別型態
## [1] "numeric"
rnorm(n = 10, mean = 5, sd = 2) # 產生常態分配的亂數
## [1] 5.4524949 5.4489767 0.4888329 4.1245115 5.7619111 4.5372758 1.6190812
## [8] 4.8013371 7.4468476 5.8080104
1e-3
## [1] 0.001
1E-3 # 大寫E與小寫e皆可
## [1] 0.001
class(x = 2.78e-3)
## [1] "numeric"
integerx <- 1:10
length(x = x)
## [1] 10
dim(x = x)
## NULL
class(x = x)
## [1] "integer"
1L
## [1] 1
class(x = 1L)
## [1] "integer"
1:10; 10:1 # 加;號,簡短程式可寫在同一行
## [1] 1 2 3 4 5 6 7 8 9 10
## [1] 10 9 8 7 6 5 4 3 2 1
logicalTRUE
## [1] TRUE
FALSE
## [1] FALSE
c(T, T, F, F) # 縮寫
## [1] TRUE TRUE FALSE FALSE
character (character string)"A"
## [1] "A"
'A'
## [1] "A"
c("A", "B", "C")
## [1] "A" "B" "C"
"" # 空字串
## [1] ""
# 常用的字元字串向量
letters
## [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s"
## [20] "t" "u" "v" "w" "x" "y" "z"
LETTERS
## [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S"
## [20] "T" "U" "V" "W" "X" "Y" "Z"
month.abb
## [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
month.name
## [1] "January" "February" "March" "April" "May" "June"
## [7] "July" "August" "September" "October" "November" "December"
complexclx1 <- 1 + 1i
class(clx1)
## [1] "complex"
clx2 <- 1 + 0i
class(clx2)
## [1] "complex"
# 1 + 1*i
# not run
\(\sqrt{-1}\) :
sqrt(-1 + 0i) # 可得虛數i
## [1] 0+1i
NAnot available 的縮寫,代表『遺失值』意義。
class(NA)
## [1] "logical"
class(NA_character_); class(NA_complex_); class(NA_integer_); class(NA_complex_)
## [1] "character"
## [1] "complex"
## [1] "integer"
## [1] "complex"
length(NA)
## [1] 1
x <- c(1, 2, NA, 4, NA)
x == NA
## [1] NA NA NA NA NA
# 判斷是否為NA,請用is.na()
is.na(x)
## [1] FALSE FALSE TRUE FALSE TRUE
NULL代表『不存在』意義的特殊物件。
class(NULL)
## [1] "NULL"
length(NULL)
## [1] 0
NULL == NULL
## logical(0)
NULL == NA
## logical(0)
NULL == 0
## logical(0)
NULL == NULL
## logical(0)
is.null(NULL)
## [1] TRUE
Infclass(Inf)
## [1] "numeric"
print(1/0)
## [1] Inf
print(log(0))
## [1] -Inf
1 + 1/0
## [1] Inf
Inf == Inf
## [1] TRUE
Inf == -Inf
## [1] FALSE
is.infinite(-Inf)
## [1] TRUE
is.infinite(Inf)
## [1] TRUE
is.finite(Inf)
## [1] FALSE
NaNNaN
## [1] NaN
1/0 - 1/0
## [1] NaN
sqrt(-1)
## Warning in sqrt(-1): 產生了 NaNs
## [1] NaN
log(-1)
## Warning in log(-1): 產生了 NaNs
## [1] NaN
class(NaN)
## [1] "numeric"
x <- c(1, 3, 1, NaN, 5, 4)
is.nan(x)
## [1] FALSE FALSE FALSE TRUE FALSE FALSE
namesx <- 1:5
x
## [1] 1 2 3 4 5
names(x)
## NULL
names(x) <- c("A", "B", "C", "D", "E")
x
## A B C D E
## 1 2 3 4 5
dim由之前的範例可知,R的向量不具有維度(dim)屬性
dim(x)
## NULL
值得注意的是,長度(length())並不算屬性。
[v <- c(1, 4, 5, 2, 8)
length(v)
## [1] 5
v[1]
## [1] 1
v[3]
## [1] 5
v[1:3]
## [1] 1 4 5
# v[1,3, 3]
v[c(1, 3, 5)]
## [1] 1 5 8
v[7]
## [1] NA
v[4:8]
## [1] 2 8 NA NA NA
v <- c(1, 4, 5, 2, 8)
names(v) <- letters[1:5]
v
## a b c d e
## 1 4 5 2 8
v["a"]
## a
## 1
v[c("a", "c", "e")]
## a c e
## 1 5 8
v <- 1:5
length(v)
## [1] 5
v[-1]
## [1] 2 3 4 5
v[-c(1, 3, 5)]
## [1] 2 4
v[-(1:3)]
## [1] 4 5
v[-8] # 欲刪除之index不存在
## [1] 1 2 3 4 5
v <- 1:5
v[c(T, T, F, F, T)]
## [1] 1 2 5
NULLv <- 1:5
v[NULL]
## integer(0)
v <- 1:5
v[]
## [1] 1 2 3 4 5
v <- 1:4
v[0]
## integer(0)
w <- letters[1:5]
w[0]
## character(0)
NAv <- 1:5
v[NA]
## [1] NA NA NA NA NA
# v <- 1:4
# v[c(1, -3, 4)]
# Error in v[c(1, -3, 4)] : 只有負數下標中才能有 0
NAv <- 1:5
v[c(1, 2, NA, 4)]
## [1] 1 2 NA 4
NAv <- 1:5
v[c(-1, -2, 0, -4)]
## [1] 3 5
v <- 1:5
v[c(0, 1, 2)]
## [1] 1 2
v[c(1, 0, 2)]
## [1] 1 2
v[c(1, 2, 0)]
## [1] 1 2
v <- 1:5
v[c(0, -1, -2)]
## [1] 3 4 5
NULLv <- 1:5
v[NULL]
## integer(0)
v[c(NULL, 1, 3)]
## [1] 1 3
NULL# v[c(-1, -2, NA)]
# Error in v[c(-1, -2, NA)] : 只有負數下標中才能有 0
set.seed(seed = 1000) # 設定亂數種子
d <- rnorm(n = 100, mean = 10, sd = 5)
head(d) # 觀察前6筆資料
## [1] 7.771109 3.970717 10.205632 13.196942 6.067228 8.072554
tail(d) # 觀察後6筆資料
## [1] 9.478942 12.339197 12.219604 14.142764 8.064749 20.094691
tail(d, n = 10) # 可透過參數n設定,觀察更多筆資料
## [1] 19.857662 0.395024 12.310630 9.196380 9.478942 12.339197 12.219604
## [8] 14.142764 8.064749 20.094691
names(d)
## NULL
names(d) <- sample(x = c(letters, 1:9), size = 100, replace = TRUE) # 設定names屬性
透過R集合(set)相關的函數操作,可將向量視為集合,如集合的交集、聯集與差集等。
?setdiff # 查詢相關集合函數
setequal(x = c(1, 1, 2, 3, 3, 3), y = c(1, 3, 2)) # 集合的比較
## [1] TRUE
x <- names(d) # 抓出每個樣本點的名稱
x # 注意:名稱會有重複的現象。
## [1] "6" "t" "w" "1" "w" "b" "p" "r" "e" "7" "p" "f" "2" "h" "s" "x" "5" "g"
## [19] "y" "3" "9" "t" "s" "r" "e" "f" "u" "2" "t" "d" "9" "e" "9" "w" "a" "a"
## [37] "m" "v" "2" "u" "i" "8" "h" "a" "a" "b" "a" "7" "9" "u" "g" "z" "8" "z"
## [55] "s" "1" "o" "h" "s" "o" "v" "h" "v" "f" "z" "x" "7" "n" "c" "d" "7" "7"
## [73] "e" "5" "p" "y" "y" "v" "d" "o" "r" "r" "p" "z" "c" "e" "r" "g" "m" "6"
## [91] "i" "c" "f" "6" "q" "t" "2" "o" "l" "n"
setdiff(x = x, y = as.character(1:9)) # 以集合的方式扣除掉數字名後得出所有英文字母的名稱,不會有重複
## [1] "t" "w" "b" "p" "r" "e" "f" "h" "s" "x" "g" "y" "u" "d" "a" "m" "v" "i" "z"
## [20] "o" "n" "c" "q" "l"
d[setdiff(x = x, y = as.character(1:9))]
## t w b p r e f
## 3.9707172 10.2056316 8.0725535 7.6206606 13.5987535 9.9074719 7.2275565
## h s x g y u d
## 9.3956384 3.3197948 10.8502874 10.1246593 -0.2329271 1.0807793 16.1046783
## a m v i z o n
## 1.1690069 8.1690966 15.2880059 7.4134678 4.5265315 1.0483853 18.0960436
## c q l
## 5.2153616 9.4789423 8.0647494
注意:以上做法會有問題。 因名稱會有重複的現象,所以當你用character vector subsetting的方式按名稱取值,只會抓出具相同名稱的樣本點中第一個值,因此在上述結果只有看各名稱下唯一值的數字。 建議:雖然R允許取重複名稱,但我們不建議。
nms <- setdiff(x = x, y = as.character(1:9)) # 這是我們要抓的名單
# 用is.element()一個個去檢查每一個樣本點是否在我們的名單(set)中(採logical vector subsetting)
d[is.element(el = names(d), set = nms)]
## t w w b p r e
## 3.9707172 10.2056316 6.0672282 8.0725535 7.6206606 13.5987535 9.9074719
## p f h s x g y
## 5.0878609 7.2275565 9.3956384 3.3197948 10.8502874 10.1246593 -0.2329271
## t s r e f u t
## 3.8649200 14.1712367 12.6628587 6.7658752 13.0158063 1.0807793 12.8048786
## d e w a a m v
## 16.1046783 13.4971476 7.6742453 1.1690069 10.9464430 8.1690966 15.2880059
## u i h a a b a
## 3.2582047 7.4134678 10.9273251 9.7815428 8.9204331 17.3188767 11.1483332
## u g z z s o h
## 5.1590856 11.2585569 4.5265315 5.0184900 10.5028901 1.0483853 11.5585061
## s o v h v f z
## 22.7699400 5.6958112 12.7196422 8.0383098 16.1772095 15.9804322 7.5212655
## x n c d e p y
## 8.5282939 18.0960436 5.2153616 10.2061856 11.4272881 9.2032820 7.6954055
## y v d o r r p
## 10.8421904 16.9774651 13.6421313 11.6754497 15.8463825 11.2398341 8.2092526
## z c e r g m i
## 16.9174666 12.0603458 9.3849607 9.6688534 -1.6124544 4.7717175 19.8576619
## c f q t o l n
## 0.3950240 12.3106304 9.4789423 12.3391970 14.1427640 8.0647494 20.0946908
d[is.element(el = names(d), set = c("A", "B", "100"))] # 會取出空向量
## named numeric(0)
d[is.element(el = names(d), set = c(nms, "1", "2", "3"))] # 名單添加1號, 2號, 3號
## t w 1 w b p r
## 3.9707172 10.2056316 13.1969420 6.0672282 8.0725535 7.6206606 13.5987535
## e p f 2 h s x
## 9.9074719 5.0878609 7.2275565 10.6069059 9.3956384 3.3197948 10.8502874
## g y 3 t s r e
## 10.1246593 -0.2329271 11.0657705 3.8649200 14.1712367 12.6628587 6.7658752
## f u 2 t d e w
## 13.0158063 1.0807793 11.6747108 12.8048786 16.1046783 13.4971476 7.6742453
## a a m v 2 u i
## 1.1690069 10.9464430 8.1690966 15.2880059 6.2918927 3.2582047 7.4134678
## h a a b a u g
## 10.9273251 9.7815428 8.9204331 17.3188767 11.1483332 5.1590856 11.2585569
## z z s 1 o h s
## 4.5265315 5.0184900 10.5028901 14.7684014 1.0483853 11.5585061 22.7699400
## o v h v f z x
## 5.6958112 12.7196422 8.0383098 16.1772095 15.9804322 7.5212655 8.5282939
## n c d e p y y
## 18.0960436 5.2153616 10.2061856 11.4272881 9.2032820 7.6954055 10.8421904
## v d o r r p z
## 16.9774651 13.6421313 11.6754497 15.8463825 11.2398341 8.2092526 16.9174666
## c e r g m i c
## 12.0603458 9.3849607 9.6688534 -1.6124544 4.7717175 19.8576619 0.3950240
## f q t 2 o l n
## 12.3106304 9.4789423 12.3391970 12.2196041 14.1427640 8.0647494 20.0946908
當Vector擁有維度(dimension)之後,其類別型態變為 『陣列(Array)』。當其維度為『1維』時,其為『一維陣列』。維度為『2維』時,其 特稱 為 『矩陣(Matrix)』 ,其他情況,如『3維以上』亦皆為『陣列』。
因為陣列(含矩陣)其來源為同質性的Atomic Vector,故陣列(含矩陣)亦為 同質性 的資料型態。
v <- 1:30
dim(v)
## NULL
class(v)
## [1] "integer"
dim(v) <- 30
class(v) # 為『1維陣列』
## [1] "array"
v
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30
dim(v) <- c(5, 6)
class(v) # 為矩陣
## [1] "matrix" "array"
v
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1 6 11 16 21 26
## [2,] 2 7 12 17 22 27
## [3,] 3 8 13 18 23 28
## [4,] 4 9 14 19 24 29
## [5,] 5 10 15 20 25 30
dim(v) <- c(5, 3, 2)
class(v) # 3維陣列
## [1] "array"
dim(v) <- NULL # 失去維度後,會變為原來同質性的integer vector
v
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30
v1 <- 1:20
dim(v1) <- c(5, 4)
v1
## [,1] [,2] [,3] [,4]
## [1,] 1 6 11 16
## [2,] 2 7 12 17
## [3,] 3 8 13 18
## [4,] 4 9 14 19
## [5,] 5 10 15 20
v2 <- matrix(data = 1:20, nrow = 5, ncol = 4) # by column-order
v2 # v1與v1內容相同,但做法不同。
## [,1] [,2] [,3] [,4]
## [1,] 1 6 11 16
## [2,] 2 7 12 17
## [3,] 3 8 13 18
## [4,] 4 9 14 19
## [5,] 5 10 15 20
v3 <- matrix(data = 1:20, nrow = 5, ncol = 4, byrow = TRUE) # NOT by column-order
v3
## [,1] [,2] [,3] [,4]
## [1,] 1 2 3 4
## [2,] 5 6 7 8
## [3,] 9 10 11 12
## [4,] 13 14 15 16
## [5,] 17 18 19 20
值得注意的是,針對一個矩陣求取長度(length):
length(v3) # 回傳向量的長度
## [1] 20
由此可知,向量與矩陣的差別在於是否具有維度屬性與否。
為 『異質性(heterogeneous)向量』,亦不具『維度』。
# ?list
l1 <- list(TRUE, 1L, 1, "1", list(100, 200), mean, median, sd)
class(l1)
## [1] "list"
length(l1)
## [1] 8
dim(l1)
## NULL
print(l1)
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] 1
##
## [[3]]
## [1] 1
##
## [[4]]
## [1] "1"
##
## [[5]]
## [[5]][[1]]
## [1] 100
##
## [[5]][[2]]
## [1] 200
##
##
## [[6]]
## function (x, ...)
## UseMethod("mean")
## <bytecode: 0x7fe86618d588>
## <environment: namespace:base>
##
## [[7]]
## function (x, na.rm = FALSE, ...)
## UseMethod("median")
## <bytecode: 0x7fe8624b1fb8>
## <environment: namespace:stats>
##
## [[8]]
## function (x, na.rm = FALSE)
## sqrt(var(if (is.vector(x) || is.factor(x)) x else as.double(x),
## na.rm = na.rm))
## <bytecode: 0x7fe8606c96e8>
## <environment: namespace:stats>
[回傳 (子清單)sub-list 。
l <- list(TRUE, 1L, 1, "1", list(100, 200), matrix(1:20, nrow = 5, ncol = 4), mean)
l[5]
## [[1]]
## [[1]][[1]]
## [1] 100
##
## [[1]][[2]]
## [1] 200
class(l[6])
## [1] "list"
[[回傳list中元素的內容物,內容物是什麼,提領出來就是什麼。
l <- list(TRUE, 1L, 1, "1", list(100, 200), matrix(1:20, nrow = 5, ncol = 4), mean)
l[6]
## [[1]]
## [,1] [,2] [,3] [,4]
## [1,] 1 6 11 16
## [2,] 2 7 12 17
## [3,] 3 8 13 18
## [4,] 4 9 14 19
## [5,] 5 10 15 20
class(l[[6]])
## [1] "matrix" "array"
l[[7]]
## function (x, ...)
## UseMethod("mean")
## <bytecode: 0x7fe86618d588>
## <environment: namespace:base>
class(l[[7]])
## [1] "function"
l[[7]](1:20)
## [1] 10.5
$當list具備 『names』
屬性時即可使用$。功能類似[[,但不完全相同。且在互動模式下使用便利。
l <- list(A = 100, B = 200, E = 300)
l[["A"]]
## [1] 100
l$A
## [1] 100
l <- list(A = 100, B = 200, E = 300)
x <- "A"
l[[x]]
## [1] 100
l$x
## NULL
因為R會將程式 l[[x]] 視為 l$x ,因
l 內並無元素命名為 x
,故回傳NULL。
# ?data.frame
df <- data.frame(A = 10:1, B = rnorm(n = 10), C = runif(n = 10))
df
## A B C
## 1 10 -0.3615950 0.82076792
## 2 9 -1.1619680 0.20936290
## 3 8 -0.7114164 0.76743166
## 4 7 0.3489709 0.67099722
## 5 6 0.4273664 0.04625896
## 6 5 1.6608471 0.75360079
## 7 4 0.5816135 0.40778331
## 8 3 0.1434655 0.37252915
## 9 2 0.3433031 0.25478193
## 10 1 -0.9314912 0.39000806
class(df)
## [1] "data.frame"
typeof(df) # 可確認data frame的內部型態本質為list
## [1] "list"
data frame的特性有些來自matrix,有些來自list。
dim(df)
## [1] 10 3
dimnames(df)
## [[1]]
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
##
## [[2]]
## [1] "A" "B" "C"
colnames(df)
## [1] "A" "B" "C"
rownames(df)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
df[, "A"]
## [1] 10 9 8 7 6 5 4 3 2 1
df[2, ]
## A B C
## 2 9 -1.161968 0.2093629
df["2", ]
## A B C
## 2 9 -1.161968 0.2093629
df[8, 1]
## [1] 3
df["8", 1]
## [1] 3
df["8", "A"]
## [1] 3
df[8, "A"]
## [1] 3
length(df)
## [1] 3
names(df)
## [1] "A" "B" "C"
df$A
## [1] 10 9 8 7 6 5 4 3 2 1
df$A[8]
## [1] 3
df[["A"]][8]
## [1] 3
df[[c(1, 8)]] # recursive setting。分兩層提取資料,第一層先抓第1個元素,進去第二層資料後提領第8個。
## [1] 3
library(tidyverse) # library(tibble)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
tibble(x = letters)
## # A tibble: 26 × 1
## x
## <chr>
## 1 a
## 2 b
## 3 c
## 4 d
## 5 e
## 6 f
## 7 g
## 8 h
## 9 i
## 10 j
## # … with 16 more rows
tibble(x = 1:3, y = list(1:5, 1:10, 1:20))
## # A tibble: 3 × 2
## x y
## <int> <list>
## 1 1 <int [5]>
## 2 2 <int [10]>
## 3 3 <int [20]>
tribble(
~x, ~y, ~z,
#--|--|----
"a", 2, 3.6,
"b", 1, 8.5
)
## # A tibble: 2 × 3
## x y z
## <chr> <dbl> <dbl>
## 1 a 2 3.6
## 2 b 1 8.5
as_tibble(iris)
## # A tibble: 150 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # … with 140 more rows
tibble(x = 1:5, y = x ^ 2)
## # A tibble: 5 × 2
## x y
## <int> <dbl>
## 1 1 1
## 2 2 4
## 3 3 9
## 4 4 16
## 5 5 25
tibble(
x = 1:5,
y = 1,
z = x ^ 2 + y
)
## # A tibble: 5 × 3
## x y z
## <int> <dbl> <dbl>
## 1 1 1 2
## 2 2 1 5
## 3 3 1 10
## 4 4 1 17
## 5 5 1 26
titanic <- read_csv("titanic.csv")
## Rows: 891 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (7): PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
titanic
## # A tibble: 891 × 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 1 0 3 Braun… male 22 1 0 A/5 2… 7.25 <NA>
## 2 2 1 1 Cumin… fema… 38 1 0 PC 17… 71.3 C85
## 3 3 1 3 Heikk… fema… 26 0 0 STON/… 7.92 <NA>
## 4 4 1 1 Futre… fema… 35 1 0 113803 53.1 C123
## 5 5 0 3 Allen… male 35 0 0 373450 8.05 <NA>
## 6 6 0 3 Moran… male NA 0 0 330877 8.46 <NA>
## 7 7 0 1 McCar… male 54 0 0 17463 51.9 E46
## 8 8 0 3 Palss… male 2 3 1 349909 21.1 <NA>
## 9 9 1 3 Johns… fema… 27 0 2 347742 11.1 <NA>
## 10 10 1 2 Nasse… fema… 14 1 0 237736 30.1 <NA>
## # … with 881 more rows, and 1 more variable: Embarked <chr>
write_csv(titanic, file = "titanic_R.csv")
They are typically much faster (~10x) than their
base equivalents. Long running jobs have a progress bar, so you can see
what’s happening. If you’re looking for raw speed, try
data.table::fread(). It doesn’t fit quite so well into the
tidyverse, but it can be quite a bit faster.
They produce tibbles, they don’t convert character vectors to factors, use row names, or munge the column names. These are common sources of frustration with the base R functions.
They are more reproducible. Base R functions inherit some behaviour from your operating system and environment variables, so import code that works on your computer might not work on someone else’s.
Printing
When you print a tibble, it only shows the first ten rows and all the columns that fit on one screen.
It also prints an abbreviated description of the column type, and uses font styles and color for highlighting:
tibble(x = -5:100, y = 123.456 * (3 ^ x))
## # A tibble: 106 × 2
## x y
## <int> <dbl>
## 1 -5 0.508
## 2 -4 1.52
## 3 -3 4.57
## 4 -2 13.7
## 5 -1 41.2
## 6 0 123.
## 7 1 370.
## 8 2 1111.
## 9 3 3333.
## 10 4 10000.
## # … with 96 more rows
Subsetting
[ always returns another tibble. Contrast this with a
data frame: sometimes [ returns a data frame and sometimes
it just returns a vector:df1 <- data.frame(x = 1:3, y = 3:1)
class(df1[, 1:2])
## [1] "data.frame"
class(df1[, 1])
## [1] "integer"
df2 <- tibble(x = 1:3, y = 3:1)
class(df2[, 1:2])
## [1] "tbl_df" "tbl" "data.frame"
class(df2[, 1])
## [1] "tbl_df" "tbl" "data.frame"
To extract a single column use [[ or $:
class(df2[[1]])
## [1] "integer"
class(df2$x)
## [1] "integer"
Tibbles are also stricter with $. Tibbles never
do partial matching, and will throw a warning and
return NULL if the column does not exist:
df <- data.frame(abc = 1)
df$a
## [1] 1
df2 <- tibble(abc = 1)
df2$a
## Warning: Unknown or uninitialised column: `a`.
## NULL
However, tibbles respect the drop argument if it is
provided:
data.frame(a = 1:3)[, "a", drop = TRUE]
## [1] 1 2 3
tibble(a = 1:3)[, "a", drop = TRUE]
## [1] 1 2 3
Tibbles do not support row names. They are removed when converting to a tibble or when subsetting:
df <- data.frame(a = 1:3, row.names = letters[1:3])
rownames(df)
## [1] "a" "b" "c"
rownames(as_tibble(df))
## [1] "1" "2" "3"
tbl <- tibble(a = 1:3)
rownames(tbl) <- letters[1:3]
## Warning: Setting row names on a tibble is deprecated.
rownames(tbl)
## [1] "a" "b" "c"
rownames(tbl[1, ])
## [1] "1"
See vignette("invariants") for a detailed comparison
between tibbles and data frames.
Recycling
When constructing a tibble, only values of length 1 are recycled.
The first column with length different to one determines the number of rows in the tibble, conflicts lead to an error:
tibble(a = 1, b = 1:3)
## # A tibble: 3 × 2
## a b
## <dbl> <int>
## 1 1 1
## 2 1 2
## 3 1 3
tibble(a = 1:3, b = 1)
## # A tibble: 3 × 2
## a b
## <int> <dbl>
## 1 1 1
## 2 2 1
## 3 3 1
# tibble(a = 1:3, c = 1:2)
# Error:
# ! Tibble columns must have compatible sizes.
# • Size 3: Existing data.
# • Size 2: Column `c`.
# ℹ Only values of size one are recycled.
This also extends to tibbles with zero rows, which is sometimes
important for programming:
tibble(a = 1, b = integer())
## # A tibble: 0 × 2
## # … with 2 variables: a <dbl>, b <int>
tibble(a = integer(), b = 1)
## # A tibble: 0 × 2
## # … with 2 variables: a <int>, b <dbl>
Arithmetic operations
Unlike data frames, tibbles don’t support arithmetic operations on all columns.
The result is silently coerced to a data frame. Do not rely on this behavior, it may become an error in a forthcoming version.
tbl <- tibble(a = 1:3, b = 4:6)
tbl * 2
## a b
## 1 2 8
## 2 4 10
## 3 6 12
float1e-3
## 0.001
1E-3 # 大寫E也可以
## 0.001
type(1e3)
## <class 'float'>
import math
type(math.e ** 2 )
## <class 'float'>
intx = 100
x
## 100
type(x)
## <class 'int'>
bool為True或False
本質上為int的子類別,True為1,False為0
擁有不同於int型態的列印(print)方式
True
## True
type(False)
## <class 'bool'>
float(True)
## 1.0
float(False)
## 0.0
int(True)
## 1
int(False)
## 0
True is 1
## False
True == 1
## True
str# help(str)
'This is a string'
## 'This is a string'
"This is a string"
## 'This is a string'
'''This is a string'''
## 'This is a string'
"""This is a string"""
## 'This is a string'
反斜線(\)可用於『脫逸字元(escape
character)』,可賦予特殊意義:
\n:代表換行
\t:代表Tab
\\:代表正常的反斜線\
\":代表”雙引號字元
x = "\tThis string starts with a \"tab\"."
x
## '\tThis string starts with a "tab".'
print(x)
## This string starts with a "tab".
x = "This string contains a single backslash(\\)."
x
## 'This string contains a single backslash(\\).'
print(x)
## This string contains a single backslash(\).
x = 'Can\'t get by without a backslash'
x
## "Can't get by without a backslash"
print(x)
## Can't get by without a backslash
x = "Can't get by without a backslash"
x
## "Can't get by without a backslash"
print(x)
## Can't get by without a backslash
x = "Backslash your \"character\" !"
x
## 'Backslash your "character" !'
print(x)
## Backslash your "character" !
x = 'You can leave the " alone'
x
## 'You can leave the " alone'
print(x)
## You can leave the " alone
Python提供『三引號』,可建立『跨行字串』,且字串中可包含『單引號’』與『雙引號”』 }
x = """Starting and ending a string with triple " characters
permits embedded newlines, and the use of " and ' without
backslashes"""
x
## 'Starting and ending a string with triple " characters\npermits embedded newlines, and the use of " and \' without\nbackslashes'
print(x)
## Starting and ending a string with triple " characters
## permits embedded newlines, and the use of " and ' without
## backslashes
# name = input("Name? ") # 從使用者鍵入值取得資料
# type(name)
# print(name)
str)補充字串為不可變「字元」序列:字串為字元依照順序組成
故可進行取值與切片
x = 'Goodbye\n!' # \n:換行跳脫字元(escape character)
x
## 'Goodbye\n!'
len(x)
## 9
print(x) # print函數自動在字串尾端添加換行跳脫字元
## Goodbye
## !
z = 'a\n\tb'
z
## 'a\n\tb'
print(z)
## a
## b
print("abc\n") # 2次換行
## abc
print("abc\n", end='') # 1次換行
## abc
x = "Hello" + "World"
x
## 'HelloWorld'
x = "Hello" "World" # Python會將空白相隔的字串連接在一起
x
## 'HelloWorld'
split()與join():切割與連結字串join()函數用於連結字串
雖+也可以用於連結字串,但+用於連結字串時,會建立新字串。故建立大量字串時,會產生很多無用的字串物件,程式碼效率會變差
" ".join(["join", "puts", "spaces", "between", "elements"])
## 'join puts spaces between elements'
"::".join(["Separated", "with", "colons"])
## 'Separated::with::colons'
"".join(["Separated", "by", "nothing"])
## 'Separatedbynothing'
split()會將字串分割為字串list,其預設以空白字元(whitespace)切割字串
空白字元包含空格、換行、定位等字元
x = "You\t\t can have tabs\t\n \t and newlines \n\n mixed in"
x.split()
## ['You', 'can', 'have', 'tabs', 'and', 'newlines', 'mixed', 'in']
x = "Mississippi"
x.split("ss")
## ['Mi', 'i', 'ippi']
x = 'a b c d'
x.split(' ', 1)
## ['a', 'b c d']
x.split(' ', 2)
## ['a', 'b', 'c d']
x.split(' ', 100)
## ['a', 'b', 'c', 'd']
float('123.456')
# float('xxyy')
# ValueError: could not convert string to float: 'xxyy'
## 123.456
int('3333')
# int('123.456')
# ValueError: invalid literal for int() with base 10: '123.456'
## 3333
int('10000', 8)
## 4096
int('101', 2)
## 5
int('ff', 16)
# int('123456', 6)
# ValueError: invalid literal for int() with base 6: '123456'
## 255
strip()、lstrip()、rstrip()移除開頭或結尾處的多餘空白x = " Hello, World\t\t "
x.strip()
## 'Hello, World'
x.lstrip() # 移除左邊的空白
## 'Hello, World\t\t '
'Hello, World\t\t '
## 'Hello, World\t\t '
x.rstrip() # 移除右邊的空白
## ' Hello, World'
import string
string.whitespace # 查詢被Python視為空白的字元
## ' \t\n\r\x0b\x0c'
x = "www.python.org"
x.strip("w")
## '.python.org'
x.strip("gor") # 移除所有g, o, r字元
## 'www.python.'
x.strip(".gorw") # 移除所有 ., g, o, r, w字元
## 'python'
demo = " Demo Example " # 移除所有的空白
demo.replace(" ", "")
## 'DemoExample'
x = "123"
x.isdigit()
## True
x.isalpha()
## False
x = "MM"
x.islower()
## False
x.isupper()
## True
import string
string.ascii_letters
## 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
string.ascii_uppercase
## 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
string.ascii_lowercase
## 'abcdefghijklmnopqrstuvwxyz'
string.digits
## '0123456789'
in operatorx = "The string"
"str" in x
## True
"sTr" in x
## False
"e s" in x
## True
find(string, start, end)x = "Mississippi"
x.find("ss")
## 2
x.find("zz") # 找不到傳回-1
## -1
x = "Mississippi"
x.find("s")
## 2
x.find("s",2) # 從索引2開始尋找
## 2
x.find("s",4) # 從索引4開始尋找
## 5
x.find("s",4,5) ## 從索引4開始尋找,並在索引5之前結束
## -1
x.find("ss", 3)
## 5
x.find("ss", 0, 3)
## -1
rfind():從字串的結尾向開頭進行搜尋,並傳回搜尋文字最後出現的索引位置x = "Mississippi"
x.rfind("ss")
## 5
index()與rindex():index()與rindex()分別與find()和rfind()相同,但index()與rindex()找不到文字時,不會回傳-1而是引發ValueError例外錯誤x = "Mississippi"
# x.index("zz")
# ValueError: substring not found
count()x = "Mississippi"
x.count("ss")
## 2
x.count("s")
## 4
startswith()與endswith()x = "Mississippi"
x.startswith("Miss")
## True
x.startswith("Mist")
## False
x.endswith("pi")
## True
x.endswith("p")
## False
雖然字串為不可變資料型態,但字串物件仍提供幾個method可對該字串進行操作,並回傳一個修改後的新字串
replace()
x = "Mississippi"
x.replace("ss", "+++")
## 'Mi+++i+++ippi'
maketrans()、translate()x = "~x ^ (y % z)"
table = x.maketrans("~^()", "!&[]") # 組成一個字元參照表
type(table) # 按照參照表table來替換字元
## <class 'dict'>
table # 字元參照表
## {126: 33, 94: 38, 40: 91, 41: 93}
x.translate(table)
## '!x & [y % z]'
list來修改字串text = "Hello, World"
wordList = list(text)
wordList[6:] = []
wordList.reverse()
text = "".join(wordList)
print(text)
## ,olleH
repr()與str()將物件轉換為字串表示repr()傳回的字串是給Python程式讀取(formal
string representation),可透過此字串重建原始物件
str()傳回的字串是給人看的(informal string
representation),可讀性比較高
許多情況之下,repr()與str()內容並無不同
repr([1,2,3,4])
## '[1, 2, 3, 4]'
x = [1]
x.append(2)
x
## [1, 2]
x.append([3, 4])
x
## [1, 2, [3, 4]]
"The list x is " + repr(x)
## 'The list x is [1, 2, [3, 4]]'
repr(len)
## '<built-in function len>'
repr(list)
## "<class 'list'>"
from datetime import datetime
now = datetime.now()
str(now)
## '2022-04-25 16:37:08.496266'
print(now)
## 2022-04-25 16:37:08.496266
repr(now)
## 'datetime.datetime(2022, 4, 25, 16, 37, 8, 496266)'
%格式化字串舊式做法
常用的格式化規範
| 字串 | 意義 |
|---|---|
| %s | 字串 |
| %c | 字元 |
| %b | 二進位 |
| %d | 十進位 |
| %x | 十六進位 |
| %f | 浮點數 |
| %e | 指數 |
| 不指名 | 與d相同 |
errno = 12345
name = "Bob"
"Hello, %s" % name # %s 告訴Python此處要替換成字串,他會到第二個 % 運算子後面找到變數
## 'Hello, Bob'
"%x" % errno # %x 把整數轉換成字串並以16進位數字表示
## '3039'
"Hi, %s, 錯誤:0x%x 發生了" % (name, errno)
## 'Hi, Bob, 錯誤:0x3039 發生了'
"Hi, %(Name)s, 錯誤:0x%(errNo)x 發生了" % {"errNo":errno, "Name":name}
## 'Hi, Bob, 錯誤:0x3039 發生了'
format()格式化字串"{} is the {} of {}".format("Ambrosia", "food", "the gods")
## 'Ambrosia is the food of the gods'
"{{Ambrosia}} is the {} of {}".format("food", "the gods")
# 若格式化字串內需顯示{與}字元,則需重複寫兩次{{與}}
## '{Ambrosia} is the food of the gods'
"{} + {} = {}".format(1, 2, 1+2)
## '1 + 2 = 3'
x = [1, 2, "three"]
"The {} contains: {}".format("list", x)
## "The list contains: [1, 2, 'three']"
"{2} is the {0} of {1}".format("food", "the gods", "Ambrosia")
## 'Ambrosia is the food of the gods'
'{0}{1}{0}'.format('abc', 'def')
## 'abcdefabc'
"{food} is the food of {user}".format(food="Ambrosia", user="the gods")
## 'Ambrosia is the food of the gods'
"{0} is the food of {user[1]}".format("Ambrosia", user=["men", "the gods", "others"])
# "{0} is the food of {user}".format(user="the gods", "Ambrosia")
# SyntaxError: non-keyword arg after keyword arg
## 'Ambrosia is the food of the gods'
import math
"{}為 {}".format("圓周率", math.pi)
## '圓周率為 3.141592653589793'
"{:10s}為 {:10.5f}".format("圓周率", math.pi)
# 寬度為10, 類型為字串,字串預設靠左
# 寬度為10, 取5位小數點, 類型為浮點數(f), 數字預設靠右
## '圓周率 為 3.14159'
"{:>10s}為 {:+10.5f}".format("圓周率", math.pi)
# 寬度為10, 類型為字串,字串靠右
# 寬度為10, 取5位小數點, 類型為浮點數(f), 數字預設靠右, 並顯示+-號
## ' 圓周率為 +3.14159'
"{0:*<10s}為 {1:#> 10.5f}".format("圓周率", math.pi)
# 寬度為10, 靠左(<), 多餘空格用*字元填滿, 並以編號0來取得format()內第0個參數
# 寬度為10, 取5位小數點, 類型為浮點數(f), 空格代表若為正數就留空格,若為負數則加負號, 剩下空格用#字元填滿, 並用編號1來取得format()內第1個參數
## '圓周率*******為 ## 3.14159'
"{0:*<10s}為 {1:#>+10.5f}".format("圓周率", math.pi)
## '圓周率*******為 ##+3.14159'
"{name:P^10s}為 {value:=+10.2f}".format(name = "圓周率", value = math.pi)
# 寬度為10, 置中(^), 多餘空格用P字元填滿, 並以名稱name來取得format()內參數
# 寬度為10, 取2位小數點, 類型為浮點數(f), +號代表強制加上+-號, =號代表把正負號放到最左邊
## 'PPP圓周率PPPP為 + 3.14'
可直接將Python運算式嵌入字串中
速度較快
name = "Bob"
f'你好, {name}!'
## '你好, Bob!'
a = 5
b = 10
name = "Peter"
No = 100
f'5 加 10 等於 {a + b}, 而非 {2 * (a + b)}.'
## '5 加 10 等於 15, 而非 30.'
f'Hi~ {name:s}, 有錯誤 0x{errno:x} 發生了!'
# :s代表字串
# :x代表十六進位數
## 'Hi~ Peter, 有錯誤 0x3039 發生了!'
f'Hello, {name:s}'
## 'Hello, Peter'
f'Hello, {name=}' # 變數名稱後面加『=』, 就會把變數名稱與內容一起印出來
## "Hello, name='Peter'"
f'Hello, {No=}' # 變數名稱後面加『=』, 就會把變數名稱與內容一起印出來
## 'Hello, No=100'
compleximport math
import cmath
1 + 1j
## (1+1j)
1 - 2J # 大寫J也可以
# NOT RUN
# math.sqrt(-1)
## (1-2j)
cmath.sqrt(-1 + 0j)
## 1j
cmath.sqrt(-1)
## 1j
None特殊的基本型態,代表『不存在』或是『空值』的意義(類似R中的
NULL )。
None在Python亦代表 佔位符號(place
holder),用來表示資料中某一個欄位目前尚未得知具體之值,先保留該位置,之後再填值(類似R中
NA 的功能)。
type(None)
## <class 'NoneType'>
None == False
## False
None == 0
## False
None == None # None只會等於自己
## True
False == 0
# 與R比較 (Not Run):
# len(None)
# TypeError: object of type 'NoneType' has no len()
## True
Inffloat('Inf')
## inf
float('inf')
## inf
float('INF')
## inf
import math
math.inf
## inf
math.isinf(float("-inf")) # OUTPUT:True. Return True if x is a positive or negative infinity, and False otherwise.
## True
math.isinf(float("inf")) # OUTPUT:True
# by comparing to infinity
## True
float("inf") == float("inf") # OUTPUT:True
## True
float("-inf") == float("-inf") # OUTPUT:True
## True
float("inf") == float("-inf")
## False
NaNimport math
math.nan
## nan
math.isnan(math.nan)
## True
import numpy as np
np.nan
## nan
type(np.nan)
## <class 'float'>
x = np.array([1, 2, 3, np.nan, math.nan])
x
## array([ 1., 2., 3., nan, nan])
np.isnan(x)
## array([False, False, False, True, True])
listlist是由『有序(ordered)』的元素(element)構成。[]建立listlist基本操作l0 = [] # 空list
l1 = [True, 1, 1.0, '1', ['a', 'b', 'c']]
l1
## [True, 1, 1.0, '1', ['a', 'b', 'c']]
type(l1)
## <class 'list'>
len(l1)
## 5
list('NCCU')
## ['N', 'C', 'C', 'U']
list(range(-4, 4))
## [-4, -3, -2, -1, 0, 1, 2, 3]
l2 = [1, 2, [3, 4]]
l3 = [5, 6, [7, 8]]
l2 + l3 # list的拼接
## [1, 2, [3, 4], 5, 6, [7, 8]]
l2 * 3 # list的重複
## [1, 2, [3, 4], 1, 2, [3, 4], 1, 2, [3, 4]]
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
matrix
## [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
matrix[1]
## [4, 5, 6]
matrix[1][1]
## 5
[n]取值x = ["first", "second", "third", "fourth"]
x[0]
## 'first'
x[2]
## 'third'
x[-1] # 取最後一個位置
## 'fourth'
x[-2] # 取得倒數第二個位置
## 'third'
提取多個元素時採切片(slicing)
[index1:index2]:代表指定從index1至index2之間(不包括index2)的元素
L = ['NCCU', 'MoneyBanking', 'QF']
L[1] = 'MONEY_BANKING'
L
## ['NCCU', 'MONEY_BANKING', 'QF']
L[0:2] = ['School', 'Department']
L
## ['School', 'Department', 'QF']
L = [1, 2, 3]
L
## [1, 2, 3]
L[1:2] = [8, 9] # 長度可變:insertion
L
## [1, 8, 9, 3]
L[1:1] = [6, 7] # 長度可變:insertion, replace nothing
L
## [1, 6, 7, 8, 9, 3]
L[1:3] = []
L
## [1, 8, 9, 3]
# R code:
# L <- list(1, 2, 3)
# L[1] <- list(8, 9)
# 被替換的項目不是替換值長度的倍數
可省略index1或index2
如兩個索引都省略,則會複製整個list
如兩個索引都省略,則會複製整個list
L = ['NCCU', 'MoneyBanking', 'QF', 'students', 'class']
L[:2] # 只️取前2個元素
## ['NCCU', 'MoneyBanking']
L[2:] # 不取前2個元素
## ['QF', 'students', 'class']
x = L[:]
x
## ['NCCU', 'MoneyBanking', 'QF', 'students', 'class']
L[-1:2]
## []
x = [1, 2, 3]
x[1:2] = [8, 9] # 長度可變:insertion
x
## [1, 8, 9, 3]
x = [1, 2, 3]
x[1:1] = [8, 9] # 長度可變:insertion, replace nothing
x
## [1, 8, 9, 2, 3]
x = [1, 2, 3, 4]
x[len(x):] = [5, 6, 7]
x
## [1, 2, 3, 4, 5, 6, 7]
x[:0] = [-1, 0]
x
## [-1, 0, 1, 2, 3, 4, 5, 6, 7]
x[1:-1] = [] # 移除list內多個元素
x
## [-1, 7]
list更多的操作L = ['eat', 'more', 'SPAM']
L.append('please')
L
## ['eat', 'more', 'SPAM', 'please']
L.sort()
L
## ['SPAM', 'eat', 'more', 'please']
L.append(['Chen']) # 請與L.extend() method 比較
L
# L.sort()
# TypeError: '<' not supported between instances of 'list' and 'str'
#
# Detailed traceback:
# File "<string>", line 1, in <module>
## ['SPAM', 'eat', 'more', 'please', ['Chen']]
L = ['abc', 'ABD', 'aBe']
L.sort()
L
## ['ABD', 'aBe', 'abc']
L = ['abc', 'ABD', 'aBe']
L.sort(key = str.lower)
L
## ['abc', 'ABD', 'aBe']
L = ['abc', 'ABD', 'aBe']
L.sort(key = str.lower, reverse = True)
L
## ['aBe', 'ABD', 'abc']
def compare_num_of_chars(string1):
return len(string1)
def compare_num_of_chars(string1):
return len(string1)
word_list = ['Python', 'is', 'better', 'than', 'C']
word_list.sort()
print(word_list)
## ['C', 'Python', 'better', 'is', 'than']
word_list = ['Python', 'is', 'better', 'than', 'C']
word_list.sort(key=compare_num_of_chars)
print(word_list)
## ['C', 'is', 'than', 'Python', 'better']
L = ['spam', 'eggs', 'ham']
L.index('eggs')
# L.index('egg')
# ValueError: 'egg' is not in list
#
# Detailed traceback:
# File "<string>", line 1, in <module>
## 1
L.insert(1, 'toast')
L
## ['spam', 'toast', 'eggs', 'ham']
L.remove('eggs')
L
## ['spam', 'toast', 'ham']
L.pop(1) # delete by position
## 'toast'
L
## ['spam', 'ham']
['1', '2', '1', '1', '3'].count('1') # number of occurences
## 3
L = ['spam', 'eggs', 'ham', 'toast']
del L[0]
L
## ['eggs', 'ham', 'toast']
del L[1:3]
L
## ['eggs']
tuple與list類似,但tuple只能被建立而不能被修改(不可變,immutable),故可當dict的鍵值(key)
與list類似,在Python中被視為有順序(ordered)的『序列(sequence)』
其中,str、list與tuple在Python中皆視為有順序的『序列(sequence)』
也可採[]進行取值與切片
+與*亦可使用
import random
t1 = (1, 2, 2, random.gauss(10, 2))
t1
## (1, 2, 2, 10.798491592666888)
type(t1)
## <class 'tuple'>
t2 = 2, 4, 8, 1
t2
## (2, 4, 8, 1)
type(t2)
## <class 'tuple'>
max(t2)
## 8
t3 = 3,
type(t3)
## <class 'tuple'>
3 in [3, 4, 7, 9, 1]
## True
one, two, three, four = 1, 2, 3, 4 # 自動打包(packing)後自動解包(unpacking),同時指定4個變數值
one
## 1
two
## 2
v1, v2, v3 = [1, 2, 3]
v1
## 1
w1, w2, w3 = 'abc'
w2
# q1, q2 = 'ABC' # 多重指定變數值時,兩邊數量要一樣多
# ValueError: too many values to unpack (expected 2)
#
# Detailed traceback:
# File "<string>", line 1, in <module>
## 'b'
*號版本自動解包
有*的標記的元素會將所有多餘的項目當作list來接收
a, b, *c = (1, 2, 3, 4)
a
## 1
b
## 2
c
## [3, 4]
a, *b, c = (1, 2, 3, 4)
a
## 1
b
## [2, 3]
c
## 4
*a, b, c = (1, 2, 3, 4)
a
## [1, 2]
b
## 3
c
## 4
a,b,c
## ([1, 2], 3, 4)
a, b, c, d, *e = (1, 2, 3, 4)
a
## 1
b
## 2
c
## 3
e
## []
x = [1, 2, 3, 4, 5]
a, b, *_ = x
a
## 1
b
## 2
_
## [3, 4, 5]
list()函數將任何序列型資料轉為list。而tuple()函數則轉為tuple。list((1, 2, 3, 4))
## [1, 2, 3, 4]
tuple([1, 2, 3, 4])
## (1, 2, 3, 4)
list('NCCU')
## ['N', 'C', 'C', 'U']
tuple('Money and Banking')
## ('M', 'o', 'n', 'e', 'y', ' ', 'a', 'n', 'd', ' ', 'B', 'a', 'n', 'k', 'i', 'n', 'g')
a = 100
b = 200
temp = a
a = b
b = temp
a, b
## (200, 100)
a = 100
b = 200
a, b = b, a
a, b
## (200, 100)
list()函數將任何序列型資料轉為list。而tuple()函數則轉為tuple。list((1, 2, 3, 4))
## [1, 2, 3, 4]
tuple([1, 2, 3, 4])
## (1, 2, 3, 4)
list('NCCU')
## ['N', 'C', 'C', 'U']
tuple('Money and Banking')
## ('M', 'o', 'n', 'e', 'y', ' ', 'a', 'n', 'd', ' ', 'B', 'a', 'n', 'k', 'i', 'n', 'g')
setset代表無順序的資料所構成(list與tuple之元素則有順序意義)
set中重複的資料會被自動刪除不計,可保持元素唯一性
值得注意的是,set中的元素須為『不可變的資料(immutable)』,故int、float、str、與tuple可作為set的元素。而list、dict與set本身則不行。
當任務的重點為判斷一群資料是否包含某個物件,而不重視順序時,則可使用set型別
因set中的元素沒有順序,故無法使用索引[n]或切片來存取,且+與*也無法使用
x = {1, 2, 1, 3, 3, 1, 2, 4}
x
## {1, 2, 3, 4}
type(x)
## <class 'set'>
x = set([1, 2, 1, 3, 3, 1, 2, 4])
x
## {1, 2, 3, 4}
type(x)
## <class 'set'>
x.add(6)
x
## {1, 2, 3, 4, 6}
x.remove(2)
x
## {1, 3, 4, 6}
3 in x
## True
5 in x
## False
x = {1, 2, 1, 2, 1, 2}
y = set([1, 7, 7, 8, 9])
x
## {1, 2}
y
## {8, 1, 9, 7}
x | y # 聯集
## {1, 2, 7, 8, 9}
x & y # 交集
## {1}
x ^ y # Symmetric Difference (XOR): 只屬於其中一個集合,且不屬於另一個集合之元素所形成的集合
## {2, 7, 8, 9}
x - y # 差集
## {2}
x1 = {'foo', 'bar', 'baz'}
x1.issubset({'foo', 'bar', 'baz', 'qux', 'quux'})
## True
x1 <= {'foo', 'bar', 'baz', 'qux', 'quux'}
## True
x2 = {'baz', 'qux', 'quux'}
x1 <= x2
## False
v = {"a", "e", "i", "o", "u"}
v.add("x")
v
## {'e', 'x', 'o', 'a', 'u', 'i'}
v.discard("z") # 與v.remove()不同,當欲移除之元素不存在時,則不會出現錯誤訊息
v
## {'e', 'x', 'o', 'a', 'u', 'i'}
letters = set("alice")
letters
## {'e', 'l', 'i', 'c', 'a'}
letters.intersection(v)
## {'e', 'a', 'i'}
letters.union(v)
## {'e', 'l', 'i', 'o', 'x', 'c', 'a', 'u'}
letters.difference(v)
## {'c', 'l'}
letters.symmetric_difference(v)
## {'x', 'c', 'o', 'u', 'l'}
s = {"a", "e"}
s.issubset(letters)
## True
letters.issuperset(s)
## True
letters.isdisjoint(s)
## False
因set無法成為另一個set的元素(因set為可變的型別),故Python提供frozenset型別解決上述問題
frozenset為不可變型別
x = set([1, 2, 3, 1, 3, 5])
z = frozenset(x)
type(x)
## <class 'set'>
type(z)
## <class 'frozenset'>
# z.add(6)
# AttributeError: 'frozenset' object has no attribute 'add'
#
# Detailed traceback:
# File "<string>", line 1, in <module>
x.add(z)
x
## {1, 2, 3, 5, frozenset({1, 2, 3, 5})}
len(x)
## 5
dictint』、
『str』、或其他Python不可變物件,且必須為唯一的dict來實作dictages = {'Mary':13, 'John': 14, 'Tony':13}
type(ages)
## <class 'dict'>
ages
## {'Mary': 13, 'John': 14, 'Tony': 13}
'Mary' in ages
## True
x = {}
x
## {}
type(x)
## <class 'dict'>
x[0] = 'NCCU' # 此0是當作key,並非當作索引用
x[1] = 'Money and Banking'
x
## {0: 'NCCU', 1: 'Money and Banking'}
x[1]
## 'Money and Banking'
len(x)
## 2
# empty list
y = []
type(y)
# y[0] = 'NCCU' # 指定一個不存在的索引值0,值得注意的是:R允許這個操作
# ndexError: list assignment index out of range
#
# Detailed traceback:
# File "<string>", line 1, in <module>
## <class 'list'>
l <- list()
class(l)
## [1] "list"
l[1] <- "NCCU"
l
## [[1]]
## [1] "NCCU"
dict的其他操作english_to_french = {'red': 'rouge', 'blue': 'bleu', 'green': 'vert'}
len(english_to_french)
## 3
list(english_to_french.keys())
## ['red', 'blue', 'green']
list(english_to_french.values())
## ['rouge', 'bleu', 'vert']
list(english_to_french.items())
## [('red', 'rouge'), ('blue', 'bleu'), ('green', 'vert')]
del english_to_french['green']
list(english_to_french.items())
## [('red', 'rouge'), ('blue', 'bleu')]
'red' in english_to_french
## True
'orange' in english_to_french
## False
english_to_french.get('blue', 'No translation')
## 'bleu'
english_to_french.get('chartreuse', 'No translation')
## 'No translation'
english_to_french.setdefault('chartreuse', 'No translation') # 找不到該值時,會新增 鍵:值
## 'No translation'
x = {0: 'zero', 1: 'one'}
y = x.copy()
y
## {0: 'zero', 1: 'one'}
z = {1: 'One', 2: 'Two'}
x = {0: 'zero', 1: 'one'}
x.update(z)
x
## {0: 'zero', 1: 'One', 2: 'Two'}
NumPy’s main object is the homogeneous multidimensional array.
It is a table of elements (usually numbers), all of the same
type, indexed by a
tuple of non-negative integers.
In NumPy dimensions are called axes.
Def: dimention為每一軸所含的元素個數。
Ex: 2個軸,每個軸有3個元素(3維)。
NumPy’s array class is called ndarray. It is also
known by the alias
array.
import time
import numpy as np
def benchmark(n):
# using list
a_list = list(range(1, n + 1))
t1 = time.time()
tot = sum(a_list)
t2 = time.time()
print('Time taken by Python is', t2 - t1)
# using numpy
a = np.arange(1, n+1)
t1 = time.time()
tot = np.sum(a)
t2 = time.time()
print('Time taken by Python is', t2 - t1)
benchmark(1000000)
## Time taken by Python is 0.0062713623046875
## Time taken by Python is 0.0008161067962646484
import numpy as np
a = np.arange(15).reshape(3, 5)
a
## array([[ 0, 1, 2, 3, 4],
## [ 5, 6, 7, 8, 9],
## [10, 11, 12, 13, 14]])
a.shape
## (3, 5)
a.ndim
## 2
a.dtype.name
## 'int64'
a.itemsize
## 8
a.size
## 15
type(a)
## <class 'numpy.ndarray'>
b = np.array([6, 7, 8])
b
## array([6, 7, 8])
type(b)
## <class 'numpy.ndarray'>
# a = np.array(1, 2, 3, 4) # WRONG
# TypeError: array() takes from 1 to 2 positional arguments but 4 were given
c = np.array([[1, 2], [3, 4]], dtype=complex)
a = np.array([[1, 2, 3], [10, 20, 30, 40]])
## <string>:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
a
## array([list([1, 2, 3]), list([10, 20, 30, 40])], dtype=object)
a.shape
## (2,)
np.zeros((3, 4))
## array([[0., 0., 0., 0.],
## [0., 0., 0., 0.],
## [0., 0., 0., 0.]])
np.ones((2, 3, 4), dtype=np.int16)
## array([[[1, 1, 1, 1],
## [1, 1, 1, 1],
## [1, 1, 1, 1]],
##
## [[1, 1, 1, 1],
## [1, 1, 1, 1],
## [1, 1, 1, 1]]], dtype=int16)
np.empty((2, 3))
## array([[1.39069238e-309, 1.39069238e-309, 1.39069238e-309],
## [1.39069238e-309, 1.39069238e-309, 1.39069238e-309]])
from numpy import pi
np.linspace(0, 2, 9)
## array([0. , 0.25, 0.5 , 0.75, 1. , 1.25, 1.5 , 1.75, 2. ])
x = np.linspace(0, 2 * pi, 100)
f = np.sin(x)
f
## array([ 0.00000000e+00, 6.34239197e-02, 1.26592454e-01, 1.89251244e-01,
## 2.51147987e-01, 3.12033446e-01, 3.71662456e-01, 4.29794912e-01,
## 4.86196736e-01, 5.40640817e-01, 5.92907929e-01, 6.42787610e-01,
## 6.90079011e-01, 7.34591709e-01, 7.76146464e-01, 8.14575952e-01,
## 8.49725430e-01, 8.81453363e-01, 9.09631995e-01, 9.34147860e-01,
## 9.54902241e-01, 9.71811568e-01, 9.84807753e-01, 9.93838464e-01,
## 9.98867339e-01, 9.99874128e-01, 9.96854776e-01, 9.89821442e-01,
## 9.78802446e-01, 9.63842159e-01, 9.45000819e-01, 9.22354294e-01,
## 8.95993774e-01, 8.66025404e-01, 8.32569855e-01, 7.95761841e-01,
## 7.55749574e-01, 7.12694171e-01, 6.66769001e-01, 6.18158986e-01,
## 5.67059864e-01, 5.13677392e-01, 4.58226522e-01, 4.00930535e-01,
## 3.42020143e-01, 2.81732557e-01, 2.20310533e-01, 1.58001396e-01,
## 9.50560433e-02, 3.17279335e-02, -3.17279335e-02, -9.50560433e-02,
## -1.58001396e-01, -2.20310533e-01, -2.81732557e-01, -3.42020143e-01,
## -4.00930535e-01, -4.58226522e-01, -5.13677392e-01, -5.67059864e-01,
## -6.18158986e-01, -6.66769001e-01, -7.12694171e-01, -7.55749574e-01,
## -7.95761841e-01, -8.32569855e-01, -8.66025404e-01, -8.95993774e-01,
## -9.22354294e-01, -9.45000819e-01, -9.63842159e-01, -9.78802446e-01,
## -9.89821442e-01, -9.96854776e-01, -9.99874128e-01, -9.98867339e-01,
## -9.93838464e-01, -9.84807753e-01, -9.71811568e-01, -9.54902241e-01,
## -9.34147860e-01, -9.09631995e-01, -8.81453363e-01, -8.49725430e-01,
## -8.14575952e-01, -7.76146464e-01, -7.34591709e-01, -6.90079011e-01,
## -6.42787610e-01, -5.92907929e-01, -5.40640817e-01, -4.86196736e-01,
## -4.29794912e-01, -3.71662456e-01, -3.12033446e-01, -2.51147987e-01,
## -1.89251244e-01, -1.26592454e-01, -6.34239197e-02, -2.44929360e-16])
a = np.arange(6)
np.reshape(a, newshape=(2, 3), order='C')
## array([[0, 1, 2],
## [3, 4, 5]])
order:
C means to read/write the elements using C-like
index order.
F means to read/write the elements using
Fortran-like index order.
a = np.array([20, 30, 40, 50])
b = np.arange(4)
b
## array([0, 1, 2, 3])
c = a - b
c
## array([20, 29, 38, 47])
b ** 2
## array([0, 1, 4, 9])
10 * np.sin(a)
## array([ 9.12945251, -9.88031624, 7.4511316 , -2.62374854])
a < 35
## array([ True, True, False, False])
Unlike in many matrix languages, the product operator
* operates elementwise in NumPy
arrays.
The matrix product can be performed using the @
operator (in python >=3.5) or the dot function or
method:
A = np.array([[1, 1], [0, 1]])
B = np.array([[2, 0], [3, 4]])
A * B # elementwise product
## array([[2, 0],
## [0, 4]])
A @ B # matrix product
## array([[5, 4],
## [3, 4]])
A.dot(B) # another matrix product
## array([[5, 4],
## [3, 4]])
+= and *=, act in
place to modify an existing array rather than create a new one.rg = np.random.default_rng(1) # create instance of default random number generator
a = np.ones((2, 3), dtype=int)
b = rg.random((2, 3))
a *= 3
a
## array([[3, 3, 3],
## [3, 3, 3]])
b += a
b
# a += b # b is not automatically converted to integer type
# UFuncTypeError: Cannot cast ufunc 'add' output from dtype('float64') to dtype('int64') with casting rule 'same_kind'
## array([[3.51182162, 3.9504637 , 3.14415961],
## [3.94864945, 3.31183145, 3.42332645]])
x = np.arange(6)
x = x.reshape((2, 3))
x
## array([[0, 1, 2],
## [3, 4, 5]])
np.zeros_like(x)
## array([[0, 0, 0],
## [0, 0, 0]])
np.ones_like(x)
## array([[1, 1, 1],
## [1, 1, 1]])
np.empty_like(x)
## array([[1, 1, 1],
## [1, 1, 1]])
np.full((2, 2), np.inf)
## array([[inf, inf],
## [inf, inf]])
np.full((2, 2), 10)
## array([[10, 10],
## [10, 10]])
np.full((2, 3), [10, 20, 30])
## array([[10, 20, 30],
## [10, 20, 30]])
from numpy import pi
a = np.ones(3, dtype=np.int32)
b = np.linspace(0, pi, 3)
b.dtype.name
## 'float64'
c = a + b
c
## array([1. , 2.57079633, 4.14159265])
c.dtype.name
## 'float64'
d = np.exp(c * 1j)
d
## array([ 0.54030231+0.84147098j, -0.84147098+0.54030231j,
## -0.54030231-0.84147098j])
d.dtype.name
## 'complex128'
ndarray class.a = rg.random((2, 3))
a
## array([[0.82770259, 0.40919914, 0.54959369],
## [0.02755911, 0.75351311, 0.53814331]])
a.sum()
## 3.1057109529998157
a.min()
## 0.027559113243068367
a.max()
## 0.8277025938204418
b = np.arange(12).reshape(3, 4)
b
## array([[ 0, 1, 2, 3],
## [ 4, 5, 6, 7],
## [ 8, 9, 10, 11]])
b.sum(axis=0) # sum of each column
## array([12, 15, 18, 21])
b.min(axis=1) # min of each row
## array([0, 4, 8])
b.cumsum(axis=1) # cumulative sum along each row
## array([[ 0, 1, 3, 6],
## [ 4, 9, 15, 22],
## [ 8, 17, 27, 38]])
NumPy provides familiar mathematical functions such as sin, cos,
and exp. In NumPy, these are called “universal
functions” (ufunc).
Within NumPy, these functions operate elementwise on an array, producing an array as output.
B = np.arange(3)
B
## array([0, 1, 2])
np.exp(B)
## array([1. , 2.71828183, 7.3890561 ])
np.sqrt(B)
## array([0. , 1. , 1.41421356])
C = np.array([2., -1., 4.])
np.add(B, C)
## array([2., 0., 6.])
See also:
all, any, apply_along_axis,
argmax, argmin, argsort, average, bincount, ceil, clip, conj, corrcoef, cov, cross, cumprod, cumsum, diff, dot, floor, inner, invert, lexsort, max, maximum, mean, median, min, minimum, nonzero, outer, prod, re, round, sort, std, sum, trace, transpose, var, vdot, vectorize, where
a = np.arange(10)**3
a[2]
## 8
a[2:5]
## array([ 8, 27, 64])
a[:6:2] = 1000
a[::-1] # reversed a
## array([ 729, 512, 343, 216, 125, 1000, 27, 1000, 1, 1000])
for i in a:
print(i**(1 / 3.))
## 9.999999999999998
## 1.0
## 9.999999999999998
## 3.0
## 9.999999999999998
## 4.999999999999999
## 5.999999999999999
## 6.999999999999999
## 7.999999999999999
## 8.999999999999998
Multidimensional arrays can have one index per axis.
These indices are given in a tuple separated by commas:
def f(x, y):
return 10 * x + y
b = np.fromfunction(f, (5, 4), dtype=int)
b
## array([[ 0, 1, 2, 3],
## [10, 11, 12, 13],
## [20, 21, 22, 23],
## [30, 31, 32, 33],
## [40, 41, 42, 43]])
b[2, 3]
## 23
b[0:5, 1] # each row in the second column of b
## array([ 1, 11, 21, 31, 41])
b[:, 1] # equivalent to the previous example
## array([ 1, 11, 21, 31, 41])
b[1:3, :] # each column in the second and third row of b
## array([[10, 11, 12, 13],
## [20, 21, 22, 23]])
b[-1] # the last row. Equivalent to b[-1, :]
## array([40, 41, 42, 43])
The dots (...) represent as many
colons as needed to produce a complete indexing tuple. For example, if
x is an array with 5 axes, then
x[1, 2, ...] is equivalent to
x[1, 2, :, :, :],
x[..., 3] to x[:, :, :, :, 3]
and
x[4, ..., 5, :] to
x[4, :, :, 5, :].
c = np.array([[[ 0, 1, 2], # a 3D array (two stacked 2D arrays)
[ 10, 12, 13]],
[[100, 101, 102],
[110, 112, 113]]])
c.shape
## (2, 2, 3)
c[1, ...] # same as c[1, :, :] or c[1]
## array([[100, 101, 102],
## [110, 112, 113]])
c[..., 2] # same as c[:, :, 2]
## array([[ 2, 13],
## [102, 113]])
for row in b:
print(row)
## [0 1 2 3]
## [10 11 12 13]
## [20 21 22 23]
## [30 31 32 33]
## [40 41 42 43]
for element in b.flat:
print(element)
## 0
## 1
## 2
## 3
## 10
## 11
## 12
## 13
## 20
## 21
## 22
## 23
## 30
## 31
## 32
## 33
## 40
## 41
## 42
## 43
a = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
a < 5
## array([[ True, True, True, True],
## [False, False, False, False],
## [False, False, False, False]])
b = np.nonzero(a < 5)
b
## (array([0, 0, 0, 0]), array([0, 1, 2, 3]))
list_of_coordinates= list(zip(b[0], b[1]))
for coord in list_of_coordinates:
print(coord)
## (0, 0)
## (0, 1)
## (0, 2)
## (0, 3)
a = np.floor(10 * rg.random((3, 4)))
a.ravel() # returns the array, flattened
## array([3., 7., 3., 4., 1., 4., 2., 2., 7., 2., 4., 9.])
a.reshape(6, 2) # returns the array with a modified shape
## array([[3., 7.],
## [3., 4.],
## [1., 4.],
## [2., 2.],
## [7., 2.],
## [4., 9.]])
a.T # returns the array, transposed
## array([[3., 1., 7.],
## [7., 4., 2.],
## [3., 2., 4.],
## [4., 2., 9.]])
a.T.shape
## (4, 3)
a.shape
## (3, 4)
The order of the elements in the array resulting from
ravel is normally “C-style”, that is, the
rightmost index “changes the fastest”, so the element after
a[0, 0] is a[0, 1].
If the array is reshaped to some other shape, again the array is treated as “C-style”.
NumPy normally creates arrays stored in this order, so
ravel will usually not need to copy its argument, but if
the array was made by taking slices of another array or created with
unusual options, it may need to be copied.
The functions ravel and reshape can
also be instructed, using an optional argument, to use FORTRAN-style
arrays, in which the leftmost index changes the fastest.
a
## array([[3., 7., 3., 4.],
## [1., 4., 2., 2.],
## [7., 2., 4., 9.]])
a.resize((2, 6))
a.reshape(3, -1)
## array([[3., 7., 3., 4.],
## [1., 4., 2., 2.],
## [7., 2., 4., 9.]])
np.newaxis will increase the dimensions of your
array by one dimension when used once. This means that a
1D array will become a 2D array, a
2D array will become a 3D array, and
so on.a = np.array([1, 2, 3, 4, 5, 6])
a.shape
## (6,)
a2 = a[np.newaxis, :]
a2.shape
## (1, 6)
col_vector = a[:, np.newaxis]
col_vector.shape
## (6, 1)
np.expand_dims to add an axis at index
position 1 with:b = np.expand_dims(a, axis=1)
b.shape
## (6, 1)
c = np.expand_dims(a, axis=0)
c.shape
## (1, 6)
a = np.floor(10 * rg.random((2, 2)))
a
## array([[9., 7.],
## [5., 2.]])
b = np.floor(10 * rg.random((2, 2)))
np.vstack((a, b))
## array([[9., 7.],
## [5., 2.],
## [1., 9.],
## [5., 1.]])
np.hstack((a, b))
## array([[9., 7., 1., 9.],
## [5., 2., 5., 1.]])
column_stack
stacks 1D arrays as columns into a 2D array. It is equivalent to hstack only for
2D arrays:from numpy import newaxis
np.column_stack((a, b)) # with 2D arrays
## array([[9., 7., 1., 9.],
## [5., 2., 5., 1.]])
a = np.array([4., 2.])
b = np.array([3., 8.])
np.column_stack((a, b)) # returns a 2D array
## array([[4., 3.],
## [2., 8.]])
np.hstack((a, b)) # the result is different
## array([4., 2., 3., 8.])
a[:, newaxis] # view `a` as a 2D column vector
## array([[4.],
## [2.]])
np.column_stack((a[:, newaxis], b[:, newaxis]))
## array([[4., 3.],
## [2., 8.]])
np.hstack((a[:, newaxis], b[:, newaxis])) # the result is the same
## array([[4., 3.],
## [2., 8.]])
n complex cases, r_ and c_ are useful for
creating arrays by stacking numbers along one axis. They allow the use
of range literals :
When used with arrays as arguments, r_ and c_ are similar to vstack and hstack in their
default behavior, but allow for an optional argument giving the number
of the axis along which to concatenate.
np.r_[1:4, 0, 4]
## array([1, 2, 3, 0, 4])
a = np.array([11, 11, 12, 13, 14, 15, 16, 17, 12, 13, 11, 14, 18, 19, 20])
unique_values = np.unique(a)
unique_values
## array([11, 12, 13, 14, 15, 16, 17, 18, 19, 20])
unique_values, indices_list = np.unique(a, return_index=True)
unique_values
## array([11, 12, 13, 14, 15, 16, 17, 18, 19, 20])
unique_values, occurrence_count = np.unique(a, return_counts=True)
occurrence_count
## array([3, 2, 2, 2, 1, 1, 1, 1, 1, 1])
a_2d = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4]])
unique_values = np.unique(a_2d)
unique_values
## array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
unique_rows = np.unique(a_2d, axis=0)
unique_rows
## array([[ 1, 2, 3, 4],
## [ 5, 6, 7, 8],
## [ 9, 10, 11, 12]])
unique_rows, indices, occurrence_count = np.unique(
a_2d, axis=0, return_counts=True, return_index=True)
unique_rows
## array([[ 1, 2, 3, 4],
## [ 5, 6, 7, 8],
## [ 9, 10, 11, 12]])
indices
## array([0, 1, 2])
occurrence_count
## array([2, 1, 1])
There are two popular ways to flatten an array:
.flatten() and .ravel().
The primary difference between the two is that the new array
created usingravel() is actually a reference to the parent
array (i.e., a “view”).
This means that any changes to the new array will affect the parent array as well.
Since ravel does not create a copy, it’s memory
efficient.
x = np.array([[1 , 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
x.flatten()
## array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
a1 = x.flatten()
a1[0] = 99
print(x) # Original array
## [[ 1 2 3 4]
## [ 5 6 7 8]
## [ 9 10 11 12]]
print(a1) # New array
## [99 2 3 4 5 6 7 8 9 10 11 12]
a2 = x.ravel()
a2[0] = 98
print(x) # Original array
## [[98 2 3 4]
## [ 5 6 7 8]
## [ 9 10 11 12]]
print(a2) # New array
## [98 2 3 4 5 6 7 8 9 10 11 12]
hsplit, you can
split an array along its horizontal axis, either by specifying the
number of equally shaped arrays to return, or by specifying the columns
after which the division should occur:a = np.floor(10 * rg.random((2, 12)))
a
# Split `a` into 3
## array([[6., 7., 6., 9., 0., 5., 4., 0., 6., 8., 5., 2.],
## [8., 5., 5., 7., 1., 8., 6., 7., 1., 8., 1., 0.]])
np.hsplit(a, 3)
# Split `a` after the third and the fourth column
## [array([[6., 7., 6., 9.],
## [8., 5., 5., 7.]]), array([[0., 5., 4., 0.],
## [1., 8., 6., 7.]]), array([[6., 8., 5., 2.],
## [1., 8., 1., 0.]])]
np.hsplit(a, (3, 4))
## [array([[6., 7., 6.],
## [8., 5., 5.]]), array([[9.],
## [7.]]), array([[0., 5., 4., 0., 6., 8., 5., 2.],
## [1., 8., 6., 7., 1., 8., 1., 0.]])]
vsplit splits
along the vertical axis, and array_split
allows one to specify along which axis to split.copy method makes a complete copy
of the array and its data.d = a.copy()
d is a
## False
d.base is a # d doesn't share anything with a
## False
d[0, 0] = 9999
a
## array([[6., 7., 6., 9., 0., 5., 4., 0., 6., 8., 5., 2.],
## [8., 5., 5., 7., 1., 8., 6., 7., 1., 8., 1., 0.]])
Sometimes copy should be called after slicing if the
original array is not required anymore.
For example, suppose a is a huge intermediate result
and the final result b only contains a small fraction of
a, a deep copy should be made when constructing
b with slicing:
a = np.arange(int(1e8))
b = a[:100].copy()
del a # the memory of ``a`` can be released.
The term broadcasting describes how NumPy treats arrays with differentshapes during arithmetic operations.
Subject to certain constraints, the smaller array is “broadcast” across the larger array so that they have compatible shapes.
Broadcasting provides a means of vectorizing array operations so that looping occurs in C instead of Python.
It does this without making needless copies of data and usually leads to efficient algorithm implementations.
There are, however, cases where broadcasting is a bad idea because it leads to inefficient use of memory that slows computation.
a = np.array([1.0, 2.0, 3.0])
b = np.array([2.0, 2.0, 2.0])
a * b
## array([2., 4., 6.])
a = np.array([1.0, 2.0, 3.0])
b = 2.0
a * b
## array([2., 4., 6.])
When operating on two arrays, NumPy compares their shapes element-wise.
It starts with the trailing (i.e. rightmost) dimensions and works its way left. Two dimensions are compatible when
they are equal, or
one of them is 1
If these conditions are not met, a
ValueError: operands could not be broadcast together
exception is thrown, indicating that the arrays have incompatible
shapes.
The size of the resulting array is the size that is not 1 along each axis of the inputs.
from numpy import array
a = array([[ 0.0, 0.0, 0.0],
[10.0, 10.0, 10.0],
[20.0, 20.0, 20.0],
[30.0, 30.0, 30.0]])
b = array([1.0, 2.0, 3.0])
a + b
## array([[ 1., 2., 3.],
## [11., 12., 13.],
## [21., 22., 23.],
## [31., 32., 33.]])
b = array([1.0, 2.0, 3.0, 4.0])
# a + b
# ValueError: operands could not be broadcast together with shapes (4,3) (4,)
pandas is a Python package providing fast, flexible, and expressive data structures designed to make working with “relational” or “labeled” data both easy and intuitive.
It aims to be the fundamental high-level building block for doing practical, real-world data analysis in Python.
Tabular data with heterogeneously-typed columns, as in an SQL table or Excel spreadsheet
Ordered and unordered (not necessarily fixed-frequency) time series data.
Arbitrary matrix data (homogeneously typed or heterogeneous) with row and column labels
Any other form of observational / statistical data sets. The data need not be labeled at all to be placed into a pandas data structure
import pandas as pd
df = pd.DataFrame(
{
"Name": [
"Braund, Mr. Owen Harris",
"Allen, Mr. William Henry",
"Bonnell, Miss. Elizabeth",
],
"Age": [22, 35, 58],
"Sex": ["male", "male", "female"],
}
)
df
## Name Age Sex
## 0 Braund, Mr. Owen Harris 22 male
## 1 Allen, Mr. William Henry 35 male
## 2 Bonnell, Miss. Elizabeth 58 female
df2 = pd.DataFrame(
{
"A": 1.0,
"B": pd.Timestamp("20130102"),
"C": pd.Series(1, index=list(range(4)), dtype="float32"),
"D": np.array([3] * 4, dtype="int32"),
"E": pd.Categorical(["test", "train", "test", "train"]),
"F": "foo",
}
)
df2
## A B C D E F
## 0 1.0 2013-01-02 1.0 3 test foo
## 1 1.0 2013-01-02 1.0 3 train foo
## 2 1.0 2013-01-02 1.0 3 test foo
## 3 1.0 2013-01-02 1.0 3 train foo
Each column in a
DataFrameis aSeries
df["Age"]
## 0 22
## 1 35
## 2 58
## Name: Age, dtype: int64
type(df["Age"])
## <class 'pandas.core.series.Series'>
ages = pd.Series([22, 35, 58], name="Age")
ages
## 0 22
## 1 35
## 2 58
## Name: Age, dtype: int64
df["Age"].max()
## 58
ages.max()
## 58
df.describe()
## Age
## count 3.000000
## mean 38.333333
## std 18.230012
## min 22.000000
## 25% 28.500000
## 50% 35.000000
## 75% 46.500000
## max 58.000000
import pandas as pd
titanic = pd.read_csv("titanic.csv")
titanic
## PassengerId Survived Pclass ... Fare Cabin Embarked
## 0 1 0 3 ... 7.2500 NaN S
## 1 2 1 1 ... 71.2833 C85 C
## 2 3 1 3 ... 7.9250 NaN S
## 3 4 1 1 ... 53.1000 C123 S
## 4 5 0 3 ... 8.0500 NaN S
## .. ... ... ... ... ... ... ...
## 886 887 0 2 ... 13.0000 NaN S
## 887 888 1 1 ... 30.0000 B42 S
## 888 889 0 3 ... 23.4500 NaN S
## 889 890 1 1 ... 30.0000 C148 C
## 890 891 0 3 ... 7.7500 NaN Q
##
## [891 rows x 12 columns]
titanic.head(10)
## PassengerId Survived Pclass ... Fare Cabin Embarked
## 0 1 0 3 ... 7.2500 NaN S
## 1 2 1 1 ... 71.2833 C85 C
## 2 3 1 3 ... 7.9250 NaN S
## 3 4 1 1 ... 53.1000 C123 S
## 4 5 0 3 ... 8.0500 NaN S
## 5 6 0 3 ... 8.4583 NaN Q
## 6 7 0 1 ... 51.8625 E46 S
## 7 8 0 3 ... 21.0750 NaN S
## 8 9 1 3 ... 11.1333 NaN S
## 9 10 1 2 ... 30.0708 NaN C
##
## [10 rows x 12 columns]
titanic.dtypes
## PassengerId int64
## Survived int64
## Pclass int64
## Name object
## Sex object
## Age float64
## SibSp int64
## Parch int64
## Ticket object
## Fare float64
## Cabin object
## Embarked object
## dtype: object
titanic.to_excel("titanic_test.xlsx", sheet_name="passengers", index=False)
titanic_test = pd.read_excel("titanic_test.xlsx", sheet_name="passengers")
titanic_test
## PassengerId Survived Pclass ... Fare Cabin Embarked
## 0 1 0 3 ... 7.2500 NaN S
## 1 2 1 1 ... 71.2833 C85 C
## 2 3 1 3 ... 7.9250 NaN S
## 3 4 1 1 ... 53.1000 C123 S
## 4 5 0 3 ... 8.0500 NaN S
## .. ... ... ... ... ... ... ...
## 886 887 0 2 ... 13.0000 NaN S
## 887 888 1 1 ... 30.0000 B42 S
## 888 889 0 3 ... 23.4500 NaN S
## 889 890 1 1 ... 30.0000 C148 C
## 890 891 0 3 ... 7.7500 NaN Q
##
## [891 rows x 12 columns]
titanic.info()
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 891 entries, 0 to 890
## Data columns (total 12 columns):
## # Column Non-Null Count Dtype
## --- ------ -------------- -----
## 0 PassengerId 891 non-null int64
## 1 Survived 891 non-null int64
## 2 Pclass 891 non-null int64
## 3 Name 891 non-null object
## 4 Sex 891 non-null object
## 5 Age 714 non-null float64
## 6 SibSp 891 non-null int64
## 7 Parch 891 non-null int64
## 8 Ticket 891 non-null object
## 9 Fare 891 non-null float64
## 10 Cabin 204 non-null object
## 11 Embarked 889 non-null object
## dtypes: float64(2), int64(5), object(5)
## memory usage: 83.7+ KB
It is indeed a DataFrame.
There are 891 entries, i.e. 891 rows.
Each row has a row label (aka the index) with values
ranging from 0 to 890.
The table has 12 columns. Most columns have a value for each of
the rows (all 891 values are non-null). Some columns do
have missing values and less than 891 non-null
values.
The columns Name, Sex,
Cabin and Embarked consists of textual data
(strings, aka object). The other columns are numerical data
with some of them whole numbers (aka integer) and others
are real numbers (aka float).
The kind of data (characters, integers,…) in the different
columns are summarized by listing the dtypes.
The approximate amount of RAM used to hold the DataFrame is provided as well.
ages = titanic["Age"]
titanic["Age"].shape
## (891,)
age_sex = titanic[["Age", "Sex"]]
above_35 = titanic[titanic["Age"] > 35]
above_35.head()
## PassengerId Survived Pclass ... Fare Cabin Embarked
## 1 2 1 1 ... 71.2833 C85 C
## 6 7 0 1 ... 51.8625 E46 S
## 11 12 1 1 ... 26.5500 C103 S
## 13 14 0 3 ... 31.2750 NaN S
## 15 16 1 2 ... 16.0000 NaN S
##
## [5 rows x 12 columns]
titanic["Age"] > 35
## 0 False
## 1 True
## 2 False
## 3 False
## 4 False
## ...
## 886 False
## 887 False
## 888 False
## 889 False
## 890 False
## Name: Age, Length: 891, dtype: bool
class_23 = titanic[titanic["Pclass"].isin([2, 3])]
class_23 = titanic[(titanic["Pclass"] == 2) | (titanic["Pclass"] == 3)]
age_no_na = titanic[titanic["Age"].notna()]
adult_names = titanic.loc[titanic["Age"] > 35, "Name"]
titanic.iloc[9:25, 2:5]
## Pclass Name Sex
## 9 2 Nasser, Mrs. Nicholas (Adele Achem) female
## 10 3 Sandstrom, Miss. Marguerite Rut female
## 11 1 Bonnell, Miss. Elizabeth female
## 12 3 Saundercock, Mr. William Henry male
## 13 3 Andersson, Mr. Anders Johan male
## 14 3 Vestrom, Miss. Hulda Amanda Adolfina female
## 15 2 Hewlett, Mrs. (Mary D Kingcome) female
## 16 3 Rice, Master. Eugene male
## 17 2 Williams, Mr. Charles Eugene male
## 18 3 Vander Planke, Mrs. Julius (Emelia Maria Vande... female
## 19 3 Masselmani, Mrs. Fatima female
## 20 2 Fynney, Mr. Joseph J male
## 21 2 Beesley, Mr. Lawrence male
## 22 3 McGowan, Miss. Anna "Annie" female
## 23 1 Sloper, Mr. William Thompson male
## 24 3 Palsson, Miss. Torborg Danira female
titanic.iloc[0:3, 3] = "anonymous"
.loc is primarily label
based, but may also be used with a boolean array.
.loc will raise KeyError when the items are
not found.
.iloc is primarily integer position
based (from 0 to length-1 of the axis), but
may also be used with a boolean array. .iloc will raise
IndexError if a requested indexer is out-of-bounds, except
slice indexers which allow out-of-bounds indexing.
dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4),
index=dates, columns=['A', 'B', 'C', 'D'])
df
## A B C D
## 2000-01-01 -1.717171 3.497959 -1.933236 0.689425
## 2000-01-02 -1.554594 1.474722 0.835386 0.125756
## 2000-01-03 -0.033432 0.800902 -0.338156 -0.708472
## 2000-01-04 -0.159002 -0.066103 -0.311853 -1.419839
## 2000-01-05 -1.870295 1.065438 0.261995 -0.789540
## 2000-01-06 1.140602 2.622254 0.998559 0.618193
## 2000-01-07 1.317243 -0.176217 -0.517474 -1.351632
## 2000-01-08 -0.205541 -0.588750 -1.758490 -0.198265
df[['B', 'A']] = df[['A', 'B']]
df
## A B C D
## 2000-01-01 3.497959 -1.717171 -1.933236 0.689425
## 2000-01-02 1.474722 -1.554594 0.835386 0.125756
## 2000-01-03 0.800902 -0.033432 -0.338156 -0.708472
## 2000-01-04 -0.066103 -0.159002 -0.311853 -1.419839
## 2000-01-05 1.065438 -1.870295 0.261995 -0.789540
## 2000-01-06 2.622254 1.140602 0.998559 0.618193
## 2000-01-07 -0.176217 1.317243 -0.517474 -1.351632
## 2000-01-08 -0.588750 -0.205541 -1.758490 -0.198265
dfl = pd.DataFrame(np.random.randn(5, 4),
columns=list('ABCD'),
index=pd.date_range('20130101', periods=5))
# dfl.loc[2:3]
# TypeError: cannot do slice indexing on DatetimeIndex with these indexers [2] of type int
# Modify Chunk OptionsRun All Chunks AboveRun Current Chunk
dfl.loc['20130102':'20130104']
## A B C D
## 2013-01-02 -1.065678 -0.540383 1.612000 1.238411
## 2013-01-03 0.157837 1.340046 -1.768334 1.188485
## 2013-01-04 -1.445582 0.322281 0.877198 -0.743674
df1 = pd.DataFrame(np.random.randn(6, 4),
index=list('abcdef'),
columns=list('ABCD'))
df1
## A B C D
## a 0.283135 0.697850 1.065823 0.383778
## b -0.037880 0.544931 -0.333350 -0.475377
## c 0.215922 0.480092 -2.435526 1.040459
## d 1.107364 -0.308551 -0.077885 -0.878722
## e -0.733931 0.350377 0.200621 -0.860042
## f 0.789388 -0.057856 -0.570553 -1.195161
df1.loc[['a', 'b', 'd'], :]
## A B C D
## a 0.283135 0.697850 1.065823 0.383778
## b -0.037880 0.544931 -0.333350 -0.475377
## d 1.107364 -0.308551 -0.077885 -0.878722
df1.loc['d':, 'A':'C']
## A B C
## d 1.107364 -0.308551 -0.077885
## e -0.733931 0.350377 0.200621
## f 0.789388 -0.057856 -0.570553
df1.loc['a']
## A 0.283135
## B 0.697850
## C 1.065823
## D 0.383778
## Name: a, dtype: float64
df1.loc['a'] > 0
## A True
## B True
## C True
## D True
## Name: a, dtype: bool
df1.loc[:, df1.loc['a'] > 0]
## A B C D
## a 0.283135 0.697850 1.065823 0.383778
## b -0.037880 0.544931 -0.333350 -0.475377
## c 0.215922 0.480092 -2.435526 1.040459
## d 1.107364 -0.308551 -0.077885 -0.878722
## e -0.733931 0.350377 0.200621 -0.860042
## f 0.789388 -0.057856 -0.570553 -1.195161
s1 = pd.Series(np.random.randn(5), index=list(range(0, 10, 2)))
s1
## 0 0.462635
## 2 0.375835
## 4 -0.685573
## 6 0.478371
## 8 -1.206119
## dtype: float64
s1.iloc[:3]
## 0 0.462635
## 2 0.375835
## 4 -0.685573
## dtype: float64
s1.iloc[3]
## 0.4783714131319601
s1.iloc[:3] = 0
s1
## 0 0.000000
## 2 0.000000
## 4 0.000000
## 6 0.478371
## 8 -1.206119
## dtype: float64
df1 = pd.DataFrame(np.random.randn(6, 4),
index=list(range(0, 12, 2)),
columns=list(range(0, 8, 2)))
df1
## 0 2 4 6
## 0 0.973026 -0.325970 1.061606 -0.638610
## 2 -0.612402 0.606244 1.672033 -0.317543
## 4 1.020128 -0.964174 0.813931 -0.004320
## 6 -0.185217 0.266766 -0.569220 -1.125756
## 8 0.224944 -1.644064 0.285576 2.009107
## 10 2.087139 2.232343 0.634498 -2.907553
df1.iloc[:3]
## 0 2 4 6
## 0 0.973026 -0.325970 1.061606 -0.638610
## 2 -0.612402 0.606244 1.672033 -0.317543
## 4 1.020128 -0.964174 0.813931 -0.004320
df1.iloc[1:5, 2:4]
## 4 6
## 2 1.672033 -0.317543
## 4 0.813931 -0.004320
## 6 -0.569220 -1.125756
## 8 0.285576 2.009107
df1.iloc[[1, 3, 5], [1, 3]]
## 2 6
## 2 0.606244 -0.317543
## 6 0.266766 -1.125756
## 10 2.232343 -2.907553
df1.iloc[1:3, :]
## 0 2 4 6
## 2 -0.612402 0.606244 1.672033 -0.317543
## 4 1.020128 -0.964174 0.813931 -0.004320
df1.iloc[:, 1:3]
## 2 4
## 0 -0.325970 1.061606
## 2 0.606244 1.672033
## 4 -0.964174 0.813931
## 6 0.266766 -0.569220
## 8 -1.644064 0.285576
## 10 2.232343 0.634498
df1.iloc[1, 1]
## 0.6062438224993173
df1.iloc[1]
## 0 -0.612402
## 2 0.606244
## 4 1.672033
## 6 -0.317543
## Name: 2, dtype: float64
.loc, .iloc, and also []
indexing can accept a callable as indexer.
The callable must be a function with one argument
(the calling Series or DataFrame) that returns valid output for
indexing.
df1 = pd.DataFrame(np.random.randn(6, 4),
index=list('abcdef'),
columns=list('ABCD'))
df1
## A B C D
## a 0.924869 0.597715 -0.351335 0.566668
## b -0.558774 0.637554 -0.914388 0.246953
## c -0.606615 0.253886 -0.053065 1.536733
## d 0.414170 1.087474 -1.440676 0.189209
## e 1.110885 -1.166853 1.227121 0.443960
## f 0.084542 -1.001249 -0.079707 0.939407
df1.loc[lambda df: df['A'] > 0, :]
## A B C D
## a 0.924869 0.597715 -0.351335 0.566668
## d 0.414170 1.087474 -1.440676 0.189209
## e 1.110885 -1.166853 1.227121 0.443960
## f 0.084542 -1.001249 -0.079707 0.939407
df1.loc[:, lambda df: ['A', 'B']]
## A B
## a 0.924869 0.597715
## b -0.558774 0.637554
## c -0.606615 0.253886
## d 0.414170 1.087474
## e 1.110885 -1.166853
## f 0.084542 -1.001249
df1.iloc[:, lambda df: [0, 1]]
## A B
## a 0.924869 0.597715
## b -0.558774 0.637554
## c -0.606615 0.253886
## d 0.414170 1.087474
## e 1.110885 -1.166853
## f 0.084542 -1.001249
df1[lambda df: df.columns[0]]
## a 0.924869
## b -0.558774
## c -0.606615
## d 0.414170
## e 1.110885
## f 0.084542
## Name: A, dtype: float64
df1['A'].loc[lambda s: s > 0]
## a 0.924869
## d 0.414170
## e 1.110885
## f 0.084542
## Name: A, dtype: float64
dfd = pd.DataFrame({'A': [1, 2, 3],
'B': [4, 5, 6]},
index=list('abc'))
dfd
## A B
## a 1 4
## b 2 5
## c 3 6
dfd.loc[dfd.index[[0, 2]], 'A']
## a 1
## c 3
## Name: A, dtype: int64
dfd.iloc[[0, 2], dfd.columns.get_loc('A')]
## a 1
## c 3
## Name: A, dtype: int64
dfd.iloc[[0, 2], dfd.columns.get_indexer(['A', 'B'])]
## A B
## a 1 4
## c 3 6
s = pd.Series([1, 2, 3])
list(s.index)
## [0, 1, 2]
s.reindex([1, 2, 3])
## 1 2.0
## 2 3.0
## 3 NaN
## dtype: float64
labels = [1, 2, 3]
s.loc[s.index.intersection(labels)]
## 1 2
## 2 3
## dtype: int64
df = pd.DataFrame({"A":[1, 5, 3, 4, 2],
"B":[3, 2, 4, 3, 4],
"C":[2, 2, 7, 3, 4],
"D":[4, 3, 6, 12, 7]},
index =["first", "second", "third", "fourth", "fifth"])
df
## A B C D
## first 1 3 2 4
## second 5 2 2 3
## third 3 4 7 6
## fourth 4 3 3 12
## fifth 2 4 4 7
df.reindex(["first", "dues", "trois", "fourth", "fifth"])
## A B C D
## first 1.0 3.0 2.0 4.0
## dues NaN NaN NaN NaN
## trois NaN NaN NaN NaN
## fourth 4.0 3.0 3.0 12.0
## fifth 2.0 4.0 4.0 7.0
df
## A B C D
## first 1 3 2 4
## second 5 2 2 3
## third 3 4 7 6
## fourth 4 3 3 12
## fifth 2 4 4 7
df.reindex(["first", "dues", "trois", "fourth", "fifth"], fill_value = 100)
## A B C D
## first 1 3 2 4
## dues 100 100 100 100
## trois 100 100 100 100
## fourth 4 3 3 12
## fifth 2 4 4 7
import pandas as pd
# Creating the first dataframe
df1 = pd.DataFrame({"A":[1, 5, 3, 4, 2],
"B":[3, 2, 4, 3, 4],
"C":[2, 2, 7, 3, 4],
"D":[4, 3, 6, 12, 7]})
# reindexing the column axis with
# old and new index values
df.reindex(columns =["A", "B", "D", "E"])
# reindex the columns
# fill the missing values by 25
## A B D E
## first 1 3 4 NaN
## second 5 2 3 NaN
## third 3 4 6 NaN
## fourth 4 3 12 NaN
## fifth 2 4 7 NaN
df.reindex(columns =["A", "B", "D", "E"], fill_value = 25)
## A B D E
## first 1 3 4 25
## second 5 2 3 25
## third 3 4 6 25
## fourth 4 3 12 25
## fifth 2 4 7 25
s = pd.Series(np.arange(4), index=['a', 'a', 'b', 'c'])
labels = ['c', 'd']
# s.reindex(labels)
# ValueError: cannot reindex on an axis with duplicate labels
s.loc[s.index.intersection(labels)].reindex(labels)
## c 3.0
## d NaN
## dtype: float64
labels = ['a', 'd']
# s.loc[s.index.intersection(labels)].reindex(labels)
# ValueError: cannot reindex on an axis with duplicate labels
Since indexing with [] must handle a lot of cases
(single-label access, slicing, boolean indexing, etc.), it has a bit of
overhead in order to figure out what you’re asking for.
If you only want to access a scalar value, the
fastest way is to use the at and
iat methods, which are implemented on all of the data
structures.
Similarly to loc, at provides
label based scalar lookups, while, iat
provides integer based lookups analogously to
iloc{python}
s = pd.Series([0, 1, 2, 3, 4, 5])
s
## 0 0
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## dtype: int64
s.iat[5]
## 5
df.iat[3, 0]
## 4
df.at[dates[5], 'E'] = 7
Another common operation is the use of boolean vectors to filter
the data. The operators are: | for or,
& for and, and ~ for
not.
These must be grouped by using parentheses,
since by default Python will evaluate an expression such as
df['A'] > 2 & df['B'] < 3 as
df['A'] > (2 & df['B']) < 3, while the desired
evaluation order is
(df['A'] > 2) & (df['B'] < 3)
s = pd.Series(range(-3, 4))
s[s > 0]
## 4 1
## 5 2
## 6 3
## dtype: int64
s[(s < -1) | (s > 0.5)]
## 0 -3
## 1 -2
## 4 1
## 5 2
## 6 3
## dtype: int64
s[~(s < 0)]
## 3 0
## 4 1
## 5 2
## 6 3
## dtype: int64
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
'b': ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
'c': np.random.randn(7)})
# only want 'two' or 'three'
criterion = df2['a'].map(lambda x: x.startswith('t'))
df2[criterion]
# equivalent but slower
## a b c
## 2 two y -0.463740
## 3 three x 0.354440
## 4 two y 0.868629
df2[[x.startswith('t') for x in df2['a']]]
# Multiple criteria
## a b c
## 2 two y -0.463740
## 3 three x 0.354440
## 4 two y 0.868629
df2[criterion & (df2['b'] == 'x')]
## a b c
## 3 three x 0.35444
df2.loc[criterion & (df2['b'] == 'x'), 'b':'c']
## b c
## 3 x 0.35444
where()
Method and MaskingSelecting values from a Series with a boolean vector generally returns a subset of the data.
To guarantee that selection output has the same
shape as the original data, you can use the where
method in Series and DataFrame.
s[s > 0]
## 4 1
## 5 2
## 6 3
## dtype: int64
s.where(s > 0)
## 0 NaN
## 1 NaN
## 2 NaN
## 3 NaN
## 4 1.0
## 5 2.0
## 6 3.0
## dtype: float64
where takes an optional other
argument for replacement of values where the condition is False, in the
returned copy.df[df < 0]
## A B C D E
## first NaN NaN NaN NaN NaN
## second NaN NaN NaN NaN NaN
## third NaN NaN NaN NaN NaN
## fourth NaN NaN NaN NaN NaN
## fifth NaN NaN NaN NaN NaN
## 2000-01-06 00:00:00 NaN NaN NaN NaN NaN
df.where(df < 0, -df)
## A B C D E
## first -1.0 -3.0 -2.0 -4.0 NaN
## second -5.0 -2.0 -2.0 -3.0 NaN
## third -3.0 -4.0 -7.0 -6.0 NaN
## fourth -4.0 -3.0 -3.0 -12.0 NaN
## fifth -2.0 -4.0 -4.0 -7.0 NaN
## 2000-01-06 00:00:00 NaN NaN NaN NaN -7.0
s2 = s.copy()
s2[s2 < 0] = 0
s2
## 0 0
## 1 0
## 2 0
## 3 0
## 4 1
## 5 2
## 6 3
## dtype: int64
df2 = df.copy()
df2[df2 < 0] = 0
df2
## A B C D E
## first 1.0 3.0 2.0 4.0 NaN
## second 5.0 2.0 2.0 3.0 NaN
## third 3.0 4.0 7.0 6.0 NaN
## fourth 4.0 3.0 3.0 12.0 NaN
## fifth 2.0 4.0 4.0 7.0 NaN
## 2000-01-06 00:00:00 NaN NaN NaN NaN 7.0
By default, where returns a modified copy of the
data.
There is an optional parameter inplace so that the
original data can be modified without creating a copy:
df_orig = df.copy()
df_orig.where(df > 0, -df, inplace=True)
df_orig
## A B C D E
## first 1.0 3.0 2.0 4.0 NaN
## second 5.0 2.0 2.0 3.0 NaN
## third 3.0 4.0 7.0 6.0 NaN
## fourth 4.0 3.0 3.0 12.0 NaN
## fifth 2.0 4.0 4.0 7.0 NaN
## 2000-01-06 00:00:00 NaN NaN NaN NaN 7.0
mask()
is the inverse boolean operation of where.s.mask(s >= 0)
## 0 -3.0
## 1 -2.0
## 2 -1.0
## 3 NaN
## 4 NaN
## 5 NaN
## 6 NaN
## dtype: float64
df.mask(df >= 0)
## A B C D E
## first NaN NaN NaN NaN NaN
## second NaN NaN NaN NaN NaN
## third NaN NaN NaN NaN NaN
## fourth NaN NaN NaN NaN NaN
## fifth NaN NaN NaN NaN NaN
## 2000-01-06 00:00:00 NaN NaN NaN NaN NaN
Hierarchical / Multi-level indexing is very exciting as it opens the door to some quite sophisticated data analysis and manipulation, especially for working with higher dimensional data.
In essence, it enables you to store and manipulate data with an
arbitrary number of dimensions in lower dimensional data structures like
Series (1d) and DataFrame (2d).
The MultiIndex
object is the hierarchical analogue of the standard Index object
which typically stores the axis labels in pandas objects.
A MultiIndex can be created from a list of arrays
(using MultiIndex.from_arrays()),
an array of tuples (using MultiIndex.from_tuples()),
a crossed set of iterables (using MultiIndex.from_product()),
or a DataFrame
(using MultiIndex.from_frame()).
arrays = [
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
index
## MultiIndex([('bar', 'one'),
## ('bar', 'two'),
## ('baz', 'one'),
## ('baz', 'two'),
## ('foo', 'one'),
## ('foo', 'two'),
## ('qux', 'one'),
## ('qux', 'two')],
## names=['first', 'second'])
s = pd.Series(np.random.randn(8), index=index)
s
## first second
## bar one 0.331363
## two -0.723341
## baz one -0.837772
## two -0.247049
## foo one 0.689141
## two 1.121384
## qux one 1.079568
## two -0.329814
## dtype: float64
iterables = [["bar", "baz", "foo", "qux"], ["one", "two"]]
pd.MultiIndex.from_product(iterables, names=["first", "second"])
## MultiIndex([('bar', 'one'),
## ('bar', 'two'),
## ('baz', 'one'),
## ('baz', 'two'),
## ('foo', 'one'),
## ('foo', 'two'),
## ('qux', 'one'),
## ('qux', 'two')],
## names=['first', 'second'])
df = pd.DataFrame(
[["bar", "one"], ["bar", "two"], ["foo", "one"], ["foo", "two"]],
columns=["first", "second"],
)
pd.MultiIndex.from_frame(df)
## MultiIndex([('bar', 'one'),
## ('bar', 'two'),
## ('foo', 'one'),
## ('foo', 'two')],
## names=['first', 'second'])
arrays = [
np.array(["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"]),
np.array(["one", "two", "one", "two", "one", "two", "one", "two"]),
]
s = pd.Series(np.random.randn(8), index=arrays)
s
## bar one -0.486092
## two -0.584221
## baz one 1.026291
## two 0.793277
## foo one 0.909938
## two -0.586864
## qux one 0.011569
## two 0.855648
## dtype: float64
df = pd.DataFrame(np.random.randn(8, 4), index=arrays)
df
## 0 1 2 3
## bar one 1.427445 -0.008988 0.155867 1.773813
## two 1.800974 -0.528620 -1.384463 -0.441665
## baz one 1.597536 0.674032 0.639739 -1.610106
## two 1.375617 0.170506 -0.275090 0.751915
## foo one -0.844638 2.489081 0.429008 0.315384
## two 0.265210 -0.511827 1.118101 1.154967
## qux one -0.466893 0.156133 0.329883 0.753334
## two 0.414061 2.253521 0.054647 0.283191
df = pd.DataFrame(np.random.randn(3, 8), index=["A", "B", "C"], columns=index)
df
## first bar baz ... foo qux
## second one two one ... two one two
## A 0.192898 -0.512821 0.145443 ... 0.222328 0.933562 0.333218
## B 0.070086 -0.597243 -0.323979 ... -0.525442 0.760575 1.122120
## C -1.727654 0.164831 0.097347 ... 0.308360 0.186375 -0.842334
##
## [3 rows x 8 columns]
pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6])
## first bar baz foo
## second one two one two one two
## first second
## bar one 0.250750 -0.321133 -0.056850 -0.397326 2.197315 -0.277122
## two -0.219032 -1.175848 0.177063 1.551406 1.423623 -0.176671
## baz one -0.869122 -0.525475 0.096946 -0.379357 -0.586905 -1.327078
## two -0.041594 -0.646962 0.487522 -0.195762 1.414732 -1.155375
## foo one -2.637186 0.145552 0.195915 0.574690 1.069698 -1.467216
## two -0.500530 0.568413 0.703336 -1.572689 0.844033 2.222156
df["bar"]
## second one two
## A 0.192898 -0.512821
## B 0.070086 -0.597243
## C -1.727654 0.164831
df["bar", "one"]
## A 0.192898
## B 0.070086
## C -1.727654
## Name: (bar, one), dtype: float64
df["bar"]["one"]
## A 0.192898
## B 0.070086
## C -1.727654
## Name: one, dtype: float64
s["qux"]
## one 0.011569
## two 0.855648
## dtype: float64
df = df.T
df
## A B C
## first second
## bar one 0.192898 0.070086 -1.727654
## two -0.512821 -0.597243 0.164831
## baz one 0.145443 -0.323979 0.097347
## two 0.606685 -0.956177 1.156921
## foo one -1.316539 -1.249647 -0.586794
## two 0.222328 -0.525442 0.308360
## qux one 0.933562 0.760575 0.186375
## two 0.333218 1.122120 -0.842334
df.loc[("bar", "two")]
## A -0.512821
## B -0.597243
## C 0.164831
## Name: (bar, two), dtype: float64
df.loc[("bar", "two"), "A"]
## -0.5128207322863069
df.loc["bar"]
## A B C
## second
## one 0.192898 0.070086 -1.727654
## two -0.512821 -0.597243 0.164831
df.loc["baz":"foo"]
## A B C
## first second
## baz one 0.145443 -0.323979 0.097347
## two 0.606685 -0.956177 1.156921
## foo one -1.316539 -1.249647 -0.586794
## two 0.222328 -0.525442 0.308360
df.loc[("baz", "two"):("qux", "one")]
## A B C
## first second
## baz two 0.606685 -0.956177 1.156921
## foo one -1.316539 -1.249647 -0.586794
## two 0.222328 -0.525442 0.308360
## qux one 0.933562 0.760575 0.186375
df.loc[("baz", "two"):"foo"]
## A B C
## first second
## baz two 0.606685 -0.956177 1.156921
## foo one -1.316539 -1.249647 -0.586794
## two 0.222328 -0.525442 0.308360
df.loc[[("bar", "two"), ("qux", "one")]]
## A B C
## first second
## bar two -0.512821 -0.597243 0.164831
## qux one 0.933562 0.760575 0.186375
s = pd.Series(
[1, 2, 3, 4, 5, 6],
index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]]),
)
s.loc[[("A", "c"), ("B", "d")]] # list of tuples
## A c 1
## B d 5
## dtype: int64
s.loc[(["A", "B"], ["c", "d"])] # tuple of lists
## A c 1
## d 2
## B c 4
## d 5
## dtype: int64
df
## A B C
## first second
## bar one 0.192898 0.070086 -1.727654
## two -0.512821 -0.597243 0.164831
## baz one 0.145443 -0.323979 0.097347
## two 0.606685 -0.956177 1.156921
## foo one -1.316539 -1.249647 -0.586794
## two 0.222328 -0.525442 0.308360
## qux one 0.933562 0.760575 0.186375
## two 0.333218 1.122120 -0.842334
df.xs("one", level="second")
# using the slicers
## A B C
## first
## bar 0.192898 0.070086 -1.727654
## baz 0.145443 -0.323979 0.097347
## foo -1.316539 -1.249647 -0.586794
## qux 0.933562 0.760575 0.186375
df.loc[(slice(None), "one"), :]
## A B C
## first second
## bar one 0.192898 0.070086 -1.727654
## baz one 0.145443 -0.323979 0.097347
## foo one -1.316539 -1.249647 -0.586794
## qux one 0.933562 0.760575 0.186375
df = df.T
df.xs("one", level="second", axis=1)
# using the slicers
## first bar baz foo qux
## A 0.192898 0.145443 -1.316539 0.933562
## B 0.070086 -0.323979 -1.249647 0.760575
## C -1.727654 0.097347 -0.586794 0.186375
df.loc[:, (slice(None), "one")]
## first bar baz foo qux
## second one one one one
## A 0.192898 0.145443 -1.316539 0.933562
## B 0.070086 -0.323979 -1.249647 0.760575
## C -1.727654 0.097347 -0.586794 0.186375
drop_level=False to xs to
retain the level that was selected.df.xs("one", level="second", axis=1, drop_level=False)
## first bar baz foo qux
## second one one one one
## A 0.192898 0.145443 -1.316539 0.933562
## B 0.070086 -0.323979 -1.249647 0.760575
## C -1.727654 0.097347 -0.586794 0.186375
df.xs("one", level="second", axis=1, drop_level=True)
## first bar baz foo qux
## A 0.192898 0.145443 -1.316539 0.933562
## B 0.070086 -0.323979 -1.249647 0.760575
## C -1.727654 0.097347 -0.586794 0.186375
『保留字(Reserved words)』 不可做為變數名稱
R:
if, else, repeat, while, function,
for, in, next, break,TRUE, FALSE,
NULL, Inf, NaN, NA, NA_integer_,
NA_real_, NA_complex_, NA_character_
Python:
and, def, False, import, not,
True, as, del, finally, in, or,
try, assert, elif, for, is, pass,
while, break, else, from, lambda, print,
with, class, except, global, None, raise,
yield, continue, exec, if, nonlocal, return
# install.packages("xts")
library(xts)
## 載入需要的套件:zoo
##
## 載入套件:'zoo'
## 下列物件被遮斷自 'package:base':
##
## as.Date, as.Date.numeric
##
## 載入套件:'xts'
## 下列物件被遮斷自 'package:dplyr':
##
## first, last
search()
## [1] ".GlobalEnv" "package:xts" "package:zoo"
## [4] "package:forcats" "package:stringr" "package:dplyr"
## [7] "package:purrr" "package:readr" "package:tidyr"
## [10] "package:tibble" "package:ggplot2" "package:tidyverse"
## [13] "package:stats" "package:graphics" "package:grDevices"
## [16] "package:utils" "package:datasets" "package:methods"
## [19] "Autoloads" "package:base"
stats::rnorm(n = 5)
## [1] -0.07624891 0.15594520 -0.09452932 0.95981448 -0.63450988
善用循環補齊、即 元素對元素運算(向量化運算) ,執行速度較快。
1 == 1
## [1] TRUE
1 == 2
## [1] FALSE
v1 <- 1:5
v2 <- 1:5
v3 <- 1:10
v4 <- 1:7
v1 + v4 # 會出現警告,因為長度並非互為倍數
## Warning in v1 + v4: 較長的物件長度並非較短物件長度的倍數
## [1] 2 4 6 8 10 7 9
v1 + 100 # 後面會進行循環補齊,形成具五個100的向量,並做元素對元素相加
## [1] 101 102 103 104 105
v1 * 4 # 後面會進行循環補齊,形成具五個4的向量,並做元素對元素相乘
## [1] 4 8 12 16 20
v1 >= 3 # 循環補齊做循環比較
## [1] FALSE FALSE TRUE TRUE TRUE
x <- c(1, 2, 3, NA, 5, NA, 8)
x == NA # 會得出NA,要小心
## [1] NA NA NA NA NA NA NA
2 == NA # 會得出NA,要小心
## [1] NA
# is.___ 為「疑問句」
# as.___ 為「當作」
is.na(x)
## [1] FALSE FALSE FALSE TRUE FALSE TRUE FALSE
set.seed(seed = 100) # 設定亂數種子
d <- rnorm(n = 100)
# 求出d > 1.96的數字個數
d > 1.96 # 可看出哪些值滿足,得出邏輯向量
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [97] FALSE FALSE FALSE FALSE
sum(d > 1.96) # 求個數,藉由「強制型別轉換」,TRUE為數值1,FALSE為數值0
## [1] 3
mean(d > 1.96) #求機率
## [1] 0.03