import pandas as pd
df1 = pd.DataFrame({"a":[1,2,3],"b":[4,5,6],"c":[7,8,9]})
df1 # Specify values for each columns
## a b c
## 0 1 4 7
## 1 2 5 8
## 2 3 6 9
df2 = pd.DataFrame([[1,2,3,4],[4,5,6,7],[7,8,9,10],[10,11,12,13]],columns=["a","b","c","d"],index=["A","B","C","D"])
df2 # Specify values for each rows
## a b c d
## A 1 2 3 4
## B 4 5 6 7
## C 7 8 9 10
## D 10 11 12 13
x = pd.melt(df2,id_vars=["a","d"],var_name="var",value_name="result")
# wide_to_long function also do this job
x
## a d var result
## 0 1 4 b 2
## 1 4 7 b 5
## 2 7 10 b 8
## 3 10 13 b 11
## 4 1 4 c 3
## 5 4 7 c 6
## 6 7 10 c 9
## 7 10 13 c 12
x.pivot(index="d",columns="var",values="result").reset_index()
## var d b c
## 0 4 2 3
## 1 7 5 6
## 2 10 8 9
## 3 13 11 12
pd.concat([df2,df2])
## a b c d
## A 1 2 3 4
## B 4 5 6 7
## C 7 8 9 10
## D 10 11 12 13
## A 1 2 3 4
## B 4 5 6 7
## C 7 8 9 10
## D 10 11 12 13
pd.concat([df1,df1],axis=1)
## a b c a b c
## 0 1 4 7 1 4 7
## 1 2 5 8 2 5 8
## 2 3 6 9 3 6 9
df1.sort_values("a",ascending=False)
## a b c
## 2 3 6 9
## 1 2 5 8
## 0 1 4 7
df1.sort_index(ascending=False)
## a b c
## 2 3 6 9
## 1 2 5 8
## 0 1 4 7
df1.rename(columns = {"a":"apple"})
## apple b c
## 0 1 4 7
## 1 2 5 8
## 2 3 6 9
df1.reset_index() # reset index and moving index to columns
## index a b c
## 0 0 1 4 7
## 1 1 2 5 8
## 2 2 3 6 9
df1.drop(columns = ["b","c"])
## a
## 0 1
## 1 2
## 2 3
df1.drop_duplicates() # remove duplicate rows
## a b c
## 0 1 4 7
## 1 2 5 8
## 2 3 6 9
df1.head(2)
## a b c
## 0 1 4 7
## 1 2 5 8
df1.tail(1)
## a b c
## 2 3 6 9
df1.sample(frac=0.2)
## a b c
## 1 2 5 8
df1.sample(n=2)
## a b c
## 0 1 4 7
## 1 2 5 8
df1.iloc[0:2] # select rows by position
## a b c
## 0 1 4 7
## 1 2 5 8
df1.nlargest(2,"a") # select and order top n
## a b c
## 2 3 6 9
## 1 2 5 8
df1.nsmallest(2,"a")
## a b c
## 0 1 4 7
## 1 2 5 8
df1[df1["a"]<2]
## a b c
## 0 1 4 7
df1[["a","b"]]
## a b
## 0 1 4
## 1 2 5
## 2 3 6
df1.a
# df1.filter(regex="regex") select columns whose name matches regulaer expression regex
## 0 1
## 1 2
## 2 3
## Name: a, dtype: int64
df1.loc[:,["a","b"]]
## a b
## 0 1 4
## 1 2 5
## 2 3 6
df1.loc[[0,1],["a","b"]]
## a b
## 0 1 4
## 1 2 5
df1.iloc[:,[0,1]]
## a b
## 0 1 4
## 1 2 5
## 2 3 6
df1.iloc[[0,1],[0,1]]
## a b
## 0 1 4
## 1 2 5
df1.loc[df1.a<2,["a","c"]]
## a c
## 0 1 7
df1["a"].value_counts()
## 3 1
## 2 1
## 1 1
## Name: a, dtype: int64
df1["a"].unique()
## array([1, 2, 3])
len(df1)
## 3
df1.describe()
## a b c
## count 3.0 3.0 3.0
## mean 2.0 5.0 8.0
## std 1.0 1.0 1.0
## min 1.0 4.0 7.0
## 25% 1.5 4.5 7.5
## 50% 2.0 5.0 8.0
## 75% 2.5 5.5 8.5
## max 3.0 6.0 9.0
df1["a"].sum()
## 6
other useful function
df1.dropna()
df1.fillna()
df1["d"] = df1.a+df1.b+df1.c
df1
## a b c d
## 0 1 4 7 12
## 1 2 5 8 15
## 2 3 6 9 18
iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',header=None)
iris
## 0 1 2 3 4
## 0 5.1 3.5 1.4 0.2 Iris-setosa
## 1 4.9 3.0 1.4 0.2 Iris-setosa
## 2 4.7 3.2 1.3 0.2 Iris-setosa
## 3 4.6 3.1 1.5 0.2 Iris-setosa
## 4 5.0 3.6 1.4 0.2 Iris-setosa
## .. ... ... ... ... ...
## 145 6.7 3.0 5.2 2.3 Iris-virginica
## 146 6.3 2.5 5.0 1.9 Iris-virginica
## 147 6.5 3.0 5.2 2.0 Iris-virginica
## 148 6.2 3.4 5.4 2.3 Iris-virginica
## 149 5.9 3.0 5.1 1.8 Iris-virginica
##
## [150 rows x 5 columns]
iris.groupby(by=4).sum()
## 0 1 2 3
## 4
## Iris-setosa 250.3 170.9 73.2 12.2
## Iris-versicolor 296.8 138.5 213.0 66.3
## Iris-virginica 329.4 148.7 277.6 101.3
join data set
pd.merge(iris,iris,how="left",on=4) # right , inner , outer
## 0_x 1_x 2_x 3_x 4 0_y 1_y 2_y 3_y
## 0 5.1 3.5 1.4 0.2 Iris-setosa 5.1 3.5 1.4 0.2
## 1 5.1 3.5 1.4 0.2 Iris-setosa 4.9 3.0 1.4 0.2
## 2 5.1 3.5 1.4 0.2 Iris-setosa 4.7 3.2 1.3 0.2
## 3 5.1 3.5 1.4 0.2 Iris-setosa 4.6 3.1 1.5 0.2
## 4 5.1 3.5 1.4 0.2 Iris-setosa 5.0 3.6 1.4 0.2
## ... ... ... ... ... ... ... ... ... ...
## 7495 5.9 3.0 5.1 1.8 Iris-virginica 6.7 3.0 5.2 2.3
## 7496 5.9 3.0 5.1 1.8 Iris-virginica 6.3 2.5 5.0 1.9
## 7497 5.9 3.0 5.1 1.8 Iris-virginica 6.5 3.0 5.2 2.0
## 7498 5.9 3.0 5.1 1.8 Iris-virginica 6.2 3.4 5.4 2.3
## 7499 5.9 3.0 5.1 1.8 Iris-virginica 5.9 3.0 5.1 1.8
##
## [7500 rows x 9 columns]
iris.plot.hist(1)