1 Creating Data Frame

import pandas as pd

df1 = pd.DataFrame({"a":[1,2,3],"b":[4,5,6],"c":[7,8,9]})
df1 # Specify values for each columns 
##    a  b  c
## 0  1  4  7
## 1  2  5  8
## 2  3  6  9
df2 = pd.DataFrame([[1,2,3,4],[4,5,6,7],[7,8,9,10],[10,11,12,13]],columns=["a","b","c","d"],index=["A","B","C","D"])

df2 # Specify values for each rows
##     a   b   c   d
## A   1   2   3   4
## B   4   5   6   7
## C   7   8   9  10
## D  10  11  12  13

2 Reshaping Data

Gather columns into rows

x = pd.melt(df2,id_vars=["a","d"],var_name="var",value_name="result")

# wide_to_long function also do this job
x
##     a   d var  result
## 0   1   4   b       2
## 1   4   7   b       5
## 2   7  10   b       8
## 3  10  13   b      11
## 4   1   4   c       3
## 5   4   7   c       6
## 6   7  10   c       9
## 7  10  13   c      12

Spread rows into colnums

x.pivot(index="d",columns="var",values="result").reset_index()
## var   d   b   c
## 0     4   2   3
## 1     7   5   6
## 2    10   8   9
## 3    13  11  12

Append rows

pd.concat([df2,df2])
##     a   b   c   d
## A   1   2   3   4
## B   4   5   6   7
## C   7   8   9  10
## D  10  11  12  13
## A   1   2   3   4
## B   4   5   6   7
## C   7   8   9  10
## D  10  11  12  13

Append columns

pd.concat([df1,df1],axis=1)
##    a  b  c  a  b  c
## 0  1  4  7  1  4  7
## 1  2  5  8  2  5  8
## 2  3  6  9  3  6  9

other useful function


df1.sort_values("a",ascending=False)
##    a  b  c
## 2  3  6  9
## 1  2  5  8
## 0  1  4  7
df1.sort_index(ascending=False)
##    a  b  c
## 2  3  6  9
## 1  2  5  8
## 0  1  4  7
df1.rename(columns = {"a":"apple"})
##    apple  b  c
## 0      1  4  7
## 1      2  5  8
## 2      3  6  9
df1.reset_index()  # reset index and moving index to columns
##    index  a  b  c
## 0      0  1  4  7
## 1      1  2  5  8
## 2      2  3  6  9
df1.drop(columns = ["b","c"])
##    a
## 0  1
## 1  2
## 2  3

3. Subset Observations (select rows)

df1.drop_duplicates() # remove duplicate rows
##    a  b  c
## 0  1  4  7
## 1  2  5  8
## 2  3  6  9
df1.head(2)
##    a  b  c
## 0  1  4  7
## 1  2  5  8
df1.tail(1)
##    a  b  c
## 2  3  6  9
df1.sample(frac=0.2)
##    a  b  c
## 1  2  5  8
df1.sample(n=2)
##    a  b  c
## 0  1  4  7
## 1  2  5  8
df1.iloc[0:2] # select rows by position
##    a  b  c
## 0  1  4  7
## 1  2  5  8
df1.nlargest(2,"a") # select and order top n
##    a  b  c
## 2  3  6  9
## 1  2  5  8
df1.nsmallest(2,"a")
##    a  b  c
## 0  1  4  7
## 1  2  5  8
df1[df1["a"]<2]
##    a  b  c
## 0  1  4  7

4 Subset Variables (Columns)

df1[["a","b"]]
##    a  b
## 0  1  4
## 1  2  5
## 2  3  6
df1.a
# df1.filter(regex="regex") select columns whose name matches regulaer expression regex
## 0    1
## 1    2
## 2    3
## Name: a, dtype: int64
df1.loc[:,["a","b"]]
##    a  b
## 0  1  4
## 1  2  5
## 2  3  6
df1.loc[[0,1],["a","b"]]
##    a  b
## 0  1  4
## 1  2  5
df1.iloc[:,[0,1]]
##    a  b
## 0  1  4
## 1  2  5
## 2  3  6
df1.iloc[[0,1],[0,1]]
##    a  b
## 0  1  4
## 1  2  5
df1.loc[df1.a<2,["a","c"]]
##    a  c
## 0  1  7

5 Summarize Data

df1["a"].value_counts()
## 3    1
## 2    1
## 1    1
## Name: a, dtype: int64
df1["a"].unique()
## array([1, 2, 3])
len(df1)
## 3
df1.describe()
##          a    b    c
## count  3.0  3.0  3.0
## mean   2.0  5.0  8.0
## std    1.0  1.0  1.0
## min    1.0  4.0  7.0
## 25%    1.5  4.5  7.5
## 50%    2.0  5.0  8.0
## 75%    2.5  5.5  8.5
## max    3.0  6.0  9.0
df1["a"].sum()
## 6

other useful function

  1. sum
  2. count
  3. median
  4. quantile
  5. apple apply function to each object
  6. min
  7. max
  8. mean
  9. var
  10. std

6. Handling Missing Data

df1.dropna()
df1.fillna()

7. Make new columns

df1["d"] = df1.a+df1.b+df1.c

df1
##    a  b  c   d
## 0  1  4  7  12
## 1  2  5  8  15
## 2  3  6  9  18

8. Group Data

iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',header=None)

iris
##        0    1    2    3               4
## 0    5.1  3.5  1.4  0.2     Iris-setosa
## 1    4.9  3.0  1.4  0.2     Iris-setosa
## 2    4.7  3.2  1.3  0.2     Iris-setosa
## 3    4.6  3.1  1.5  0.2     Iris-setosa
## 4    5.0  3.6  1.4  0.2     Iris-setosa
## ..   ...  ...  ...  ...             ...
## 145  6.7  3.0  5.2  2.3  Iris-virginica
## 146  6.3  2.5  5.0  1.9  Iris-virginica
## 147  6.5  3.0  5.2  2.0  Iris-virginica
## 148  6.2  3.4  5.4  2.3  Iris-virginica
## 149  5.9  3.0  5.1  1.8  Iris-virginica
## 
## [150 rows x 5 columns]
iris.groupby(by=4).sum()
##                      0      1      2      3
## 4                                          
## Iris-setosa      250.3  170.9   73.2   12.2
## Iris-versicolor  296.8  138.5  213.0   66.3
## Iris-virginica   329.4  148.7  277.6  101.3

9 Combine Data Sets

join data set


pd.merge(iris,iris,how="left",on=4) # right , inner , outer
##       0_x  1_x  2_x  3_x               4  0_y  1_y  2_y  3_y
## 0     5.1  3.5  1.4  0.2     Iris-setosa  5.1  3.5  1.4  0.2
## 1     5.1  3.5  1.4  0.2     Iris-setosa  4.9  3.0  1.4  0.2
## 2     5.1  3.5  1.4  0.2     Iris-setosa  4.7  3.2  1.3  0.2
## 3     5.1  3.5  1.4  0.2     Iris-setosa  4.6  3.1  1.5  0.2
## 4     5.1  3.5  1.4  0.2     Iris-setosa  5.0  3.6  1.4  0.2
## ...   ...  ...  ...  ...             ...  ...  ...  ...  ...
## 7495  5.9  3.0  5.1  1.8  Iris-virginica  6.7  3.0  5.2  2.3
## 7496  5.9  3.0  5.1  1.8  Iris-virginica  6.3  2.5  5.0  1.9
## 7497  5.9  3.0  5.1  1.8  Iris-virginica  6.5  3.0  5.2  2.0
## 7498  5.9  3.0  5.1  1.8  Iris-virginica  6.2  3.4  5.4  2.3
## 7499  5.9  3.0  5.1  1.8  Iris-virginica  5.9  3.0  5.1  1.8
## 
## [7500 rows x 9 columns]

10 ploting

iris.plot.hist(1)