Resources and libraries

library(reticulate)

import sys

print(sys.version)
## 3.7.5 (default, Oct 31 2019, 15:18:51) [MSC v.1916 64 bit (AMD64)]

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import matplotlib.pyplot as plt

import matplotlib as mpl

mpl.use('ps') # generate postscript output by default

import sklearn

#from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

pd.set_option('precision', 3)

pd.set_option('expand_frame_repr', True)

#pd.set_option('max_colwidth', -1)

Glimpse the data set

Structure of the dataset

library(MASS)

data("birthwt")

str(birthwt)
## 'data.frame':    189 obs. of  10 variables:
##  $ low  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ age  : int  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt  : int  182 155 105 108 107 124 118 103 123 113 ...
##  $ race : int  2 3 1 1 1 3 1 3 1 1 ...
##  $ smoke: int  0 0 1 1 1 0 0 0 1 1 ...
##  $ ptl  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ht   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ui   : int  1 0 0 1 1 0 0 0 0 0 ...
##  $ ftv  : int  0 3 1 2 0 0 1 1 1 0 ...
##  $ bwt  : int  2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...

Describe the variables

# variable name variable label coded levels
1 low indicator of birth weight less than 2.5 kg 0, 1
2 age mother’s age in years continous variable
3 lwt mother’s weight in pounds at last menstrual period continous variable
4 race mother’s race (1 = white, 2 = black, 3 = other) 1, 2, 3
5 smoke smoking status during pregnancy 0, 1
6 ptl number of previous premature labours 0, 1, 2, 3
7 ht history of hypertension 0, 1
8 ui presence of uterine irritability 0, 1
9 ftv number of physician visits during the first trimester 0, 1, 2, 3, 4, 6
10 bwt birth weight in grams continous variable

Import data

# save the 'birthwt' data as a 'csv' file and import using pandas 'read_csv' function 

pbwt=pd.read_csv('birthwt.csv')

Information about the data set

# information about data set
pbwt.info()
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 189 entries, 0 to 188
## Data columns (total 10 columns):
##  #   Column  Non-Null Count  Dtype
## ---  ------  --------------  -----
##  0   low     189 non-null    int64
##  1   age     189 non-null    int64
##  2   lwt     189 non-null    int64
##  3   race    189 non-null    int64
##  4   smoke   189 non-null    int64
##  5   ptl     189 non-null    int64
##  6   ht      189 non-null    int64
##  7   ui      189 non-null    int64
##  8   ftv     189 non-null    int64
##  9   bwt     189 non-null    int64
## dtypes: int64(10)
## memory usage: 14.9 KB

Data types

# data types: integer/object/category/floating-point
pbwt.dtypes
## low      int64
## age      int64
## lwt      int64
## race     int64
## smoke    int64
## ptl      int64
## ht       int64
## ui       int64
## ftv      int64
## bwt      int64
## dtype: object

First five rows of data


pbwt.head()
##    low  age  lwt  race  smoke  ptl  ht  ui  ftv   bwt
## 0    0   19  182     2      0    0   0   1    0  2523
## 1    0   33  155     3      0    0   0   0    3  2551
## 2    0   20  105     1      1    0   0   0    1  2557
## 3    0   21  108     1      1    0   0   1    2  2594
## 4    0   18  107     1      1    0   0   1    0  2600

Last five rows of data


pbwt.tail()
##      low  age  lwt  race  smoke  ptl  ht  ui  ftv   bwt
## 184    1   28   95     1      1    0   0   0    2  2466
## 185    1   14  100     3      0    0   0   0    2  2495
## 186    1   23   94     3      1    0   0   0    0  2495
## 187    1   17  142     2      0    0   1   0    0  2495
## 188    1   21  130     1      1    0   1   0    3  2495

Data shape or dimension



pbwt.shape
## (189, 10)

Columns name


pbwt.columns
## Index(['low', 'age', 'lwt', 'race', 'smoke', 'ptl', 'ht', 'ui', 'ftv', 'bwt'], dtype='object')

Number of columns


len(pbwt.columns)
## 10

Number of observations


len(pbwt)
## 189

Length of a variable

len(pbwt['age'])
## 189

Number of rows

# rows
pbwt.index
## RangeIndex(start=0, stop=189, step=1)

Convert certain variables to categorical type


pbwt[['low', 'race', 'smoke', 'ptl', 'ht', 'ui', 'ftv']]=pbwt[['low', 'race', 'smoke', 'ptl', 'ht', 'ui', 'ftv']].astype('category')

pbwt.dtypes
## low      category
## age         int64
## lwt         int64
## race     category
## smoke    category
## ptl      category
## ht       category
## ui       category
## ftv      category
## bwt         int64
## dtype: object

Unique values


pbwt['race'].unique()

#sorted(pbwt['race'].unique())
## [2, 3, 1]
## Categories (3, int64): [2, 3, 1]

Value counts


pbwt['race'].value_counts()
## 1    96
## 3    67
## 2    26
## Name: race, dtype: int64

# for a continuous variable

pbwt['age'].value_counts()
## 20    18
## 19    16
## 25    15
## 22    13
## 23    13
## 24    13
## 21    12
## 17    12
## 18    10
## 28     9
## 26     8
## 30     7
## 16     7
## 29     7
## 32     6
## 31     5
## 15     3
## 14     3
## 27     3
## 33     3
## 36     2
## 35     2
## 34     1
## 45     1
## Name: age, dtype: int64

Sort values

Descending

#pbwt.sort_values(by='age', ascending=False).head()
pbwt.sort_values('age', ascending=False).head()
##     low  age  lwt race smoke ptl ht ui ftv   bwt
## 129   0   45  123    1     0   0  0  0   1  4990
## 88    0   36  175    1     0   0  0  0   0  3600
## 22    0   36  202    1     0   0  0  0   1  2836
## 126   0   35  170    1     0   1  0  0   1  4174
## 32    0   35  121    2     1   1  0  0   1  2948

Ascending


# select and sort the values of a column

pbwt['age'].sort_values().head()
## 116    14
## 185    14
## 183    14
## 173    15
## 169    15
## Name: age, dtype: int64

# the dataframe sorted according to the value of age

pbwt.sort_values('age').head()
#pbwt.sort_values(by='age').head()
##     low  age  lwt race smoke ptl ht ui ftv   bwt
## 116   0   14  135    1     0   0  0  0   0  3941
## 185   1   14  100    3     0   0  0  0   2  2495
## 183   1   14  101    3     1   1  0  0   0  2466
## 173   1   15  115    3     0   0  0  1   0  2381
## 169   1   15  110    1     0   0  0  0   0  2353

Drop duplicates


pbwt['age'].drop_duplicates()

#pbwt['lwt'].drop_duplicates()
## 0      19
## 1      33
## 2      20
## 3      21
## 4      18
## 6      22
## 7      17
## 8      29
## 9      26
## 13     30
## 16     15
## 17     25
## 19     28
## 20     32
## 21     31
## 22     36
## 31     24
## 32     35
## 37     27
## 42     23
## 55     16
## 116    14
## 129    45
## 132    34
## Name: age, dtype: int64

# apparently there are 3 duplicated rows/could be independent observations

pbwt.drop_duplicates()
##     low  age  lwt race smoke ptl ht ui ftv   bwt
## 0     0   19  182    2     0   0  0  1   0  2523
## 1     0   33  155    3     0   0  0  0   3  2551
## 2     0   20  105    1     1   0  0  0   1  2557
## 3     0   21  108    1     1   0  0  1   2  2594
## 4     0   18  107    1     1   0  0  1   0  2600
## ..   ..  ...  ...  ...   ...  .. .. ..  ..   ...
## 184   1   28   95    1     1   0  0  0   2  2466
## 185   1   14  100    3     0   0  0  0   2  2495
## 186   1   23   94    3     1   0  0  0   0  2495
## 187   1   17  142    2     0   0  1  0   0  2495
## 188   1   21  130    1     1   0  1  0   3  2495
## 
## [184 rows x 10 columns]

Select variables/columns

Select certain data types


pbwt.select_dtypes(include=['category'])
##     low race smoke ptl ht ui ftv
## 0     0    2     0   0  0  1   0
## 1     0    3     0   0  0  0   3
## 2     0    1     1   0  0  0   1
## 3     0    1     1   0  0  1   2
## 4     0    1     1   0  0  1   0
## ..   ..  ...   ...  .. .. ..  ..
## 184   1    1     1   0  0  0   2
## 185   1    3     0   0  0  0   2
## 186   1    3     1   0  0  0   0
## 187   1    2     0   0  1  0   0
## 188   1    1     1   0  1  0   3
## 
## [189 rows x 7 columns]

pbwt.select_dtypes(include=['object'])
## Empty DataFrame
## Columns: []
## Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]
## 
## [189 rows x 0 columns]

pbwt.select_dtypes(include=['int64'])
##      age  lwt   bwt
## 0     19  182  2523
## 1     33  155  2551
## 2     20  105  2557
## 3     21  108  2594
## 4     18  107  2600
## ..   ...  ...   ...
## 184   28   95  2466
## 185   14  100  2495
## 186   23   94  2495
## 187   17  142  2495
## 188   21  130  2495
## 
## [189 rows x 3 columns]

Drop variables/columns


# note the usage of axis which is `columns`
# to drop variables withouit creating new data frame use `Inplace=True`

pbwt.drop('low', axis=1)
##      age  lwt race smoke ptl ht ui ftv   bwt
## 0     19  182    2     0   0  0  1   0  2523
## 1     33  155    3     0   0  0  0   3  2551
## 2     20  105    1     1   0  0  0   1  2557
## 3     21  108    1     1   0  0  1   2  2594
## 4     18  107    1     1   0  0  1   0  2600
## ..   ...  ...  ...   ...  .. .. ..  ..   ...
## 184   28   95    1     1   0  0  0   2  2466
## 185   14  100    3     0   0  0  0   2  2495
## 186   23   94    3     1   0  0  0   0  2495
## 187   17  142    2     0   0  1  0   0  2495
## 188   21  130    1     1   0  1  0   3  2495
## 
## [189 rows x 9 columns]
     
pbwt.drop(['low', 'age'], axis='columns')
##      lwt race smoke ptl ht ui ftv   bwt
## 0    182    2     0   0  0  1   0  2523
## 1    155    3     0   0  0  0   3  2551
## 2    105    1     1   0  0  0   1  2557
## 3    108    1     1   0  0  1   2  2594
## 4    107    1     1   0  0  1   0  2600
## ..   ...  ...   ...  .. .. ..  ..   ...
## 184   95    1     1   0  0  0   2  2466
## 185  100    3     0   0  0  0   2  2495
## 186   94    3     1   0  0  0   0  2495
## 187  142    2     0   0  1  0   0  2495
## 188  130    1     1   0  1  0   3  2495
## 
## [189 rows x 8 columns]

Select a column: outputs a series


pbwt['age']
## 0      19
## 1      33
## 2      20
## 3      21
## 4      18
##        ..
## 184    28
## 185    14
## 186    23
## 187    17
## 188    21
## Name: age, Length: 189, dtype: int64

Select multiple columns: outputs a data frame

pbwt[['age', 'lwt', 'race']].head(n=3)
##    age  lwt race
## 0   19  182    2
## 1   33  155    3
## 2   20  105    1

Select multiple columns using loc: outputs a data frame

-Note the usage of loc and a colon followed by comma to select a range of columns

-Do not pass a list of columns

pbwt.loc[:, 'age':'race']
##      age  lwt race
## 0     19  182    2
## 1     33  155    3
## 2     20  105    1
## 3     21  108    1
## 4     18  107    1
## ..   ...  ...  ...
## 184   28   95    1
## 185   14  100    3
## 186   23   94    3
## 187   17  142    2
## 188   21  130    1
## 
## [189 rows x 3 columns]

Select multiple columns using iloc: outputs a data frame

pbwt.iloc[:, 1:4]
##      age  lwt race
## 0     19  182    2
## 1     33  155    3
## 2     20  105    1
## 3     21  108    1
## 4     18  107    1
## ..   ...  ...  ...
## 184   28   95    1
## 185   14  100    3
## 186   23   94    3
## 187   17  142    2
## 188   21  130    1
## 
## [189 rows x 3 columns]

Select rows/observations

Output first 3 rows


pbwt.head(n=3)
##   low  age  lwt race smoke ptl ht ui ftv   bwt
## 0   0   19  182    2     0   0  0  1   0  2523
## 1   0   33  155    3     0   0  0  0   3  2551
## 2   0   20  105    1     1   0  0  0   1  2557

# note no comma following the range
pbwt[0:3]
##   low  age  lwt race smoke ptl ht ui ftv   bwt
## 0   0   19  182    2     0   0  0  1   0  2523
## 1   0   33  155    3     0   0  0  0   3  2551
## 2   0   20  105    1     1   0  0  0   1  2557

pbwt[:3]
##   low  age  lwt race smoke ptl ht ui ftv   bwt
## 0   0   19  182    2     0   0  0  1   0  2523
## 1   0   33  155    3     0   0  0  0   3  2551
## 2   0   20  105    1     1   0  0  0   1  2557

# use of loc and a comma

pbwt.loc[0:3,]
##   low  age  lwt race smoke ptl ht ui ftv   bwt
## 0   0   19  182    2     0   0  0  1   0  2523
## 1   0   33  155    3     0   0  0  0   3  2551
## 2   0   20  105    1     1   0  0  0   1  2557
## 3   0   21  108    1     1   0  0  1   2  2594

# use iloc and a comma

pbwt.iloc[0:3,]
##   low  age  lwt race smoke ptl ht ui ftv   bwt
## 0   0   19  182    2     0   0  0  1   0  2523
## 1   0   33  155    3     0   0  0  0   3  2551
## 2   0   20  105    1     1   0  0  0   1  2557

# use iloc and a comma

pbwt.iloc[:3,]
##   low  age  lwt race smoke ptl ht ui ftv   bwt
## 0   0   19  182    2     0   0  0  1   0  2523
## 1   0   33  155    3     0   0  0  0   3  2551
## 2   0   20  105    1     1   0  0  0   1  2557

Output last 3 rows


pbwt.tail(n=3)
##     low  age  lwt race smoke ptl ht ui ftv   bwt
## 186   1   23   94    3     1   0  0  0   0  2495
## 187   1   17  142    2     0   0  1  0   0  2495
## 188   1   21  130    1     1   0  1  0   3  2495

Output randomly 3 rows


pbwt.sample(n=3)
##    low  age  lwt race smoke ptl ht ui ftv   bwt
## 0    0   19  182    2     0   0  0  1   0  2523
## 23   0   28  120    3     0   0  0  0   0  2863
## 28   0   26  168    2     1   0  0  0   0  2920

Output randomly 3 percent of the rows


pbwt.sample(frac=0.03)
##     low  age  lwt race smoke ptl ht ui ftv   bwt
## 22    0   36  202    1     0   0  0  0   1  2836
## 87    0   23  130    1     0   0  0  0   0  3586
## 48    0   24  115    1     0   0  0  0   2  3090
## 158   1   27  130    2     0   0  0  1   0  2187
## 108   0   18  120    1     1   0  0  0   2  3856
## 58    0   20  103    3     0   0  0  0   0  3203

Select both rows and columns


pbwt.iloc[[0, 1, 3], [1, 2, 9]]
##    age  lwt   bwt
## 0   19  182  2523
## 1   33  155  2551
## 3   21  108  2594

pbwt.loc[[0, 1, 3], ['age', 'lwt', 'bwt']]
##    age  lwt   bwt
## 0   19  182  2523
## 1   33  155  2551
## 3   21  108  2594

Subset or filter rows/observations based on certain conditions

Using square brackets

is in


# subset race==1

pbwt[pbwt['race'].isin([1])]

# pbwt[pbwt['race']==1]
##     low  age  lwt race smoke ptl ht ui ftv   bwt
## 2     0   20  105    1     1   0  0  0   1  2557
## 3     0   21  108    1     1   0  0  1   2  2594
## 4     0   18  107    1     1   0  0  1   0  2600
## 6     0   22  118    1     0   0  0  0   1  2637
## 8     0   29  123    1     1   0  0  0   1  2663
## ..   ..  ...  ...  ...   ...  .. .. ..  ..   ...
## 177   1   17  120    1     1   0  0  0   3  2414
## 178   1   23  110    1     1   1  0  0   0  2424
## 182   1   26  190    1     1   0  0  0   0  2466
## 184   1   28   95    1     1   0  0  0   2  2466
## 188   1   21  130    1     1   0  1  0   3  2495
## 
## [96 rows x 10 columns]

Equality

pbwt[pbwt['race'] == 2].head()
# pbwt[pbwt['race'] == 2].shape
##    low  age  lwt race smoke ptl ht ui ftv   bwt
## 0    0   19  182    2     0   0  0  1   0  2523
## 16   0   15   98    2     0   0  0  0   0  2778
## 28   0   26  168    2     1   0  0  0   0  2920
## 29   0   17  113    2     0   0  0  0   1  2920
## 30   0   17  113    2     0   0  0  0   1  2920

Not-equality

pbwt[pbwt['race'] != 2].head()
# pbwt[pbwt['race'] != 2].shape
##   low  age  lwt race smoke ptl ht ui ftv   bwt
## 1   0   33  155    3     0   0  0  0   3  2551
## 2   0   20  105    1     1   0  0  0   1  2557
## 3   0   21  108    1     1   0  0  1   2  2594
## 4   0   18  107    1     1   0  0  1   0  2600
## 5   0   21  124    3     0   0  0  0   0  2622

Multiple equality


pbwt[(pbwt['race'] == 2) & (pbwt['smoke']==1)].head()

# pbwt[(pbwt.race == 2) & (pbwt.smoke == 1)].head()
##     low  age  lwt race smoke ptl ht ui ftv   bwt
## 28    0   26  168    2     1   0  0  0   0  2920
## 32    0   35  121    2     1   1  0  0   1  2948
## 40    0   21  185    2     1   0  0  0   2  3042
## 78    0   20  121    2     1   0  0  0   0  3444
## 132   1   34  187    2     1   0  1  0   0  1135

Multiple conditions


pbwt[(pbwt['race']==2) & (pbwt['age'] > 31)]
##     low  age  lwt race smoke ptl ht ui ftv   bwt
## 32    0   35  121    2     1   1  0  0   1  2948
## 132   1   34  187    2     1   0  1  0   0  1135

One or another condition

pbwt[(pbwt['age'] >= 35) | (pbwt['lwt']==150) ].head()
##    low  age  lwt race smoke ptl ht ui ftv   bwt
## 11   0   19  150    3     0   0  0  0   1  2733
## 22   0   36  202    1     0   0  0  0   1  2836
## 27   0   29  150    1     0   0  0  0   2  2920
## 32   0   35  121    2     1   1  0  0   1  2948
## 71   0   31  150    3     1   0  0  0   2  3321

Delete observarions


# Delete observations for age greater than 24

pbwt.loc[pbwt['age']<=24]
##     low  age  lwt race smoke ptl ht ui ftv   bwt
## 0     0   19  182    2     0   0  0  1   0  2523
## 2     0   20  105    1     1   0  0  0   1  2557
## 3     0   21  108    1     1   0  0  1   2  2594
## 4     0   18  107    1     1   0  0  1   0  2600
## 5     0   21  124    3     0   0  0  0   0  2622
## ..   ..  ...  ...  ...   ...  .. .. ..  ..   ...
## 183   1   14  101    3     1   1  0  0   0  2466
## 185   1   14  100    3     0   0  0  0   2  2495
## 186   1   23   94    3     1   0  0  0   0  2495
## 187   1   17  142    2     0   0  1  0   0  2495
## 188   1   21  130    1     1   0  1  0   3  2495
## 
## [120 rows x 10 columns]

Locate the observation/row number for the largest value of bwt

pbwt['bwt'].max()
## 4990
# observation/row number for max bwt

pbwt[pbwt['bwt']==4990]
##     low  age  lwt race smoke ptl ht ui ftv   bwt
## 129   0   45  123    1     0   0  0  0   1  4990

Using query function

pbwt.query('race == 2').head()
# pbwt.query('race == 2').shape
##    low  age  lwt race smoke ptl ht ui ftv   bwt
## 0    0   19  182    2     0   0  0  1   0  2523
## 16   0   15   98    2     0   0  0  0   0  2778
## 28   0   26  168    2     1   0  0  0   0  2920
## 29   0   17  113    2     0   0  0  0   1  2920
## 30   0   17  113    2     0   0  0  0   1  2920
pbwt.query('race == 2 & smoke == 1').head()
#pbwt.query('race == 2 & smoke == 1').shape
##     low  age  lwt race smoke ptl ht ui ftv   bwt
## 28    0   26  168    2     1   0  0  0   0  2920
## 32    0   35  121    2     1   1  0  0   1  2948
## 40    0   21  185    2     1   0  0  0   2  3042
## 78    0   20  121    2     1   0  0  0   0  3444
## 132   1   34  187    2     1   0  1  0   0  1135

Locate the observation/row number for the largest value of bwt

pbwt['bwt'].max()
## 4990
# observation/row number for max bwt

pbwt.query('bwt==4990')
##     low  age  lwt race smoke ptl ht ui ftv   bwt
## 129   0   45  123    1     0   0  0  0   1  4990

Summary statistics

Calculates summary stats (i.e., count, mean, std, min, q25, q50, q75 and max) for continuous variables using describe() function

# describe a variable

pbwt['age'].describe()
## count    189.000
## mean      23.238
## std        5.299
## min       14.000
## 25%       19.000
## 50%       23.000
## 75%       26.000
## max       45.000
## Name: age, dtype: float64
# describe all continuous variables
pbwt.describe()
##            age      lwt       bwt
## count  189.000  189.000   189.000
## mean    23.238  129.815  2944.587
## std      5.299   30.579   729.214
## min     14.000   80.000   709.000
## 25%     19.000  110.000  2414.000
## 50%     23.000  121.000  2977.000
## 75%     26.000  140.000  3487.000
## max     45.000  250.000  4990.000

Calculate individual stats (i.e., count, min, max, mean, std, median, mode, kurtosis, cummax; etc.) for continuous variables

pbwt['age'].count()
## 189
pbwt['age'].mean()
## 23.238095238095237
pbwt['age'].std()
## 5.298677933404265

pbwt['age'].min()
## 14

pbwt['age'].max()
## 45
pbwt['age'].median()
## 23.0
pbwt['age'].mode()
## 0    20
## dtype: int64
pbwt['age'].kurtosis()
## 0.6162022516194354
pbwt['age'].cummax()
## 0      19
## 1      33
## 2      33
## 3      33
## 4      33
##        ..
## 184    45
## 185    45
## 186    45
## 187    45
## 188    45
## Name: age, Length: 189, dtype: int64

# q25 and q75 (quantiles) for continuous variables

pbwt[['age', 'lwt', 'bwt']].quantile([.25, .75])
##        age    lwt     bwt
## 0.25  19.0  110.0  2414.0
## 0.75  26.0  140.0  3487.0

Calculates summary stats (i.e., count, unique, top and freq) for categorical variables using describe() function

pbwt.describe(include='category')
##         low  race  smoke  ptl   ht   ui  ftv
## count   189   189    189  189  189  189  189
## unique    2     3      2    4    2    2    6
## top       0     1      0    0    0    0    0
## freq    130    96    115  159  177  161  100

Calculate additional stats for categorical variables

# frequency/counts

pbwt['race'].value_counts(sort=False)
## 1    96
## 2    26
## 3    67
## Name: race, dtype: int64
# frequency/count percentage

pbwt['race'].value_counts(sort=False, normalize=True)
## 1    0.508
## 2    0.138
## 3    0.354
## Name: race, dtype: float64

# two-way frequency or cross-tab

pd.crosstab(pbwt['race'], pbwt['smoke'],margins=True)
## smoke    0   1  All
## race               
## 1       44  52   96
## 2       16  10   26
## 3       55  12   67
## All    115  74  189

n largest


# 3 largest values of variable age

#pbwt.nlargest(3, 'age')

pbwt['age'].nlargest(3)
## 129    45
## 22     36
## 88     36
## Name: age, dtype: int64

# rows for 3 largest values for lwt

pbwt.nlargest(3, ['lwt', 'bwt'] )
##     low  age  lwt race smoke ptl ht ui ftv   bwt
## 67    0   28  250    3     1   0  0  0   6  3303
## 105   0   25  241    2     0   0  1  0   0  3790
## 92    0   19  235    1     1   0  1  0   0  3629

max values of each variable

pbwt.max(axis=0).sort_values(ascending=False)
## bwt      4990.0
## lwt       250.0
## age        45.0
## ftv         6.0
## ptl         3.0
## race        3.0
## ui          1.0
## ht          1.0
## smoke       1.0
## low         1.0
## dtype: float64

# barplot for max values of the variables

pbwt.max(axis=0).sort_values(ascending=True).plot(kind='barh', color='red')
plt.show()

min values of each variable

pbwt.min(axis=0).sort_values(ascending=False)
## bwt      709.0
## lwt       80.0
## age       14.0
## race       1.0
## ftv        0.0
## ui         0.0
## ht         0.0
## ptl        0.0
## smoke      0.0
## low        0.0
## dtype: float64
# barplot for min values of the variables

pbwt.min(axis=0).sort_values(ascending=True).plot(kind='barh', color='red')
plt.show()

Group and summarize

Frequency/counts


pbwt.groupby('race').size()
## race
## 1    96
## 2    26
## 3    67
## dtype: int64

Frequency/counts percentage


pbwt.groupby('race').size()*100/len(pbwt)
## race
## 1    50.794
## 2    13.757
## 3    35.450
## dtype: float64

Group and then generate individual stats or summary stats for continuous variables


pbwt.groupby('race').agg({'age':'mean'})

# pbwt.groupby('race').agg({'age':['mean','std']})


# pbwt.groupby('race')['age'].agg(['mean', 'std'])
##          age
## race        
## 1     24.292
## 2     21.538
## 3     22.388

# calculate multiple stats for age following grouping


pbwt.groupby('race').agg({'age':['size', 'mean', 'std', 'min', 'max']})


# pbwt.groupby('race').agg({'age':[np.size, np.mean, np.std, np.min, np.max]})


# pbwt.groupby('race')['age'].agg(['size', 'mean', 'std', 'min', 'max'])


# pbwt.groupby('race')['age'].agg([np.size, np.mean, np.std, np.min, np.max])
##       age                       
##      size    mean    std min max
## race                            
## 1      96  24.292  5.655  14  45
## 2      26  21.538  5.109  15  35
## 3      67  22.388  4.536  14  33

## Group and then summarize a continuous variable

pbwt.groupby('race')['age'].describe()
##       count    mean    std   min    25%   50%   75%   max
## race                                                     
## 1      96.0  24.292  5.655  14.0  20.00  23.5  29.0  45.0
## 2      26.0  21.538  5.109  15.0  17.25  20.5  24.0  35.0
## 3      67.0  22.388  4.536  14.0  19.00  22.0  25.0  33.0

Group and then generate individual stats or summary stats more than one continuous variable


pbwt.groupby('race')[['age', 'lwt']].agg(['mean', 'std'])
##          age             lwt        
##         mean    std     mean     std
## race                                
## 1     24.292  5.655  132.052  29.094
## 2     21.538  5.109  146.808  39.639
## 3     22.388  4.536  120.015  25.130

pbwt.groupby('race')[['age', 'lwt']].describe()
##        age                              ...   lwt                             
##      count    mean    std   min    25%  ...   min    25%    50%     75%    max
## race                                    ...                                   
## 1     96.0  24.292  5.655  14.0  20.00  ...  90.0  112.0  129.5  143.25  235.0
## 2     26.0  21.538  5.109  15.0  17.25  ...  98.0  120.0  129.0  179.00  241.0
## 3     67.0  22.388  4.536  14.0  19.00  ...  80.0  105.0  119.0  130.00  250.0
## 
## [3 rows x 16 columns]
# group-by and pass a dictionary or dictionaries to aggregate


pbwt.groupby('race').agg({'age':['mean', 'std', 'min', 'max'], 
                          'bwt':['mean', 'std', 'min', 'max']})


# pbwt.groupby('race').agg({'age':[np.mean, np.std, np.min, np.max], 'bwt':[np.mean, np.std, np.min, np.max]})
##          age                      bwt                     
##         mean    std min max      mean      std   min   max
## race                                                      
## 1     24.292  5.655  14  45  3102.719  727.886  1021  4990
## 2     21.538  5.109  15  35  2719.692  638.684  1135  3860
## 3     22.388  4.536  14  33  2805.284  722.194   709  4054

# calculate mean for continuous variables following grouping


pbwt.groupby('race').agg({'age':'mean', 'bwt':'mean', 'lwt':'mean'})
##          age       bwt      lwt
## race                           
## 1     24.292  3102.719  132.052
## 2     21.538  2719.692  146.808
## 3     22.388  2805.284  120.015

# summary stats for all continuous varibales following grouping

pbwt.groupby('race').describe()
##        age                              ...     bwt                                 
##      count    mean    std   min    25%  ...     min      25%     50%     75%     max
## race                                    ...                                         
## 1     96.0  24.292  5.655  14.0  20.00  ...  1021.0  2584.75  3062.0  3651.0  4990.0
## 2     26.0  21.538  5.109  15.0  17.25  ...  1135.0  2370.50  2849.0  3057.0  3860.0
## 3     67.0  22.388  4.536  14.0  19.00  ...   709.0  2313.00  2835.0  3274.0  4054.0
## 
## [3 rows x 24 columns]

## Group by two categorical variables and then summarize continuous variables

pbwt.groupby(['race', 'smoke']).agg({'age':'mean', 'bwt':'mean', 'lwt':'mean'})
##                age       bwt      lwt
## race smoke                           
## 1    0      26.023  3428.750  138.841
##      1      22.827  2826.846  126.308
## 2    0      19.938  2854.500  149.438
##      1      24.100  2504.000  142.600
## 3    0      22.364  2815.782  119.145
##      1      22.500  2757.167  124.000
## Group by two categorical variables and then summarize continuous variables

pbwt.groupby(['race', 'smoke'])[['age', 'bwt', 'lwt']].aggregate([np.size, np.mean, np.std, np.min, np.max])
##             age                           ...  lwt                           
##            size    mean    std amin amax  ... size     mean     std amin amax
## race smoke                                ...                                
## 1    0       44  26.023  6.017   14   45  ...   44  138.841  25.700  100  202
##      1       52  22.827  4.926   16   33  ...   52  126.308  30.764   90  235
## 2    0       16  19.938  3.890   15   27  ...   16  149.438  43.408   98  241
##      1       10  24.100  5.953   18   35  ...   10  142.600  34.529  105  187
## 3    0       55  22.364  4.453   14   33  ...   55  119.145  19.044   85  169
##      1       12  22.500  5.108   14   31  ...   12  124.000  44.583   80  250
## 
## [6 rows x 15 columns]

## Group by two categorical variables and then summarize continuous variables

pbwt.groupby(['race', 'smoke']).describe()
##              age                       ...      bwt                         
##            count    mean    std   min  ...      25%     50%      75%     max
## race smoke                             ...                                  
## 1    0      44.0  26.023  6.017  14.0  ...  3062.00  3593.0  3873.00  4990.0
##      1      52.0  22.827  4.926  16.0  ...  2410.00  2775.5  3189.50  4238.0
## 2    0      16.0  19.938  3.890  15.0  ...  2480.75  2920.0  3331.25  3860.0
##      1      10.0  24.100  5.953  18.0  ...  2313.75  2381.0  2941.00  3444.0
## 3    0      55.0  22.364  4.453  14.0  ...  2313.00  2807.0  3253.00  4054.0
##      1      12.0  22.500  5.108  14.0  ...  2402.25  3146.5  3307.50  3572.0
## 
## [6 rows x 24 columns]

# calculate mean age, bwt and lwt following grouping

pbwt.groupby(['race', 'smoke', 'ht']).agg({'age':'mean', 'bwt':'mean', 'lwt':'mean'})
##                   age       bwt      lwt
## race smoke ht                           
## 1    0     0   26.116  3436.395  139.279
##            1   22.000  3100.000  120.000
##      1     0   23.062  2819.292  121.958
##            1   20.000  2917.500  178.500
## 2    0     0   19.786  2813.357  143.429
##            1   21.000  3142.500  191.500
##      1     0   23.000  2656.111  137.667
##            1   34.000  1135.000  187.000
## 3    0     0   22.216  2874.824  118.961
##            1   24.250  2063.000  121.500
##      1     0   22.500  2757.167  124.000
##            1      NaN       NaN      NaN

# group-by and pass a dictionary to aggregate

pbwt.groupby(['race', 'smoke', 'ht']).agg({'age':[np.size, np.mean, np.std, np.min, np.max]})
##                 age                           
##                size    mean    std  amin  amax
## race smoke ht                                 
## 1    0     0   43.0  26.116  6.056  14.0  45.0
##            1    1.0  22.000    NaN  22.0  22.0
##      1     0   48.0  23.062  5.050  16.0  33.0
##            1    4.0  20.000  1.155  19.0  21.0
## 2    0     0   14.0  19.786  3.847  15.0  27.0
##            1    2.0  21.000  5.657  17.0  25.0
##      1     0    9.0  23.000  5.123  18.0  35.0
##            1    1.0  34.000    NaN  34.0  34.0
## 3    0     0   51.0  22.216  4.575  14.0  33.0
##            1    4.0  24.250  1.708  22.0  26.0
##      1     0   12.0  22.500  5.108  14.0  31.0
##            1    NaN     NaN    NaN   NaN   NaN

# group-by and pass  dictionaries to aggregate

pbwt.groupby(['race', 'smoke', 'ht']).agg({'age':[np.mean, np.std, np.min, np.max],
                                          'bwt':[np.mean, np.std, np.min, np.max]})
##                   age                          bwt                         
##                  mean    std  amin  amax      mean      std    amin    amax
## race smoke ht                                                              
## 1    0     0   26.116  6.056  14.0  45.0  3436.395  716.668  1021.0  4990.0
##            1   22.000    NaN  22.0  22.0  3100.000      NaN  3100.0  3100.0
##      1     0   23.062  5.050  16.0  33.0  2819.292  607.071  1818.0  4238.0
##            1   20.000  1.155  19.0  21.0  2917.500  941.466  1790.0  3756.0
## 2    0     0   19.786  3.847  15.0  27.0  2813.357  605.186  1701.0  3860.0
##            1   21.000  5.657  17.0  25.0  3142.500  915.703  2495.0  3790.0
##      1     0   23.000  5.123  18.0  35.0  2656.111  443.025  2126.0  3444.0
##            1   34.000    NaN  34.0  34.0  1135.000      NaN  1135.0  1135.0
## 3    0     0   22.216  4.575  14.0  33.0  2874.824  684.999  1474.0  4054.0
##            1   24.250  1.708  22.0  26.0  2063.000  649.572  1330.0  2751.0
##      1     0   22.500  5.108  14.0  31.0  2757.167  810.045   709.0  3572.0
##            1      NaN    NaN   NaN   NaN       NaN      NaN     NaN     NaN
# sums only the continous variables after grouping

pbwt.groupby('race').sum()
##        age    lwt     bwt
## race                     
## 1     2332  12677  297861
## 2      560   3817   70712
## 3     1500   8041  187954

Group and then summarize a categorical variable


pbwt.groupby('race')['smoke'].describe()
##       count  unique  top  freq
## race                          
## 1        96       2    1    52
## 2        26       2    0    16
## 3        67       2    0    55

Group and then summarize all categorical variables


pbwt.groupby('race').describe(include='category')
##        low                 smoke         ...  ui        ftv                
##      count unique top freq count unique  ... top freq count unique top freq
## race                                     ...                               
## 1       96      2   0   73    96      2  ...   0   83    96      5   0   43
## 2       26      2   0   15    26      2  ...   0   23    26      5   0   14
## 3       67      2   0   42    67      2  ...   0   55    67      6   0   43
## 
## [3 rows x 24 columns]

Pivot table (tapply)


# max values for age in each racial group

pbwt.pivot_table(values='age', columns='race', aggfunc='max')

#pbwt.pivot_table(values='age', columns='race', aggfunc=np.max)
## race   1   2   3
## age   45  35  33

# min age in racial groups


pbwt.pivot_table(values='age', columns='race', aggfunc=np.min)
## race   1   2   3
## age   14  15  14

# mean age values in each racial groups

pbwt.pivot_table(values='age', columns='race', aggfunc=np.mean)
## race       1       2       3
## age   24.292  21.538  22.388

Using is in() function


pbwt['race'].isin([1]).sum()

#pbwt['race'].isin([2]).sum()

#pbwt['race'].isin([3]).sum()
## 96
pbwt['race'].value_counts()
## 1    96
## 3    67
## 2    26
## Name: race, dtype: int64

Top n rows per group


pbwt.assign(rn=pbwt.sort_values(['age'], ascending=False).groupby(['race']).cumcount() + 1).query('rn < 3').sort_values(['race', 'rn'])
 
##     low  age  lwt race smoke ptl ht ui ftv   bwt  rn
## 129   0   45  123    1     0   0  0  0   1  4990   1
## 88    0   36  175    1     0   0  0  0   0  3600   2
## 32    0   35  121    2     1   1  0  0   1  2948   1
## 132   1   34  187    2     1   0  1  0   0  1135   2
## 1     0   33  155    3     0   0  0  0   3  2551   1
## 20    0   32  121    3     0   0  0  0   2  2835   2

Variable transformation or creation or evaluation

Rename variables using rename function


pbwt.rename(columns={'race': 'racegr', 'smoke':'smokegr'}).head()
##   low  age  lwt racegr smokegr ptl ht ui ftv   bwt
## 0   0   19  182      2       0   0  0  1   0  2523
## 1   0   33  155      3       0   0  0  0   3  2551
## 2   0   20  105      1       1   0  0  0   1  2557
## 3   0   21  108      1       1   0  0  1   2  2594
## 4   0   18  107      1       1   0  0  1   0  2600

Secondary variable creation

Using assign function


pbwt['age'].mean()
## 23.238095238095237

# subtract and create a new variable

pbwt.assign(age_centered=pbwt['age']-23.23).head()

# pbwt.assign(age_centered=pbwt['age']-pbwt['age'].mean()).head()
##   low  age  lwt race smoke ptl ht ui ftv   bwt  age_centered
## 0   0   19  182    2     0   0  0  1   0  2523         -4.23
## 1   0   33  155    3     0   0  0  0   3  2551          9.77
## 2   0   20  105    1     1   0  0  0   1  2557         -3.23
## 3   0   21  108    1     1   0  0  1   2  2594         -2.23
## 4   0   18  107    1     1   0  0  1   0  2600         -5.23

# create two variables

pbwt.assign(age_centered=pbwt['age']-pbwt['age'].mean(), lwt_centered=pbwt['lwt']-pbwt['lwt'].mean() )
##     low  age  lwt race smoke ptl ht ui ftv   bwt  age_centered  lwt_centered
## 0     0   19  182    2     0   0  0  1   0  2523        -4.238        52.185
## 1     0   33  155    3     0   0  0  0   3  2551         9.762        25.185
## 2     0   20  105    1     1   0  0  0   1  2557        -3.238       -24.815
## 3     0   21  108    1     1   0  0  1   2  2594        -2.238       -21.815
## 4     0   18  107    1     1   0  0  1   0  2600        -5.238       -22.815
## ..   ..  ...  ...  ...   ...  .. .. ..  ..   ...           ...           ...
## 184   1   28   95    1     1   0  0  0   2  2466         4.762       -34.815
## 185   1   14  100    3     0   0  0  0   2  2495        -9.238       -29.815
## 186   1   23   94    3     1   0  0  0   0  2495        -0.238       -35.815
## 187   1   17  142    2     0   0  1  0   0  2495        -6.238        12.185
## 188   1   21  130    1     1   0  1  0   3  2495        -2.238         0.185
## 
## [189 rows x 12 columns]

# multiply and create a new variable

pbwt.assign(age_lwt = pbwt['age'] * pbwt['lwt'])
##     low  age  lwt race smoke ptl ht ui ftv   bwt  age_lwt
## 0     0   19  182    2     0   0  0  1   0  2523     3458
## 1     0   33  155    3     0   0  0  0   3  2551     5115
## 2     0   20  105    1     1   0  0  0   1  2557     2100
## 3     0   21  108    1     1   0  0  1   2  2594     2268
## 4     0   18  107    1     1   0  0  1   0  2600     1926
## ..   ..  ...  ...  ...   ...  .. .. ..  ..   ...      ...
## 184   1   28   95    1     1   0  0  0   2  2466     2660
## 185   1   14  100    3     0   0  0  0   2  2495     1400
## 186   1   23   94    3     1   0  0  0   0  2495     2162
## 187   1   17  142    2     0   0  1  0   0  2495     2414
## 188   1   21  130    1     1   0  1  0   3  2495     2730
## 
## [189 rows x 11 columns]
pbwt.head()
##   low  age  lwt race smoke ptl ht ui ftv   bwt
## 0   0   19  182    2     0   0  0  1   0  2523
## 1   0   33  155    3     0   0  0  0   3  2551
## 2   0   20  105    1     1   0  0  0   1  2557
## 3   0   21  108    1     1   0  0  1   2  2594
## 4   0   18  107    1     1   0  0  1   0  2600

Using eval function


pbwt.eval('age- 23.23')
## 0     -4.23
## 1      9.77
## 2     -3.23
## 3     -2.23
## 4     -5.23
##        ... 
## 184    4.77
## 185   -9.23
## 186   -0.23
## 187   -6.23
## 188   -2.23
## Length: 189, dtype: float64


# pbwt['age']+pbwt['lwt']
# pbwt.age+pbwt.lwt

pbwt['age_plus_lwt'] = pbwt.eval('age+lwt')

pbwt.head()
##   low  age  lwt race smoke ptl ht ui ftv   bwt  age_plus_lwt
## 0   0   19  182    2     0   0  0  1   0  2523           201
## 1   0   33  155    3     0   0  0  0   3  2551           188
## 2   0   20  105    1     1   0  0  0   1  2557           125
## 3   0   21  108    1     1   0  0  1   2  2594           129
## 4   0   18  107    1     1   0  0  1   0  2600           125

Using transform function


# replace values with mean values of group

pbwt.groupby('race')['age'].transform('mean')
## 0      21.538
## 1      22.388
## 2      24.292
## 3      24.292
## 4      24.292
##         ...  
## 184    24.292
## 185    22.388
## 186    22.388
## 187    21.538
## 188    24.292
## Name: age, Length: 189, dtype: float64

pbwt['age_minus_race_group_avg_age'] = pbwt['age'] - pbwt.groupby('race')['age'].transform('mean')

pbwt.head()
##   low  age  lwt race  ... ftv   bwt age_plus_lwt age_minus_race_group_avg_age
## 0   0   19  182    2  ...   0  2523          201                       -2.538
## 1   0   33  155    3  ...   3  2551          188                       10.612
## 2   0   20  105    1  ...   1  2557          125                       -4.292
## 3   0   21  108    1  ...   2  2594          129                       -3.292
## 4   0   18  107    1  ...   0  2600          125                       -6.292
## 
## [5 rows x 12 columns]

Update a variable

pbwt.sort_values('age', ascending=False)
##     low  age  lwt race  ... ftv   bwt age_plus_lwt age_minus_race_group_avg_age
## 129   0   45  123    1  ...   1  4990          168                       20.708
## 88    0   36  175    1  ...   0  3600          211                       11.708
## 22    0   36  202    1  ...   1  2836          238                       11.708
## 126   0   35  170    1  ...   1  4174          205                       10.708
## 32    0   35  121    2  ...   1  2948          156                       13.462
## ..   ..  ...  ...  ...  ...  ..   ...          ...                          ...
## 173   1   15  115    3  ...   0  2381          130                       -7.388
## 169   1   15  110    1  ...   0  2353          125                       -9.292
## 116   0   14  135    1  ...   0  3941          149                      -10.292
## 183   1   14  101    3  ...   0  2466          115                       -8.388
## 185   1   14  100    3  ...   2  2495          114                       -8.388
## 
## [189 rows x 12 columns]

# for values of age less than 70, add 10

pbwt.loc[pbwt['age'] < 70, 'age'] += 10

pbwt
##     low  age  lwt race  ... ftv   bwt age_plus_lwt age_minus_race_group_avg_age
## 0     0   29  182    2  ...   0  2523          201                       -2.538
## 1     0   43  155    3  ...   3  2551          188                       10.612
## 2     0   30  105    1  ...   1  2557          125                       -4.292
## 3     0   31  108    1  ...   2  2594          129                       -3.292
## 4     0   28  107    1  ...   0  2600          125                       -6.292
## ..   ..  ...  ...  ...  ...  ..   ...          ...                          ...
## 184   1   38   95    1  ...   2  2466          123                        3.708
## 185   1   24  100    3  ...   2  2495          114                       -8.388
## 186   1   33   94    3  ...   0  2495          117                        0.612
## 187   1   27  142    2  ...   0  2495          159                       -4.538
## 188   1   31  130    1  ...   3  2495          151                       -3.292
## 
## [189 rows x 12 columns]

# For values of age less than 60, subtract 10

pbwt.loc[pbwt['age'] < 60, 'age'] -= 10
pbwt.sort_values('age', ascending=False)
##     low  age  lwt race  ... ftv   bwt age_plus_lwt age_minus_race_group_avg_age
## 129   0   45  123    1  ...   1  4990          168                       20.708
## 88    0   36  175    1  ...   0  3600          211                       11.708
## 22    0   36  202    1  ...   1  2836          238                       11.708
## 126   0   35  170    1  ...   1  4174          205                       10.708
## 32    0   35  121    2  ...   1  2948          156                       13.462
## ..   ..  ...  ...  ...  ...  ..   ...          ...                          ...
## 173   1   15  115    3  ...   0  2381          130                       -7.388
## 169   1   15  110    1  ...   0  2353          125                       -9.292
## 116   0   14  135    1  ...   0  3941          149                      -10.292
## 183   1   14  101    3  ...   0  2466          115                       -8.388
## 185   1   14  100    3  ...   2  2495          114                       -8.388
## 
## [189 rows x 12 columns]

Bin continuous variables


pbwt['bwt'].plot(kind='hist')
plt.show()

Using cut

Based on number of bins


# cut to 4 bins

#pd.cut(pbwt['bwt'], bins=4)

pd.cut(pbwt['bwt'], 4)
## 0      (1779.25, 2849.5]
## 1      (1779.25, 2849.5]
## 2      (1779.25, 2849.5]
## 3      (1779.25, 2849.5]
## 4      (1779.25, 2849.5]
##              ...        
## 184    (1779.25, 2849.5]
## 185    (1779.25, 2849.5]
## 186    (1779.25, 2849.5]
## 187    (1779.25, 2849.5]
## 188    (1779.25, 2849.5]
## Name: bwt, Length: 189, dtype: category
## Categories (4, interval[float64]): [(704.719, 1779.25] < (1779.25, 2849.5] < (2849.5, 3919.75] <
##                                     (3919.75, 4990.0]]

# count values 

pd.cut(pbwt['bwt'], bins=4).value_counts()
## (2849.5, 3919.75]     91
## (1779.25, 2849.5]     73
## (3919.75, 4990.0]     16
## (704.719, 1779.25]     9
## Name: bwt, dtype: int64

Based on custom needs


# assign a variable to the cut output

pbwt['bwt_cut'] =pd.cut(pbwt['bwt'], [500, 1000, 2000, 3000, 4000, 5000])

pbwt
##     low  age  lwt  ... age_plus_lwt age_minus_race_group_avg_age       bwt_cut
## 0     0   19  182  ...          201                       -2.538  (2000, 3000]
## 1     0   33  155  ...          188                       10.612  (2000, 3000]
## 2     0   20  105  ...          125                       -4.292  (2000, 3000]
## 3     0   21  108  ...          129                       -3.292  (2000, 3000]
## 4     0   18  107  ...          125                       -6.292  (2000, 3000]
## ..   ..  ...  ...  ...          ...                          ...           ...
## 184   1   28   95  ...          123                        3.708  (2000, 3000]
## 185   1   14  100  ...          114                       -8.388  (2000, 3000]
## 186   1   23   94  ...          117                        0.612  (2000, 3000]
## 187   1   17  142  ...          159                       -4.538  (2000, 3000]
## 188   1   21  130  ...          151                       -3.292  (2000, 3000]
## 
## [189 rows x 13 columns]

pbwt['bwt_cut'].value_counts()
## (3000, 4000]    83
## (2000, 3000]    78
## (1000, 2000]    18
## (4000, 5000]     9
## (500, 1000]      1
## Name: bwt_cut, dtype: int64

pd.crosstab(pbwt['bwt_cut'], pbwt['bwt'])
## bwt           709   1021  1135  1330  1474  ...  4167  4174  4238  4593  4990
## bwt_cut                                     ...                              
## (500, 1000]      1     0     0     0     0  ...     0     0     0     0     0
## (1000, 2000]     0     1     1     1     1  ...     0     0     0     0     0
## (2000, 3000]     0     0     0     0     0  ...     0     0     0     0     0
## (3000, 4000]     0     0     0     0     0  ...     0     0     0     0     0
## (4000, 5000]     0     0     0     0     0  ...     1     1     1     1     1
## 
## [5 rows x 131 columns]

Using qcut

To quantiles


pd.qcut(pbwt['bwt'], q=4, precision=0)
## 0      (2414.0, 2977.0]
## 1      (2414.0, 2977.0]
## 2      (2414.0, 2977.0]
## 3      (2414.0, 2977.0]
## 4      (2414.0, 2977.0]
##              ...       
## 184    (2414.0, 2977.0]
## 185    (2414.0, 2977.0]
## 186    (2414.0, 2977.0]
## 187    (2414.0, 2977.0]
## 188    (2414.0, 2977.0]
## Name: bwt, Length: 189, dtype: category
## Categories (4, interval[float64]): [(708.0, 2414.0] < (2414.0, 2977.0] < (2977.0, 3487.0] <
##                                     (3487.0, 4990.0]]
pd.qcut(pbwt['bwt'], q=4, precision=0).value_counts()
## (2414.0, 2977.0]    49
## (708.0, 2414.0]     48
## (3487.0, 4990.0]    47
## (2977.0, 3487.0]    45
## Name: bwt, dtype: int64

To labelled quantiles



pbwt['low4ql'] = pd.qcut(pbwt['bwt'], q=4, labels=['1q_low', '2q_low', '3q_low', '4q_low'])

pbwt.head()
##   low  age  lwt  ... age_minus_race_group_avg_age       bwt_cut  low4ql
## 0   0   19  182  ...                       -2.538  (2000, 3000]  2q_low
## 1   0   33  155  ...                       10.612  (2000, 3000]  2q_low
## 2   0   20  105  ...                       -4.292  (2000, 3000]  2q_low
## 3   0   21  108  ...                       -3.292  (2000, 3000]  2q_low
## 4   0   18  107  ...                       -6.292  (2000, 3000]  2q_low
## 
## [5 rows x 14 columns]
pbwt['low4ql'].value_counts()
## 2q_low    49
## 1q_low    48
## 4q_low    47
## 3q_low    45
## Name: low4ql, dtype: int64

Recode variables

Using map() function (Without initializing a variable)


pbwtr=pbwt.copy()
pbwt['race'].value_counts(sort=False)
## 1    96
## 2    26
## 3    67
## Name: race, dtype: int64

recode = {1: 0, 2: 1, 3: 2}

pbwt['race']= pbwt['race'].map(recode)

pbwt.head()
##   low  age  lwt  ... age_minus_race_group_avg_age       bwt_cut  low4ql
## 0   0   19  182  ...                       -2.538  (2000, 3000]  2q_low
## 1   0   33  155  ...                       10.612  (2000, 3000]  2q_low
## 2   0   20  105  ...                       -4.292  (2000, 3000]  2q_low
## 3   0   21  108  ...                       -3.292  (2000, 3000]  2q_low
## 4   0   18  107  ...                       -6.292  (2000, 3000]  2q_low
## 
## [5 rows x 14 columns]
pbwt['race'].value_counts()
## 0    96
## 2    67
## 1    26
## Name: race, dtype: int64

With initializing a variable

# note the diffrence is assignment and equality


# initialize a variable named `raceg`

pbwtr['raceg']=99

# when race is 1, racegr is 0
pbwtr['raceg'][pbwtr['race']==1]=0

# when race is 2, racg is 1
pbwtr['raceg'][pbwtr['race']==2]=1

# when race is 3, racg is 2
pbwtr['raceg'][pbwtr['race']==3]=2

pbwtr.head()
##   low  age  lwt race  ... age_minus_race_group_avg_age       bwt_cut  low4ql raceg
## 0   0   19  182    2  ...                       -2.538  (2000, 3000]  2q_low     1
## 1   0   33  155    3  ...                       10.612  (2000, 3000]  2q_low     2
## 2   0   20  105    1  ...                       -4.292  (2000, 3000]  2q_low     0
## 3   0   21  108    1  ...                       -3.292  (2000, 3000]  2q_low     0
## 4   0   18  107    1  ...                       -6.292  (2000, 3000]  2q_low     0
## 
## [5 rows x 15 columns]

pbwtr['raceg'].value_counts(sort=False)
## 0    96
## 1    26
## 2    67
## Name: raceg, dtype: int64

To labelled values using np.where (If……then logic)


# Unordered Categoricals can only compare equality or not

np.where(pbwt['low'] == 0, 'high', 'low')
## array(['high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
##        'high', 'high', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
##        'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
##        'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
##        'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
##        'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
##        'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
##        'low', 'low', 'low', 'low', 'low', 'low', 'low'], dtype='<U4')

#  it is 'high' when values are 0, else it is 'low'

pbwt['lowc'] = np.where(pbwt['low'] == 0, 'high', 'low')

pbwt
##     low  age  lwt  ...       bwt_cut  low4ql  lowc
## 0     0   19  182  ...  (2000, 3000]  2q_low  high
## 1     0   33  155  ...  (2000, 3000]  2q_low  high
## 2     0   20  105  ...  (2000, 3000]  2q_low  high
## 3     0   21  108  ...  (2000, 3000]  2q_low  high
## 4     0   18  107  ...  (2000, 3000]  2q_low  high
## ..   ..  ...  ...  ...           ...     ...   ...
## 184   1   28   95  ...  (2000, 3000]  2q_low   low
## 185   1   14  100  ...  (2000, 3000]  2q_low   low
## 186   1   23   94  ...  (2000, 3000]  2q_low   low
## 187   1   17  142  ...  (2000, 3000]  2q_low   low
## 188   1   21  130  ...  (2000, 3000]  2q_low   low
## 
## [189 rows x 15 columns]

pbwt['smokec'] = np.where(pbwt['smoke'] == 0, 'non smoker', 'smoker')

pbwt.head()
##   low  age  lwt race  ...       bwt_cut  low4ql  lowc      smokec
## 0   0   19  182    1  ...  (2000, 3000]  2q_low  high  non smoker
## 1   0   33  155    2  ...  (2000, 3000]  2q_low  high  non smoker
## 2   0   20  105    0  ...  (2000, 3000]  2q_low  high      smoker
## 3   0   21  108    0  ...  (2000, 3000]  2q_low  high      smoker
## 4   0   18  107    0  ...  (2000, 3000]  2q_low  high      smoker
## 
## [5 rows x 16 columns]

To labelled values using using dictionary and list comprehension

pbwtl=pbwtr.copy()

pbwtl['racename']=pbwtl['race']

race_dict={1:'white', 2:'black', 3:'other'}

pbwtl['racename'] = [race_dict[i] for i in pbwtl['racename']]

pbwtl.head()
##   low  age  lwt race  ...       bwt_cut  low4ql raceg racename
## 0   0   19  182    2  ...  (2000, 3000]  2q_low     1    black
## 1   0   33  155    3  ...  (2000, 3000]  2q_low     2    other
## 2   0   20  105    1  ...  (2000, 3000]  2q_low     0    white
## 3   0   21  108    1  ...  (2000, 3000]  2q_low     0    white
## 4   0   18  107    1  ...  (2000, 3000]  2q_low     0    white
## 
## [5 rows x 16 columns]
pbwtl['racename'].value_counts()
## white    96
## other    67
## black    26
## Name: racename, dtype: int64

To labelled values using replace() function


pbwtc = pbwt.copy()
pbwtc.head()
##   low  age  lwt race  ...       bwt_cut  low4ql  lowc      smokec
## 0   0   19  182    1  ...  (2000, 3000]  2q_low  high  non smoker
## 1   0   33  155    2  ...  (2000, 3000]  2q_low  high  non smoker
## 2   0   20  105    0  ...  (2000, 3000]  2q_low  high      smoker
## 3   0   21  108    0  ...  (2000, 3000]  2q_low  high      smoker
## 4   0   18  107    0  ...  (2000, 3000]  2q_low  high      smoker
## 
## [5 rows x 16 columns]

# copy and create new variables

pbwtc[['lowl', 'racel', 'smokel', 'ptll', 'htl', 'uil', 'ftvl']]=pbwtc[['low', 'race', 'smoke', 'ptl', 'ht', 'ui', 'ftv']].copy()

pbwtc.head()
##   low  age  lwt race smoke ptl ht  ... lowl racel  smokel  ptll  htl uil ftvl
## 0   0   19  182    1     0   0  0  ...    0     1       0     0    0   1    0
## 1   0   33  155    2     0   0  0  ...    0     2       0     0    0   0    3
## 2   0   20  105    0     1   0  0  ...    0     0       1     0    0   0    1
## 3   0   21  108    0     1   0  0  ...    0     0       1     0    0   1    2
## 4   0   18  107    0     1   0  0  ...    0     0       1     0    0   1    0
## 
## [5 rows x 23 columns]

pbwtc.replace({
    'lowl': {0:'high', 1:'low'},    
    'racel':  {1: 'white', 2: 'black', 3:'other'},
    'smokel': {0:'non_smoker', 1:'smoker'},
    'ptll': {0: 'no_ptl', 1:'one_ptl', 2:'two_ptl', 3:'three_ptl'},
    'htl': {0:'no_hypert', 1:'hypert'},
    'uil': {0:'no_uinfect', 1:'uinfect'},
    'ftvl': {0:'no_DrVisit', 1:'one_DrVisit', 2: 'two_DrVisit', 3: 'three_DrVisit', 4: 'four_DrVisit', 6: 'six_DrVisit'}      
    }, 
    inplace=True
    )


pbwtc.head()
##   low  age  lwt race  ...    ptll        htl         uil           ftvl
## 0   0   19  182    1  ...  no_ptl  no_hypert     uinfect     no_DrVisit
## 1   0   33  155    2  ...  no_ptl  no_hypert  no_uinfect  three_DrVisit
## 2   0   20  105    0  ...  no_ptl  no_hypert  no_uinfect    one_DrVisit
## 3   0   21  108    0  ...  no_ptl  no_hypert     uinfect    two_DrVisit
## 4   0   18  107    0  ...  no_ptl  no_hypert     uinfect     no_DrVisit
## 
## [5 rows x 23 columns]

pbwtc['ftvl'].value_counts()
## no_DrVisit       100
## one_DrVisit       47
## two_DrVisit       30
## three_DrVisit      7
## four_DrVisit       4
## six_DrVisit        1
## Name: ftvl, dtype: int64

pbwtc.dtypes
## low                             category
## age                                int64
## lwt                                int64
## race                            category
## smoke                           category
## ptl                             category
## ht                              category
## ui                              category
## ftv                             category
## bwt                                int64
## age_plus_lwt                       int64
## age_minus_race_group_avg_age     float64
## bwt_cut                         category
## low4ql                          category
## lowc                              object
## smokec                            object
## lowl                              object
## racel                             object
## smokel                            object
## ptll                              object
## htl                               object
## uil                               object
## ftvl                              object
## dtype: object

# convert object data types

pbwtcn=pbwtc[['lowl', 'racel', 'smokel', 'ptll', 'htl', 'uil', 'ftvl']].astype('category')

pbwtcn.dtypes
## lowl      category
## racel     category
## smokel    category
## ptll      category
## htl       category
## uil       category
## ftvl      category
## dtype: object

Convert object data types to categorical codes

pbwtcn
##      lowl  racel      smokel    ptll        htl         uil           ftvl
## 0    high  white  non_smoker  no_ptl  no_hypert     uinfect     no_DrVisit
## 1    high  black  non_smoker  no_ptl  no_hypert  no_uinfect  three_DrVisit
## 2    high      0      smoker  no_ptl  no_hypert  no_uinfect    one_DrVisit
## 3    high      0      smoker  no_ptl  no_hypert     uinfect    two_DrVisit
## 4    high      0      smoker  no_ptl  no_hypert     uinfect     no_DrVisit
## ..    ...    ...         ...     ...        ...         ...            ...
## 184   low      0      smoker  no_ptl  no_hypert  no_uinfect    two_DrVisit
## 185   low  black  non_smoker  no_ptl  no_hypert  no_uinfect    two_DrVisit
## 186   low  black      smoker  no_ptl  no_hypert  no_uinfect     no_DrVisit
## 187   low  white  non_smoker  no_ptl     hypert  no_uinfect     no_DrVisit
## 188   low      0      smoker  no_ptl     hypert  no_uinfect  three_DrVisit
## 
## [189 rows x 7 columns]

# use cat.codes

pbwtcn['lowcc'] = pbwtcn['lowl'].cat.codes
pbwtcn['racecc'] = pbwtcn['racel'].cat.codes
pbwtcn['smokecc'] = pbwtcn['smokel'].cat.codes
pbwtcn['ptlcc'] = pbwtcn['ptll'].cat.codes
pbwtcn['htcc'] = pbwtcn['htl'].cat.codes
pbwtcn['uicc'] = pbwtcn['uil'].cat.codes
pbwtcn['ftvcc'] = pbwtcn['ftvl'].cat.codes

pbwtcn.head()
##    lowl  racel      smokel    ptll  ... ptlcc htcc uicc  ftvcc
## 0  high  white  non_smoker  no_ptl  ...     0    1    1      1
## 1  high  black  non_smoker  no_ptl  ...     0    1    0      4
## 2  high      0      smoker  no_ptl  ...     0    1    0      2
## 3  high      0      smoker  no_ptl  ...     0    1    1      5
## 4  high      0      smoker  no_ptl  ...     0    1    1      1
## 
## [5 rows x 14 columns]

Collapse levels of categorical variables

pd.value_counts(pbwtc.ptl)
## 0    159
## 1     24
## 2      5
## 3      1
## Name: ptl, dtype: int64

# make a series for value counts

series = pd.value_counts(pbwtc.ptl)

series
## 0    159
## 1     24
## 2      5
## 3      1
## Name: ptl, dtype: int64

# assign a True value for levels less than 20 percent

mask = (series/series.sum() * 100).lt(20)

mask
## 0    False
## 1     True
## 2     True
## 3     True
## Name: ptl, dtype: bool

# create a categorical index for TRUE values

series[mask].index
## CategoricalIndex([1, 2, 3], categories=[0, 1, 2, 3], ordered=False, dtype='category')

# To replace df['column'] use np.where 
# for those meet the conditions assign a value of 1

pbwtc['ptln'] = np.where(pbwtc['ptl'].isin(series[mask].index),1, pbwtc['ptl'])

pbwtc['ptln'].value_counts()
## 0    159
## 1     30
## Name: ptln, dtype: int64

Another example


pbwtc['ftv'].value_counts()
## 0    100
## 1     47
## 2     30
## 3      7
## 4      4
## 6      1
## Name: ftv, dtype: int64


series = pd.value_counts(pbwtc['ftv'])

mask = (series/series.sum() * 100).lt(20)

## assign a value of 2 for levels less than 20 percent

pbwtc['ftvn'] = np.where(pbwtc['ftv'].isin(series[mask].index),2, pbwtc['ftv'])

pbwtc['ftvn'].value_counts()
## 0    100
## 1     47
## 2     42
## Name: ftvn, dtype: int64

Make dummies or indicator variables

Using pandas.get_dummies


# dtypes are integrers
pbwtd=pbwt.copy()

And concatenation

# get dummies for ftv

ftv_dummies=pd.get_dummies(pbwtd['ftv'], prefix='f')

ftv_dummies.head()
##    f_0  f_1  f_2  f_3  f_4  f_6
## 0    1    0    0    0    0    0
## 1    0    0    0    1    0    0
## 2    0    1    0    0    0    0
## 3    0    0    1    0    0    0
## 4    1    0    0    0    0    0

# get dummies for ptl

ptl_dummies=pd.get_dummies(pbwtd['ptl'], prefix='p')

ptl_dummies.head()
##    p_0  p_1  p_2  p_3
## 0    1    0    0    0
## 1    1    0    0    0
## 2    1    0    0    0
## 3    1    0    0    0
## 4    1    0    0    0

# Put dummy vars in dataframe

pbwte = pd.concat([pbwtd, ptl_dummies, ftv_dummies], axis=1)

pbwte.head()
##   low  age  lwt race smoke ptl ht ui  ... p_2  p_3  f_0  f_1 f_2 f_3 f_4 f_6
## 0   0   19  182    1     0   0  0  1  ...   0    0    1    0   0   0   0   0
## 1   0   33  155    2     0   0  0  0  ...   0    0    0    0   0   1   0   0
## 2   0   20  105    0     1   0  0  0  ...   0    0    0    1   0   0   0   0
## 3   0   21  108    0     1   0  0  1  ...   0    0    0    0   1   0   0   0
## 4   0   18  107    0     1   0  0  1  ...   0    0    1    0   0   0   0   0
## 
## [5 rows x 26 columns]

# Drop the original class columns and one dummy var to prevent multicollinearity


pbwte = pbwte.drop(['ptl', 'ftv', 'p_0', 'f_0'], axis=1)

pbwte.head()
##   low  age  lwt race smoke ht ui   bwt  ...  p_1  p_2 p_3 f_1 f_2 f_3  f_4  f_6
## 0   0   19  182    1     0  0  1  2523  ...    0    0   0   0   0   0    0    0
## 1   0   33  155    2     0  0  0  2551  ...    0    0   0   0   0   1    0    0
## 2   0   20  105    0     1  0  0  2557  ...    0    0   0   1   0   0    0    0
## 3   0   21  108    0     1  0  1  2594  ...    0    0   0   0   1   0    0    0
## 4   0   18  107    0     1  0  1  2600  ...    0    0   0   0   0   0    0    0
## 
## [5 rows x 22 columns]

With prefixes (removes the original variables)


pd.get_dummies(pbwtd, columns=['low', 'race', 'smoke', 'ptl', 'ht', 'ui', 'ftv'], prefix=['l', 'r', 's', 'p', 'h', 'u', 'f']).head()
##    age  lwt   bwt  age_plus_lwt  ...  f_2 f_3 f_4 f_6
## 0   19  182  2523           201  ...    0   0   0   0
## 1   33  155  2551           188  ...    0   1   0   0
## 2   20  105  2557           125  ...    0   0   0   0
## 3   21  108  2594           129  ...    1   0   0   0
## 4   18  107  2600           125  ...    0   0   0   0
## 
## [5 rows x 30 columns]

Without prefixes (removes the original variables)


pd.get_dummies(pbwtd, columns=['low', 'race', 'smoke', 'ptl', 'ht', 'ui', 'ftv']).head()
##    age  lwt   bwt  age_plus_lwt  ...  ftv_2 ftv_3 ftv_4 ftv_6
## 0   19  182  2523           201  ...      0     0     0     0
## 1   33  155  2551           188  ...      0     1     0     0
## 2   20  105  2557           125  ...      0     0     0     0
## 3   21  108  2594           129  ...      1     0     0     0
## 4   18  107  2600           125  ...      0     0     0     0
## 
## [5 rows x 30 columns]

For labelled (categorical) data types


pd.get_dummies(pbwtc, columns=['lowl', 'racel', 'smokel', 'ptll', 'htl', 'uil', 'ftvl']).head()
##   low  age  lwt  ... ftvl_six_DrVisit ftvl_three_DrVisit ftvl_two_DrVisit
## 0   0   19  182  ...                0                  0                0
## 1   0   33  155  ...                0                  1                0
## 2   0   20  105  ...                0                  0                0
## 3   0   21  108  ...                0                  0                1
## 4   0   18  107  ...                0                  0                0
## 
## [5 rows x 39 columns]

Using sklearn.preprocessing.LabelEncoder


pbwtf=pbwtc[['age', 'lwt', 'race', 'smoke', 'ptll', 'ht', 'ui', 'ftvl' ]]

pbwtf
##      age  lwt race smoke    ptll ht ui           ftvl
## 0     19  182    1     0  no_ptl  0  1     no_DrVisit
## 1     33  155    2     0  no_ptl  0  0  three_DrVisit
## 2     20  105    0     1  no_ptl  0  0    one_DrVisit
## 3     21  108    0     1  no_ptl  0  1    two_DrVisit
## 4     18  107    0     1  no_ptl  0  1     no_DrVisit
## ..   ...  ...  ...   ...     ... .. ..            ...
## 184   28   95    0     1  no_ptl  0  0    two_DrVisit
## 185   14  100    2     0  no_ptl  0  0    two_DrVisit
## 186   23   94    2     1  no_ptl  0  0     no_DrVisit
## 187   17  142    1     0  no_ptl  1  0     no_DrVisit
## 188   21  130    0     1  no_ptl  1  0  three_DrVisit
## 
## [189 rows x 8 columns]

label_encoder = LabelEncoder()

pbwtf['ptll'].value_counts()
## no_ptl       159
## one_ptl       24
## two_ptl        5
## three_ptl      1
## Name: ptll, dtype: int64

ptl_encoded = label_encoder.fit_transform(pbwtf['ptll'])
ptl_df = pd.DataFrame(ptl_encoded, columns=['Preterm_DrVisit'])
ptl_df.head()
##    Preterm_DrVisit
## 0                0
## 1                0
## 2                0
## 3                0
## 4                0

Using sklearn.preprocessing.OneHotEncoder


binary_encoder = OneHotEncoder(categories='auto')

ptl_encoded = label_encoder.fit_transform(pbwtf['ptll'])

ptl_1hot = binary_encoder.fit_transform(ptl_encoded.reshape(-1,1))
ptl_1hot_mat = ptl_1hot.toarray()

ptl_DF = pd.DataFrame(ptl_1hot_mat, columns = ['no_ptl', 'one_ptl', 'two_ptl', 'three_ptl' ])

ptl_DF.head()
##    no_ptl  one_ptl  two_ptl  three_ptl
## 0     1.0      0.0      0.0        0.0
## 1     1.0      0.0      0.0        0.0
## 2     1.0      0.0      0.0        0.0
## 3     1.0      0.0      0.0        0.0
## 4     1.0      0.0      0.0        0.0