max values of each variablemin values of each variablelibrary(reticulate)
import sys
print(sys.version)
## 3.7.5 (default, Oct 31 2019, 15:18:51) [MSC v.1916 64 bit (AMD64)]
import sys
if not sys.warnoptions:
import warnings
warnings.simplefilter("ignore")
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.use('ps') # generate postscript output by default
import sklearn
#from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
pd.set_option('precision', 3)
pd.set_option('expand_frame_repr', True)
#pd.set_option('max_colwidth', -1)
library(MASS)
data("birthwt")
str(birthwt)
## 'data.frame': 189 obs. of 10 variables:
## $ low : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age : int 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
## $ race : int 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke: int 0 0 1 1 1 0 0 0 1 1 ...
## $ ptl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ht : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ui : int 1 0 0 1 1 0 0 0 0 0 ...
## $ ftv : int 0 3 1 2 0 0 1 1 1 0 ...
## $ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
| # | variable name | variable label | coded levels |
|---|---|---|---|
| 1 | low | indicator of birth weight less than 2.5 kg | 0, 1 |
| 2 | age | mother’s age in years | continous variable |
| 3 | lwt | mother’s weight in pounds at last menstrual period | continous variable |
| 4 | race | mother’s race (1 = white, 2 = black, 3 = other) | 1, 2, 3 |
| 5 | smoke | smoking status during pregnancy | 0, 1 |
| 6 | ptl | number of previous premature labours | 0, 1, 2, 3 |
| 7 | ht | history of hypertension | 0, 1 |
| 8 | ui | presence of uterine irritability | 0, 1 |
| 9 | ftv | number of physician visits during the first trimester | 0, 1, 2, 3, 4, 6 |
| 10 | bwt | birth weight in grams | continous variable |
# save the 'birthwt' data as a 'csv' file and import using pandas 'read_csv' function
pbwt=pd.read_csv('birthwt.csv')
# information about data set
pbwt.info()
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 189 entries, 0 to 188
## Data columns (total 10 columns):
## # Column Non-Null Count Dtype
## --- ------ -------------- -----
## 0 low 189 non-null int64
## 1 age 189 non-null int64
## 2 lwt 189 non-null int64
## 3 race 189 non-null int64
## 4 smoke 189 non-null int64
## 5 ptl 189 non-null int64
## 6 ht 189 non-null int64
## 7 ui 189 non-null int64
## 8 ftv 189 non-null int64
## 9 bwt 189 non-null int64
## dtypes: int64(10)
## memory usage: 14.9 KB
# data types: integer/object/category/floating-point
pbwt.dtypes
## low int64
## age int64
## lwt int64
## race int64
## smoke int64
## ptl int64
## ht int64
## ui int64
## ftv int64
## bwt int64
## dtype: object
pbwt.head()
## low age lwt race smoke ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 1 0 33 155 3 0 0 0 0 3 2551
## 2 0 20 105 1 1 0 0 0 1 2557
## 3 0 21 108 1 1 0 0 1 2 2594
## 4 0 18 107 1 1 0 0 1 0 2600
pbwt.tail()
## low age lwt race smoke ptl ht ui ftv bwt
## 184 1 28 95 1 1 0 0 0 2 2466
## 185 1 14 100 3 0 0 0 0 2 2495
## 186 1 23 94 3 1 0 0 0 0 2495
## 187 1 17 142 2 0 0 1 0 0 2495
## 188 1 21 130 1 1 0 1 0 3 2495
pbwt.shape
## (189, 10)
pbwt.columns
## Index(['low', 'age', 'lwt', 'race', 'smoke', 'ptl', 'ht', 'ui', 'ftv', 'bwt'], dtype='object')
len(pbwt.columns)
## 10
len(pbwt)
## 189
len(pbwt['age'])
## 189
# rows
pbwt.index
## RangeIndex(start=0, stop=189, step=1)
pbwt[['low', 'race', 'smoke', 'ptl', 'ht', 'ui', 'ftv']]=pbwt[['low', 'race', 'smoke', 'ptl', 'ht', 'ui', 'ftv']].astype('category')
pbwt.dtypes
## low category
## age int64
## lwt int64
## race category
## smoke category
## ptl category
## ht category
## ui category
## ftv category
## bwt int64
## dtype: object
pbwt['race'].unique()
#sorted(pbwt['race'].unique())
## [2, 3, 1]
## Categories (3, int64): [2, 3, 1]
pbwt['race'].value_counts()
## 1 96
## 3 67
## 2 26
## Name: race, dtype: int64
# for a continuous variable
pbwt['age'].value_counts()
## 20 18
## 19 16
## 25 15
## 22 13
## 23 13
## 24 13
## 21 12
## 17 12
## 18 10
## 28 9
## 26 8
## 30 7
## 16 7
## 29 7
## 32 6
## 31 5
## 15 3
## 14 3
## 27 3
## 33 3
## 36 2
## 35 2
## 34 1
## 45 1
## Name: age, dtype: int64
#pbwt.sort_values(by='age', ascending=False).head()
pbwt.sort_values('age', ascending=False).head()
## low age lwt race smoke ptl ht ui ftv bwt
## 129 0 45 123 1 0 0 0 0 1 4990
## 88 0 36 175 1 0 0 0 0 0 3600
## 22 0 36 202 1 0 0 0 0 1 2836
## 126 0 35 170 1 0 1 0 0 1 4174
## 32 0 35 121 2 1 1 0 0 1 2948
# select and sort the values of a column
pbwt['age'].sort_values().head()
## 116 14
## 185 14
## 183 14
## 173 15
## 169 15
## Name: age, dtype: int64
# the dataframe sorted according to the value of age
pbwt.sort_values('age').head()
#pbwt.sort_values(by='age').head()
## low age lwt race smoke ptl ht ui ftv bwt
## 116 0 14 135 1 0 0 0 0 0 3941
## 185 1 14 100 3 0 0 0 0 2 2495
## 183 1 14 101 3 1 1 0 0 0 2466
## 173 1 15 115 3 0 0 0 1 0 2381
## 169 1 15 110 1 0 0 0 0 0 2353
pbwt['age'].drop_duplicates()
#pbwt['lwt'].drop_duplicates()
## 0 19
## 1 33
## 2 20
## 3 21
## 4 18
## 6 22
## 7 17
## 8 29
## 9 26
## 13 30
## 16 15
## 17 25
## 19 28
## 20 32
## 21 31
## 22 36
## 31 24
## 32 35
## 37 27
## 42 23
## 55 16
## 116 14
## 129 45
## 132 34
## Name: age, dtype: int64
# apparently there are 3 duplicated rows/could be independent observations
pbwt.drop_duplicates()
## low age lwt race smoke ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 1 0 33 155 3 0 0 0 0 3 2551
## 2 0 20 105 1 1 0 0 0 1 2557
## 3 0 21 108 1 1 0 0 1 2 2594
## 4 0 18 107 1 1 0 0 1 0 2600
## .. .. ... ... ... ... .. .. .. .. ...
## 184 1 28 95 1 1 0 0 0 2 2466
## 185 1 14 100 3 0 0 0 0 2 2495
## 186 1 23 94 3 1 0 0 0 0 2495
## 187 1 17 142 2 0 0 1 0 0 2495
## 188 1 21 130 1 1 0 1 0 3 2495
##
## [184 rows x 10 columns]
pbwt.select_dtypes(include=['category'])
## low race smoke ptl ht ui ftv
## 0 0 2 0 0 0 1 0
## 1 0 3 0 0 0 0 3
## 2 0 1 1 0 0 0 1
## 3 0 1 1 0 0 1 2
## 4 0 1 1 0 0 1 0
## .. .. ... ... .. .. .. ..
## 184 1 1 1 0 0 0 2
## 185 1 3 0 0 0 0 2
## 186 1 3 1 0 0 0 0
## 187 1 2 0 0 1 0 0
## 188 1 1 1 0 1 0 3
##
## [189 rows x 7 columns]
pbwt.select_dtypes(include=['object'])
## Empty DataFrame
## Columns: []
## Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]
##
## [189 rows x 0 columns]
pbwt.select_dtypes(include=['int64'])
## age lwt bwt
## 0 19 182 2523
## 1 33 155 2551
## 2 20 105 2557
## 3 21 108 2594
## 4 18 107 2600
## .. ... ... ...
## 184 28 95 2466
## 185 14 100 2495
## 186 23 94 2495
## 187 17 142 2495
## 188 21 130 2495
##
## [189 rows x 3 columns]
# note the usage of axis which is `columns`
# to drop variables withouit creating new data frame use `Inplace=True`
pbwt.drop('low', axis=1)
## age lwt race smoke ptl ht ui ftv bwt
## 0 19 182 2 0 0 0 1 0 2523
## 1 33 155 3 0 0 0 0 3 2551
## 2 20 105 1 1 0 0 0 1 2557
## 3 21 108 1 1 0 0 1 2 2594
## 4 18 107 1 1 0 0 1 0 2600
## .. ... ... ... ... .. .. .. .. ...
## 184 28 95 1 1 0 0 0 2 2466
## 185 14 100 3 0 0 0 0 2 2495
## 186 23 94 3 1 0 0 0 0 2495
## 187 17 142 2 0 0 1 0 0 2495
## 188 21 130 1 1 0 1 0 3 2495
##
## [189 rows x 9 columns]
pbwt.drop(['low', 'age'], axis='columns')
## lwt race smoke ptl ht ui ftv bwt
## 0 182 2 0 0 0 1 0 2523
## 1 155 3 0 0 0 0 3 2551
## 2 105 1 1 0 0 0 1 2557
## 3 108 1 1 0 0 1 2 2594
## 4 107 1 1 0 0 1 0 2600
## .. ... ... ... .. .. .. .. ...
## 184 95 1 1 0 0 0 2 2466
## 185 100 3 0 0 0 0 2 2495
## 186 94 3 1 0 0 0 0 2495
## 187 142 2 0 0 1 0 0 2495
## 188 130 1 1 0 1 0 3 2495
##
## [189 rows x 8 columns]
pbwt['age']
## 0 19
## 1 33
## 2 20
## 3 21
## 4 18
## ..
## 184 28
## 185 14
## 186 23
## 187 17
## 188 21
## Name: age, Length: 189, dtype: int64
pbwt[['age', 'lwt', 'race']].head(n=3)
## age lwt race
## 0 19 182 2
## 1 33 155 3
## 2 20 105 1
loc: outputs a data frame-Note the usage of loc and a colon followed by comma to select a range of columns
-Do not pass a list of columns
pbwt.loc[:, 'age':'race']
## age lwt race
## 0 19 182 2
## 1 33 155 3
## 2 20 105 1
## 3 21 108 1
## 4 18 107 1
## .. ... ... ...
## 184 28 95 1
## 185 14 100 3
## 186 23 94 3
## 187 17 142 2
## 188 21 130 1
##
## [189 rows x 3 columns]
iloc: outputs a data framepbwt.iloc[:, 1:4]
## age lwt race
## 0 19 182 2
## 1 33 155 3
## 2 20 105 1
## 3 21 108 1
## 4 18 107 1
## .. ... ... ...
## 184 28 95 1
## 185 14 100 3
## 186 23 94 3
## 187 17 142 2
## 188 21 130 1
##
## [189 rows x 3 columns]
pbwt.head(n=3)
## low age lwt race smoke ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 1 0 33 155 3 0 0 0 0 3 2551
## 2 0 20 105 1 1 0 0 0 1 2557
# note no comma following the range
pbwt[0:3]
## low age lwt race smoke ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 1 0 33 155 3 0 0 0 0 3 2551
## 2 0 20 105 1 1 0 0 0 1 2557
pbwt[:3]
## low age lwt race smoke ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 1 0 33 155 3 0 0 0 0 3 2551
## 2 0 20 105 1 1 0 0 0 1 2557
# use of loc and a comma
pbwt.loc[0:3,]
## low age lwt race smoke ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 1 0 33 155 3 0 0 0 0 3 2551
## 2 0 20 105 1 1 0 0 0 1 2557
## 3 0 21 108 1 1 0 0 1 2 2594
# use iloc and a comma
pbwt.iloc[0:3,]
## low age lwt race smoke ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 1 0 33 155 3 0 0 0 0 3 2551
## 2 0 20 105 1 1 0 0 0 1 2557
# use iloc and a comma
pbwt.iloc[:3,]
## low age lwt race smoke ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 1 0 33 155 3 0 0 0 0 3 2551
## 2 0 20 105 1 1 0 0 0 1 2557
pbwt.tail(n=3)
## low age lwt race smoke ptl ht ui ftv bwt
## 186 1 23 94 3 1 0 0 0 0 2495
## 187 1 17 142 2 0 0 1 0 0 2495
## 188 1 21 130 1 1 0 1 0 3 2495
pbwt.sample(n=3)
## low age lwt race smoke ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 23 0 28 120 3 0 0 0 0 0 2863
## 28 0 26 168 2 1 0 0 0 0 2920
pbwt.sample(frac=0.03)
## low age lwt race smoke ptl ht ui ftv bwt
## 22 0 36 202 1 0 0 0 0 1 2836
## 87 0 23 130 1 0 0 0 0 0 3586
## 48 0 24 115 1 0 0 0 0 2 3090
## 158 1 27 130 2 0 0 0 1 0 2187
## 108 0 18 120 1 1 0 0 0 2 3856
## 58 0 20 103 3 0 0 0 0 0 3203
pbwt.iloc[[0, 1, 3], [1, 2, 9]]
## age lwt bwt
## 0 19 182 2523
## 1 33 155 2551
## 3 21 108 2594
pbwt.loc[[0, 1, 3], ['age', 'lwt', 'bwt']]
## age lwt bwt
## 0 19 182 2523
## 1 33 155 2551
## 3 21 108 2594
# subset race==1
pbwt[pbwt['race'].isin([1])]
# pbwt[pbwt['race']==1]
## low age lwt race smoke ptl ht ui ftv bwt
## 2 0 20 105 1 1 0 0 0 1 2557
## 3 0 21 108 1 1 0 0 1 2 2594
## 4 0 18 107 1 1 0 0 1 0 2600
## 6 0 22 118 1 0 0 0 0 1 2637
## 8 0 29 123 1 1 0 0 0 1 2663
## .. .. ... ... ... ... .. .. .. .. ...
## 177 1 17 120 1 1 0 0 0 3 2414
## 178 1 23 110 1 1 1 0 0 0 2424
## 182 1 26 190 1 1 0 0 0 0 2466
## 184 1 28 95 1 1 0 0 0 2 2466
## 188 1 21 130 1 1 0 1 0 3 2495
##
## [96 rows x 10 columns]
pbwt[pbwt['race'] == 2].head()
# pbwt[pbwt['race'] == 2].shape
## low age lwt race smoke ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 16 0 15 98 2 0 0 0 0 0 2778
## 28 0 26 168 2 1 0 0 0 0 2920
## 29 0 17 113 2 0 0 0 0 1 2920
## 30 0 17 113 2 0 0 0 0 1 2920
pbwt[pbwt['race'] != 2].head()
# pbwt[pbwt['race'] != 2].shape
## low age lwt race smoke ptl ht ui ftv bwt
## 1 0 33 155 3 0 0 0 0 3 2551
## 2 0 20 105 1 1 0 0 0 1 2557
## 3 0 21 108 1 1 0 0 1 2 2594
## 4 0 18 107 1 1 0 0 1 0 2600
## 5 0 21 124 3 0 0 0 0 0 2622
pbwt[(pbwt['race'] == 2) & (pbwt['smoke']==1)].head()
# pbwt[(pbwt.race == 2) & (pbwt.smoke == 1)].head()
## low age lwt race smoke ptl ht ui ftv bwt
## 28 0 26 168 2 1 0 0 0 0 2920
## 32 0 35 121 2 1 1 0 0 1 2948
## 40 0 21 185 2 1 0 0 0 2 3042
## 78 0 20 121 2 1 0 0 0 0 3444
## 132 1 34 187 2 1 0 1 0 0 1135
pbwt[(pbwt['race']==2) & (pbwt['age'] > 31)]
## low age lwt race smoke ptl ht ui ftv bwt
## 32 0 35 121 2 1 1 0 0 1 2948
## 132 1 34 187 2 1 0 1 0 0 1135
pbwt[(pbwt['age'] >= 35) | (pbwt['lwt']==150) ].head()
## low age lwt race smoke ptl ht ui ftv bwt
## 11 0 19 150 3 0 0 0 0 1 2733
## 22 0 36 202 1 0 0 0 0 1 2836
## 27 0 29 150 1 0 0 0 0 2 2920
## 32 0 35 121 2 1 1 0 0 1 2948
## 71 0 31 150 3 1 0 0 0 2 3321
# Delete observations for age greater than 24
pbwt.loc[pbwt['age']<=24]
## low age lwt race smoke ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 2 0 20 105 1 1 0 0 0 1 2557
## 3 0 21 108 1 1 0 0 1 2 2594
## 4 0 18 107 1 1 0 0 1 0 2600
## 5 0 21 124 3 0 0 0 0 0 2622
## .. .. ... ... ... ... .. .. .. .. ...
## 183 1 14 101 3 1 1 0 0 0 2466
## 185 1 14 100 3 0 0 0 0 2 2495
## 186 1 23 94 3 1 0 0 0 0 2495
## 187 1 17 142 2 0 0 1 0 0 2495
## 188 1 21 130 1 1 0 1 0 3 2495
##
## [120 rows x 10 columns]
bwtpbwt['bwt'].max()
## 4990
# observation/row number for max bwt
pbwt[pbwt['bwt']==4990]
## low age lwt race smoke ptl ht ui ftv bwt
## 129 0 45 123 1 0 0 0 0 1 4990
query functionpbwt.query('race == 2').head()
# pbwt.query('race == 2').shape
## low age lwt race smoke ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 16 0 15 98 2 0 0 0 0 0 2778
## 28 0 26 168 2 1 0 0 0 0 2920
## 29 0 17 113 2 0 0 0 0 1 2920
## 30 0 17 113 2 0 0 0 0 1 2920
pbwt.query('race == 2 & smoke == 1').head()
#pbwt.query('race == 2 & smoke == 1').shape
## low age lwt race smoke ptl ht ui ftv bwt
## 28 0 26 168 2 1 0 0 0 0 2920
## 32 0 35 121 2 1 1 0 0 1 2948
## 40 0 21 185 2 1 0 0 0 2 3042
## 78 0 20 121 2 1 0 0 0 0 3444
## 132 1 34 187 2 1 0 1 0 0 1135
bwtpbwt['bwt'].max()
## 4990
# observation/row number for max bwt
pbwt.query('bwt==4990')
## low age lwt race smoke ptl ht ui ftv bwt
## 129 0 45 123 1 0 0 0 0 1 4990
# describe a variable
pbwt['age'].describe()
## count 189.000
## mean 23.238
## std 5.299
## min 14.000
## 25% 19.000
## 50% 23.000
## 75% 26.000
## max 45.000
## Name: age, dtype: float64
# describe all continuous variables
pbwt.describe()
## age lwt bwt
## count 189.000 189.000 189.000
## mean 23.238 129.815 2944.587
## std 5.299 30.579 729.214
## min 14.000 80.000 709.000
## 25% 19.000 110.000 2414.000
## 50% 23.000 121.000 2977.000
## 75% 26.000 140.000 3487.000
## max 45.000 250.000 4990.000
pbwt['age'].count()
## 189
pbwt['age'].mean()
## 23.238095238095237
pbwt['age'].std()
## 5.298677933404265
pbwt['age'].min()
## 14
pbwt['age'].max()
## 45
pbwt['age'].median()
## 23.0
pbwt['age'].mode()
## 0 20
## dtype: int64
pbwt['age'].kurtosis()
## 0.6162022516194354
pbwt['age'].cummax()
## 0 19
## 1 33
## 2 33
## 3 33
## 4 33
## ..
## 184 45
## 185 45
## 186 45
## 187 45
## 188 45
## Name: age, Length: 189, dtype: int64
# q25 and q75 (quantiles) for continuous variables
pbwt[['age', 'lwt', 'bwt']].quantile([.25, .75])
## age lwt bwt
## 0.25 19.0 110.0 2414.0
## 0.75 26.0 140.0 3487.0
pbwt.describe(include='category')
## low race smoke ptl ht ui ftv
## count 189 189 189 189 189 189 189
## unique 2 3 2 4 2 2 6
## top 0 1 0 0 0 0 0
## freq 130 96 115 159 177 161 100
# frequency/counts
pbwt['race'].value_counts(sort=False)
## 1 96
## 2 26
## 3 67
## Name: race, dtype: int64
# frequency/count percentage
pbwt['race'].value_counts(sort=False, normalize=True)
## 1 0.508
## 2 0.138
## 3 0.354
## Name: race, dtype: float64
# two-way frequency or cross-tab
pd.crosstab(pbwt['race'], pbwt['smoke'],margins=True)
## smoke 0 1 All
## race
## 1 44 52 96
## 2 16 10 26
## 3 55 12 67
## All 115 74 189
# 3 largest values of variable age
#pbwt.nlargest(3, 'age')
pbwt['age'].nlargest(3)
## 129 45
## 22 36
## 88 36
## Name: age, dtype: int64
# rows for 3 largest values for lwt
pbwt.nlargest(3, ['lwt', 'bwt'] )
## low age lwt race smoke ptl ht ui ftv bwt
## 67 0 28 250 3 1 0 0 0 6 3303
## 105 0 25 241 2 0 0 1 0 0 3790
## 92 0 19 235 1 1 0 1 0 0 3629
max values of each variablepbwt.max(axis=0).sort_values(ascending=False)
## bwt 4990.0
## lwt 250.0
## age 45.0
## ftv 6.0
## ptl 3.0
## race 3.0
## ui 1.0
## ht 1.0
## smoke 1.0
## low 1.0
## dtype: float64
# barplot for max values of the variables
pbwt.max(axis=0).sort_values(ascending=True).plot(kind='barh', color='red')
plt.show()
min values of each variablepbwt.min(axis=0).sort_values(ascending=False)
## bwt 709.0
## lwt 80.0
## age 14.0
## race 1.0
## ftv 0.0
## ui 0.0
## ht 0.0
## ptl 0.0
## smoke 0.0
## low 0.0
## dtype: float64
# barplot for min values of the variables
pbwt.min(axis=0).sort_values(ascending=True).plot(kind='barh', color='red')
plt.show()
pbwt.groupby('race').size()
## race
## 1 96
## 2 26
## 3 67
## dtype: int64
pbwt.groupby('race').size()*100/len(pbwt)
## race
## 1 50.794
## 2 13.757
## 3 35.450
## dtype: float64
pbwt.groupby('race').agg({'age':'mean'})
# pbwt.groupby('race').agg({'age':['mean','std']})
# pbwt.groupby('race')['age'].agg(['mean', 'std'])
## age
## race
## 1 24.292
## 2 21.538
## 3 22.388
# calculate multiple stats for age following grouping
pbwt.groupby('race').agg({'age':['size', 'mean', 'std', 'min', 'max']})
# pbwt.groupby('race').agg({'age':[np.size, np.mean, np.std, np.min, np.max]})
# pbwt.groupby('race')['age'].agg(['size', 'mean', 'std', 'min', 'max'])
# pbwt.groupby('race')['age'].agg([np.size, np.mean, np.std, np.min, np.max])
## age
## size mean std min max
## race
## 1 96 24.292 5.655 14 45
## 2 26 21.538 5.109 15 35
## 3 67 22.388 4.536 14 33
## Group and then summarize a continuous variable
pbwt.groupby('race')['age'].describe()
## count mean std min 25% 50% 75% max
## race
## 1 96.0 24.292 5.655 14.0 20.00 23.5 29.0 45.0
## 2 26.0 21.538 5.109 15.0 17.25 20.5 24.0 35.0
## 3 67.0 22.388 4.536 14.0 19.00 22.0 25.0 33.0
pbwt.groupby('race')[['age', 'lwt']].agg(['mean', 'std'])
## age lwt
## mean std mean std
## race
## 1 24.292 5.655 132.052 29.094
## 2 21.538 5.109 146.808 39.639
## 3 22.388 4.536 120.015 25.130
pbwt.groupby('race')[['age', 'lwt']].describe()
## age ... lwt
## count mean std min 25% ... min 25% 50% 75% max
## race ...
## 1 96.0 24.292 5.655 14.0 20.00 ... 90.0 112.0 129.5 143.25 235.0
## 2 26.0 21.538 5.109 15.0 17.25 ... 98.0 120.0 129.0 179.00 241.0
## 3 67.0 22.388 4.536 14.0 19.00 ... 80.0 105.0 119.0 130.00 250.0
##
## [3 rows x 16 columns]
# group-by and pass a dictionary or dictionaries to aggregate
pbwt.groupby('race').agg({'age':['mean', 'std', 'min', 'max'],
'bwt':['mean', 'std', 'min', 'max']})
# pbwt.groupby('race').agg({'age':[np.mean, np.std, np.min, np.max], 'bwt':[np.mean, np.std, np.min, np.max]})
## age bwt
## mean std min max mean std min max
## race
## 1 24.292 5.655 14 45 3102.719 727.886 1021 4990
## 2 21.538 5.109 15 35 2719.692 638.684 1135 3860
## 3 22.388 4.536 14 33 2805.284 722.194 709 4054
# calculate mean for continuous variables following grouping
pbwt.groupby('race').agg({'age':'mean', 'bwt':'mean', 'lwt':'mean'})
## age bwt lwt
## race
## 1 24.292 3102.719 132.052
## 2 21.538 2719.692 146.808
## 3 22.388 2805.284 120.015
# summary stats for all continuous varibales following grouping
pbwt.groupby('race').describe()
## age ... bwt
## count mean std min 25% ... min 25% 50% 75% max
## race ...
## 1 96.0 24.292 5.655 14.0 20.00 ... 1021.0 2584.75 3062.0 3651.0 4990.0
## 2 26.0 21.538 5.109 15.0 17.25 ... 1135.0 2370.50 2849.0 3057.0 3860.0
## 3 67.0 22.388 4.536 14.0 19.00 ... 709.0 2313.00 2835.0 3274.0 4054.0
##
## [3 rows x 24 columns]
## Group by two categorical variables and then summarize continuous variables
pbwt.groupby(['race', 'smoke']).agg({'age':'mean', 'bwt':'mean', 'lwt':'mean'})
## age bwt lwt
## race smoke
## 1 0 26.023 3428.750 138.841
## 1 22.827 2826.846 126.308
## 2 0 19.938 2854.500 149.438
## 1 24.100 2504.000 142.600
## 3 0 22.364 2815.782 119.145
## 1 22.500 2757.167 124.000
## Group by two categorical variables and then summarize continuous variables
pbwt.groupby(['race', 'smoke'])[['age', 'bwt', 'lwt']].aggregate([np.size, np.mean, np.std, np.min, np.max])
## age ... lwt
## size mean std amin amax ... size mean std amin amax
## race smoke ...
## 1 0 44 26.023 6.017 14 45 ... 44 138.841 25.700 100 202
## 1 52 22.827 4.926 16 33 ... 52 126.308 30.764 90 235
## 2 0 16 19.938 3.890 15 27 ... 16 149.438 43.408 98 241
## 1 10 24.100 5.953 18 35 ... 10 142.600 34.529 105 187
## 3 0 55 22.364 4.453 14 33 ... 55 119.145 19.044 85 169
## 1 12 22.500 5.108 14 31 ... 12 124.000 44.583 80 250
##
## [6 rows x 15 columns]
## Group by two categorical variables and then summarize continuous variables
pbwt.groupby(['race', 'smoke']).describe()
## age ... bwt
## count mean std min ... 25% 50% 75% max
## race smoke ...
## 1 0 44.0 26.023 6.017 14.0 ... 3062.00 3593.0 3873.00 4990.0
## 1 52.0 22.827 4.926 16.0 ... 2410.00 2775.5 3189.50 4238.0
## 2 0 16.0 19.938 3.890 15.0 ... 2480.75 2920.0 3331.25 3860.0
## 1 10.0 24.100 5.953 18.0 ... 2313.75 2381.0 2941.00 3444.0
## 3 0 55.0 22.364 4.453 14.0 ... 2313.00 2807.0 3253.00 4054.0
## 1 12.0 22.500 5.108 14.0 ... 2402.25 3146.5 3307.50 3572.0
##
## [6 rows x 24 columns]
# calculate mean age, bwt and lwt following grouping
pbwt.groupby(['race', 'smoke', 'ht']).agg({'age':'mean', 'bwt':'mean', 'lwt':'mean'})
## age bwt lwt
## race smoke ht
## 1 0 0 26.116 3436.395 139.279
## 1 22.000 3100.000 120.000
## 1 0 23.062 2819.292 121.958
## 1 20.000 2917.500 178.500
## 2 0 0 19.786 2813.357 143.429
## 1 21.000 3142.500 191.500
## 1 0 23.000 2656.111 137.667
## 1 34.000 1135.000 187.000
## 3 0 0 22.216 2874.824 118.961
## 1 24.250 2063.000 121.500
## 1 0 22.500 2757.167 124.000
## 1 NaN NaN NaN
# group-by and pass a dictionary to aggregate
pbwt.groupby(['race', 'smoke', 'ht']).agg({'age':[np.size, np.mean, np.std, np.min, np.max]})
## age
## size mean std amin amax
## race smoke ht
## 1 0 0 43.0 26.116 6.056 14.0 45.0
## 1 1.0 22.000 NaN 22.0 22.0
## 1 0 48.0 23.062 5.050 16.0 33.0
## 1 4.0 20.000 1.155 19.0 21.0
## 2 0 0 14.0 19.786 3.847 15.0 27.0
## 1 2.0 21.000 5.657 17.0 25.0
## 1 0 9.0 23.000 5.123 18.0 35.0
## 1 1.0 34.000 NaN 34.0 34.0
## 3 0 0 51.0 22.216 4.575 14.0 33.0
## 1 4.0 24.250 1.708 22.0 26.0
## 1 0 12.0 22.500 5.108 14.0 31.0
## 1 NaN NaN NaN NaN NaN
# group-by and pass dictionaries to aggregate
pbwt.groupby(['race', 'smoke', 'ht']).agg({'age':[np.mean, np.std, np.min, np.max],
'bwt':[np.mean, np.std, np.min, np.max]})
## age bwt
## mean std amin amax mean std amin amax
## race smoke ht
## 1 0 0 26.116 6.056 14.0 45.0 3436.395 716.668 1021.0 4990.0
## 1 22.000 NaN 22.0 22.0 3100.000 NaN 3100.0 3100.0
## 1 0 23.062 5.050 16.0 33.0 2819.292 607.071 1818.0 4238.0
## 1 20.000 1.155 19.0 21.0 2917.500 941.466 1790.0 3756.0
## 2 0 0 19.786 3.847 15.0 27.0 2813.357 605.186 1701.0 3860.0
## 1 21.000 5.657 17.0 25.0 3142.500 915.703 2495.0 3790.0
## 1 0 23.000 5.123 18.0 35.0 2656.111 443.025 2126.0 3444.0
## 1 34.000 NaN 34.0 34.0 1135.000 NaN 1135.0 1135.0
## 3 0 0 22.216 4.575 14.0 33.0 2874.824 684.999 1474.0 4054.0
## 1 24.250 1.708 22.0 26.0 2063.000 649.572 1330.0 2751.0
## 1 0 22.500 5.108 14.0 31.0 2757.167 810.045 709.0 3572.0
## 1 NaN NaN NaN NaN NaN NaN NaN NaN
# sums only the continous variables after grouping
pbwt.groupby('race').sum()
## age lwt bwt
## race
## 1 2332 12677 297861
## 2 560 3817 70712
## 3 1500 8041 187954
pbwt.groupby('race')['smoke'].describe()
## count unique top freq
## race
## 1 96 2 1 52
## 2 26 2 0 16
## 3 67 2 0 55
pbwt.groupby('race').describe(include='category')
## low smoke ... ui ftv
## count unique top freq count unique ... top freq count unique top freq
## race ...
## 1 96 2 0 73 96 2 ... 0 83 96 5 0 43
## 2 26 2 0 15 26 2 ... 0 23 26 5 0 14
## 3 67 2 0 42 67 2 ... 0 55 67 6 0 43
##
## [3 rows x 24 columns]
# max values for age in each racial group
pbwt.pivot_table(values='age', columns='race', aggfunc='max')
#pbwt.pivot_table(values='age', columns='race', aggfunc=np.max)
## race 1 2 3
## age 45 35 33
# min age in racial groups
pbwt.pivot_table(values='age', columns='race', aggfunc=np.min)
## race 1 2 3
## age 14 15 14
# mean age values in each racial groups
pbwt.pivot_table(values='age', columns='race', aggfunc=np.mean)
## race 1 2 3
## age 24.292 21.538 22.388
is in() function
pbwt['race'].isin([1]).sum()
#pbwt['race'].isin([2]).sum()
#pbwt['race'].isin([3]).sum()
## 96
pbwt['race'].value_counts()
## 1 96
## 3 67
## 2 26
## Name: race, dtype: int64
pbwt.assign(rn=pbwt.sort_values(['age'], ascending=False).groupby(['race']).cumcount() + 1).query('rn < 3').sort_values(['race', 'rn'])
## low age lwt race smoke ptl ht ui ftv bwt rn
## 129 0 45 123 1 0 0 0 0 1 4990 1
## 88 0 36 175 1 0 0 0 0 0 3600 2
## 32 0 35 121 2 1 1 0 0 1 2948 1
## 132 1 34 187 2 1 0 1 0 0 1135 2
## 1 0 33 155 3 0 0 0 0 3 2551 1
## 20 0 32 121 3 0 0 0 0 2 2835 2
rename function
pbwt.rename(columns={'race': 'racegr', 'smoke':'smokegr'}).head()
## low age lwt racegr smokegr ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 1 0 33 155 3 0 0 0 0 3 2551
## 2 0 20 105 1 1 0 0 0 1 2557
## 3 0 21 108 1 1 0 0 1 2 2594
## 4 0 18 107 1 1 0 0 1 0 2600
assign function
pbwt['age'].mean()
## 23.238095238095237
# subtract and create a new variable
pbwt.assign(age_centered=pbwt['age']-23.23).head()
# pbwt.assign(age_centered=pbwt['age']-pbwt['age'].mean()).head()
## low age lwt race smoke ptl ht ui ftv bwt age_centered
## 0 0 19 182 2 0 0 0 1 0 2523 -4.23
## 1 0 33 155 3 0 0 0 0 3 2551 9.77
## 2 0 20 105 1 1 0 0 0 1 2557 -3.23
## 3 0 21 108 1 1 0 0 1 2 2594 -2.23
## 4 0 18 107 1 1 0 0 1 0 2600 -5.23
# create two variables
pbwt.assign(age_centered=pbwt['age']-pbwt['age'].mean(), lwt_centered=pbwt['lwt']-pbwt['lwt'].mean() )
## low age lwt race smoke ptl ht ui ftv bwt age_centered lwt_centered
## 0 0 19 182 2 0 0 0 1 0 2523 -4.238 52.185
## 1 0 33 155 3 0 0 0 0 3 2551 9.762 25.185
## 2 0 20 105 1 1 0 0 0 1 2557 -3.238 -24.815
## 3 0 21 108 1 1 0 0 1 2 2594 -2.238 -21.815
## 4 0 18 107 1 1 0 0 1 0 2600 -5.238 -22.815
## .. .. ... ... ... ... .. .. .. .. ... ... ...
## 184 1 28 95 1 1 0 0 0 2 2466 4.762 -34.815
## 185 1 14 100 3 0 0 0 0 2 2495 -9.238 -29.815
## 186 1 23 94 3 1 0 0 0 0 2495 -0.238 -35.815
## 187 1 17 142 2 0 0 1 0 0 2495 -6.238 12.185
## 188 1 21 130 1 1 0 1 0 3 2495 -2.238 0.185
##
## [189 rows x 12 columns]
# multiply and create a new variable
pbwt.assign(age_lwt = pbwt['age'] * pbwt['lwt'])
## low age lwt race smoke ptl ht ui ftv bwt age_lwt
## 0 0 19 182 2 0 0 0 1 0 2523 3458
## 1 0 33 155 3 0 0 0 0 3 2551 5115
## 2 0 20 105 1 1 0 0 0 1 2557 2100
## 3 0 21 108 1 1 0 0 1 2 2594 2268
## 4 0 18 107 1 1 0 0 1 0 2600 1926
## .. .. ... ... ... ... .. .. .. .. ... ...
## 184 1 28 95 1 1 0 0 0 2 2466 2660
## 185 1 14 100 3 0 0 0 0 2 2495 1400
## 186 1 23 94 3 1 0 0 0 0 2495 2162
## 187 1 17 142 2 0 0 1 0 0 2495 2414
## 188 1 21 130 1 1 0 1 0 3 2495 2730
##
## [189 rows x 11 columns]
pbwt.head()
## low age lwt race smoke ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 1 0 33 155 3 0 0 0 0 3 2551
## 2 0 20 105 1 1 0 0 0 1 2557
## 3 0 21 108 1 1 0 0 1 2 2594
## 4 0 18 107 1 1 0 0 1 0 2600
eval function
pbwt.eval('age- 23.23')
## 0 -4.23
## 1 9.77
## 2 -3.23
## 3 -2.23
## 4 -5.23
## ...
## 184 4.77
## 185 -9.23
## 186 -0.23
## 187 -6.23
## 188 -2.23
## Length: 189, dtype: float64
# pbwt['age']+pbwt['lwt']
# pbwt.age+pbwt.lwt
pbwt['age_plus_lwt'] = pbwt.eval('age+lwt')
pbwt.head()
## low age lwt race smoke ptl ht ui ftv bwt age_plus_lwt
## 0 0 19 182 2 0 0 0 1 0 2523 201
## 1 0 33 155 3 0 0 0 0 3 2551 188
## 2 0 20 105 1 1 0 0 0 1 2557 125
## 3 0 21 108 1 1 0 0 1 2 2594 129
## 4 0 18 107 1 1 0 0 1 0 2600 125
transform function
# replace values with mean values of group
pbwt.groupby('race')['age'].transform('mean')
## 0 21.538
## 1 22.388
## 2 24.292
## 3 24.292
## 4 24.292
## ...
## 184 24.292
## 185 22.388
## 186 22.388
## 187 21.538
## 188 24.292
## Name: age, Length: 189, dtype: float64
pbwt['age_minus_race_group_avg_age'] = pbwt['age'] - pbwt.groupby('race')['age'].transform('mean')
pbwt.head()
## low age lwt race ... ftv bwt age_plus_lwt age_minus_race_group_avg_age
## 0 0 19 182 2 ... 0 2523 201 -2.538
## 1 0 33 155 3 ... 3 2551 188 10.612
## 2 0 20 105 1 ... 1 2557 125 -4.292
## 3 0 21 108 1 ... 2 2594 129 -3.292
## 4 0 18 107 1 ... 0 2600 125 -6.292
##
## [5 rows x 12 columns]
pbwt.sort_values('age', ascending=False)
## low age lwt race ... ftv bwt age_plus_lwt age_minus_race_group_avg_age
## 129 0 45 123 1 ... 1 4990 168 20.708
## 88 0 36 175 1 ... 0 3600 211 11.708
## 22 0 36 202 1 ... 1 2836 238 11.708
## 126 0 35 170 1 ... 1 4174 205 10.708
## 32 0 35 121 2 ... 1 2948 156 13.462
## .. .. ... ... ... ... .. ... ... ...
## 173 1 15 115 3 ... 0 2381 130 -7.388
## 169 1 15 110 1 ... 0 2353 125 -9.292
## 116 0 14 135 1 ... 0 3941 149 -10.292
## 183 1 14 101 3 ... 0 2466 115 -8.388
## 185 1 14 100 3 ... 2 2495 114 -8.388
##
## [189 rows x 12 columns]
# for values of age less than 70, add 10
pbwt.loc[pbwt['age'] < 70, 'age'] += 10
pbwt
## low age lwt race ... ftv bwt age_plus_lwt age_minus_race_group_avg_age
## 0 0 29 182 2 ... 0 2523 201 -2.538
## 1 0 43 155 3 ... 3 2551 188 10.612
## 2 0 30 105 1 ... 1 2557 125 -4.292
## 3 0 31 108 1 ... 2 2594 129 -3.292
## 4 0 28 107 1 ... 0 2600 125 -6.292
## .. .. ... ... ... ... .. ... ... ...
## 184 1 38 95 1 ... 2 2466 123 3.708
## 185 1 24 100 3 ... 2 2495 114 -8.388
## 186 1 33 94 3 ... 0 2495 117 0.612
## 187 1 27 142 2 ... 0 2495 159 -4.538
## 188 1 31 130 1 ... 3 2495 151 -3.292
##
## [189 rows x 12 columns]
# For values of age less than 60, subtract 10
pbwt.loc[pbwt['age'] < 60, 'age'] -= 10
pbwt.sort_values('age', ascending=False)
## low age lwt race ... ftv bwt age_plus_lwt age_minus_race_group_avg_age
## 129 0 45 123 1 ... 1 4990 168 20.708
## 88 0 36 175 1 ... 0 3600 211 11.708
## 22 0 36 202 1 ... 1 2836 238 11.708
## 126 0 35 170 1 ... 1 4174 205 10.708
## 32 0 35 121 2 ... 1 2948 156 13.462
## .. .. ... ... ... ... .. ... ... ...
## 173 1 15 115 3 ... 0 2381 130 -7.388
## 169 1 15 110 1 ... 0 2353 125 -9.292
## 116 0 14 135 1 ... 0 3941 149 -10.292
## 183 1 14 101 3 ... 0 2466 115 -8.388
## 185 1 14 100 3 ... 2 2495 114 -8.388
##
## [189 rows x 12 columns]
pbwt['bwt'].plot(kind='hist')
plt.show()
cutbins
# cut to 4 bins
#pd.cut(pbwt['bwt'], bins=4)
pd.cut(pbwt['bwt'], 4)
## 0 (1779.25, 2849.5]
## 1 (1779.25, 2849.5]
## 2 (1779.25, 2849.5]
## 3 (1779.25, 2849.5]
## 4 (1779.25, 2849.5]
## ...
## 184 (1779.25, 2849.5]
## 185 (1779.25, 2849.5]
## 186 (1779.25, 2849.5]
## 187 (1779.25, 2849.5]
## 188 (1779.25, 2849.5]
## Name: bwt, Length: 189, dtype: category
## Categories (4, interval[float64]): [(704.719, 1779.25] < (1779.25, 2849.5] < (2849.5, 3919.75] <
## (3919.75, 4990.0]]
# count values
pd.cut(pbwt['bwt'], bins=4).value_counts()
## (2849.5, 3919.75] 91
## (1779.25, 2849.5] 73
## (3919.75, 4990.0] 16
## (704.719, 1779.25] 9
## Name: bwt, dtype: int64
# assign a variable to the cut output
pbwt['bwt_cut'] =pd.cut(pbwt['bwt'], [500, 1000, 2000, 3000, 4000, 5000])
pbwt
## low age lwt ... age_plus_lwt age_minus_race_group_avg_age bwt_cut
## 0 0 19 182 ... 201 -2.538 (2000, 3000]
## 1 0 33 155 ... 188 10.612 (2000, 3000]
## 2 0 20 105 ... 125 -4.292 (2000, 3000]
## 3 0 21 108 ... 129 -3.292 (2000, 3000]
## 4 0 18 107 ... 125 -6.292 (2000, 3000]
## .. .. ... ... ... ... ... ...
## 184 1 28 95 ... 123 3.708 (2000, 3000]
## 185 1 14 100 ... 114 -8.388 (2000, 3000]
## 186 1 23 94 ... 117 0.612 (2000, 3000]
## 187 1 17 142 ... 159 -4.538 (2000, 3000]
## 188 1 21 130 ... 151 -3.292 (2000, 3000]
##
## [189 rows x 13 columns]
pbwt['bwt_cut'].value_counts()
## (3000, 4000] 83
## (2000, 3000] 78
## (1000, 2000] 18
## (4000, 5000] 9
## (500, 1000] 1
## Name: bwt_cut, dtype: int64
pd.crosstab(pbwt['bwt_cut'], pbwt['bwt'])
## bwt 709 1021 1135 1330 1474 ... 4167 4174 4238 4593 4990
## bwt_cut ...
## (500, 1000] 1 0 0 0 0 ... 0 0 0 0 0
## (1000, 2000] 0 1 1 1 1 ... 0 0 0 0 0
## (2000, 3000] 0 0 0 0 0 ... 0 0 0 0 0
## (3000, 4000] 0 0 0 0 0 ... 0 0 0 0 0
## (4000, 5000] 0 0 0 0 0 ... 1 1 1 1 1
##
## [5 rows x 131 columns]
qcut
pd.qcut(pbwt['bwt'], q=4, precision=0)
## 0 (2414.0, 2977.0]
## 1 (2414.0, 2977.0]
## 2 (2414.0, 2977.0]
## 3 (2414.0, 2977.0]
## 4 (2414.0, 2977.0]
## ...
## 184 (2414.0, 2977.0]
## 185 (2414.0, 2977.0]
## 186 (2414.0, 2977.0]
## 187 (2414.0, 2977.0]
## 188 (2414.0, 2977.0]
## Name: bwt, Length: 189, dtype: category
## Categories (4, interval[float64]): [(708.0, 2414.0] < (2414.0, 2977.0] < (2977.0, 3487.0] <
## (3487.0, 4990.0]]
pd.qcut(pbwt['bwt'], q=4, precision=0).value_counts()
## (2414.0, 2977.0] 49
## (708.0, 2414.0] 48
## (3487.0, 4990.0] 47
## (2977.0, 3487.0] 45
## Name: bwt, dtype: int64
pbwt['low4ql'] = pd.qcut(pbwt['bwt'], q=4, labels=['1q_low', '2q_low', '3q_low', '4q_low'])
pbwt.head()
## low age lwt ... age_minus_race_group_avg_age bwt_cut low4ql
## 0 0 19 182 ... -2.538 (2000, 3000] 2q_low
## 1 0 33 155 ... 10.612 (2000, 3000] 2q_low
## 2 0 20 105 ... -4.292 (2000, 3000] 2q_low
## 3 0 21 108 ... -3.292 (2000, 3000] 2q_low
## 4 0 18 107 ... -6.292 (2000, 3000] 2q_low
##
## [5 rows x 14 columns]
pbwt['low4ql'].value_counts()
## 2q_low 49
## 1q_low 48
## 4q_low 47
## 3q_low 45
## Name: low4ql, dtype: int64
map() function (Without initializing a variable)
pbwtr=pbwt.copy()
pbwt['race'].value_counts(sort=False)
## 1 96
## 2 26
## 3 67
## Name: race, dtype: int64
recode = {1: 0, 2: 1, 3: 2}
pbwt['race']= pbwt['race'].map(recode)
pbwt.head()
## low age lwt ... age_minus_race_group_avg_age bwt_cut low4ql
## 0 0 19 182 ... -2.538 (2000, 3000] 2q_low
## 1 0 33 155 ... 10.612 (2000, 3000] 2q_low
## 2 0 20 105 ... -4.292 (2000, 3000] 2q_low
## 3 0 21 108 ... -3.292 (2000, 3000] 2q_low
## 4 0 18 107 ... -6.292 (2000, 3000] 2q_low
##
## [5 rows x 14 columns]
pbwt['race'].value_counts()
## 0 96
## 2 67
## 1 26
## Name: race, dtype: int64
# note the diffrence is assignment and equality
# initialize a variable named `raceg`
pbwtr['raceg']=99
# when race is 1, racegr is 0
pbwtr['raceg'][pbwtr['race']==1]=0
# when race is 2, racg is 1
pbwtr['raceg'][pbwtr['race']==2]=1
# when race is 3, racg is 2
pbwtr['raceg'][pbwtr['race']==3]=2
pbwtr.head()
## low age lwt race ... age_minus_race_group_avg_age bwt_cut low4ql raceg
## 0 0 19 182 2 ... -2.538 (2000, 3000] 2q_low 1
## 1 0 33 155 3 ... 10.612 (2000, 3000] 2q_low 2
## 2 0 20 105 1 ... -4.292 (2000, 3000] 2q_low 0
## 3 0 21 108 1 ... -3.292 (2000, 3000] 2q_low 0
## 4 0 18 107 1 ... -6.292 (2000, 3000] 2q_low 0
##
## [5 rows x 15 columns]
pbwtr['raceg'].value_counts(sort=False)
## 0 96
## 1 26
## 2 67
## Name: raceg, dtype: int64
np.where (If……then logic)
# Unordered Categoricals can only compare equality or not
np.where(pbwt['low'] == 0, 'high', 'low')
## array(['high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
## 'high', 'high', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
## 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
## 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
## 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
## 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
## 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low', 'low',
## 'low', 'low', 'low', 'low', 'low', 'low', 'low'], dtype='<U4')
# it is 'high' when values are 0, else it is 'low'
pbwt['lowc'] = np.where(pbwt['low'] == 0, 'high', 'low')
pbwt
## low age lwt ... bwt_cut low4ql lowc
## 0 0 19 182 ... (2000, 3000] 2q_low high
## 1 0 33 155 ... (2000, 3000] 2q_low high
## 2 0 20 105 ... (2000, 3000] 2q_low high
## 3 0 21 108 ... (2000, 3000] 2q_low high
## 4 0 18 107 ... (2000, 3000] 2q_low high
## .. .. ... ... ... ... ... ...
## 184 1 28 95 ... (2000, 3000] 2q_low low
## 185 1 14 100 ... (2000, 3000] 2q_low low
## 186 1 23 94 ... (2000, 3000] 2q_low low
## 187 1 17 142 ... (2000, 3000] 2q_low low
## 188 1 21 130 ... (2000, 3000] 2q_low low
##
## [189 rows x 15 columns]
pbwt['smokec'] = np.where(pbwt['smoke'] == 0, 'non smoker', 'smoker')
pbwt.head()
## low age lwt race ... bwt_cut low4ql lowc smokec
## 0 0 19 182 1 ... (2000, 3000] 2q_low high non smoker
## 1 0 33 155 2 ... (2000, 3000] 2q_low high non smoker
## 2 0 20 105 0 ... (2000, 3000] 2q_low high smoker
## 3 0 21 108 0 ... (2000, 3000] 2q_low high smoker
## 4 0 18 107 0 ... (2000, 3000] 2q_low high smoker
##
## [5 rows x 16 columns]
pbwtl=pbwtr.copy()
pbwtl['racename']=pbwtl['race']
race_dict={1:'white', 2:'black', 3:'other'}
pbwtl['racename'] = [race_dict[i] for i in pbwtl['racename']]
pbwtl.head()
## low age lwt race ... bwt_cut low4ql raceg racename
## 0 0 19 182 2 ... (2000, 3000] 2q_low 1 black
## 1 0 33 155 3 ... (2000, 3000] 2q_low 2 other
## 2 0 20 105 1 ... (2000, 3000] 2q_low 0 white
## 3 0 21 108 1 ... (2000, 3000] 2q_low 0 white
## 4 0 18 107 1 ... (2000, 3000] 2q_low 0 white
##
## [5 rows x 16 columns]
pbwtl['racename'].value_counts()
## white 96
## other 67
## black 26
## Name: racename, dtype: int64
replace() function
pbwtc = pbwt.copy()
pbwtc.head()
## low age lwt race ... bwt_cut low4ql lowc smokec
## 0 0 19 182 1 ... (2000, 3000] 2q_low high non smoker
## 1 0 33 155 2 ... (2000, 3000] 2q_low high non smoker
## 2 0 20 105 0 ... (2000, 3000] 2q_low high smoker
## 3 0 21 108 0 ... (2000, 3000] 2q_low high smoker
## 4 0 18 107 0 ... (2000, 3000] 2q_low high smoker
##
## [5 rows x 16 columns]
# copy and create new variables
pbwtc[['lowl', 'racel', 'smokel', 'ptll', 'htl', 'uil', 'ftvl']]=pbwtc[['low', 'race', 'smoke', 'ptl', 'ht', 'ui', 'ftv']].copy()
pbwtc.head()
## low age lwt race smoke ptl ht ... lowl racel smokel ptll htl uil ftvl
## 0 0 19 182 1 0 0 0 ... 0 1 0 0 0 1 0
## 1 0 33 155 2 0 0 0 ... 0 2 0 0 0 0 3
## 2 0 20 105 0 1 0 0 ... 0 0 1 0 0 0 1
## 3 0 21 108 0 1 0 0 ... 0 0 1 0 0 1 2
## 4 0 18 107 0 1 0 0 ... 0 0 1 0 0 1 0
##
## [5 rows x 23 columns]
pbwtc.replace({
'lowl': {0:'high', 1:'low'},
'racel': {1: 'white', 2: 'black', 3:'other'},
'smokel': {0:'non_smoker', 1:'smoker'},
'ptll': {0: 'no_ptl', 1:'one_ptl', 2:'two_ptl', 3:'three_ptl'},
'htl': {0:'no_hypert', 1:'hypert'},
'uil': {0:'no_uinfect', 1:'uinfect'},
'ftvl': {0:'no_DrVisit', 1:'one_DrVisit', 2: 'two_DrVisit', 3: 'three_DrVisit', 4: 'four_DrVisit', 6: 'six_DrVisit'}
},
inplace=True
)
pbwtc.head()
## low age lwt race ... ptll htl uil ftvl
## 0 0 19 182 1 ... no_ptl no_hypert uinfect no_DrVisit
## 1 0 33 155 2 ... no_ptl no_hypert no_uinfect three_DrVisit
## 2 0 20 105 0 ... no_ptl no_hypert no_uinfect one_DrVisit
## 3 0 21 108 0 ... no_ptl no_hypert uinfect two_DrVisit
## 4 0 18 107 0 ... no_ptl no_hypert uinfect no_DrVisit
##
## [5 rows x 23 columns]
pbwtc['ftvl'].value_counts()
## no_DrVisit 100
## one_DrVisit 47
## two_DrVisit 30
## three_DrVisit 7
## four_DrVisit 4
## six_DrVisit 1
## Name: ftvl, dtype: int64
pbwtc.dtypes
## low category
## age int64
## lwt int64
## race category
## smoke category
## ptl category
## ht category
## ui category
## ftv category
## bwt int64
## age_plus_lwt int64
## age_minus_race_group_avg_age float64
## bwt_cut category
## low4ql category
## lowc object
## smokec object
## lowl object
## racel object
## smokel object
## ptll object
## htl object
## uil object
## ftvl object
## dtype: object
# convert object data types
pbwtcn=pbwtc[['lowl', 'racel', 'smokel', 'ptll', 'htl', 'uil', 'ftvl']].astype('category')
pbwtcn.dtypes
## lowl category
## racel category
## smokel category
## ptll category
## htl category
## uil category
## ftvl category
## dtype: object
pbwtcn
## lowl racel smokel ptll htl uil ftvl
## 0 high white non_smoker no_ptl no_hypert uinfect no_DrVisit
## 1 high black non_smoker no_ptl no_hypert no_uinfect three_DrVisit
## 2 high 0 smoker no_ptl no_hypert no_uinfect one_DrVisit
## 3 high 0 smoker no_ptl no_hypert uinfect two_DrVisit
## 4 high 0 smoker no_ptl no_hypert uinfect no_DrVisit
## .. ... ... ... ... ... ... ...
## 184 low 0 smoker no_ptl no_hypert no_uinfect two_DrVisit
## 185 low black non_smoker no_ptl no_hypert no_uinfect two_DrVisit
## 186 low black smoker no_ptl no_hypert no_uinfect no_DrVisit
## 187 low white non_smoker no_ptl hypert no_uinfect no_DrVisit
## 188 low 0 smoker no_ptl hypert no_uinfect three_DrVisit
##
## [189 rows x 7 columns]
# use cat.codes
pbwtcn['lowcc'] = pbwtcn['lowl'].cat.codes
pbwtcn['racecc'] = pbwtcn['racel'].cat.codes
pbwtcn['smokecc'] = pbwtcn['smokel'].cat.codes
pbwtcn['ptlcc'] = pbwtcn['ptll'].cat.codes
pbwtcn['htcc'] = pbwtcn['htl'].cat.codes
pbwtcn['uicc'] = pbwtcn['uil'].cat.codes
pbwtcn['ftvcc'] = pbwtcn['ftvl'].cat.codes
pbwtcn.head()
## lowl racel smokel ptll ... ptlcc htcc uicc ftvcc
## 0 high white non_smoker no_ptl ... 0 1 1 1
## 1 high black non_smoker no_ptl ... 0 1 0 4
## 2 high 0 smoker no_ptl ... 0 1 0 2
## 3 high 0 smoker no_ptl ... 0 1 1 5
## 4 high 0 smoker no_ptl ... 0 1 1 1
##
## [5 rows x 14 columns]
pd.value_counts(pbwtc.ptl)
## 0 159
## 1 24
## 2 5
## 3 1
## Name: ptl, dtype: int64
# make a series for value counts
series = pd.value_counts(pbwtc.ptl)
series
## 0 159
## 1 24
## 2 5
## 3 1
## Name: ptl, dtype: int64
# assign a True value for levels less than 20 percent
mask = (series/series.sum() * 100).lt(20)
mask
## 0 False
## 1 True
## 2 True
## 3 True
## Name: ptl, dtype: bool
# create a categorical index for TRUE values
series[mask].index
## CategoricalIndex([1, 2, 3], categories=[0, 1, 2, 3], ordered=False, dtype='category')
# To replace df['column'] use np.where
# for those meet the conditions assign a value of 1
pbwtc['ptln'] = np.where(pbwtc['ptl'].isin(series[mask].index),1, pbwtc['ptl'])
pbwtc['ptln'].value_counts()
## 0 159
## 1 30
## Name: ptln, dtype: int64
pbwtc['ftv'].value_counts()
## 0 100
## 1 47
## 2 30
## 3 7
## 4 4
## 6 1
## Name: ftv, dtype: int64
series = pd.value_counts(pbwtc['ftv'])
mask = (series/series.sum() * 100).lt(20)
## assign a value of 2 for levels less than 20 percent
pbwtc['ftvn'] = np.where(pbwtc['ftv'].isin(series[mask].index),2, pbwtc['ftv'])
pbwtc['ftvn'].value_counts()
## 0 100
## 1 47
## 2 42
## Name: ftvn, dtype: int64
pandas.get_dummies
# dtypes are integrers
pbwtd=pbwt.copy()
# get dummies for ftv
ftv_dummies=pd.get_dummies(pbwtd['ftv'], prefix='f')
ftv_dummies.head()
## f_0 f_1 f_2 f_3 f_4 f_6
## 0 1 0 0 0 0 0
## 1 0 0 0 1 0 0
## 2 0 1 0 0 0 0
## 3 0 0 1 0 0 0
## 4 1 0 0 0 0 0
# get dummies for ptl
ptl_dummies=pd.get_dummies(pbwtd['ptl'], prefix='p')
ptl_dummies.head()
## p_0 p_1 p_2 p_3
## 0 1 0 0 0
## 1 1 0 0 0
## 2 1 0 0 0
## 3 1 0 0 0
## 4 1 0 0 0
# Put dummy vars in dataframe
pbwte = pd.concat([pbwtd, ptl_dummies, ftv_dummies], axis=1)
pbwte.head()
## low age lwt race smoke ptl ht ui ... p_2 p_3 f_0 f_1 f_2 f_3 f_4 f_6
## 0 0 19 182 1 0 0 0 1 ... 0 0 1 0 0 0 0 0
## 1 0 33 155 2 0 0 0 0 ... 0 0 0 0 0 1 0 0
## 2 0 20 105 0 1 0 0 0 ... 0 0 0 1 0 0 0 0
## 3 0 21 108 0 1 0 0 1 ... 0 0 0 0 1 0 0 0
## 4 0 18 107 0 1 0 0 1 ... 0 0 1 0 0 0 0 0
##
## [5 rows x 26 columns]
# Drop the original class columns and one dummy var to prevent multicollinearity
pbwte = pbwte.drop(['ptl', 'ftv', 'p_0', 'f_0'], axis=1)
pbwte.head()
## low age lwt race smoke ht ui bwt ... p_1 p_2 p_3 f_1 f_2 f_3 f_4 f_6
## 0 0 19 182 1 0 0 1 2523 ... 0 0 0 0 0 0 0 0
## 1 0 33 155 2 0 0 0 2551 ... 0 0 0 0 0 1 0 0
## 2 0 20 105 0 1 0 0 2557 ... 0 0 0 1 0 0 0 0
## 3 0 21 108 0 1 0 1 2594 ... 0 0 0 0 1 0 0 0
## 4 0 18 107 0 1 0 1 2600 ... 0 0 0 0 0 0 0 0
##
## [5 rows x 22 columns]
pd.get_dummies(pbwtd, columns=['low', 'race', 'smoke', 'ptl', 'ht', 'ui', 'ftv'], prefix=['l', 'r', 's', 'p', 'h', 'u', 'f']).head()
## age lwt bwt age_plus_lwt ... f_2 f_3 f_4 f_6
## 0 19 182 2523 201 ... 0 0 0 0
## 1 33 155 2551 188 ... 0 1 0 0
## 2 20 105 2557 125 ... 0 0 0 0
## 3 21 108 2594 129 ... 1 0 0 0
## 4 18 107 2600 125 ... 0 0 0 0
##
## [5 rows x 30 columns]
pd.get_dummies(pbwtd, columns=['low', 'race', 'smoke', 'ptl', 'ht', 'ui', 'ftv']).head()
## age lwt bwt age_plus_lwt ... ftv_2 ftv_3 ftv_4 ftv_6
## 0 19 182 2523 201 ... 0 0 0 0
## 1 33 155 2551 188 ... 0 1 0 0
## 2 20 105 2557 125 ... 0 0 0 0
## 3 21 108 2594 129 ... 1 0 0 0
## 4 18 107 2600 125 ... 0 0 0 0
##
## [5 rows x 30 columns]
pd.get_dummies(pbwtc, columns=['lowl', 'racel', 'smokel', 'ptll', 'htl', 'uil', 'ftvl']).head()
## low age lwt ... ftvl_six_DrVisit ftvl_three_DrVisit ftvl_two_DrVisit
## 0 0 19 182 ... 0 0 0
## 1 0 33 155 ... 0 1 0
## 2 0 20 105 ... 0 0 0
## 3 0 21 108 ... 0 0 1
## 4 0 18 107 ... 0 0 0
##
## [5 rows x 39 columns]
sklearn.preprocessing.LabelEncoder
pbwtf=pbwtc[['age', 'lwt', 'race', 'smoke', 'ptll', 'ht', 'ui', 'ftvl' ]]
pbwtf
## age lwt race smoke ptll ht ui ftvl
## 0 19 182 1 0 no_ptl 0 1 no_DrVisit
## 1 33 155 2 0 no_ptl 0 0 three_DrVisit
## 2 20 105 0 1 no_ptl 0 0 one_DrVisit
## 3 21 108 0 1 no_ptl 0 1 two_DrVisit
## 4 18 107 0 1 no_ptl 0 1 no_DrVisit
## .. ... ... ... ... ... .. .. ...
## 184 28 95 0 1 no_ptl 0 0 two_DrVisit
## 185 14 100 2 0 no_ptl 0 0 two_DrVisit
## 186 23 94 2 1 no_ptl 0 0 no_DrVisit
## 187 17 142 1 0 no_ptl 1 0 no_DrVisit
## 188 21 130 0 1 no_ptl 1 0 three_DrVisit
##
## [189 rows x 8 columns]
label_encoder = LabelEncoder()
pbwtf['ptll'].value_counts()
## no_ptl 159
## one_ptl 24
## two_ptl 5
## three_ptl 1
## Name: ptll, dtype: int64
ptl_encoded = label_encoder.fit_transform(pbwtf['ptll'])
ptl_df = pd.DataFrame(ptl_encoded, columns=['Preterm_DrVisit'])
ptl_df.head()
## Preterm_DrVisit
## 0 0
## 1 0
## 2 0
## 3 0
## 4 0
sklearn.preprocessing.OneHotEncoder
binary_encoder = OneHotEncoder(categories='auto')
ptl_encoded = label_encoder.fit_transform(pbwtf['ptll'])
ptl_1hot = binary_encoder.fit_transform(ptl_encoded.reshape(-1,1))
ptl_1hot_mat = ptl_1hot.toarray()
ptl_DF = pd.DataFrame(ptl_1hot_mat, columns = ['no_ptl', 'one_ptl', 'two_ptl', 'three_ptl' ])
ptl_DF.head()
## no_ptl one_ptl two_ptl three_ptl
## 0 1.0 0.0 0.0 0.0
## 1 1.0 0.0 0.0 0.0
## 2 1.0 0.0 0.0 0.0
## 3 1.0 0.0 0.0 0.0
## 4 1.0 0.0 0.0 0.0