References, resources and libraries
matplotlib: https://matplotlib.org/
seaborn: https://seaborn.pydata.org/
seaborn axis_grids: https://seaborn.pydata.org/tutorial/axis_grids.html
library(reticulate)
import sys
print(sys.version)
## 3.7.5 (default, Oct 31 2019, 15:18:51) [MSC v.1916 64 bit (AMD64)]
import sys
if not sys.warnoptions:
import warnings
warnings.simplefilter("ignore")
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.use('ps') # generate postscript output by default
import seaborn as sb
sb.set_style('whitegrid')
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import scale
pd.set_option('precision', 3)
pd.set_option('expand_frame_repr', True)
#pd.set_option('max_colwidth', -1)
Describe the variables
1 |
low |
indicator of birth weight less than 2.5 kg |
0, 1 |
2 |
age |
mother’s age in years |
continous variable |
3 |
lwt |
mother’s weight in pounds at last menstrual period |
continous variable |
4 |
race |
mother’s race (1 = white, 2 = black, 3 = other) |
1, 2, 3 |
5 |
smoke |
smoking status during pregnancy |
0, 1 |
6 |
ptl |
number of previous premature labours |
0, 1, 2, 3 |
7 |
ht |
history of hypertension |
0, 1 |
8 |
ui |
presence of uterine irritability |
0, 1 |
9 |
ftv |
number of physician visits during the first trimester |
0, 1, 2, 3, 4, 6 |
10 |
bwt |
birth weight in grams |
continous variable |
The continuous outcome variable is bwt
. Its related variables are: age, lwt, race, smoke, ptl, ht, ui and ftv.
Import and glimpse the data
# save the 'birthwt' data as a 'csv' file and import using pandas 'read_csv' function
pbwt=pd.read_csv('birthwt.csv')
Data types
# data types: integer/object/category/floating-point
pbwt.dtypes
## low int64
## age int64
## lwt int64
## race int64
## smoke int64
## ptl int64
## ht int64
## ui int64
## ftv int64
## bwt int64
## dtype: object
First five rows of data
pbwt.head()
## low age lwt race smoke ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 1 0 33 155 3 0 0 0 0 3 2551
## 2 0 20 105 1 1 0 0 0 1 2557
## 3 0 21 108 1 1 0 0 1 2 2594
## 4 0 18 107 1 1 0 0 1 0 2600
Last five rows of data
pbwt.tail()
## low age lwt race smoke ptl ht ui ftv bwt
## 184 1 28 95 1 1 0 0 0 2 2466
## 185 1 14 100 3 0 0 0 0 2 2495
## 186 1 23 94 3 1 0 0 0 0 2495
## 187 1 17 142 2 0 0 1 0 0 2495
## 188 1 21 130 1 1 0 1 0 3 2495
Data shape or dimension
pbwt.shape
## (189, 10)
Columns name
pbwt.columns
## Index(['low', 'age', 'lwt', 'race', 'smoke', 'ptl', 'ht', 'ui', 'ftv', 'bwt'], dtype='object')
Number of columns
len(pbwt.columns)
## 10
Number of observations
len(pbwt)
## 189
Length of a variable
len(pbwt['age'])
## 189
Number of rows
# rows
pbwt.index
## RangeIndex(start=0, stop=189, step=1)
Plotting continuous outcome and its related varables using seaborn
Distplot: Distribution (histogram, kernel density and rug) plot
# only histogram
#f, ax = plt.subplots(figsize=(8, 4))
sb.distplot(pbwt['bwt'].dropna(), kde=False, bins=13, color='red')
plt.title('Distribution of infant body weight')
plt.show()

# only kernel density plot
sb.distplot(pbwt['bwt'], hist=False, kde=True, rug=False)
plt.title('KDE plot of infant body weight')
plt.show()

# histogram with kernel density plot
sb.distplot(pbwt['bwt'])
plt.title('Distribution of infant body weight')
plt.show()

# histogram with kernel density and rug plot
sb.distplot(pbwt['bwt'], rug=True)
plt.title('Distribution of infant body weight')
plt.show()

Relplot: Scatter plots
# Scatter plot of lwt and bwt
g=sb.relplot(x="lwt", y="bwt", data=pbwt, height=8, aspect=1)
plt.title('Scatter plot of lwt and bwt')
plt.show(g)

# Scatter plot of lwt and bwt in relation to smoking status
g=sb.relplot(x="lwt", y="bwt", hue='smoke', data=pbwt, height=8)
plt.title("Scatter plot of lwt and bwt in relation to smoking status")
plt.show(g)

#Scatter plot of lwt and bwt in relation to smoking status: scatter points denoted in different styles
g=sb.relplot(x="lwt", y="bwt", hue='smoke', style='smoke', data=pbwt, height=8)
plt.title("Scatter plot of lwt and bwt in relation to smoking status")
plt.show(g)

# Scatter plot of lwt and bwt in relation to smoking status: scatter points denoted in different size
g=sb.relplot(x="lwt", y="bwt", hue='smoke', size='smoke', sizes=(15, 150), data=pbwt, height=8)
plt.title("Scatter plot of lwt and bwt in relation to smoking status")
plt.show(g)

# Scatter plot of lwt and bwt in relation to smoking status in different racial groups
g=sb.relplot(x="lwt", y="bwt", hue='smoke', size='smoke', col='race', data=pbwt, height=8)
plt.show(g)

Catplot: Categorical scatter plot
#Distribution of bwt in relation to race: Without jitter
g=sb.catplot(x="race", y="bwt", jitter= False, data=pbwt, height=8)
plt.title("Distribution of bwt in relation to race")
plt.show(g)

# Distribution of bwt in relation to race: With jitter
g=sb.catplot(x="race", y="bwt", data=pbwt, height=8)
plt.title("Distribution of bwt in relation to race")
plt.show(g)

# Distribution of bwt in relation to race: Swarm plot and ordering the categorical groups according to your choice
g=sb.catplot(x="race", y="bwt", kind='swarm', data=pbwt, order=[2, 3, 1], height=8)
plt.title("Distribution of bwt in relation to race")
plt.show(g)

# for selected categories: omit one group from plotting
g=sb.catplot(x="race", y="bwt", kind="swarm", data=pbwt.query("race != 3"), height=8)
plt.title("Distribution of bwt in relation to race")
plt.show(g)

# for selected categories of a 3rd variable (e.g., smoker)
g=sb.catplot(x="race", y="bwt", kind="swarm", data=pbwt.query("smoke == 0"), height=8)
plt.title("Distribution of bwt in non-smokers in relation to race ")
plt.show(g)

# Distribution of bwt in relation to race in smoking categories
g = sb.catplot(x="race", y="bwt", col="smoke", aspect=1, kind="swarm", data=pbwt, height=8)
plt.show(g)

Pairplot: Pair plots
For a dataframe
# Pair plot for all variables
g = sb.pairplot(pbwt)
plt.show(g)

# alternatively using seaborn PairGrid
g = sb.PairGrid(pbwt)
g = g.map(plt.scatter)
plt.show(g)

# using seaborn PairGrid but also mapping to diagonal and off-diagonal grid
g = sb.PairGrid(pbwt)
g = g.map_diag(plt.hist)
g = g.map_offdiag(plt.scatter)
plt.show(g)

For a subset of variables in the dataframe
sb.pairplot(pbwt[['age', 'lwt', 'bwt']], height=3)
## <seaborn.axisgrid.PairGrid object at 0x0000000037F0F488>
plt.show()

For a subset of variables categorized by the levels of another variable
g = sb.pairplot(pbwt[['age', 'lwt', 'bwt', 'race']], hue='race', palette='hls', height=3)
plt.show(g)

# For a subset of continuous variables grouped by another categorical variable: with labels of categorical groups
g = sb.pairplot(pbwt[['age', 'lwt', 'bwt', 'race']].replace({'race': {1:'white', 2:'black', 3:'other'}}), hue='race', palette='hls', height=3)
plt.show(g)

With regression line
# pair plot with regression line
g = sb.pairplot(data=pbwt, x_vars=["age", "lwt"], y_vars=["bwt"], aspect=1, kind="reg", height=8)
plt.show(g)

# pair plot with regression line: scatter points identified as a categorical variable (e.g., race)
g = sb.pairplot(data=pbwt, x_vars=["age", "lwt"], y_vars=["bwt"], hue='race', aspect=1, kind="reg", height=8)
plt.show(g)

Regplot: Regression plot
# baseline regression line
sb.regplot(x='lwt', y='bwt', data=pbwt, scatter=False, color='red')
plt.title("Baseline regression plot for lwt abd bwt")
plt.show()

# regression plot with scatters
sb.regplot(x='lwt', y='bwt', data=pbwt, scatter=True, color='red')
plt.title("Regression plot for lwt and bwt")
plt.show()

# regression plot with resizing ability: using add_axes
fig= plt.figure(figsize=(8,8))
ax= fig.add_axes([0.1,0.1,0.8,0.8])
sb.regplot(x='lwt', y='bwt', data=pbwt, scatter=True, color='red', ax=ax)
ax.set_title("Regression plot for lwt and bwt")
plt.show()

# regression plot with resizing ability: using plt subplots
fig, ax = plt.subplots(figsize=(8, 8))
sb.regplot(x='lwt', y='bwt', data=pbwt, scatter=True, color='red', ax=ax)
#plt.title("Regression plot for lwt and bwt")
ax.set_title("Regression plot for lwt and bwt")
plt.show()

Residplot: Residual plot
# residual plot
f, ax = plt.subplots(figsize=(8, 8))
sb.residplot(x="lwt", y="bwt", data=pbwt, scatter_kws={"s": 80}, color='red', ax=ax)
plt.title("Residual plot for lwt and bwt")
plt.show()

Regression and residual plots together
# regression plot with resizing ability
fig, axes= plt.subplots(nrows=1, ncols=2, figsize=(16,8))
# regression plot
sb.regplot(x='lwt', y='bwt', data=pbwt, scatter=True, scatter_kws={"s": 50}, color='red', ax=axes[0])
axes[0].set_title("Regression plot for lwt and bwt")
# residual plot
sb.residplot(x="lwt", y="bwt", data=pbwt, scatter_kws={"s": 50}, color='black', ax=axes[1])
axes[1].set_title("Residual plot for lwt and bwt")
#plt.tight_layout()
plt.show()

Lmplot: Linear model plot
With linear fit
# lm plot with different markers and colors
g = sb.lmplot(x='lwt', y='bwt', hue='race', data=pbwt, markers=["o", "x", "^"], palette="Set1", aspect=1, height=8)
plt.title("Linear relationship between lwt and bwt in 3 racial groups")
plt.show(g)

# lm plot for lwt and bwt for different racial groups with different smoking status
g = sb.lmplot(x='lwt', y='bwt', hue='race', col='smoke', data=pbwt, aspect=1, col_wrap=2, height=8)
plt.show(g)

# lm plot for lwt and bwt for individuals with different smoking status within diffrent (racial X ui) groups")
g = sb.lmplot(x='lwt', y='bwt', hue='smoke', col='race', row='ui', data=pbwt)
plt.show(g)

# lm plot for lwt and bwt for smokers within diffrent (racial X ui) groups") for the levels of another categorical variable
g = sb.lmplot(x='lwt', y='bwt', hue='smoke', col='race', row='ui', data=pbwt.query("ftv==0"), ci=None, scatter_kws={"s": 80})
plt.show(g)

With quadratic fit
# lm plot with quadratic fit
g = sb.lmplot(x='lwt', y='bwt', hue='smoke', col='race', row='ui', data=pbwt.query("ftv==0"), order=2, ci=None, scatter_kws={"s": 80})
plt.show(g)

With lowess fit
# lm plot with lowess fit
g = sb.lmplot(x="lwt", y="bwt", data=pbwt, lowess=True, height=8)
plt.title("Lowess plot for lwt and bwt")
plt.show(g)

With robust regression fit
# Robust regression between lwt and bwt
g = sb.lmplot(x="age", y="bwt", data=pbwt.query('smoke==0'), robust=True, height=8, ci=None, scatter_kws={"s": 80})
plt.title("Robust regression between lwt and bwt")
plt.show(g)

Boxplots: Using boxplot()
pbwt.dtypes
## low int64
## age int64
## lwt int64
## race int64
## smoke int64
## ptl int64
## ht int64
## ui int64
## ftv int64
## bwt int64
## dtype: object
For a continuous variable
# box plot for bwt
fig, ax = plt.subplots(figsize=(6, 4))
sb.boxplot(x='bwt', data=pbwt, ax=ax)
plt.title("Distribution of bwt")
plt.show()

For a continuous variable categorized by the levels of another variable
# box plot for bwt by race: axis would determine orientation
sb.boxplot(x='bwt', y=pbwt['race'].astype('category'), data=pbwt, palette='hls')
plt.title("Distribution of bwt by race")
plt.show()

Boxplots: Using catplot()
For a continuous variable
# box plot for bwt by race
g = sb.catplot(x="bwt", kind="box", data=pbwt)
plt.title("Distribution of bwt by race")
plt.show(g)

For a subset of continuous variables
# horizontal box plot using a subset of the data frame or selected variables
g = sb.catplot(data=pbwt[['lwt', 'bwt']], orient='h', kind="box", height=4, aspect=2)
plt.title("Distribution of lwt and bwt")
plt.show(g)

For a continuous variable grouped by another categorical variable
# box plot for bwt by race
pbwt['race_c']=pbwt['race'].astype('category')
g = sb.catplot(y='race_c', x='bwt', kind="box", data=pbwt, height=8)
plt.title("Distribution of bwt by race")
plt.show(g)

For a continuous variable grouped by two categorical variables
# box plot for bwt by (raceXsmoke) groups
pbwt['race_c']=pbwt['race'].astype('category')
g = sb.catplot(x="bwt", y="race_c", hue='smoke', kind="box", data=pbwt, height=8)
plt.title("Distribution of bwt by smoking status in different racial groups")
plt.show(g)

For selected subgroups
# box plot for selected subgroups
g = sb.catplot(y="race", x="bwt", row="smoke", kind="box", orient="h", height=3, aspect=4, data=pbwt.query('race==1'))
plt.show(g)

# box plot for selected subgroups
g = sb.catplot(x="bwt", y="race", row="smoke", kind="box", orient="h", height=3, aspect=4, data=pbwt.query('race!=1'))
plt.show(g)
