library(reticulate)
import sys
print(sys.version)
## 3.7.5 (default, Oct 31 2019, 15:18:51) [MSC v.1916 64 bit (AMD64)]
import sys
if not sys.warnoptions:
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.use('ps') # generate postscript output by default
import seaborn as sb
sb.set_style('whitegrid')
pd.set_option('precision', 3)
pd.set_option('expand_frame_repr', True)
#pd.set_option('max_colwidth', -1)
library(MASS)
data("birthwt")
str(birthwt)
## 'data.frame': 189 obs. of 10 variables:
## $ low : int 0 0 0 0 0 0 0 0 0 0 ...
## $ age : int 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
## $ race : int 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke: int 0 0 1 1 1 0 0 0 1 1 ...
## $ ptl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ht : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ui : int 1 0 0 1 1 0 0 0 0 0 ...
## $ ftv : int 0 3 1 2 0 0 1 1 1 0 ...
## $ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
# | variable name | variable label | coded levels |
---|---|---|---|
1 | low | indicator of birth weight less than 2.5 kg | 0, 1 |
2 | age | mother’s age in years | continous variable |
3 | lwt | mother’s weight in pounds at last menstrual period | continous variable |
4 | race | mother’s race (1 = white, 2 = black, 3 = other) | 1, 2, 3 |
5 | smoke | smoking status during pregnancy | 0, 1 |
6 | ptl | number of previous premature labours | 0, 1, 2, 3 |
7 | ht | history of hypertension | 0, 1 |
8 | ui | presence of uterine irritability | 0, 1 |
9 | ftv | number of physician visits during the first trimester | 0, 1, 2, 3, 4, 6 |
10 | bwt | birth weight in grams | continous variable |
Outcome variable is low
. Its related variables are: age, lwt, race, smoke, ptl, ht, ui and ftv.
# save the 'birthwt' data as a 'csv' file and import using pandas 'read_csv' function
pbwt=pd.read_csv('birthwt.csv')
# information about data set
pbwt.info()
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 189 entries, 0 to 188
## Data columns (total 10 columns):
## # Column Non-Null Count Dtype
## --- ------ -------------- -----
## 0 low 189 non-null int64
## 1 age 189 non-null int64
## 2 lwt 189 non-null int64
## 3 race 189 non-null int64
## 4 smoke 189 non-null int64
## 5 ptl 189 non-null int64
## 6 ht 189 non-null int64
## 7 ui 189 non-null int64
## 8 ftv 189 non-null int64
## 9 bwt 189 non-null int64
## dtypes: int64(10)
## memory usage: 14.9 KB
# data types: integer/object/category/floating-point
pbwt.dtypes
## low int64
## age int64
## lwt int64
## race int64
## smoke int64
## ptl int64
## ht int64
## ui int64
## ftv int64
## bwt int64
## dtype: object
pbwt.head()
## low age lwt race smoke ptl ht ui ftv bwt
## 0 0 19 182 2 0 0 0 1 0 2523
## 1 0 33 155 3 0 0 0 0 3 2551
## 2 0 20 105 1 1 0 0 0 1 2557
## 3 0 21 108 1 1 0 0 1 2 2594
## 4 0 18 107 1 1 0 0 1 0 2600
pbwt.tail()
## low age lwt race smoke ptl ht ui ftv bwt
## 184 1 28 95 1 1 0 0 0 2 2466
## 185 1 14 100 3 0 0 0 0 2 2495
## 186 1 23 94 3 1 0 0 0 0 2495
## 187 1 17 142 2 0 0 1 0 0 2495
## 188 1 21 130 1 1 0 1 0 3 2495
pbwt.shape
## (189, 10)
pbwt.columns
## Index(['low', 'age', 'lwt', 'race', 'smoke', 'ptl', 'ht', 'ui', 'ftv', 'bwt'], dtype='object')
len(pbwt.columns)
## 10
len(pbwt)
## 189
len(pbwt['age'])
## 189
# rows
pbwt.index
## RangeIndex(start=0, stop=189, step=1)