Resources and libraries

library(reticulate)

import sys

print(sys.version)
## 3.7.5 (default, Oct 31 2019, 15:18:51) [MSC v.1916 64 bit (AMD64)]

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
import pandas as pd

pd.set_option('display.max_rows', 5)

pd.set_option('display.max_columns', 20)

#pd.set_option('display.width', 200)

pd.set_option('display.max_colwidth', -1)

Download the following csv files from the gapminder web site (https://www.gapminder.org).

  1. “cholesterol_fat_in_blood_women_mmolperl.csv”
  2. “body_mass_index_infant.mortality_women_kgperm2.csv”
  3. “blood_pressure_sbp_women_mmhg.csv”
  4. “children_per_woman_total_fertility.csv”
  5. “contraceptive_use_percent_of_women_ages_15_49.csv”
  6. “breast_cancer_number_of_female_deaths.csv”
  7. “infant_mortality_rate_per_1000_births.csv”
  8. “maternal_deaths_total_number.csv”
  9. “maternal_mortality_ratio_per_100000_live_births.csv”

Import the csv files to create following datasets

cholesterol_fat_in_blood_women_mmolperl.csv -> fm_tc -> fm_tc_long


fm_tc = pd.read_csv("cholesterol_fat_in_blood_women_mmolperl.csv")

fm_tc.head()
##        country  1980  1981  1982  1983  1984  1985  1986  1987  1988  ...  \
## 0  Afghanistan  4.64  4.64  4.63  4.63  4.62  4.61  4.61  4.60  4.58  ...   
## 1  Albania      5.04  5.04  5.03  5.03  5.02  5.02  5.01  5.01  5.00  ...   
## 2  Algeria      4.98  4.97  4.97  4.98  4.98  4.98  4.97  4.97  4.96  ...   
## 3  Andorra      6.13  6.10  6.07  6.04  6.01  5.98  5.95  5.92  5.89  ...   
## 4  Angola       4.79  4.77  4.75  4.73  4.72  4.71  4.69  4.68  4.67  ...   
## 
##    1999  2000  2001  2002  2003  2004  2005  2006  2007  2008  
## 0  4.36  4.33  4.31  4.29  4.28  4.27  4.26  4.25  4.25  4.24  
## 1  4.92  4.92  4.92  4.92  4.92  4.91  4.90  4.89  4.89  4.88  
## 2  4.87  4.87  4.86  4.85  4.84  4.84  4.83  4.83  4.82  4.82  
## 3  5.59  5.57  5.55  5.53  5.51  5.50  5.48  5.47  5.46  5.46  
## 4  4.51  4.50  4.50  4.50  4.50  4.50  4.50  4.51  4.52  4.53  
## 
## [5 rows x 30 columns]

fm_tc_long=pd.melt(fm_tc, id_vars='country', var_name='year', value_name='tc')

fm_tc_long.shape
## (5481, 3)

fm_tc_long.head()
##        country  year    tc
## 0  Afghanistan  1980  4.64
## 1  Albania      1980  5.04
## 2  Algeria      1980  4.98
## 3  Andorra      1980  6.13
## 4  Angola       1980  4.79

body_mass_index_bmi_women_kgperm2.csv -> fm_bmi -> fm_bmi_long


fm_bmi = pd.read_csv("body_mass_index_bmi_women_kgperm2.csv")

fm_bmi.head()
##        country  1980  1981  1982  1983  1984  1985  1986  1987  1988  ...  \
## 0  Afghanistan  20.4  20.5  20.5  20.6  20.6  20.6  20.7  20.7  20.7  ...   
## 1  Albania      25.2  25.2  25.2  25.2  25.2  25.2  25.2  25.2  25.2  ...   
## 2  Algeria      23.7  23.8  23.9  24.0  24.1  24.2  24.3  24.4  24.5  ...   
## 3  Andorra      25.7  25.7  25.7  25.7  25.7  25.7  25.7  25.7  25.8  ...   
## 4  Angola       20.1  20.1  20.2  20.3  20.3  20.4  20.5  20.6  20.7  ...   
## 
##    1999  2000  2001  2002  2003  2004  2005  2006  2007  2008  
## 0  20.6  20.6  20.6  20.6  20.7  20.8  20.8  20.9  21.0  21.1  
## 1  25.1  25.1  25.2  25.3  25.3  25.4  25.5  25.5  25.6  25.7  
## 2  25.4  25.5  25.6  25.7  25.8  25.9  26.0  26.1  26.3  26.4  
## 3  26.1  26.1  26.1  26.2  26.2  26.3  26.3  26.4  26.4  26.4  
## 4  21.8  21.9  22.1  22.3  22.4  22.6  22.8  23.0  23.3  23.5  
## 
## [5 rows x 30 columns]

fm_bmi_long=pd.melt(fm_bmi, 
        id_vars='country',        
        var_name='year', 
        value_name='bmi')

fm_bmi_long.shape
## (5481, 3)

fm_bmi_long.head()
##        country  year   bmi
## 0  Afghanistan  1980  20.4
## 1  Albania      1980  25.2
## 2  Algeria      1980  23.7
## 3  Andorra      1980  25.7
## 4  Angola       1980  20.1

blood_pressure_sbp_women_mmhg.csv -> fm_sbp -> fm_sbp_long


fm_sbp = pd.read_csv("blood_pressure_sbp_women_mmhg.csv")

fm_sbp.head()
##        country  1980  1981  1982  1983  1984  1985  1986  1987  1988  ...  \
## 0  Afghanistan  122   122   123   123   123   123   123   123   124   ...   
## 1  Albania      132   132   132   132   132   132   132   131   131   ...   
## 2  Algeria      131   131   131   131   131   131   131   131   131   ...   
## 3  Andorra      137   136   135   135   134   133   133   132   132   ...   
## 4  Angola       130   130   130   130   130   130   130   130   130   ...   
## 
##    1999  2000  2001  2002  2003  2004  2005  2006  2007  2008  
## 0  124   124   124   124   124   125   125   125   125   125   
## 1  129   129   129   129   129   129   129   129   129   129   
## 2  131   131   130   130   130   130   130   130   130   130   
## 3  126   126   125   125   124   124   123   123   122   122   
## 4  130   130   130   130   130   130   130   130   130   130   
## 
## [5 rows x 30 columns]
fm_sbp_long=pd.melt(fm_sbp, 
        id_vars='country',        
        var_name='year', 
        value_name='sbp')

fm_sbp_long.shape
## (5481, 3)
fm_sbp_long.head()
##        country  year  sbp
## 0  Afghanistan  1980  122
## 1  Albania      1980  132
## 2  Algeria      1980  131
## 3  Andorra      1980  137
## 4  Angola       1980  130

children_per_woman_total_fertility -> fm_fertility -> fm_fertility_long


fm_fertility = pd.read_csv("children_per_woman_total_fertility.csv")

fm_fertility.head()
##                country  1800  1801  1802  1803  1804  1805  1806  1807  1808  \
## 0  Afghanistan          7.00  7.00  7.00  7.00  7.00  7.00  7.00  7.00  7.00   
## 1  Albania              4.60  4.60  4.60  4.60  4.60  4.60  4.60  4.60  4.60   
## 2  Algeria              6.99  6.99  6.99  6.99  6.99  6.99  6.99  6.99  6.99   
## 3  Angola               6.93  6.93  6.93  6.93  6.93  6.93  6.93  6.94  6.94   
## 4  Antigua and Barbuda  5.00  5.00  4.99  4.99  4.99  4.98  4.98  4.97  4.97   
## 
##    ...  2009  2010  2011  2012  2013  2014  2015  2016  2017  2018  
## 0  ...  6.04  5.82  5.60  5.38  5.17  4.98  4.80  4.64  4.48  4.33  
## 1  ...  1.65  1.65  1.67  1.69  1.70  1.71  1.71  1.71  1.71  1.71  
## 2  ...  2.83  2.89  2.93  2.94  2.92  2.89  2.84  2.78  2.71  2.64  
## 3  ...  6.24  6.16  6.08  6.00  5.92  5.84  5.77  5.69  5.62  5.55  
## 4  ...  2.15  2.13  2.12  2.10  2.09  2.08  2.06  2.05  2.04  2.03  
## 
## [5 rows x 220 columns]

fm_fertility_long=pd.melt(fm_fertility, 
        id_vars='country',        
        var_name='year', 
        value_name='fertility')

fm_fertility_long.shape
## (40296, 3)
fm_fertility_long.head()
##                country  year  fertility
## 0  Afghanistan          1800  7.00     
## 1  Albania              1800  4.60     
## 2  Algeria              1800  6.99     
## 3  Angola               1800  6.93     
## 4  Antigua and Barbuda  1800  5.00

contraceptive_use_percent_of_women_ages_15_49.csv -> fm_contracept_use -> fm_contracept_use_long


fm_contracept_use = pd.read_csv("contraceptive_use_percent_of_women_ages_15_49.csv")

fm_contracept_use.head()
##                country  1961  1962  1963  1964  1965  1966  1967  1968  1969  \
## 0  Afghanistan         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN     
## 1  Albania             NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN     
## 2  Algeria             NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN     
## 3  Angola              NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN     
## 4  Antigua and Barbuda NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN     
## 
##    ...  2008  2009  2010  2011  2012  2013  2014  2015  2016  2017  
## 0  ...  22.8 NaN    21.8  21.2 NaN   NaN   NaN    22.5 NaN   NaN    
## 1  ... NaN    69.3 NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN    
## 2  ... NaN   NaN   NaN   NaN    55.9  57.1 NaN   NaN   NaN   NaN    
## 3  ... NaN    17.7 NaN   NaN   NaN   NaN   NaN   NaN    13.7 NaN    
## 4  ... NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN    
## 
## [5 rows x 58 columns]

fm_contracept_use_long=pd.melt(fm_contracept_use, 
        id_vars='country',        
        var_name='year', 
        value_name='contracept_use')

fm_contracept_use_long.shape
## (10545, 3)

fm_contracept_use_long.head()
##                country  year  contracept_use
## 0  Afghanistan          1961 NaN            
## 1  Albania              1961 NaN            
## 2  Algeria              1961 NaN            
## 3  Angola               1961 NaN            
## 4  Antigua and Barbuda  1961 NaN

breast_cancer_number_of_female_deaths.csv -> fm_brstc_death -> fm_brstc_death_long


fm_brstc_death = pd.read_csv("breast_cancer_number_of_female_deaths.csv")

fm_brstc_death.head()
##        country   1990   1991   1992   1993    1994    1995    1996    1997  \
## 0  Afghanistan  761.0  817.0  894.0  989.0  1090.0  1180.0  1240.0  1290.0   
## 1  Albania      100.0  103.0  105.0  108.0  111.0   116.0   120.0   126.0    
## 2  Algeria      701.0  729.0  789.0  852.0  920.0   999.0   1010.0  1040.0   
## 3  Andorra      9.8    10.5   11.1   11.9   12.5    11.9    11.7    11.5     
## 4  Angola       420.0  432.0  445.0  477.0  513.0   538.0   548.0   554.0    
## 
##      1998  ...    2007    2008    2009    2010    2011    2012    2013  \
## 0  1340.0  ...  2020.0  2090.0  2150.0  2220.0  2280.0  2340.0  2420.0   
## 1  134.0   ...  206.0   212.0   214.0   215.0   215.0   215.0   217.0    
## 2  1060.0  ...  1700.0  1760.0  1840.0  1910.0  1980.0  2060.0  2150.0   
## 3  11.4    ...  13.2    13.7    14.3    14.6    14.9    15.2    15.4     
## 4  603.0   ...  814.0   835.0   852.0   867.0   885.0   909.0   939.0    
## 
##      2014    2015    2016  
## 0  2510.0  2600.0  2690.0  
## 1  219.0   221.0   222.0   
## 2  2240.0  2320.0  2390.0  
## 3  15.7    15.9    16.0    
## 4  968.0   1000.0  1030.0  
## 
## [5 rows x 28 columns]
fm_brstc_death_long=pd.melt(fm_brstc_death, 
        id_vars='country',        
        var_name='year', 
        value_name='brstc_death')

fm_brstc_death_long.shape
## (5049, 3)
fm_brstc_death_long.head()
##        country  year  brstc_death
## 0  Afghanistan  1990  761.0      
## 1  Albania      1990  100.0      
## 2  Algeria      1990  701.0      
## 3  Andorra      1990  9.8        
## 4  Angola       1990  420.0

infant_mortality_rate_per_1000_births.csv -> infant_mortality -> infant_mortality_long


infant_mortality = pd.read_csv("infant_mortality_rate_per_1000_births.csv")

infant_mortality.head()
##        country  1800  1801  1802  1803  1804  1805  1806  1807  1808  ...  \
## 0  Afghanistan NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN    ...   
## 1  Albania     NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN    ...   
## 2  Algeria     NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN    ...   
## 3  Andorra     NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN    ...   
## 4  Angola      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN    ...   
## 
##     2006   2007   2008   2009   2010   2011   2012   2013  2014  2015  
## 0  82.3   80.4   78.6   76.8   75.1   73.4   71.7   69.9   68.1  66.3  
## 1  17.4   16.7   16.0   15.4   14.8   14.3   13.8   13.3   12.9  12.5  
## 2  27.6   26.4   25.3   24.3   23.5   22.8   22.4   22.1   22.0  21.9  
## 3  2.9    2.8    2.7    2.6    2.5    2.4    2.3    2.2    2.1   2.1   
## 4  119.0  117.0  115.0  112.0  110.0  107.0  104.0  101.0  98.8  96.0  
## 
## [5 rows x 217 columns]

infant_mortality_long=pd.melt(infant_mortality, 
        id_vars='country',        
        var_name='year', 
        value_name='infant_mortality')

infant_mortality_long.shape
## (41904, 3)
infant_mortality_long.head()
##        country  year  infant_mortality
## 0  Afghanistan  1800 NaN              
## 1  Albania      1800 NaN              
## 2  Algeria      1800 NaN              
## 3  Andorra      1800 NaN              
## 4  Angola       1800 NaN

maternal_deaths_total_number.csv -> maternal_death -> maternal_death_long


maternal_death = pd.read_csv("maternal_deaths_total_number.csv")

maternal_death.head()
##                country  1800  1801  1802  1803  1804  1805  1806  1807  1808  \
## 0  Afghanistan         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN     
## 1  Albania             NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN     
## 2  Algeria             NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN     
## 3  Angola              NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN     
## 4  Antigua and Barbuda NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN     
## 
##    ...  2004    2005  2006  2007  2008  2009    2010  2011  2012    2013  
## 0  ... NaN    7900.0 NaN   NaN   NaN   NaN    5490.0 NaN   NaN    4200.0  
## 1  ... NaN    9.0    NaN   NaN   NaN   NaN    7.0    NaN   NaN    8.0     
## 2  ... NaN    696.0  NaN   NaN   NaN   NaN    821.0  NaN   NaN    840.0   
## 3  ... NaN    6680.0 NaN   NaN   NaN   NaN    5360.0 NaN   NaN    4400.0  
## 4  ... NaN   NaN     NaN   NaN   NaN   NaN   NaN     NaN   NaN   NaN      
## 
## [5 rows x 215 columns]
maternal_death_long=pd.melt(maternal_death, 
        id_vars='country',        
        var_name='year', 
        value_name='maternal_death')

maternal_death_long.shape
## (39162, 3)
maternal_death_long.head()
##                country  year  maternal_death
## 0  Afghanistan          1800 NaN            
## 1  Albania              1800 NaN            
## 2  Algeria              1800 NaN            
## 3  Angola               1800 NaN            
## 4  Antigua and Barbuda  1800 NaN

maternal_mortality_ratio_per_100000_live_births.csv -> maternal.mortality.ratio -> maternal.mortality.ratio.long


maternal_mortality_ratio = pd.read_csv("maternal_mortality_ratio_per_100000_live_births.csv")


maternal_mortality_ratio.sample(n=5)
##         country  1800  1801  1802  1803  1804  1805  1806  1807  1808  ...  \
## 150  Somalia    NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN    ...   
## 66   Grenada    NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN    ...   
## 180  Uzbekistan NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN    ...   
## 46   Denmark    NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN    ...   
## 117  Nepal      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN    ...   
## 
##      2004    2005  2006  2007  2008  2009   2010  2011  2012   2013  
## 150 NaN    1100.0 NaN   NaN   NaN   NaN    930.0 NaN   NaN    850.0  
## 66  NaN    25.0   NaN   NaN   NaN   NaN    23.0  NaN   NaN    23.0   
## 180 NaN    44.0   NaN   NaN   NaN   NaN    40.0  NaN   NaN    36.0   
## 46  NaN    8.0    NaN   NaN   NaN   NaN    9.0   NaN   NaN    5.0    
## 117 NaN    310.0  NaN   NaN   NaN   NaN    220.0 NaN   NaN    190.0  
## 
## [5 rows x 215 columns]

mm_ratio_long=pd.melt(maternal_mortality_ratio, 
        id_vars='country',        
        var_name='year', 
        value_name='mm_ratio')

mm_ratio_long.shape
## (40018, 3)

mm_ratio_long.head()
##        country  year  mm_ratio
## 0  Afghanistan  1800 NaN      
## 1  Albania      1800 NaN      
## 2  Algeria      1800 NaN      
## 3  Andorra      1800 NaN      
## 4  Angola       1800 NaN

Inner join

Two tables


pd.merge(fm_bmi_long, fm_tc_long, on=['country', 'year'])
##           country  year   bmi    tc
## 0     Afghanistan  1980  20.4  4.64
## 1     Albania      1980  25.2  5.04
## ...       ...       ...   ...   ...
## 5479  Zambia       2008  23.1  4.46
## 5480  Zimbabwe     2008  24.6  4.33
## 
## [5481 rows x 4 columns]

More than two tables


gpm_ij=fm_bmi_long.merge(
    fm_tc_long, on=['country', 'year'], how='inner').merge(
    fm_sbp_long, on=['country', 'year'], how='inner').merge(
    fm_fertility_long, on=['country', 'year'], how='inner').merge(
    fm_brstc_death_long, on=['country', 'year'], how='inner').merge(
    fm_contracept_use_long, on=['country', 'year'], how='inner').merge(
    mm_ratio_long, on=['country', 'year'], how='inner').merge(
    maternal_death_long, on=['country', 'year'], how='inner').merge(
    infant_mortality_long, on=['country', 'year'], how='inner')


gpm_ij
##           country  year   bmi    tc  sbp  fertility  brstc_death  \
## 0     Afghanistan  1990  20.7  4.55  124  7.47       761.0         
## 1     Albania      1990  25.1  4.99  131  2.98       100.0         
## ...       ...       ...   ...   ...  ...   ...         ...         
## 3380  Zambia       2008  23.1  4.46  132  5.56       430.0         
## 3381  Zimbabwe     2008  24.6  4.33  132  4.01       761.0         
## 
##       contracept_use  mm_ratio  maternal_death  infant_mortality  
## 0    NaN              1200.0    7520.0          123.0             
## 1    NaN              31.0      24.0            35.1              
## ...   ..               ...       ...             ...              
## 3380 NaN             NaN       NaN              58.7              
## 3381 NaN             NaN       NaN              58.9              
## 
## [3382 rows x 11 columns]

Drop the rows where at least one element is missing

This would leave only 177 rows out of 3382 rows.


gpm_ij.dropna()
##        country  year   bmi    tc  sbp  fertility  brstc_death  contracept_use  \
## 32    Chile     1990  25.2  5.11  128  2.55       834.0        56.0             
## 34    Colombia  1990  24.7  5.00  125  2.99       1480.0       66.1             
## ...        ...   ...   ...   ...  ...   ...          ...        ...             
## 2840  Uruguay   2005  26.2  4.89  124  2.16       730.0        78.0             
## 2844  Vietnam   2005  20.8  4.62  122  1.89       4130.0       76.8             
## 
##       mm_ratio  maternal_death  infant_mortality  
## 32    55.0      157.0           16.0              
## 34    100.0     905.0           28.9              
## ...     ...       ...            ...              
## 2840  32.0      17.0            12.4              
## 2844  60.0      856.0           22.6              
## 
## [177 rows x 11 columns]

Drop the rows where all elements are missing

None of the rows would be dropped.


gpm_ij.dropna(how='all')
##           country  year   bmi    tc  sbp  fertility  brstc_death  \
## 0     Afghanistan  1990  20.7  4.55  124  7.47       761.0         
## 1     Albania      1990  25.1  4.99  131  2.98       100.0         
## ...       ...       ...   ...   ...  ...   ...         ...         
## 3380  Zambia       2008  23.1  4.46  132  5.56       430.0         
## 3381  Zimbabwe     2008  24.6  4.33  132  4.01       761.0         
## 
##       contracept_use  mm_ratio  maternal_death  infant_mortality  
## 0    NaN              1200.0    7520.0          123.0             
## 1    NaN              31.0      24.0            35.1              
## ...   ..               ...       ...             ...              
## 3380 NaN             NaN       NaN              58.7              
## 3381 NaN             NaN       NaN              58.9              
## 
## [3382 rows x 11 columns]

Keep only the rows with at least 2 non-NA values

None of the rows would be dropped.


gpm_ij.dropna(thresh=2)
##           country  year   bmi    tc  sbp  fertility  brstc_death  \
## 0     Afghanistan  1990  20.7  4.55  124  7.47       761.0         
## 1     Albania      1990  25.1  4.99  131  2.98       100.0         
## ...       ...       ...   ...   ...  ...   ...         ...         
## 3380  Zambia       2008  23.1  4.46  132  5.56       430.0         
## 3381  Zimbabwe     2008  24.6  4.33  132  4.01       761.0         
## 
##       contracept_use  mm_ratio  maternal_death  infant_mortality  
## 0    NaN              1200.0    7520.0          123.0             
## 1    NaN              31.0      24.0            35.1              
## ...   ..               ...       ...             ...              
## 3380 NaN             NaN       NaN              58.7              
## 3381 NaN             NaN       NaN              58.9              
## 
## [3382 rows x 11 columns]

Drop the columns where at least one element is missing

This would keep all the rows and drop two columns.


gpm_ij.dropna(axis='columns')

# same as 'axis=1'
##           country  year   bmi    tc  sbp  fertility  brstc_death  \
## 0     Afghanistan  1990  20.7  4.55  124  7.47       761.0         
## 1     Albania      1990  25.1  4.99  131  2.98       100.0         
## ...       ...       ...   ...   ...  ...   ...         ...         
## 3380  Zambia       2008  23.1  4.46  132  5.56       430.0         
## 3381  Zimbabwe     2008  24.6  4.33  132  4.01       761.0         
## 
##       infant_mortality  
## 0     123.0             
## 1     35.1              
## ...    ...              
## 3380  58.7              
## 3381  58.9              
## 
## [3382 rows x 8 columns]

Define in which columns to look for missing values

Only 177 rows of data would be left.


# look for missing values in columns: contracept_use,  mm_ratio,  maternal_death

gpm_ij.dropna(subset=['contracept_use',  'mm_ratio',  'maternal_death'])
##        country  year   bmi    tc  sbp  fertility  brstc_death  contracept_use  \
## 32    Chile     1990  25.2  5.11  128  2.55       834.0        56.0             
## 34    Colombia  1990  24.7  5.00  125  2.99       1480.0       66.1             
## ...        ...   ...   ...   ...  ...   ...          ...        ...             
## 2840  Uruguay   2005  26.2  4.89  124  2.16       730.0        78.0             
## 2844  Vietnam   2005  20.8  4.62  122  1.89       4130.0       76.8             
## 
##       mm_ratio  maternal_death  infant_mortality  
## 32    55.0      157.0           16.0              
## 34    100.0     905.0           28.9              
## ...     ...       ...            ...              
## 2840  32.0      17.0            12.4              
## 2844  60.0      856.0           22.6              
## 
## [177 rows x 11 columns]

Left (outer) join

Two tables


pd.merge(fm_bmi_long, fm_fertility_long, on=['country', 'year'], how='left')
##           country  year   bmi  fertility
## 0     Afghanistan  1980  20.4  7.45     
## 1     Albania      1980  25.2  3.62     
## ...       ...       ...   ...   ...     
## 5479  Zambia       2008  23.1  5.56     
## 5480  Zimbabwe     2008  24.6  4.01     
## 
## [5481 rows x 4 columns]

# order of the tables dtetermine the order of the columns

pd.merge(fm_fertility_long, fm_bmi_long, on=['country', 'year'], how='left')
##            country  year  fertility  bmi
## 0      Afghanistan  1800  7.00      NaN 
## 1      Albania      1800  4.60      NaN 
## ...        ...       ...   ...       .. 
## 40294  Zambia       2018  4.87      NaN 
## 40295  Zimbabwe     2018  3.61      NaN 
## 
## [40296 rows x 4 columns]

More than two tables


fm_bmi_long.merge(
    fm_tc_long, on=['country', 'year'], how='left').merge(
    fm_sbp_long, on=['country', 'year'], how='left').merge(
    mm_ratio_long, on=['country', 'year'], how='left')
##           country  year   bmi    tc  sbp  mm_ratio
## 0     Afghanistan  1980  20.4  4.64  122  1640.0  
## 1     Albania      1980  25.2  5.04  132  57.7    
## ...       ...       ...   ...   ...  ...   ...    
## 5479  Zambia       2008  23.1  4.46  132 NaN      
## 5480  Zimbabwe     2008  24.6  4.33  132 NaN      
## 
## [5481 rows x 6 columns]

Right (outer) join

Two tables


pd.merge(fm_bmi_long, fm_fertility_long, on=['country', 'year'], how='right')
##            country  year   bmi  fertility
## 0      Afghanistan  1980  20.4  7.45     
## 1      Albania      1980  25.2  3.62     
## ...        ...       ...   ...   ...     
## 40294  Zambia       2018 NaN    4.87     
## 40295  Zimbabwe     2018 NaN    3.61     
## 
## [40296 rows x 4 columns]

# order of data set would determine column order

pd.merge(fm_fertility_long, fm_bmi_long, on=['country', 'year'], how='right')
##                   country  year  fertility   bmi
## 0     Afghanistan          1980  7.45       20.4
## 1     Albania              1980  3.62       25.2
## ...       ...               ...   ...        ...
## 5479  Palau                2008 NaN         31.9
## 5480  St. Kitts and Nevis  2008 NaN         30.5
## 
## [5481 rows x 4 columns]

More than two tables


fm_bmi_long.merge(
    fm_tc_long, on=['country', 'year'], how='right').merge(
    fm_sbp_long, on=['country', 'year'], how='right').merge(
    mm_ratio_long, on=['country', 'year'], how='right')
##            country  year   bmi    tc    sbp  mm_ratio
## 0      Afghanistan  1980  20.4  4.64  122.0  1640.0  
## 1      Albania      1980  25.2  5.04  132.0  57.7    
## ...        ...       ...   ...   ...    ...   ...    
## 40016  Zambia       2013 NaN   NaN   NaN     280.0   
## 40017  Zimbabwe     2013 NaN   NaN   NaN     470.0   
## 
## [40018 rows x 6 columns]

Full join

Two tables


pd.merge(fm_fertility_long, infant_mortality_long, on=['country', 'year'], how='outer')
##                    country  year  fertility  infant_mortality
## 0      Afghanistan          1800  7.0       NaN              
## 1      Albania              1800  4.6       NaN              
## ...        ...               ...  ...        ..              
## 42454  St. Kitts and Nevis  2015 NaN         8.4             
## 42455  Tuvalu               2015 NaN         22.8            
## 
## [42456 rows x 4 columns]

More than two tables


pd.merge(fm_bmi_long, fm_tc_long, on=['country', 'year'], how='outer').merge(fm_sbp_long, on=['country', 'year'], how='outer')
##           country  year   bmi    tc  sbp
## 0     Afghanistan  1980  20.4  4.64  122
## 1     Albania      1980  25.2  5.04  132
## ...       ...       ...   ...   ...  ...
## 5479  Zambia       2008  23.1  4.46  132
## 5480  Zimbabwe     2008  24.6  4.33  132
## 
## [5481 rows x 5 columns]

# merge three tables

fm_bmi_long.merge(fm_tc_long, on=['country', 'year'], how='outer').merge(fm_sbp_long, on=['country', 'year'], how='outer')
##           country  year   bmi    tc  sbp
## 0     Afghanistan  1980  20.4  4.64  122
## 1     Albania      1980  25.2  5.04  132
## ...       ...       ...   ...   ...  ...
## 5479  Zambia       2008  23.1  4.46  132
## 5480  Zimbabwe     2008  24.6  4.33  132
## 
## [5481 rows x 5 columns]

# merge nine tables


gpm_fj=fm_bmi_long.merge(
    fm_tc_long, on=['country', 'year'], how='outer').merge(
    fm_sbp_long, on=['country', 'year'], how='outer').merge(
    fm_fertility_long, on=['country', 'year'], how='outer').merge(
    fm_brstc_death_long, on=['country', 'year'], how='outer').merge(
    fm_contracept_use_long, on=['country', 'year'], how='outer').merge(
    mm_ratio_long, on=['country', 'year'], how='outer').merge(
    maternal_death_long, on=['country', 'year'], how='outer').merge(
    infant_mortality_long, on=['country', 'year'], how='outer')
    
gpm_fj
##            country  year   bmi    tc    sbp  fertility  brstc_death  \
## 0      Afghanistan  1980  20.4  4.64  122.0  7.45      NaN            
## 1      Albania      1980  25.2  5.04  132.0  3.62      NaN            
## ...        ...       ...   ...   ...    ...   ...       ..            
## 42467  Monaco       2015 NaN   NaN   NaN    NaN        NaN            
## 42468  San Marino   2015 NaN   NaN   NaN    NaN        NaN            
## 
##        contracept_use  mm_ratio  maternal_death  infant_mortality  
## 0     NaN              1640.0    10600.0         166.0             
## 1     NaN              57.7      42.0            64.0              
## ...    ..               ...       ...             ...              
## 42467 NaN             NaN       NaN              2.8               
## 42468 NaN             NaN       NaN              2.6               
## 
## [42469 rows x 11 columns]

Drop the rows where at least one element is missing

This would leave only 177 rows out of 42469 rows.


gpm_fj.dropna()
##        country  year   bmi    tc    sbp  fertility  brstc_death  \
## 1924  Chile     1990  25.2  5.11  128.0  2.55       834.0         
## 1926  Colombia  1990  24.7  5.00  125.0  2.99       1480.0        
## ...        ...   ...   ...   ...    ...   ...          ...        
## 4906  Uruguay   2005  26.2  4.89  124.0  2.16       730.0         
## 4910  Vietnam   2005  20.8  4.62  122.0  1.89       4130.0        
## 
##       contracept_use  mm_ratio  maternal_death  infant_mortality  
## 1924  56.0            55.0      157.0           16.0              
## 1926  66.1            100.0     905.0           28.9              
## ...    ...              ...       ...            ...              
## 4906  78.0            32.0      17.0            12.4              
## 4910  76.8            60.0      856.0           22.6              
## 
## [177 rows x 11 columns]

Drop the rows where all elements are missing

None of the rows would be dropped.


gpm_fj.dropna(how='all')
##            country  year   bmi    tc    sbp  fertility  brstc_death  \
## 0      Afghanistan  1980  20.4  4.64  122.0  7.45      NaN            
## 1      Albania      1980  25.2  5.04  132.0  3.62      NaN            
## ...        ...       ...   ...   ...    ...   ...       ..            
## 42467  Monaco       2015 NaN   NaN   NaN    NaN        NaN            
## 42468  San Marino   2015 NaN   NaN   NaN    NaN        NaN            
## 
##        contracept_use  mm_ratio  maternal_death  infant_mortality  
## 0     NaN              1640.0    10600.0         166.0             
## 1     NaN              57.7      42.0            64.0              
## ...    ..               ...       ...             ...              
## 42467 NaN             NaN       NaN              2.8               
## 42468 NaN             NaN       NaN              2.6               
## 
## [42469 rows x 11 columns]

Keep only the rows with at least 8 non-NA values

This would leave only 3888 rows out of 42469 rows.


gpm_fj.dropna(thresh=8)
##            country  year   bmi    tc    sbp  fertility  brstc_death  \
## 0      Afghanistan  1980  20.4  4.64  122.0  7.45      NaN            
## 1      Albania      1980  25.2  5.04  132.0  3.62      NaN            
## ...        ...       ...   ...   ...    ...   ...       ..            
## 39546  Vietnam      2013 NaN   NaN   NaN     1.96       4840.0        
## 39547  Yemen        2013 NaN   NaN   NaN     4.33       1080.0        
## 
##        contracept_use  mm_ratio  maternal_death  infant_mortality  
## 0     NaN              1640.0    10600.0         166.0             
## 1     NaN              57.7      42.0            64.0              
## ...    ..               ...       ...             ...              
## 39546  77.3            49.0      690.0           18.3              
## 39547  33.5            270.0     2100.0          36.7              
## 
## [3888 rows x 11 columns]

Drop the columns where at least one element is missing

This would keep all the rows and drop all columns except 2.


gpm_fj.dropna(axis=1)
##            country  year
## 0      Afghanistan  1980
## 1      Albania      1980
## ...        ...       ...
## 42467  Monaco       2015
## 42468  San Marino   2015
## 
## [42469 rows x 2 columns]

Define in which columns to look for missing values

Only 290 rows of data would be left.


# look for missing values in columns: contracept_use,  mm_ratio,  maternal_death

gpm_fj.dropna(subset=['contracept_use',  'mm_ratio',  'maternal_death'])
##           country  year   bmi    tc    sbp  fertility  brstc_death  \
## 13     Bangladesh  1980  18.5  4.55  120.0  6.36      NaN            
## 36     Colombia    1980  23.4  4.90  128.0  3.97      NaN            
## ...         ...     ...   ...   ...    ...   ...       ..            
## 39546  Vietnam     2013 NaN   NaN   NaN     1.96       4840.0        
## 39547  Yemen       2013 NaN   NaN   NaN     4.33       1080.0        
## 
##        contracept_use  mm_ratio  maternal_death  infant_mortality  
## 13     12.7            1330.0    47200.0         134.0             
## 36     48.5            115.0     1010.0          44.6              
## ...     ...              ...        ...           ...              
## 39546  77.3            49.0      690.0           18.3              
## 39547  33.5            270.0     2100.0          36.7              
## 
## [290 rows x 11 columns]