Resources

-SAS University Edition/ SAS Studio

-Download the following csv files from the gapminder web site (https://www.gapminder.org).

    cholesterol_fat_in_blood_women_mmolperl.csv
    body_mass_index_bmi_women_kgperm2.csv
    blood_pressure_sbp_women_mmhg.csv
    children_per_woman_total_fertility.csv
    contraceptive_use_percent_of_women_ages_15_49.csv
    breast_cancer_number_of_female_deaths.csv
    infant_mortality_rate_per_1000_births.csv
    maternal_deaths_total_number.csv
    maternal_mortality_ratio_per_100000_live_births.csv
    

Assign a library


%let path=/folders/myshortcuts/datasets;

libname usr "&path";

Import csv files to create following datasets

usr.fm_tc

Output: Total rows: 189 and Total columns: 30


proc import datafile="&path/cholesterol_fat_in_blood_women_mmolperl.csv"  dbms=csv   out=usr.fm_tc replace;
getnames=yes;
run;

usr.fm_bmi

Output: Total rows: 189 and Total columns: 30


proc import datafile="&path/body_mass_index_bmi_women_kgperm2.csv"  dbms=csv out=usr.fm_bmi replace;
getnames=yes;
run;

usr.fm_sbp

Output: Total rows: 189 and Total columns: 30

proc import datafile="&path/blood_pressure_sbp_women_mmhg.csv"  dbms=csv  out=usr.fm_sbp replace;
getnames=yes;
run;

usr.fm_fertility

Output: Total rows: 184 and Total columns: 220


proc import datafile="&path/children_per_woman_total_fertility.csv"  dbms=csv out=usr.fm_fertility replace;
getnames=yes;
run;

usr.fm_contracept_use

    Output: Total rows: 185 and Total columns: 58.
    Many missing values in this dataset.
    Some numeric variables are imported as character variables.
proc import datafile="&path/contraceptive_use_percent_of_women_ages_15_49.csv"  dbms=csv out=usr.fm_contracept_use replace;
getnames=yes;
run;

usr.fm_brstc_death

output: Total rows: 187 and Total columns: 28


proc import datafile="&path/breast_cancer_number_of_female_deaths.csv"  dbms=csv out=usr.fm_brstc_death replace;
getnames=yes;
run;

usr.infant_mortality

    Output: Total rows: 194 and Total columns: 217.
    Many missing values in this dataset.

proc import datafile="&path/infant_mortality_rate_per_1000_births.csv"  dbms=csv out=usr.infant_mortality replace;
getnames=yes;
run;

usr.maternal_death

    Output: Total rows: 183 and Total columns: 215.
    Many missing valus in this dataset.

proc import datafile="&path/maternal_deaths_total_number.csv"  dbms=csv  out=usr.maternal_death replace;
getnames=yes;
run;

usr.maternal_mortality_ratio

Output: Total rows: 187 and Total columns: 215


proc import datafile="&path/maternal_mortality_ratio_per_100000_live_births.csv"  dbms=csv  out=usr.maternal_mortality_ratio replace;
getnames=yes;
run;

Transpose the datasets created above to long format as shown below

    -Done in two steps

    -1st: transpose using `proc transpose`

    -2nd: convert variables to their expected types (`numeric` or `character`) if not imported as the right type in a data step

usr.fm_tc_t_long

Output: Total rows: 5481 and Total columns: 3


proc transpose data=usr.fm_tc out=usr.fm_tc_t (rename=(col1=tc));
    by country;
    var _1980-_2008;
run;
data usr.fm_tc_t_long;
set usr.fm_tc_t;
year=input(substr(_name_, 2, 4), 5.);
drop _name_;
run;

usr.fm_bmi_t_long

Output: Total rows: 5481 and Total columns: 3


proc transpose data=usr.fm_bmi out=usr.fm_bmi_t (rename=(col1=bmi));
    by country;
    var _1980-_2008;
run;
data usr.fm_bmi_t_long;
set usr.fm_bmi_t;
year=input(substr(_name_, 2, 4), 5.);
drop _name_;
run;

usr.fm_sbp_t_long

Output: Total rows: 5481 and Total columns: 3


proc transpose data=usr.fm_sbp out=usr.fm_sbp_t (rename=(col1=sbp));
    by country;
    var _1980-_2008;
run;
data usr.fm_sbp_t_long;
set usr.fm_sbp_t;
year=input(substr(_name_, 2, 4), 5.);
drop _name_;
run;

usr.fm_fertility_t_long

Output: Total rows: 40296 and Total columns: 3


proc transpose data=usr.fm_fertility out=usr.fm_fertility_t (rename=(col1=total_fertility));
    by country;
    var _1800-_2018;
run;
data usr.fm_fertility_t_long;
set usr.fm_fertility_t;
year=input(substr(_name_, 2, 4), 5.);

drop _name_;
run;

usr.fm_brstc_death_t_long

Output: Total rows: 5049 and Total columns: 3


proc transpose data=usr.fm_brstc_death out=usr.fm_brstc_death_t (rename=(col1=breast_cancer_deaths));
    by country;
    var _1990-_2016;
run;
data usr.fm_brstc_death_t_long;
set usr.fm_brstc_death_t;
year=input(substr(_name_, 2, 4), 5.);

drop _name_;
run;

usr.fm_contracept_use_t_long

    Some numeric variables were imported as character variables.
    After transposition all variables are character in nature.
    Hence they needed approprite conversion.

    Output: Total rows: 10545 and Total columns: 3.

proc transpose data=usr.fm_contracept_use out=usr.fm_contracept_use_t;
    by country;
    var _1961-_2017;
run;
data usr.fm_contracept_use_t_long;
set usr.fm_contracept_use_t;
year=input(substr(_name_, 2, 4), 5.);
contraceptive_use_percent=input(col1, best32.);
drop _name_ col1;
run;

usr.infant_mortality_t_long

Output: Total rows: 41904 and Total columns: 3


proc transpose data=usr.infant_mortality out=usr.infant_mortality_t;
    by country;
    var _1800-_2015;
run;
data usr.infant_mortality_t_long;
set usr.infant_mortality_t;

year=input(substr(_name_, 2, 4), 5.);
infant_mortality_rate=input(col1, best32.);

drop _name_ col1;
run;

usr.maternal_death_t_long

Output: Total rows: 39162 and Total columns: 3

proc transpose data=usr.maternal_death out=usr.maternal_death_t;
    by country;
    var _1800-_2013;
run;
data usr.maternal_death_t_long;
set usr.maternal_death_t;

year=input(substr(_name_, 2, 4), 5.);
maternal_deaths=input(col1, best32.);

drop _name_ col1;
run;

usr.maternal_mortality_ratio_t_long

Output: Total rows: 40018 and Total columns: 3


proc transpose data=usr.maternal_mortality_ratio out=usr.maternal_mortality_ratio_t;
    by country;
    var _1800-_2013;
run;
data usr.maternal_mortality_ratio_t_long;
set usr.maternal_mortality_ratio_t;

year=input(substr(_name_, 2, 4), 5.);
maternal_mortality_ratio=input(col1, best32.);

drop _name_ col1;
run;

Join the tables as needed

Inner join using the where clause

Output: Total rows: 3382 and Total columns: 11


proc sql;

create table usr.gpm as

select
    t.country, t.year, tc,
    bmi, 
    sbp,
    total_fertility,
    breast_cancer_deaths,
    contraceptive_use_percent,
    infant_mortality_rate,
    maternal_deaths,
    maternal_mortality_ratio
    
from
    usr.fm_tc_t_long as t,
    usr.fm_bmi_t_long as b,
    usr.fm_sbp_t_long as s,
    usr.fm_fertility_t_long as f,
    usr.fm_brstc_death_t_long as br,
    usr.fm_contracept_use_t_long as c,
    usr.infant_mortality_t_long as i,
    usr.maternal_death_t_long as m,
    usr.maternal_mortality_ratio_t_long as mm
    
where 
    t.country=b.country=s.country=f.country=br.country=c.country=i.country=m.country=mm.country
    
    and 
    
    t.year=b.year=s.year=f.year=br.year=c.year=i.year=m.year=mm.year;   

quit;
    

exclude 3 variables with many missing values

Output: Total rows: 3382 and Total columns: 8


proc sql;

create table usr.gpm_nomiss as

select country, year, tc, bmi, sbp, total_fertility, breast_cancer_deaths, infant_mortality_rate from usr.gpm 

where country and year and tc and bmi and sbp and total_fertility and breast_cancer_deaths and infant_mortality_rate  is not missing;

quit;

Inner join

Output: Total rows: 3382 and Total columns: 11


proc sql;

create table usr.gpm_i as

select
    t.country, t.year, tc,
    bmi, 
    sbp,
    total_fertility,
    breast_cancer_deaths,
    contraceptive_use_percent,
    infant_mortality_rate,
    maternal_deaths,
    maternal_mortality_ratio
    
from
    usr.fm_tc_t_long as t 
    inner join usr.fm_bmi_t_long as b
    on (t.country=b.country and t.year=b.year)
    inner join usr.fm_sbp_t_long as s
    on (t.country=s.country and t.year=s.year)
    inner join usr.fm_fertility_t_long as f
    on (t.country=f.country and t.year=f.year)
    inner join usr.fm_brstc_death_t_long as br
    on (t.country=br.country and t.year=br.year)
    inner join usr.fm_contracept_use_t_long as c
    on (t.country=c.country and t.year=c.year)
    inner join usr.infant_mortality_t_long as i
    on (t.country=i.country and t.year=i.year)
    inner join usr.maternal_death_t_long as m
    on (t.country=m.country and t.year=m.year)
    inner join usr.maternal_mortality_ratio_t_long as mm
    on (t.country=mm.country and t.year=mm.year);
    
quit;

exclude 3 variables with many missing values

Output: Total rows: 3477 and Total columns: 8


proc sql;

create table usr.gpm_i_1 as

select
    t.country, t.year, tc,
    bmi, 
    sbp,
    total_fertility,
    breast_cancer_deaths,
    infant_mortality_rate
    
from
    usr.fm_tc_t_long as t 
    inner join usr.fm_bmi_t_long as b
    on (t.country=b.country and t.year=b.year)
    inner join usr.fm_sbp_t_long as s
    on (t.country=s.country and t.year=s.year)
    inner join usr.fm_fertility_t_long as f
    on (t.country=f.country and t.year=f.year)
    inner join usr.fm_brstc_death_t_long as br
    on (t.country=br.country and t.year=br.year)
    inner join usr.infant_mortality_t_long as i
    on (t.country=i.country and t.year=i.year);
    
quit;
    

Left join

Output: Total rows: 5481 and Total columns: 8


proc sql;

create table usr.gpm_l as

select
    t.country, t.year, tc, bmi, sbp, total_fertility, breast_cancer_deaths, infant_mortality_rate   
from
    usr.fm_tc_t_long as t 
    left join usr.fm_bmi_t_long as b
    on (t.country=b.country and t.year=b.year)
    left join usr.fm_sbp_t_long as s
    on (t.country=s.country and t.year=s.year)
    left join usr.fm_fertility_t_long as f
    on (t.country=f.country and t.year=f.year)
    left join usr.fm_brstc_death_t_long as br
    on (t.country=br.country and t.year=br.year)
    left join usr.infant_mortality_t_long as i
    on (t.country=i.country and t.year=i.year);
    
quit;

Right join

Output: Total rows: 41904 and Total columns: 8


proc sql;

create table usr.gpm_r as

select
    t.country, t.year, tc, bmi, sbp, total_fertility, breast_cancer_deaths, infant_mortality_rate   
from
    usr.fm_tc_t_long as t 
    right join usr.fm_bmi_t_long as b
    on (t.country=b.country and t.year=b.year)
    right join usr.fm_sbp_t_long as s
    on (t.country=s.country and t.year=s.year)
    right join usr.fm_fertility_t_long as f
    on (t.country=f.country and t.year=f.year)
    right join usr.fm_brstc_death_t_long as br
    on (t.country=br.country and t.year=br.year)
    right join usr.infant_mortality_t_long as i
    on (t.country=i.country and t.year=i.year);
    
quit;
    

exclude observations with missing values

Output: Total rows: 3477 and Total columns: 8


proc sql;

create table usr.gpm_r_nomiss as

select * from usr.gpm_r

where country and year and tc and bmi and sbp and total_fertility and breast_cancer_deaths and infant_mortality_rate is not missing;

quit;

Full join

Output: Total rows: 78408 and Total columns: 8


proc sql;

create table usr.gpm_f as

select
    t.country, t.year, tc, bmi, sbp, total_fertility, breast_cancer_deaths, infant_mortality_rate   
from
    usr.fm_tc_t_long as t 
    full join usr.fm_bmi_t_long as b
    on (t.country=b.country and t.year=b.year)
    full join usr.fm_sbp_t_long as s
    on (t.country=s.country and t.year=s.year)
    full join usr.fm_fertility_t_long as f
    on (t.country=f.country and t.year=f.year)
    full join usr.fm_brstc_death_t_long as br
    on (t.country=br.country and t.year=br.year)
    full join usr.infant_mortality_t_long as i
    on (t.country=i.country and t.year=i.year);
    
quit;
    

exclude observations with missing values

Output: Total rows: 3477 and Total columns: 8


proc sql;

create table usr.gpm_f_nomiss as

select * 
        
from usr.gpm_f

where country and year and tc and bmi and sbp and total_fertility and breast_cancer_deaths and infant_mortality_rate is not missing;

quit;