Resources

-SAS University Edition/ SAS Studio

-https://support.sas.com/en/knowledge-base.html

-http://support.sas.com/kb/40/700.html

-Download the following csv files from the gapminder web site (https://www.gapminder.org).

    cholesterol_fat_in_blood_women_mmolperl.csv
    body_mass_index_bmi_women_kgperm2.csv
    blood_pressure_sbp_women_mmhg.csv
    children_per_woman_total_fertility.csv
    contraceptive_use_percent_of_women_ages_15_49.csv
    breast_cancer_number_of_female_deaths.csv
    infant_mortality_rate_per_1000_births.csv
    maternal_deaths_total_number.csv
    maternal_mortality_ratio_per_100000_live_births.csv
    

Preliminary

-Some numeric variables will be presented as character variables following importation.

-They needed to be conveted to the appropriate variable types.

-It could be achived in many ways: e.g., group the numeric and character variables into two datasets, convert the char variables using input function and concatenate/merge both datasets. In another way, determine which variables are char and use an array to convert them into num variables.

-The example provided at http://support.sas.com/kb/40/700.html can be used to agnostically to process variable conversion. It would be helpful when there are many variables and it’s tedious to identify individual char variables.

-The codes in that example would convert the character variables in the test dataset to numeric variables in the output dataset test2.

-Use these codes as it is from proc contents onward but substitute country for id and your dataset name for test.

Assign a library


%let path=/folders/myshortcuts/datasets;

libname usr "&path";

Import csv files to create following datasets

-Some variables would be imported as character variables instead of numeric variables as would be normally expected.

-These could be converted back to numeric as described above. However, it would be done following the array processing as shown below.

usr.fm_tc

Output: Total rows: 189 and Total columns: 30


proc import datafile="&path/cholesterol_fat_in_blood_women_mmolperl.csv"  dbms=csv   out=usr.fm_tc replace;
getnames=yes;
run;

usr.fm_bmi

Output: Total rows: 189 and Total columns: 30


proc import datafile="&path/body_mass_index_bmi_women_kgperm2.csv"  dbms=csv out=usr.fm_bmi replace;
getnames=yes;
run;

usr.fm_sbp

Output: Total rows: 189 and Total columns: 30

proc import datafile="&path/blood_pressure_sbp_women_mmhg.csv"  dbms=csv  out=usr.fm_sbp replace;
getnames=yes;
run;

usr.fm_fertility

Output: Total rows: 184 and Total columns: 220


proc import datafile="&path/children_per_woman_total_fertility.csv"  dbms=csv out=usr.fm_fertility replace;
getnames=yes;
run;

usr.fm_brstc_death

output: Total rows: 187 and Total columns: 28


proc import datafile="&path/breast_cancer_number_of_female_deaths.csv"  dbms=csv out=usr.fm_brstc_death replace;
getnames=yes;
run;

usr.fm_contracept_use and usr.fm_contracept_use_num

    Output: Total rows: 185 and Total columns: 58.
    
    Many missing values in this dataset.
    
    *Some numeric variables are imported as character variables and needed variable conversion.
proc import datafile="&path/contraceptive_use_percent_of_women_ages_15_49.csv"  dbms=csv out=usr.fm_contracept_use replace;
getnames=yes;
run;

variable conversion


proc contents data=USR.FM_CONTRACEPT_USE out=vars(keep=name type);
run; 
 
data vars;                                                
   set vars;                                                 
   if type=2 and name ne 'country';                               
   newname=trim(left(name))||"_n"; 
run;                                                                              


options symbolgen;                                        

proc sql;                                         
   select trim(left(name)), trim(left(newname)),             
          trim(left(newname))||'='||trim(left(name))         
          into :c_list separated by ' ', :n_list separated by ' ',  
          :renam_list separated by ' '                         
          from vars;                                                
quit;                                                                                                               
 
data usr.FM_CONTRACEPT_USE_num;                                               
   set USR.FM_CONTRACEPT_USE;                                                 
   array ch(*) $ &c_list;                                    
   array nu(*) &n_list;                                      
   do i = 1 to dim(ch);                                      
      nu(i)=input(ch(i),8.);                                  
   end;                                                      
   drop i &c_list;     
   rename &renam_list;                                                                                      
run;  
          

usr.infant_mortality and usr.infant_mortality_num

    Output: Total rows: 194 and Total columns: 217.
    
    Many missing values in this dataset.
    
    *Some numeric variables are imported as character variables and need conversion to numeric ones.

proc import datafile="&path/infant_mortality_rate_per_1000_births.csv"  dbms=csv out=usr.infant_mortality replace;
getnames=yes;
run;


proc contents data=USR.infant_mortality out=vars(keep=name type);
run; 
 
data vars;                                                
   set vars;                                                 
   if type=2 and name ne 'country';                               
   newname=trim(left(name))||"_n"; 
run;                                                                              


options symbolgen;                                        

proc sql;                                         
   select trim(left(name)), trim(left(newname)),             
          trim(left(newname))||'='||trim(left(name))         
          into :c_list separated by ' ', :n_list separated by ' ',  
          :renam_list separated by ' '                         
          from vars;                                                
quit;                                                                                                               
 
data usr.infant_mortality_num;                                               
   set USR.infant_mortality;                                                 
   array ch(*) $ &c_list;                                    
   array nu(*) &n_list;                                      
   do i = 1 to dim(ch);                                      
      nu(i)=input(ch(i),8.);                                  
   end;                                                      
   drop i &c_list;     
   rename &renam_list;                                                                                      
run;  
include_graphics("infant_mortality_num.PNG")

usr.maternal_death and usr.maternal_death_num

    Output: Total rows: 183 and Total columns: 215.
    Many missing valus in this dataset.
    *Some numeric variables are imported as character variables.

proc import datafile="&path/maternal_deaths_total_number.csv"  dbms=csv  out=usr.maternal_death replace;
getnames=yes;
run;

variable conversion

proc contents data=usr.maternal_death out=vars(keep=name type);
run; 
 
data vars;                                                
   set vars;                                                 
   if type=2 and name ne 'country';                               
   newname=trim(left(name))||"_n"; 
run;                                                                              


options symbolgen;                                        

proc sql;                                         
   select trim(left(name)), trim(left(newname)),             
          trim(left(newname))||'='||trim(left(name))         
          into :c_list separated by ' ', :n_list separated by ' ',  
          :renam_list separated by ' '                         
          from vars;                                                
quit;                                                                                                               
 
data usr.maternal_death_num;                                               
   set usr.maternal_death;                                                 
   array ch(*) $ &c_list;                                    
   array nu(*) &n_list;                                      
   do i = 1 to dim(ch);                                      
      nu(i)=input(ch(i),8.);                                  
   end;                                                      
   drop i &c_list;     
   rename &renam_list;                                                                                      
run;  
include_graphics("maternal_death_num.PNG")

usr.maternal_mortality_ratio

    Output: Total rows: 187 and Total columns: 215
    Many missing valus in this dataset.
    *Some numeric variables are imported as character variables.

proc import datafile="&path/maternal_mortality_ratio_per_100000_live_births.csv"  dbms=csv  out=usr.maternal_mortality_ratio replace;
getnames=yes;
run;

variable conversion

proc contents data=usr.maternal_mortality_ratio out=vars(keep=name type);
run; 
 
data vars;                                                
   set vars;                                                 
   if type=2 and name ne 'country';                               
   newname=trim(left(name))||"_n"; 
run;                                                                              


options symbolgen;                                        

proc sql;                                         
   select trim(left(name)), trim(left(newname)),             
          trim(left(newname))||'='||trim(left(name))         
          into :c_list separated by ' ', :n_list separated by ' ',  
          :renam_list separated by ' '                         
          from vars;                                                
quit;                                                                                                               
 
data usr.maternal_mortality_ratio_num;                                               
   set usr.maternal_mortality_ratio;                                                 
   array ch(*) $ &c_list;                                    
   array nu(*) &n_list;                                      
   do i = 1 to dim(ch);                                      
      nu(i)=input(ch(i),8.);                                  
   end;                                                      
   drop i &c_list;     
   rename &renam_list;                                                                                      
run;  
include_graphics("maternal_mortality_ratio_num.PNG")

Transpose the datasets created above to long format using array statements

usr.fm_tc_long

Output: Total rows: 5481 and Total columns: 3


data usr.fm_tc_long;
set usr.fm_tc;
array widetolong{1980:2008} _1980-_2008;
do year = 1980 to 2008;
    tc = widetolong(year);
    output;
end;
drop _1980-_2008;
run;

usr.fm_bmi_long

Output: Total rows: 5481 and Total columns: 3


data usr.fm_bmi_long;
set usr.fm_bmi;
array widetolong{1980:2008} _1980-_2008;
do year = 1980 to 2008;
    bmi = widetolong(year);
    output;
end;
drop _1980-_2008;
run;

usr.fm_sbp_long

Output: Total rows: 5481 and Total columns: 3


data usr.fm_brstc_death_long;
set usr.fm_brstc_death;
array widetolong{1990:2016} _1990-_2016;
do year = 1990 to 2016;
    breast_cancer_deaths = widetolong(year);
    output;
end;
drop _1990-_2016;
run;

usr.fm_fertility_long

Output: Total rows: 40296 and Total columns: 3


data usr.fm_fertility_long;
set usr.fm_fertility;
array widetolong{1800:2018} _1800-_2018;
do year = 1800 to 2008;
    total_fertility = widetolong(year);
    output;
end;
drop _1800-_2018;
run;

usr.fm_brstc_death_long

Output: Total rows: 5049 and Total columns: 3


data usr.fm_brstc_death_long;
set usr.fm_brstc_death;
array widetolong{1990:2016} _1990-_2016;
do year = 1990 to 2016;
    breast_cancer_deaths = widetolong(year);
    output;
end;
drop _1990-_2016;
run;

usr.fm_contracept_use_long

    Some numeric variables were imported as character variables.
    After transposition all variables are character in nature.
    Hence they needed approprite conversion.

    Output: Total rows: 10545 and Total columns: 3.

proc transpose data=usr.fm_contracept_use out=usr.fm_contracept_use_t;
    by country;
    var _1961-_2017;
run;

usr.infant_mortality_long

Output: Total rows: 41904 and Total columns: 3


data usr.infant_mortality_long;
set usr.infant_mortality_num;
array widetolong{1800:2015} _1800-_2015;
do year = 1800 to 2015;
    infant_mortality_rate = widetolong(year);
    output;
end;
drop _1800-_2015;
run;

usr.maternal_death_long

Output: Total rows: 39162 and Total columns: 3


data usr.maternal_death_long;
set usr.maternal_death_num;
array widetolong{1800:2013} _1800-_2013;
do year = 1800 to 2013;
    maternal_deaths = widetolong(year);
    output;
end;
drop _1800-_2013;
run;

usr.maternal_mortality_ratio_long

Output: Total rows: 40018 and Total columns: 3


data usr.maternal_mortality_ratio_long;
set usr.maternal_mortality_ratio_num;
array widetolong{1800:2013} _1800-_2013;
do year = 1800 to 2013;
    maternal_mortality_ratio = widetolong(year);
    output;
end;
drop _1800-_2013;
run;

Join the tables as needed

Inner join using the where clause

Output: Total rows: 3382 and Total columns: 11


proc sql;

create table usr.gpm_a as

select
    t.country, t.year, tc,
    bmi, 
    sbp,
    total_fertility,
    breast_cancer_deaths,
    contraceptive_use_percent,
    infant_mortality_rate,
    maternal_deaths,
    maternal_mortality_ratio
    
from
    usr.fm_tc_long as t,
    usr.fm_bmi_long as b,
    usr.fm_sbp_long as s,
    usr.fm_fertility_long as f,
    usr.fm_brstc_death_long as br,
    usr.fm_contracept_use_long as c,
    usr.infant_mortality_long as i,
    usr.maternal_death_long as m,
    usr.maternal_mortality_ratio_long as mm
    
where 
    t.country=b.country=s.country=f.country=br.country=c.country=i.country=m.country=mm.country
    
    and 
    
    t.year=b.year=s.year=f.year=br.year=c.year=i.year=m.year=mm.year;   

quit;
    

exclude 3 variables with many missing values

Output: Total rows: 3382 and Total columns: 8


proc sql;

create table usr.gpm_a_nomiss as

select country, year, tc, bmi, sbp, total_fertility, breast_cancer_deaths, infant_mortality_rate from usr.gpm 

where country and year and tc and bmi and sbp and total_fertility and breast_cancer_deaths and infant_mortality_rate  is not missing;

quit;

Inner join

Output: Total rows: 3382 and Total columns: 11


proc sql;

create table usr.gpm_a_i as

select
    t.country, t.year, tc,
    bmi, 
    sbp,
    total_fertility,
    breast_cancer_deaths,
    contraceptive_use_percent,
    infant_mortality_rate,
    maternal_deaths,
    maternal_mortality_ratio
    
from
    usr.fm_tc_long as t 
    inner join usr.fm_bmi_long as b
    on (t.country=b.country and t.year=b.year)
    inner join usr.fm_sbp_long as s
    on (t.country=s.country and t.year=s.year)
    inner join usr.fm_fertility_long as f
    on (t.country=f.country and t.year=f.year)
    inner join usr.fm_brstc_death_long as br
    on (t.country=br.country and t.year=br.year)
    inner join usr.fm_contracept_use_long as c
    on (t.country=c.country and t.year=c.year)
    inner join usr.infant_mortality_long as i
    on (t.country=i.country and t.year=i.year)
    inner join usr.maternal_death_long as m
    on (t.country=m.country and t.year=m.year)
    inner join usr.maternal_mortality_ratio_long as mm
    on (t.country=mm.country and t.year=mm.year);
    
quit;

exclude 3 variables with many missing values

Output: Total rows: 3477 and Total columns: 8


proc sql;

create table usr.gpm_a_i_1 as

select
    t.country, t.year, tc,
    bmi, 
    sbp,
    total_fertility,
    breast_cancer_deaths,
    infant_mortality_rate
    
from
    usr.fm_tc_long as t 
    inner join usr.fm_bmi_long as b
    on (t.country=b.country and t.year=b.year)
    inner join usr.fm_sbp_long as s
    on (t.country=s.country and t.year=s.year)
    inner join usr.fm_fertility_long as f
    on (t.country=f.country and t.year=f.year)
    inner join usr.fm_brstc_death_long as br
    on (t.country=br.country and t.year=br.year)
    inner join usr.infant_mortality_long as i
    on (t.country=i.country and t.year=i.year);
    
quit;
    

Left join

Output: Total rows: 5481 and Total columns: 8


proc sql;

create table usr.gpm_a_l as

select
    t.country, t.year, tc, bmi, sbp, total_fertility, breast_cancer_deaths, infant_mortality_rate   
from
    usr.fm_tc_long as t 
    left join usr.fm_bmi_long as b
    on (t.country=b.country and t.year=b.year)
    left join usr.fm_sbp_long as s
    on (t.country=s.country and t.year=s.year)
    left join usr.fm_fertility_long as f
    on (t.country=f.country and t.year=f.year)
    left join usr.fm_brstc_death_long as br
    on (t.country=br.country and t.year=br.year)
    left join usr.infant_mortality_long as i
    on (t.country=i.country and t.year=i.year);
    
quit;

Right join

Output: Total rows: 41904 and Total columns: 8


proc sql;

create table usr.gpm_a_r as

select
    t.country, t.year, tc, bmi, sbp, total_fertility, breast_cancer_deaths, infant_mortality_rate   
from
    usr.fm_tc_long as t 
    right join usr.fm_bmi_long as b
    on (t.country=b.country and t.year=b.year)
    right join usr.fm_sbp_long as s
    on (t.country=s.country and t.year=s.year)
    right join usr.fm_fertility_long as f
    on (t.country=f.country and t.year=f.year)
    right join usr.fm_brstc_death_long as br
    on (t.country=br.country and t.year=br.year)
    right join usr.infant_mortality_long as i
    on (t.country=i.country and t.year=i.year);
    
quit;
    

exclude observations with missing values

Output: Total rows: 3477 and Total columns: 8


proc sql;

create table usr.gpm_a_r_nomiss as

select * from usr.gpm_a_r

where country and year and tc and bmi and sbp and total_fertility and breast_cancer_deaths and infant_mortality_rate is not missing;

quit;

Full join

Output: Total rows: 76568 and Total columns: 8


proc sql;

create table usr.gpm_a_f as

select
    t.country, t.year, tc, bmi, sbp, total_fertility, breast_cancer_deaths, infant_mortality_rate   
from
    usr.fm_tc_long as t 
    full join usr.fm_bmi_long as b
    on (t.country=b.country and t.year=b.year)
    full join usr.fm_sbp_long as s
    on (t.country=s.country and t.year=s.year)
    full join usr.fm_fertility_long as f
    on (t.country=f.country and t.year=f.year)
    full join usr.fm_brstc_death_long as br
    on (t.country=br.country and t.year=br.year)
    full join usr.infant_mortality_long as i
    on (t.country=i.country and t.year=i.year);
    
quit;
    

exclude observations with missing values

Output: Total rows: 3477 and Total columns: 8


proc sql;

create table usr.gpm_a_f_nomiss as

select * 
        
from usr.gpm_a_f

where country and year and tc and bmi and sbp and total_fertility and breast_cancer_deaths and infant_mortality_rate is not missing;

quit;