-SAS University Edition/ SAS Studio
-Download the following csv files from the gapminder web site (https://www.gapminder.org).
cholesterol_fat_in_blood_women_mmolperl.csv
body_mass_index_bmi_women_kgperm2.csv
blood_pressure_sbp_women_mmhg.csv
children_per_woman_total_fertility.csv
contraceptive_use_percent_of_women_ages_15_49.csv
breast_cancer_number_of_female_deaths.csv
infant_mortality_rate_per_1000_births.csv
maternal_deaths_total_number.csv
maternal_mortality_ratio_per_100000_live_births.csv
%let path=/folders/myshortcuts/datasets;
libname usr "&path";
Output: Total rows: 189 and Total columns: 30
proc import datafile="&path/cholesterol_fat_in_blood_women_mmolperl.csv" dbms=csv out=usr.fm_tc replace;
getnames=yes;
run;
Output: Total rows: 189 and Total columns: 30
proc import datafile="&path/body_mass_index_bmi_women_kgperm2.csv" dbms=csv out=usr.fm_bmi replace;
getnames=yes;
run;
Output: Total rows: 189 and Total columns: 30
proc import datafile="&path/blood_pressure_sbp_women_mmhg.csv" dbms=csv out=usr.fm_sbp replace;
getnames=yes;
run;
Output: Total rows: 184 and Total columns: 220
proc import datafile="&path/children_per_woman_total_fertility.csv" dbms=csv out=usr.fm_fertility replace;
getnames=yes;
run;
Output: Total rows: 185 and Total columns: 58.
Many missing values in this dataset.
Some numeric variables are imported as character variables.
proc import datafile="&path/contraceptive_use_percent_of_women_ages_15_49.csv" dbms=csv out=usr.fm_contracept_use replace;
getnames=yes;
run;
output: Total rows: 187 and Total columns: 28
proc import datafile="&path/breast_cancer_number_of_female_deaths.csv" dbms=csv out=usr.fm_brstc_death replace;
getnames=yes;
run;
Output: Total rows: 194 and Total columns: 217.
Many missing values in this dataset.
proc import datafile="&path/infant_mortality_rate_per_1000_births.csv" dbms=csv out=usr.infant_mortality replace;
getnames=yes;
run;
Output: Total rows: 183 and Total columns: 215.
Many missing valus in this dataset.
proc import datafile="&path/maternal_deaths_total_number.csv" dbms=csv out=usr.maternal_death replace;
getnames=yes;
run;
Output: Total rows: 187 and Total columns: 215
proc import datafile="&path/maternal_mortality_ratio_per_100000_live_births.csv" dbms=csv out=usr.maternal_mortality_ratio replace;
getnames=yes;
run;
-Done in two steps
-1st: transpose using `proc transpose`
-2nd: convert variables to their expected types (`numeric` or `character`) if not imported as the right type in a data step
Output: Total rows: 5481 and Total columns: 3
proc transpose data=usr.fm_tc out=usr.fm_tc_t (rename=(col1=tc));
by country;
var _1980-_2008;
run;
data usr.fm_tc_t_long;
set usr.fm_tc_t;
year=input(substr(_name_, 2, 4), 5.);
drop _name_;
run;
Output: Total rows: 5481 and Total columns: 3
proc transpose data=usr.fm_bmi out=usr.fm_bmi_t (rename=(col1=bmi));
by country;
var _1980-_2008;
run;
data usr.fm_bmi_t_long;
set usr.fm_bmi_t;
year=input(substr(_name_, 2, 4), 5.);
drop _name_;
run;
Output: Total rows: 5481 and Total columns: 3
proc transpose data=usr.fm_sbp out=usr.fm_sbp_t (rename=(col1=sbp));
by country;
var _1980-_2008;
run;
data usr.fm_sbp_t_long;
set usr.fm_sbp_t;
year=input(substr(_name_, 2, 4), 5.);
drop _name_;
run;
Output: Total rows: 40296 and Total columns: 3
proc transpose data=usr.fm_fertility out=usr.fm_fertility_t (rename=(col1=total_fertility));
by country;
var _1800-_2018;
run;
data usr.fm_fertility_t_long;
set usr.fm_fertility_t;
year=input(substr(_name_, 2, 4), 5.);
drop _name_;
run;
Output: Total rows: 5049 and Total columns: 3
proc transpose data=usr.fm_brstc_death out=usr.fm_brstc_death_t (rename=(col1=breast_cancer_deaths));
by country;
var _1990-_2016;
run;
data usr.fm_brstc_death_t_long;
set usr.fm_brstc_death_t;
year=input(substr(_name_, 2, 4), 5.);
drop _name_;
run;
Some numeric variables were imported as character variables.
After transposition all variables are character in nature.
Hence they needed approprite conversion.
Output: Total rows: 10545 and Total columns: 3.
proc transpose data=usr.fm_contracept_use out=usr.fm_contracept_use_t;
by country;
var _1961-_2017;
run;
data usr.fm_contracept_use_t_long;
set usr.fm_contracept_use_t;
year=input(substr(_name_, 2, 4), 5.);
contraceptive_use_percent=input(col1, best32.);
drop _name_ col1;
run;
Output: Total rows: 41904 and Total columns: 3
proc transpose data=usr.infant_mortality out=usr.infant_mortality_t;
by country;
var _1800-_2015;
run;
data usr.infant_mortality_t_long;
set usr.infant_mortality_t;
year=input(substr(_name_, 2, 4), 5.);
infant_mortality_rate=input(col1, best32.);
drop _name_ col1;
run;
Output: Total rows: 39162 and Total columns: 3
proc transpose data=usr.maternal_death out=usr.maternal_death_t;
by country;
var _1800-_2013;
run;
data usr.maternal_death_t_long;
set usr.maternal_death_t;
year=input(substr(_name_, 2, 4), 5.);
maternal_deaths=input(col1, best32.);
drop _name_ col1;
run;
Output: Total rows: 40018 and Total columns: 3
proc transpose data=usr.maternal_mortality_ratio out=usr.maternal_mortality_ratio_t;
by country;
var _1800-_2013;
run;
data usr.maternal_mortality_ratio_t_long;
set usr.maternal_mortality_ratio_t;
year=input(substr(_name_, 2, 4), 5.);
maternal_mortality_ratio=input(col1, best32.);
drop _name_ col1;
run;
Output: Total rows: 3382 and Total columns: 11
proc sql;
create table usr.gpm as
select
t.country, t.year, tc,
bmi,
sbp,
total_fertility,
breast_cancer_deaths,
contraceptive_use_percent,
infant_mortality_rate,
maternal_deaths,
maternal_mortality_ratio
from
usr.fm_tc_t_long as t,
usr.fm_bmi_t_long as b,
usr.fm_sbp_t_long as s,
usr.fm_fertility_t_long as f,
usr.fm_brstc_death_t_long as br,
usr.fm_contracept_use_t_long as c,
usr.infant_mortality_t_long as i,
usr.maternal_death_t_long as m,
usr.maternal_mortality_ratio_t_long as mm
where
t.country=b.country=s.country=f.country=br.country=c.country=i.country=m.country=mm.country
and
t.year=b.year=s.year=f.year=br.year=c.year=i.year=m.year=mm.year;
quit;
Output: Total rows: 3382 and Total columns: 8
proc sql;
create table usr.gpm_nomiss as
select country, year, tc, bmi, sbp, total_fertility, breast_cancer_deaths, infant_mortality_rate from usr.gpm
where country and year and tc and bmi and sbp and total_fertility and breast_cancer_deaths and infant_mortality_rate is not missing;
quit;
Output: Total rows: 3382 and Total columns: 11
proc sql;
create table usr.gpm_i as
select
t.country, t.year, tc,
bmi,
sbp,
total_fertility,
breast_cancer_deaths,
contraceptive_use_percent,
infant_mortality_rate,
maternal_deaths,
maternal_mortality_ratio
from
usr.fm_tc_t_long as t
inner join usr.fm_bmi_t_long as b
on (t.country=b.country and t.year=b.year)
inner join usr.fm_sbp_t_long as s
on (t.country=s.country and t.year=s.year)
inner join usr.fm_fertility_t_long as f
on (t.country=f.country and t.year=f.year)
inner join usr.fm_brstc_death_t_long as br
on (t.country=br.country and t.year=br.year)
inner join usr.fm_contracept_use_t_long as c
on (t.country=c.country and t.year=c.year)
inner join usr.infant_mortality_t_long as i
on (t.country=i.country and t.year=i.year)
inner join usr.maternal_death_t_long as m
on (t.country=m.country and t.year=m.year)
inner join usr.maternal_mortality_ratio_t_long as mm
on (t.country=mm.country and t.year=mm.year);
quit;
Output: Total rows: 3477 and Total columns: 8
proc sql;
create table usr.gpm_i_1 as
select
t.country, t.year, tc,
bmi,
sbp,
total_fertility,
breast_cancer_deaths,
infant_mortality_rate
from
usr.fm_tc_t_long as t
inner join usr.fm_bmi_t_long as b
on (t.country=b.country and t.year=b.year)
inner join usr.fm_sbp_t_long as s
on (t.country=s.country and t.year=s.year)
inner join usr.fm_fertility_t_long as f
on (t.country=f.country and t.year=f.year)
inner join usr.fm_brstc_death_t_long as br
on (t.country=br.country and t.year=br.year)
inner join usr.infant_mortality_t_long as i
on (t.country=i.country and t.year=i.year);
quit;
Output: Total rows: 5481 and Total columns: 8
proc sql;
create table usr.gpm_l as
select
t.country, t.year, tc, bmi, sbp, total_fertility, breast_cancer_deaths, infant_mortality_rate
from
usr.fm_tc_t_long as t
left join usr.fm_bmi_t_long as b
on (t.country=b.country and t.year=b.year)
left join usr.fm_sbp_t_long as s
on (t.country=s.country and t.year=s.year)
left join usr.fm_fertility_t_long as f
on (t.country=f.country and t.year=f.year)
left join usr.fm_brstc_death_t_long as br
on (t.country=br.country and t.year=br.year)
left join usr.infant_mortality_t_long as i
on (t.country=i.country and t.year=i.year);
quit;
Output: Total rows: 41904 and Total columns: 8
proc sql;
create table usr.gpm_r as
select
t.country, t.year, tc, bmi, sbp, total_fertility, breast_cancer_deaths, infant_mortality_rate
from
usr.fm_tc_t_long as t
right join usr.fm_bmi_t_long as b
on (t.country=b.country and t.year=b.year)
right join usr.fm_sbp_t_long as s
on (t.country=s.country and t.year=s.year)
right join usr.fm_fertility_t_long as f
on (t.country=f.country and t.year=f.year)
right join usr.fm_brstc_death_t_long as br
on (t.country=br.country and t.year=br.year)
right join usr.infant_mortality_t_long as i
on (t.country=i.country and t.year=i.year);
quit;
Output: Total rows: 3477 and Total columns: 8
proc sql;
create table usr.gpm_r_nomiss as
select * from usr.gpm_r
where country and year and tc and bmi and sbp and total_fertility and breast_cancer_deaths and infant_mortality_rate is not missing;
quit;
Output: Total rows: 78408 and Total columns: 8
proc sql;
create table usr.gpm_f as
select
t.country, t.year, tc, bmi, sbp, total_fertility, breast_cancer_deaths, infant_mortality_rate
from
usr.fm_tc_t_long as t
full join usr.fm_bmi_t_long as b
on (t.country=b.country and t.year=b.year)
full join usr.fm_sbp_t_long as s
on (t.country=s.country and t.year=s.year)
full join usr.fm_fertility_t_long as f
on (t.country=f.country and t.year=f.year)
full join usr.fm_brstc_death_t_long as br
on (t.country=br.country and t.year=br.year)
full join usr.infant_mortality_t_long as i
on (t.country=i.country and t.year=i.year);
quit;
Output: Total rows: 3477 and Total columns: 8
proc sql;
create table usr.gpm_f_nomiss as
select *
from usr.gpm_f
where country and year and tc and bmi and sbp and total_fertility and breast_cancer_deaths and infant_mortality_rate is not missing;
quit;