Notes made while working through Biostatistics by Example Using SAS Studio by Ron Cody, using the SAS programmer interface. All SAS content and materials belong to them, not me.

R will then be used to complete representative activities.

Chapters 1 - 4: Data Import Basics

Read-in bult-in dataset, import from text and CSV files, work with various delimiters and header profiles.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* copy built-in SAS dataset to an external file for download and use with R later */
/* run once only */
/* validvarname v6 for mainframe and v9 for R or Unix */
/* options validvarname=v6;   */
/* options validvarname=v9;   */
/*  proc copy in=sashelp out=biostats;   */
/*    select heart fish bweight;   */
/*    run;   */
/* options validvarname=v7; */

/* temp one time run
proc copy in=sashelp out=biostats;  
  select bweight; 
  run; 
*/

/*** Chapters 1 - 4: Data Import Basics ***/

/* examining a sample built-in dataset */
title1 "sample built-in dataset heart";
proc contents data=sashelp.heart order=varnum; run;
title1 "sample built-in dataset heart sample";
proc print u data=sashelp.heart (obs=25); run;
proc sort data=sashelp.heart out=work.heart_temp; by descending Systolic; run;
title1 "sample built-in dataset heart sample decreasing Systolic";
proc print u data=work.heart_temp (obs=25 keep=Sex Height Systolic Diastolic Weight); run;
title1 "sample built-in dataset heart sample decreasing Systolic Height > 70";
proc print u data=work.heart_temp (obs=25 keep=Sex Height Systolic Diastolic Weight
  where=(Height GT 70)); run;
title1 "sample built-in dataset heart";
proc freq data=sashelp.heart; 
  tables height / missing; run; 
title1;
proc delete data=work.heart_temp; run;

/* import xlsx file */
proc import datafile=" &path1.Grades.xlsx"
    dbms=xlsx
    out=work.grades_xlsx replace;
    getnames=yes;
run;
title1 "sample imported dataset grades";
proc contents data=work.grades_xlsx order=varnum; run;
proc print u data=work.grades_xlsx (obs=25); run;
title1;
proc delete data=work.grades_xlsx; run;

/* import xlsx file take 2 invalid sas names*/
options validvarname=v7;  /* need to force imports to sas naming standards */
proc import datafile=" &path1.Grades2.xlsx"
    dbms=xlsx
    out=work.grades2_xlsx replace;
    getnames=yes;
run;
title1 "sample imported dataset grades2";
proc contents data=work.grades2_xlsx order=varnum; run;
proc print u data=work.grades2_xlsx (obs=25); run;
title1 "sample imported dataset grades2 rename";
proc print u data=work.grades2_xlsx (obs=25 rename=(Stuent_Name=Student_Name _2015Final = Final_2015)); run;
title1;
proc delete data=work.grades2_xlsx; run;

/* import csv file */
proc import datafile=" &path1.Grades.csv"
    dbms=csv
    out=work.grades_csv replace;
    getnames=yes;
run;
title1 "sample imported dataset grades csv";
proc contents data=work.grades_csv order=varnum; run;
proc print u data=work.grades_csv (obs=25); run;
title1;
proc delete data=work.grades_csv; run;

/* import txt file with no column names*/
proc import datafile="&path1.Health_List.txt"
    dbms=dlm replace
    out=work.healthlist_txt replace;
    getnames=no;
    delimiter='20'x;
run;
title1 "sample imported dataset healthlist txt proc import terrible";
proc contents data=work.healthlist_txt order=varnum; run;
proc print u data=work.healthlist_txt (obs=25); run;

/* Better to use a list input which takes care of consecutive delimeters etc. */
data work.healthlist_txt;
  infile "&path1.Health_List.txt";
  input Subj Gender $ Age Heart_Rate SBP DBP Chol;
  label Subj = "Subject Number"
    Gender = "Gender (M or F)"
    Age = "Age in years"
    Heart_Rate = "Heart rate"
    SBP = "Systolic blood pressure"
    DBP = "Diastolic blood pressure"
    Chol = "Total cholesterol";
  run;
title1 "sample imported dataset healthlist txt list input is better";
proc contents data=work.healthlist_txt order=varnum; run;
proc print u data=work.healthlist_txt (obs=25); run;
title1;
proc delete data=work.healthlist_txt; run;

/* same data using list input with more explicit parameters */
data work.healthlist_txt;
  length Gender $1.; /* specify string length before input */
  infile "&path1.Health_List.txt" DLM='20'x; /* TAB is 09x */
  input Subj Gender $ Age Heart_Rate SBP DBP Chol;
  label Subj = "Subject Number"
    Gender = "Gender (M or F)"
    Age = "Age in years"
    Heart_Rate = "Heart rate"
    SBP = "Systolic blood pressure"
    DBP = "Diastolic blood pressure"
    Chol = "Total cholesterol";
  run;
title1 "sample import using list input treat mult delim as one";
proc contents data=work.healthlist_txt order=varnum; run;
proc print u data=work.healthlist_txt (obs=25); run;
title1;
proc delete data=work.healthlist_txt; run;

/* Use list input to read in CSV */
data work.health_csv;
  infile "&path1.Health.csv" DSD;
  input Subj Gender $ Age Heart_Rate SBP DBP Chol;
  label Subj = "Subject Number"
    Gender = "Gender (M or F)"
    Age = "Age in years"
    Heart_Rate = "Heart rate"
    SBP = "Systolic blood pressure"
    DBP = "Diastolic blood pressure"
    Chol = "Total cholesterol";
  run;
title1 "sample imported dataset health csv using list input";
proc contents data=work.health_csv order=varnum; run;
proc print u data=work.health_csv (obs=25); run;
title1;
proc delete data=work.health_csv; run;

/* read in text with fixed columns */
data work.health_txt;
  infile "&path1.Health.txt" pad;
  input 
    @1 Subj 3.
    @4 Gender $1.
    @5 Age 2.
    @7 Heart_Rate 2.
    @9 SBP 3.
    @12 DBP 3.
    @16 Chol 3.;
  label Subj = "Subject Number"
    Gender = "Gender (M or F)"
    Age = "Age in years"
    Heart_Rate = "Heart rate"
    SBP = "Systolic blood pressure"
    DBP = "Diastolic blood pressure"
    Chol = "Total cholesterol";
  run;
title1 "sample imported dataset health txt fixed columns";
proc contents data=work.health_txt order=varnum; run;
proc print u data=work.health_txt (obs=25); run;
title1;
proc delete data=work.health_txt; run;

/* read in tab delimited file*/
data work.bpressure_txt;
  length Drug $7.
    Gender $1.;
  infile "&path1.Blood_Pressure.txt" DLM='09'x DSD;
  input Drug $ Subj Gender $ SBP DBP;
  label Drug = "Drug"
    Subj = "Subject Number"
    Gender = "Gender (M or F)"
    SBP = "Systolic blood pressure"
    DBP = "Diastolic blood pressure";
  run;
title1 "sample import blood pressure tab delim text file";
proc contents data=work.bpressure_txt order=varnum; run;
proc print u data=work.bpressure_txt (obs=25); run;
title1;
proc delete data=work.bpressure_txt; run;

/* Import external files for exercises found in other chapters - start */

/* Distribution Practice 5-3 */
options validvarname=v7;  /* need to force imports to sas naming standards */
proc import datafile=" &path1.Blood_Pressure.xlsx"
    dbms=xlsx
    out=work.blood_pressure replace;
    getnames=yes;
run;
title1 "Blood Pressure Distribution Practice 5-3";
proc contents data=work.blood_pressure order=varnum; run;
proc print u data=work.blood_pressure (obs=25); run;

/*** Chapter 6: One-Sample Tests ***/
/* Performing a One-Sample t Test */
options validvarname=v7;  /* need to force imports to sas naming standards */
proc import datafile=" &path1.Perch.xlsx"
    dbms=xlsx
    out=work.perch replace;
    getnames=yes;
run;
title1 "Importing Perch data";
proc contents data=work.perch order=varnum; run;
proc print u data=work.perch (obs=25); run;

/* Paired t Test */
options validvarname=v7;  /* need to force imports to sas naming standards */
proc import datafile=" &path1.Yoga.xlsx"
    dbms=xlsx
    out=work.yoga replace;
    getnames=yes;
run;
title1 "Importing Yoga data";
proc contents data=work.yoga order=varnum; run;
proc print u data=work.yoga (obs=25); run;

title1;
/* proc delete data=work.blood_pressure; run; */
/* proc delete data=work.perch; run; */
/* proc delete data=work.yoga; run; */

/* Practice: One-way ANOVA with test for Tukey multiple comparisons, CHF - generated code - start 8.6 */
title1 "Practice: One-way ANOVA with test for Tukey multiple comparisons, CHF";
/* Placebo: 55 58 62 48 57 57 80 40 55 52 */ 
/* Calcium: 57 65 55 78 57 84 72 80 78 81 */ 
/* Lasix:   60 60 65 67 48 62 64 70 57 40 */ 
data work.congestive_heart_failure;
  do Group = 'Placebo','Calcium','Lasix';
    do Subj = 1 to 10;
      input LVEF @@;
      output; 
    end;
  end;
datalines;
  55 58 62 48 57 57 80 40 55 52
  57 65 55 78 57 84 72 80 78 81
  60 60 65 67 48 62 64 70 57 40
    ; 

title1;
proc print u data=work.congestive_heart_failure ; run; 
/* proc delete data=work.congestive_heart_failure; run; */

/* Practice: One-way ANOVA with test for Tukey multiple comparisons, CHF - generated code - end 8.6 */

/* Chapter 9: create sample dataset for Performing a Two-Way Analysis of Variance - generated code - start */

/* Create 25 pct sample without replacement using random seed 13579 */
proc surveyselect data=biostats.bweight out=work.Birth_Wt_Sample method=srs 
        samprate=0.25 seed=13579;
run;

/* The SASHELP library contains the Bweight dataset containing birth weights for 50,000 babies, along with several variables believed to be related to birth weight, such as race (coded as black or not black), mother's smoking status (smoking or non-smoking), and marital status.
Weight (in grams) is the Dependent variable
Black (0 = not black, 1=black)
MomSmoke (0=no, 1=yes)
Married (0=no, 1=yes) */

/* Chapter 9: create sample dataset for Performing a Two-Way Analysis of Variance - generated code - end */


/*** Chapter 10: Correlation ***/
/* Statistics Correlation */
proc import datafile=" &path1.Exercise.xls"
    dbms=xls
    out=work.exercise replace;
    getnames=yes;
run;
title1 "Importing Exercise data";
proc contents data=work.exercise order=varnum; run;
proc print u data=work.exercise (obs=25); run;

/* Chapter 12: Binary Logistic Regression data prep - start */
/* Create a categorical weight variable from arbitrary 3402 gram cutoff using non-missing weight from previously-created sample */
title1 "Logistic Regression data prep Birth_Wt_Sample to High_Low";
data WORK.High_Low;
   set WORK.Birth_Wt_Sample;
   where Weight is not missing; 
   *Wt_Group = 1 is lower weight group
   The median weight is 3402 grams; 
   if Weight lt 3402 then Wt_Group = 1;
   else Wt_Group = 0;
run; 
proc means data=WORK.High_Low; run;
proc print U data=WORK.High_Low (obs=10); run;
/* Chapter 12: Binary Logistic Regression data prep - end */

/* Chapter 13: Prepare generated dataset Heart_Attack - start */
title1 "Import generated dataset Heart_Attack";
proc import datafile=" &path1.Heart_Attack.xlsx"
    dbms=xlsx
    out=work.Heart_Attack replace;
    getnames=yes;
run;
proc print u data=work.Heart_Attack (obs=10); run;
proc freq data=work.Heart_Attack ;
  tables Gender Age_Group High_Chol Heart_Attack
  /list missing;
  run;
/* Formats for the Heart_Attack Dataset */ 
title1 "Sample Observations from Formatted Heart_Attack";
proc format;
   value $gender 'F' = 'Female'
                 'M' = 'Male';
   value Yesno 0 = 'No'
               1 = 'Yes';
   value Age_Group 1 = '< 60'
                   2 = '60-70'
                   3 = '71+';
  run; 
data work.Heart_Attack;
   set work.Heart_Attack;
   format Gender $Gender.
          Heart_Attack High_Chol Yesno.
          Age_Group Age_Group.;
  run; 
proc print u data=work.Heart_Attack (obs=10); run;
proc freq data=work.Heart_Attack ;
  tables Gender Age_Group High_Chol Heart_Attack
  /list missing;
  run;

/* Reformat the Heart_Attack Dataset */ 
title1 "Sample Observations from Reformatted Heart_Attack";
proc format;
   value $gender 'F' = '2:Female'
                 'M' = '1:Male';
   value Yesno 0 = '2:No'
               1 = '1:Yes';
   value Age_Group 1 = '< 60'
                   2 = '60-70'
                   3 = '71+';
  run; 
data work.Heart_Attack;
   set work.Heart_Attack;
   format Gender $Gender.
          Heart_Attack High_Chol Yesno.
          Age_Group Age_Group.;
  run; 
proc print u data=work.Heart_Attack (obs=10); run;
proc freq data=work.Heart_Attack order=formatted;
  tables Gender Age_Group High_Chol Heart_Attack
  /list missing;
  run;

title1;
/* Chapter 13: Prepare generated dataset Heart_Attack - end */

title1;
/* Import external files for exercises found in other chapters - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

sample built-in dataset heart

The CONTENTS Procedure

The Contents Procedure

SASHELP.HEART

Attributes


Data Set Name	SASHELP.HEART	Observations	5209
Member Type	DATA	Variables	17
Engine	V9	Indexes	0
Created	10/24/2018 21:21:26	Observation Length	168
Last Modified	10/24/2018 21:21:26	Deleted Observations	0
Protection		Compressed	NO
Data Set Type		Sorted	NO
Label	Framingham Heart Study
Data Representation	SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64
Encoding	us-ascii ASCII (ANSI)

Engine/Host Information


Engine/Host Dependent Information
Data Set Page Size	65536
Number of Data Set Pages	14
First Data Page	1
Max Obs per Page	389
Obs in First Data Page	365
Number of Data Set Repairs	0
Filename	/pbr/sfw/sas/940/SASFoundation/9.4/sashelp/heart.sas7bdat
Release Created	9.0401M6
Host Created	Linux
Inode Number	137812
Access Permission	rw-r--r--
Owner Name	odaowner
File Size	960KB
File Size (bytes)	983040

Varnum


Variables in Creation Order
#	Variable	Type	Len	Label
1	Status	Char	5
2	DeathCause	Char	26	Cause of Death
3	AgeCHDdiag	Num	8	Age CHD Diagnosed
4	Sex	Char	6
5	AgeAtStart	Num	8	Age at Start
6	Height	Num	8
7	Weight	Num	8
8	Diastolic	Num	8
9	Systolic	Num	8
10	MRW	Num	8	Metropolitan Relative Weight
11	Smoking	Num	8
12	AgeAtDeath	Num	8	Age at Death
13	Cholesterol	Num	8
14	Chol_Status	Char	10	Cholesterol Status
15	BP_Status	Char	7	Blood Pressure Status
16	Weight_Status	Char	11	Weight Status
17	Smoking_Status	Char	17	Smoking Status

sample built-in dataset heart sample

The Print Procedure

Data Set SASHELP.HEART


Obs	Status	DeathCause	AgeCHDdiag	Sex	AgeAtStart	Height	Weight	Diastolic	Systolic	MRW	Smoking	AgeAtDeath	Cholesterol	Chol_Status	BP_Status	Weight_Status	Smoking_Status
1	Dead	Other	.	Female	29	62.50	140	78	124	121	0	55	.		Normal	Overweight	Non-smoker
2	Dead	Cancer	.	Female	41	59.75	194	92	144	183	0	57	181	Desirable	High	Overweight	Non-smoker
3	Alive		.	Female	57	62.25	132	90	170	114	10	.	250	High	High	Overweight	Moderate (6-15)
4	Alive		.	Female	39	65.75	158	80	128	123	0	.	242	High	Normal	Overweight	Non-smoker
5	Alive		.	Male	42	66.00	156	76	110	116	20	.	281	High	Optimal	Overweight	Heavy (16-25)
6	Alive		.	Female	58	61.75	131	92	176	117	0	.	196	Desirable	High	Overweight	Non-smoker
7	Alive		.	Female	36	64.75	136	80	112	110	15	.	196	Desirable	Normal	Overweight	Moderate (6-15)
8	Dead	Other	.	Male	53	65.50	130	80	114	99	0	77	276	High	Normal	Normal	Non-smoker
9	Alive		.	Male	35	71.00	194	68	132	124	0	.	211	Borderline	Normal	Overweight	Non-smoker
10	Dead	Cerebral Vascular Disease	.	Male	52	62.50	129	78	124	106	5	82	284	High	Normal	Normal	Light (1-5)
11	Alive		.	Male	39	66.25	179	76	128	133	30	.	225	Borderline	Normal	Overweight	Very Heavy (> 25)
12	Alive		57	Male	33	64.25	151	68	108	118	0	.	221	Borderline	Optimal	Overweight	Non-smoker
13	Alive		55	Male	33	70.00	174	90	142	114	0	.	188	Desirable	High	Overweight	Non-smoker
14	Alive		79	Male	57	67.25	165	76	128	118	15	.	.		Normal	Overweight	Moderate (6-15)
15	Alive		66	Male	44	69.00	155	90	130	105	30	.	292	High	High	Normal	Very Heavy (> 25)
16	Alive		.	Female	37	64.50	134	76	120	108	10	.	196	Desirable	Normal	Normal	Moderate (6-15)
17	Alive		.	Male	40	66.25	151	72	132	112	30	.	192	Desirable	Normal	Overweight	Very Heavy (> 25)
18	Dead	Cancer	56	Male	56	67.25	122	72	120	87	15	72	194	Desirable	Normal	Underweight	Moderate (6-15)
19	Alive		.	Female	42	67.75	162	96	138	119	1	.	200	Borderline	High	Overweight	Light (1-5)
20	Dead	Coronary Heart Disease	74	Male	46	66.50	157	84	142	116	30	76	233	Borderline	High	Overweight	Very Heavy (> 25)
21	Alive		.	Female	37	66.25	148	78	110	112	15	.	192	Desirable	Optimal	Overweight	Moderate (6-15)
22	Alive		.	Female	45	64.00	147	74	120	119	5	.	209	Borderline	Normal	Overweight	Light (1-5)
23	Alive		.	Female	59	65.75	156	74	156	122	0	.	200	Borderline	High	Overweight	Non-smoker
24	Alive		.	Female	36	63.75	122	84	132	102	0	.	184	Desirable	Normal	Normal	Non-smoker
25	Alive		.	Female	50	67.50	185	88	150	136	15	.	228	Borderline	High	Overweight	Moderate (6-15)

sample built-in dataset heart sample decreasing Systolic

The Print Procedure

Data Set WORK.HEART_TEMP


Obs	Sex	Height	Weight	Diastolic	Systolic
1	Female	62.75	228	150	300
2	Female	62.00	153	144	294
3	Female	61.75	144	124	290
4	Female	60.00	204	104	286
5	Female	59.75	189	145	280
6	Male	66.50	186	112	276
7	Female	63.75	115	124	272
8	Female	60.75	202	135	270
9	Male	64.00	168	130	260
10	Female	61.00	200	145	260
11	Female	58.75	152	140	250
12	Male	65.00	155	140	250
13	Female	63.75	154	130	250
14	Female	59.50	113	115	250
15	Female	60.25	271	130	246
16	Female	60.25	235	134	246
17	Male	66.75	174	110	246
18	Female	62.75	241	150	242
19	Female	60.25	171	130	240
20	Female	61.25	150	138	240
21	Female	62.25	198	120	240
22	Female	59.00	172	112	240
23	Female	64.00	217	130	240
24	Female	64.00	129	140	236
25	Female	62.50	145	114	236

sample built-in dataset heart sample decreasing Systolic Height > 70

The Print Procedure

Data Set WORK.HEART_TEMP


Obs	Sex	Height	Weight	Diastolic	Systolic
60	Male	70.50	229	126	216
106	Male	73.50	178	140	200
115	Male	72.00	227	110	200
149	Male	70.50	226	130	190
150	Male	72.00	191	110	190
181	Male	70.50	172	96	188
260	Male	72.25	273	120	180
291	Male	71.50	215	105	180
308	Male	73.50	182	100	178
320	Male	70.25	183	100	176
345	Male	70.50	181	120	175
392	Male	71.00	191	100	172
399	Male	70.50	180	120	172
402	Male	70.75	198	104	172
428	Male	70.50	203	120	170
461	Male	71.25	189	115	170
472	Male	70.25	133	55	170
506	Male	71.25	166	106	168
545	Male	70.75	198	104	166
563	Male	70.50	156	95	165
567	Male	72.50	213	110	165
592	Male	70.50	223	90	164
599	Male	70.50	169	85	164
608	Male	70.25	172	94	164
626	Male	70.25	243	90	162

sample built-in dataset heart

The FREQ Procedure

The Freq Procedure

Table Height

One-Way Frequencies


Height	Frequency	Percent	Cumulative Frequency	Cumulative Percent
.	6	0.12	6	0.12
51.5	1	0.02	7	0.13
53.75	1	0.02	8	0.15
54.75	2	0.04	10	0.19
55	2	0.04	12	0.23
55.5	2	0.04	14	0.27
55.75	2	0.04	16	0.31
56	3	0.06	19	0.36
56.25	2	0.04	21	0.40
56.5	14	0.27	35	0.67
56.75	10	0.19	45	0.86
57	4	0.08	49	0.94
57.25	9	0.17	58	1.11
57.5	12	0.23	70	1.34
57.75	7	0.13	77	1.48
58	26	0.50	103	1.98
58.25	23	0.44	126	2.42
58.5	33	0.63	159	3.05
58.75	22	0.42	181	3.47
59	63	1.21	244	4.68
59.25	34	0.65	278	5.34
59.5	70	1.34	348	6.68
59.75	39	0.75	387	7.43
60	86	1.65	473	9.08
60.25	68	1.31	541	10.39
60.5	95	1.82	636	12.21
60.75	62	1.19	698	13.40
61	122	2.34	820	15.74
61.25	88	1.69	908	17.43
61.5	125	2.40	1033	19.83
61.75	88	1.69	1121	21.52
62	165	3.17	1286	24.69
62.25	129	2.48	1415	27.16
62.5	175	3.36	1590	30.52
62.75	102	1.96	1692	32.48
63	161	3.09	1853	35.57
63.25	109	2.09	1962	37.67
63.5	146	2.80	2108	40.47
63.75	102	1.96	2210	42.43
64	147	2.82	2357	45.25
64.25	111	2.13	2468	47.38
64.5	169	3.24	2637	50.62
64.75	86	1.65	2723	52.27
65	170	3.26	2893	55.54
65.25	119	2.28	3012	57.82
65.5	139	2.67	3151	60.49
65.75	113	2.17	3264	62.66
66	124	2.38	3388	65.04
66.25	109	2.09	3497	67.13
66.5	107	2.05	3604	69.19
66.75	66	1.27	3670	70.45
67	119	2.28	3789	72.74
67.25	103	1.98	3892	74.72
67.5	114	2.19	4006	76.91
67.75	85	1.63	4091	78.54
68	84	1.61	4175	80.15
68.25	102	1.96	4277	82.11
68.5	100	1.92	4377	84.03
68.75	73	1.40	4450	85.43
69	84	1.61	4534	87.04
69.25	77	1.48	4611	88.52
69.5	81	1.56	4692	90.07
69.75	47	0.90	4739	90.98
70	68	1.31	4807	92.28
70.25	38	0.73	4845	93.01
70.5	62	1.19	4907	94.20
70.75	24	0.46	4931	94.66
71	55	1.06	4986	95.72
71.25	30	0.58	5016	96.29
71.5	29	0.56	5045	96.85
71.75	14	0.27	5059	97.12
72	34	0.65	5093	97.77
72.25	22	0.42	5115	98.20
72.5	22	0.42	5137	98.62
72.75	12	0.23	5149	98.85
73	17	0.33	5166	99.17
73.25	12	0.23	5178	99.40
73.5	7	0.13	5185	99.54
73.75	2	0.04	5187	99.58
74	4	0.08	5191	99.65
74.25	2	0.04	5193	99.69
74.5	6	0.12	5199	99.81
74.75	2	0.04	5201	99.85
75	1	0.02	5202	99.87
75.25	2	0.04	5204	99.90
75.5	2	0.04	5206	99.94
76	2	0.04	5208	99.98
76.5	1	0.02	5209	100.00

sample imported dataset grades

The CONTENTS Procedure

The Contents Procedure

WORK.GRADES_XLSX

Attributes


Data Set Name	WORK.GRADES_XLSX	Observations	3
Member Type	DATA	Variables	8
Engine	V9	Indexes	0
Created	07/13/2022 10:51:34	Observation Length	72
Last Modified	07/13/2022 10:51:34	Deleted Observations	0
Protection		Compressed	NO
Data Set Type		Sorted	NO
Label
Data Representation	SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64
Encoding	utf-8 Unicode (UTF-8)

Engine/Host Information


Engine/Host Dependent Information
Data Set Page Size	131072
Number of Data Set Pages	1
First Data Page	1
Max Obs per Page	1816
Obs in First Data Page	3
Number of Data Set Repairs	0
Filename	/saswork/SAS_work03A00000A0C4_odaws03-usw2.oda.sas.com/SAS_work0E560000A0C4_odaws03-usw2.oda.sas.com/grades_xlsx.sas7bdat
Release Created	9.0401M6
Host Created	Linux
Inode Number	2156762
Access Permission	rw-r--r--
Owner Name	u47386750
File Size	256KB
File Size (bytes)	262144

Varnum


Variables in Creation Order
#	Variable	Type	Len	Format	Informat	Label
1	Name	Char	10	$10.	$10.	Name
2	ID	Num	8	BEST.		ID
3	Quiz1	Num	8	BEST.		Quiz1
4	Quiz2	Num	8	BEST.		Quiz2
5	Midterm	Num	8	BEST.		Midterm
6	Quiz3	Num	8	BEST.		Quiz3
7	Quiz4	Num	8	BEST.		Quiz4
8	Final	Num	8	BEST.		Final

sample imported dataset grades

The Print Procedure

Data Set WORK.GRADES_XLSX


Obs	Name	ID	Quiz1	Quiz2	Midterm	Quiz3	Quiz4	Final
1	Jones	12345	88	80	76	88	90	82
2	Hildebrand	22222	95	92	91	94	90	96
3	O'Brien	33333	76	78	79	81	83	80

sample imported dataset grades2

The CONTENTS Procedure

The Contents Procedure

WORK.GRADES2_XLSX

Attributes


Data Set Name	WORK.GRADES2_XLSX	Observations	3
Member Type	DATA	Variables	8
Engine	V9	Indexes	0
Created	07/13/2022 10:51:34	Observation Length	72
Last Modified	07/13/2022 10:51:34	Deleted Observations	0
Protection		Compressed	NO
Data Set Type		Sorted	NO
Label
Data Representation	SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64
Encoding	utf-8 Unicode (UTF-8)

Engine/Host Information


Engine/Host Dependent Information
Data Set Page Size	131072
Number of Data Set Pages	1
First Data Page	1
Max Obs per Page	1816
Obs in First Data Page	3
Number of Data Set Repairs	0
Filename	/saswork/SAS_work03A00000A0C4_odaws03-usw2.oda.sas.com/SAS_work0E560000A0C4_odaws03-usw2.oda.sas.com/grades2_xlsx.sas7bdat
Release Created	9.0401M6
Host Created	Linux
Inode Number	2156762
Access Permission	rw-r--r--
Owner Name	u47386750
File Size	256KB
File Size (bytes)	262144

Varnum


Variables in Creation Order
#	Variable	Type	Len	Format	Informat	Label
1	Stuent_Name	Char	10	$10.	$10.	Stuent Name
2	ID	Num	8	BEST.		ID
3	Quiz_1	Num	8	BEST.		Quiz 1
4	Quiz_2	Num	8	BEST.		Quiz 2
5	Mid_Term	Num	8	BEST.		Mid Term
6	Quiz_3	Num	8	BEST.		Quiz 3
7	Quiz_4	Num	8	BEST.		Quiz 4
8	_2015Final	Num	8	BEST.		2015Final

sample imported dataset grades2

The Print Procedure

Data Set WORK.GRADES2_XLSX


Obs	Stuent_Name	ID	Quiz_1	Quiz_2	Mid_Term	Quiz_3	Quiz_4	_2015Final
1	Jones	12345	88	80	76	88	90	82
2	Hildebrand	22222	95	92	91	94	90	96
3	O'Brien	33333	76	78	79	81	83	80

sample imported dataset grades2 rename

The Print Procedure

Data Set WORK.GRADES2_XLSX


Obs	Student_Name	ID	Quiz_1	Quiz_2	Mid_Term	Quiz_3	Quiz_4	Final_2015
1	Jones	12345	88	80	76	88	90	82
2	Hildebrand	22222	95	92	91	94	90	96
3	O'Brien	33333	76	78	79	81	83	80

sample imported dataset grades csv

The CONTENTS Procedure

The Contents Procedure

WORK.GRADES_CSV

Attributes


Data Set Name	WORK.GRADES_CSV	Observations	3
Member Type	DATA	Variables	8
Engine	V9	Indexes	0
Created	07/13/2022 10:51:34	Observation Length	88
Last Modified	07/13/2022 10:51:34	Deleted Observations	0
Protection		Compressed	NO
Data Set Type		Sorted	NO
Label
Data Representation	SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64
Encoding	utf-8 Unicode (UTF-8)

Engine/Host Information


Engine/Host Dependent Information
Data Set Page Size	131072
Number of Data Set Pages	1
First Data Page	1
Max Obs per Page	1486
Obs in First Data Page	3
Number of Data Set Repairs	0
Filename	/saswork/SAS_work03A00000A0C4_odaws03-usw2.oda.sas.com/SAS_work0E560000A0C4_odaws03-usw2.oda.sas.com/grades_csv.sas7bdat
Release Created	9.0401M6
Host Created	Linux
Inode Number	2156770
Access Permission	rw-r--r--
Owner Name	u47386750
File Size	256KB
File Size (bytes)	262144

Varnum


Variables in Creation Order
#	Variable	Type	Len	Format	Informat
1	Name	Char	31	$31.	$31.
2	ID	Num	8	BEST12.	BEST32.
3	Quiz1	Num	8	BEST12.	BEST32.
4	Quiz2	Num	8	BEST12.	BEST32.
5	Midterm	Num	8	BEST12.	BEST32.
6	Quiz3	Num	8	BEST12.	BEST32.
7	Quiz4	Num	8	BEST12.	BEST32.
8	Final	Num	8	BEST12.	BEST32.

sample imported dataset grades csv

The Print Procedure

Data Set WORK.GRADES_CSV


Obs	Name	ID	Quiz1	Quiz2	Midterm	Quiz3	Quiz4	Final
1	Jones	12345	88	80	76	88	90	82
2	Hildebrand	22222	95	92	91	94	90	96
3	O'Brien	33333	76	78	79	81	83	80

sample imported dataset healthlist txt proc import terrible

The CONTENTS Procedure

The Contents Procedure

WORK.HEALTHLIST_TXT

Attributes


Data Set Name	WORK.HEALTHLIST_TXT	Observations	6
Member Type	DATA	Variables	11
Engine	V9	Indexes	0
Created	07/13/2022 10:51:34	Observation Length	72
Last Modified	07/13/2022 10:51:34	Deleted Observations	0
Protection		Compressed	NO
Data Set Type		Sorted	NO
Label
Data Representation	SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64
Encoding	utf-8 Unicode (UTF-8)

Engine/Host Information


Engine/Host Dependent Information
Data Set Page Size	131072
Number of Data Set Pages	1
First Data Page	1
Max Obs per Page	1816
Obs in First Data Page	6
Number of Data Set Repairs	0
Filename	/saswork/SAS_work03A00000A0C4_odaws03-usw2.oda.sas.com/SAS_work0E560000A0C4_odaws03-usw2.oda.sas.com/healthlist_txt.sas7bdat
Release Created	9.0401M6
Host Created	Linux
Inode Number	2156770
Access Permission	rw-r--r--
Owner Name	u47386750
File Size	256KB
File Size (bytes)	262144

Varnum


Variables in Creation Order
#	Variable	Type	Len	Format	Informat
1	VAR1	Num	8	BEST12.	BEST32.
2	VAR2	Char	1	$1.	$1.
3	VAR3	Char	2	$2.	$2.
4	VAR4	Char	2	$2.	$2.
5	VAR5	Num	8	BEST12.	BEST32.
6	VAR6	Num	8	BEST12.	BEST32.
7	VAR7	Num	8	BEST12.	BEST32.
8	VAR8	Num	8	BEST12.	BEST32.
9	VAR9	Num	8	BEST12.	BEST32.
10	VAR10	Num	8	BEST12.	BEST32.
11	VAR11	Num	8	BEST12.	BEST32.

sample imported dataset healthlist txt proc import terrible

The Print Procedure

Data Set WORK.HEALTHLIST_TXT


Obs	VAR1	VAR2	VAR3	VAR4	VAR5	VAR6	VAR7	VAR8	VAR9	VAR10	VAR11
1	1	M	23	68	120	90	128	.	.	.	.
2	2		F	55	.	72	.	.	180	90	170
3	3	F	18	58	118	72	122	.	.	.	.
4	4	M		80	82	.	.	220	.	.	.
5	5	F	34	62	128	80	.	.	.	.	.
6	6			F	.	38	78	108	68	220	.

sample imported dataset healthlist txt list input is better

The CONTENTS Procedure

The Contents Procedure

WORK.HEALTHLIST_TXT

Attributes


Data Set Name	WORK.HEALTHLIST_TXT	Observations	6
Member Type	DATA	Variables	7
Engine	V9	Indexes	0
Created	07/13/2022 10:51:34	Observation Length	56
Last Modified	07/13/2022 10:51:34	Deleted Observations	0
Protection		Compressed	NO
Data Set Type		Sorted	NO
Label
Data Representation	SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64
Encoding	utf-8 Unicode (UTF-8)

Engine/Host Information


Engine/Host Dependent Information
Data Set Page Size	131072
Number of Data Set Pages	1
First Data Page	1
Max Obs per Page	2334
Obs in First Data Page	6
Number of Data Set Repairs	0
Filename	/saswork/SAS_work03A00000A0C4_odaws03-usw2.oda.sas.com/SAS_work0E560000A0C4_odaws03-usw2.oda.sas.com/healthlist_txt.sas7bdat
Release Created	9.0401M6
Host Created	Linux
Inode Number	2156762
Access Permission	rw-r--r--
Owner Name	u47386750
File Size	256KB
File Size (bytes)	262144

Varnum


Variables in Creation Order
#	Variable	Type	Len	Label
1	Subj	Num	8	Subject Number
2	Gender	Char	8	Gender (M or F)
3	Age	Num	8	Age in years
4	Heart_Rate	Num	8	Heart rate
5	SBP	Num	8	Systolic blood pressure
6	DBP	Num	8	Diastolic blood pressure
7	Chol	Num	8	Total cholesterol

sample imported dataset healthlist txt list input is better

The Print Procedure

Data Set WORK.HEALTHLIST_TXT


Obs	Subj	Gender	Age	Heart_Rate	SBP	DBP	Chol
1	1	M	23	68	120	90	128
2	2	F	55	72	180	90	170
3	3	F	18	58	118	72	122
4	4	M	80	82	.	.	220
5	5	F	34	62	128	80	.
6	6	F	38	78	108	68	220

sample import using list input treat mult delim as one

The CONTENTS Procedure

The Contents Procedure

WORK.HEALTHLIST_TXT

Attributes


Data Set Name	WORK.HEALTHLIST_TXT	Observations	6
Member Type	DATA	Variables	7
Engine	V9	Indexes	0
Created	07/13/2022 10:51:34	Observation Length	56
Last Modified	07/13/2022 10:51:34	Deleted Observations	0
Protection		Compressed	NO
Data Set Type		Sorted	NO
Label
Data Representation	SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64
Encoding	utf-8 Unicode (UTF-8)

Engine/Host Information


Engine/Host Dependent Information
Data Set Page Size	131072
Number of Data Set Pages	1
First Data Page	1
Max Obs per Page	2334
Obs in First Data Page	6
Number of Data Set Repairs	0
Filename	/saswork/SAS_work03A00000A0C4_odaws03-usw2.oda.sas.com/SAS_work0E560000A0C4_odaws03-usw2.oda.sas.com/healthlist_txt.sas7bdat
Release Created	9.0401M6
Host Created	Linux
Inode Number	2156762
Access Permission	rw-r--r--
Owner Name	u47386750
File Size	256KB
File Size (bytes)	262144

Varnum


Variables in Creation Order
#	Variable	Type	Len	Label
1	Gender	Char	1	Gender (M or F)
2	Subj	Num	8	Subject Number
3	Age	Num	8	Age in years
4	Heart_Rate	Num	8	Heart rate
5	SBP	Num	8	Systolic blood pressure
6	DBP	Num	8	Diastolic blood pressure
7	Chol	Num	8	Total cholesterol

sample import using list input treat mult delim as one

The Print Procedure

Data Set WORK.HEALTHLIST_TXT


Obs	Gender	Subj	Age	Heart_Rate	SBP	DBP	Chol
1	M	1	23	68	120	90	128
2	F	2	55	72	180	90	170
3	F	3	18	58	118	72	122
4	M	4	80	82	.	.	220
5	F	5	34	62	128	80	.
6	F	6	38	78	108	68	220

sample imported dataset health csv using list input

The CONTENTS Procedure

The Contents Procedure

WORK.HEALTH_CSV

Attributes


Data Set Name	WORK.HEALTH_CSV	Observations	6
Member Type	DATA	Variables	7
Engine	V9	Indexes	0
Created	07/13/2022 10:51:34	Observation Length	56
Last Modified	07/13/2022 10:51:34	Deleted Observations	0
Protection		Compressed	NO
Data Set Type		Sorted	NO
Label
Data Representation	SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64
Encoding	utf-8 Unicode (UTF-8)

Engine/Host Information


Engine/Host Dependent Information
Data Set Page Size	131072
Number of Data Set Pages	1
First Data Page	1
Max Obs per Page	2334
Obs in First Data Page	6
Number of Data Set Repairs	0
Filename	/saswork/SAS_work03A00000A0C4_odaws03-usw2.oda.sas.com/SAS_work0E560000A0C4_odaws03-usw2.oda.sas.com/health_csv.sas7bdat
Release Created	9.0401M6
Host Created	Linux
Inode Number	2156762
Access Permission	rw-r--r--
Owner Name	u47386750
File Size	256KB
File Size (bytes)	262144

Varnum


Variables in Creation Order
#	Variable	Type	Len	Label
1	Subj	Num	8	Subject Number
2	Gender	Char	8	Gender (M or F)
3	Age	Num	8	Age in years
4	Heart_Rate	Num	8	Heart rate
5	SBP	Num	8	Systolic blood pressure
6	DBP	Num	8	Diastolic blood pressure
7	Chol	Num	8	Total cholesterol

sample imported dataset health csv using list input

The Print Procedure

Data Set WORK.HEALTH_CSV


Obs	Subj	Gender	Age	Heart_Rate	SBP	DBP	Chol
1	1	M	23	68	120	90	128
2	2	F	55	72	180	90	170
3	3	F	18	58	118	72	122
4	4	M	80	82	.	.	220
5	5	F	34	62	128	80	.
6	6	F	38	78	108	68	220

sample imported dataset health txt fixed columns

The CONTENTS Procedure

The Contents Procedure

WORK.HEALTH_TXT

Attributes


Data Set Name	WORK.HEALTH_TXT	Observations	6
Member Type	DATA	Variables	7
Engine	V9	Indexes	0
Created	07/13/2022 10:51:34	Observation Length	56
Last Modified	07/13/2022 10:51:34	Deleted Observations	0
Protection		Compressed	NO
Data Set Type		Sorted	NO
Label
Data Representation	SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64
Encoding	utf-8 Unicode (UTF-8)

Engine/Host Information


Engine/Host Dependent Information
Data Set Page Size	131072
Number of Data Set Pages	1
First Data Page	1
Max Obs per Page	2334
Obs in First Data Page	6
Number of Data Set Repairs	0
Filename	/saswork/SAS_work03A00000A0C4_odaws03-usw2.oda.sas.com/SAS_work0E560000A0C4_odaws03-usw2.oda.sas.com/health_txt.sas7bdat
Release Created	9.0401M6
Host Created	Linux
Inode Number	2156762
Access Permission	rw-r--r--
Owner Name	u47386750
File Size	256KB
File Size (bytes)	262144

Varnum


Variables in Creation Order
#	Variable	Type	Len	Label
1	Subj	Num	8	Subject Number
2	Gender	Char	1	Gender (M or F)
3	Age	Num	8	Age in years
4	Heart_Rate	Num	8	Heart rate
5	SBP	Num	8	Systolic blood pressure
6	DBP	Num	8	Diastolic blood pressure
7	Chol	Num	8	Total cholesterol

sample imported dataset health txt fixed columns

The Print Procedure

Data Set WORK.HEALTH_TXT


Obs	Subj	Gender	Age	Heart_Rate	SBP	DBP	Chol
1	1	M	23	68	120	90	28
2	2	F	55	72	180	90	70
3	3	F	18	58	118	72	22
4	4	M	80	82	.	.	20
5	5	F	34	62	128	80	.
6	6	F	38	78	108	68	20

sample import blood pressure tab delim text file

The CONTENTS Procedure

The Contents Procedure

WORK.BPRESSURE_TXT

Attributes


Data Set Name	WORK.BPRESSURE_TXT	Observations	60
Member Type	DATA	Variables	5
Engine	V9	Indexes	0
Created	07/13/2022 10:51:34	Observation Length	32
Last Modified	07/13/2022 10:51:34	Deleted Observations	0
Protection		Compressed	NO
Data Set Type		Sorted	NO
Label
Data Representation	SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64
Encoding	utf-8 Unicode (UTF-8)

Engine/Host Information


Engine/Host Dependent Information
Data Set Page Size	131072
Number of Data Set Pages	1
First Data Page	1
Max Obs per Page	4078
Obs in First Data Page	60
Number of Data Set Repairs	0
Filename	/saswork/SAS_work03A00000A0C4_odaws03-usw2.oda.sas.com/SAS_work0E560000A0C4_odaws03-usw2.oda.sas.com/bpressure_txt.sas7bdat
Release Created	9.0401M6
Host Created	Linux
Inode Number	2156762
Access Permission	rw-r--r--
Owner Name	u47386750
File Size	256KB
File Size (bytes)	262144

Varnum


Variables in Creation Order
#	Variable	Type	Len	Label
1	Drug	Char	7	Drug
2	Gender	Char	1	Gender (M or F)
3	Subj	Num	8	Subject Number
4	SBP	Num	8	Systolic blood pressure
5	DBP	Num	8	Diastolic blood pressure

sample import blood pressure tab delim text file

The Print Procedure

Data Set WORK.BPRESSURE_TXT


Obs	Drug	Gender	Subj	SBP	DBP
1	Placebo	F	1	138	86
2	Placebo	M	2	124	82
3	Placebo	F	3	150	72
4	Placebo		4	136	84
5	Placebo	F	5	.	.
6	Placebo	M	6	132	84
7	Placebo	F	7	130	84
8	Placebo	F	8	146	88
9	Placebo	F	9	134	82
10	Placebo	M	10	138	88
11	Placebo	F	11	144	84
12	Placebo	F	12	130	88
13	Placebo	M	13	134	80
14	Placebo	M	14	132	90
15	Placebo	F	15	.	.
16	Placebo	M	16	124	88
17	Placebo	F	17	140	78
18	Placebo		18	156	86
19	Placebo	F	19	120	80
20	Placebo	M	20	142	90
21	Drug A	F	21	126	76
22	Drug A	M	22	134	86
23	Drug A	F	23	118	78
24	Drug A	F	24	132	80
25	Drug A	F	25	.	.

Blood Pressure Distribution Practice 5-3

The CONTENTS Procedure

The Contents Procedure

WORK.BLOOD_PRESSURE

Attributes


Data Set Name	WORK.BLOOD_PRESSURE	Observations	60
Member Type	DATA	Variables	5
Engine	V9	Indexes	0
Created	07/13/2022 10:51:35	Observation Length	32
Last Modified	07/13/2022 10:51:35	Deleted Observations	0
Protection		Compressed	NO
Data Set Type		Sorted	NO
Label
Data Representation	SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64
Encoding	utf-8 Unicode (UTF-8)

Engine/Host Information


Engine/Host Dependent Information
Data Set Page Size	131072
Number of Data Set Pages	1
First Data Page	1
Max Obs per Page	4078
Obs in First Data Page	60
Number of Data Set Repairs	0
Filename	/saswork/SAS_work03A00000A0C4_odaws03-usw2.oda.sas.com/SAS_work0E560000A0C4_odaws03-usw2.oda.sas.com/blood_pressure.sas7bdat
Release Created	9.0401M6
Host Created	Linux
Inode Number	2156762
Access Permission	rw-r--r--
Owner Name	u47386750
File Size	256KB
File Size (bytes)	262144

Varnum


Variables in Creation Order
#	Variable	Type	Len	Format	Informat	Label
1	Drug	Char	7	$7.	$7.	Drug
2	Subj	Num	8	BEST.		Subj
3	Gender	Char	1	$1.	$1.	Gender
4	SBP	Num	8	BEST.		SBP
5	DBP	Num	8	BEST.		DBP

Blood Pressure Distribution Practice 5-3

The Print Procedure

Data Set WORK.BLOOD_PRESSURE


Obs	Drug	Subj	Gender	SBP	DBP
1	Placebo	1	F	138	86
2	Placebo	2	M	124	82
3	Placebo	3	F	150	72
4	Placebo	4		136	84
5	Placebo	5	F	.	.
6	Placebo	6	M	132	84
7	Placebo	7	F	130	84
8	Placebo	8	F	146	88
9	Placebo	9	F	134	82
10	Placebo	10	M	138	88
11	Placebo	11	F	144	84
12	Placebo	12	F	130	88
13	Placebo	13	M	134	80
14	Placebo	14	M	132	90
15	Placebo	15	F	.	.
16	Placebo	16	M	124	88
17	Placebo	17	F	140	78
18	Placebo	18		156	86
19	Placebo	19	F	120	80
20	Placebo	20	M	142	90
21	Drug A	21	F	126	76
22	Drug A	22	M	134	86
23	Drug A	23	F	118	78
24	Drug A	24	F	132	80
25	Drug A	25	F	.	.

Importing Perch data

The CONTENTS Procedure

The Contents Procedure

WORK.PERCH

Attributes


Data Set Name	WORK.PERCH	Observations	56
Member Type	DATA	Variables	3
Engine	V9	Indexes	0
Created	07/13/2022 10:51:35	Observation Length	24
Last Modified	07/13/2022 10:51:35	Deleted Observations	0
Protection		Compressed	NO
Data Set Type		Sorted	NO
Label
Data Representation	SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64
Encoding	utf-8 Unicode (UTF-8)

Engine/Host Information


Engine/Host Dependent Information
Data Set Page Size	131072
Number of Data Set Pages	1
First Data Page	1
Max Obs per Page	5431
Obs in First Data Page	56
Number of Data Set Repairs	0
Filename	/saswork/SAS_work03A00000A0C4_odaws03-usw2.oda.sas.com/SAS_work0E560000A0C4_odaws03-usw2.oda.sas.com/perch.sas7bdat
Release Created	9.0401M6
Host Created	Linux
Inode Number	2156770
Access Permission	rw-r--r--
Owner Name	u47386750
File Size	256KB
File Size (bytes)	262144

Varnum


Variables in Creation Order
#	Variable	Type	Len	Format	Label
1	Weight	Num	8	BEST.	Weight
2	Height	Num	8	BEST.	Height
3	Width	Num	8	BEST.	Width

Importing Perch data

The Print Procedure

Data Set WORK.PERCH


Obs	Weight	Height	Width
1	5.9	2.112	1.408
2	32	3.528	1.9992
3	40	3.824	2.432
4	51.5	4.5924	2.6316
5	70	4.588	2.9415
6	100	5.2224	3.3216
7	78	5.1992	3.1234
8	80	5.6358	3.0502
9	85	5.1376	3.0368
10	85	5.082	2.772
11	110	5.6925	3.555
12	115	5.9175	3.3075
13	125	5.6925	3.6675
14	130	6.384	3.534
15	120	6.11	3.4075
16	120	5.64	3.525
17	130	6.11	3.525
18	135	5.875	3.525
19	110	5.5225	3.995
20	130	5.856	3.624
21	150	6.792	3.624
22	145	5.9532	3.63
23	150	5.2185	3.626
24	170	6.275	3.725
25	225	7.293	3.723

Importing Yoga data

The CONTENTS Procedure

The Contents Procedure

WORK.YOGA

Attributes


Data Set Name	WORK.YOGA	Observations	9
Member Type	DATA	Variables	3
Engine	V9	Indexes	0
Created	07/13/2022 10:51:35	Observation Length	24
Last Modified	07/13/2022 10:51:35	Deleted Observations	0
Protection		Compressed	NO
Data Set Type		Sorted	NO
Label
Data Representation	SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64
Encoding	utf-8 Unicode (UTF-8)

Engine/Host Information


Engine/Host Dependent Information
Data Set Page Size	131072
Number of Data Set Pages	1
First Data Page	1
Max Obs per Page	5431
Obs in First Data Page	9
Number of Data Set Repairs	0
Filename	/saswork/SAS_work03A00000A0C4_odaws03-usw2.oda.sas.com/SAS_work0E560000A0C4_odaws03-usw2.oda.sas.com/yoga.sas7bdat
Release Created	9.0401M6
Host Created	Linux
Inode Number	2156771
Access Permission	rw-r--r--
Owner Name	u47386750
File Size	256KB
File Size (bytes)	262144

Varnum


Variables in Creation Order
#	Variable	Type	Len	Format	Label
1	Subj	Num	8	BEST.	Subj
2	Before	Num	8	BEST.	Before
3	After	Num	8	BEST.	After

Importing Yoga data

The Print Procedure

Data Set WORK.YOGA


Obs	Subj	Before	After
1	1	78	74
2	2	68	68
3	3	76	70
4	4	58	57
5	5	83	73
6	6	80	77
7	7	69	61
8	8	77	76
9	9	77	72

Practice: One-way ANOVA with test for Tukey multiple comparisons, CHF

The Print Procedure

Data Set WORK.CONGESTIVE_HEART_FAILURE


Obs	Group	Subj	LVEF
1	Placebo	1	55
2	Placebo	2	58
3	Placebo	3	62
4	Placebo	4	48
5	Placebo	5	57
6	Placebo	6	57
7	Placebo	7	80
8	Placebo	8	40
9	Placebo	9	55
10	Placebo	10	52
11	Calcium	1	57
12	Calcium	2	65
13	Calcium	3	55
14	Calcium	4	78
15	Calcium	5	57
16	Calcium	6	84
17	Calcium	7	72
18	Calcium	8	80
19	Calcium	9	78
20	Calcium	10	81
21	Lasix	1	60
22	Lasix	2	60
23	Lasix	3	65
24	Lasix	4	67
25	Lasix	5	48
26	Lasix	6	62
27	Lasix	7	64
28	Lasix	8	70
29	Lasix	9	57
30	Lasix	10	40

Chapter 9: create 25 pct sample w/o replacement for Two-Way Anova

The SURVEYSELECT Procedure

The Surveyselect Procedure

Sample Selection Method


Selection Method	Simple Random Sampling

Sample Selection Summary


Input Data Set	BWEIGHT
Random Number Seed	13579
Sampling Rate	0.25
Sample Size	12500
Selection Probability	0.25
Sampling Weight	4
Output Data Set	BIRTH_WT_SAMPLE

Importing Exercise data

The CONTENTS Procedure

The Contents Procedure

WORK.EXERCISE

Attributes


Data Set Name	WORK.EXERCISE	Observations	50
Member Type	DATA	Variables	6
Engine	V9	Indexes	0
Created	07/13/2022 10:51:35	Observation Length	48
Last Modified	07/13/2022 10:51:35	Deleted Observations	0
Protection		Compressed	NO
Data Set Type		Sorted	NO
Label
Data Representation	SOLARIS_X86_64, LINUX_X86_64, ALPHA_TRU64, LINUX_IA64
Encoding	utf-8 Unicode (UTF-8)

Engine/Host Information


Engine/Host Dependent Information
Data Set Page Size	131072
Number of Data Set Pages	1
First Data Page	1
Max Obs per Page	2722
Obs in First Data Page	50
Number of Data Set Repairs	0
Filename	/saswork/SAS_work03A00000A0C4_odaws03-usw2.oda.sas.com/SAS_work0E560000A0C4_odaws03-usw2.oda.sas.com/exercise.sas7bdat
Release Created	9.0401M6
Host Created	Linux
Inode Number	2156778
Access Permission	rw-r--r--
Owner Name	u47386750
File Size	256KB
File Size (bytes)	262144

Varnum


Variables in Creation Order
#	Variable	Type	Len	Format	Label
1	Subj	Num	8	BEST12.	Subj
2	Age	Num	8	BEST12.	Age
3	Pushups	Num	8	BEST12.	Pushups
4	Rest_Pulse	Num	8	BEST12.	Rest_Pulse
5	Max_Pulse	Num	8	BEST12.	Max_Pulse
6	Run_Pulse	Num	8	BEST12.	Run_Pulse

Importing Exercise data

The Print Procedure

Data Set WORK.EXERCISE


Obs	Subj	Age	Pushups	Rest_Pulse	Max_Pulse	Run_Pulse
1	1	68	19	75	124	121
2	2	64	36	61	107	110
3	3	76	11	74	115	105
4	4	44	35	59	111	108
5	5	55	24	76	115	110
6	6	57	14	74	121	118
7	7	64	21	69	106	103
8	8	30	48	60	114	110
9	9	35	25	55	107	107
10	10	49	9	88	137	134
11	11	63	51	55	102	103
12	12	63	30	73	126	125
13	13	19	34	65	118	110
14	14	51	23	66	114	110
15	15	54	7	59	113	111
16	16	48	43	77	119	117
17	17	49	11	68	118	113
18	18	43	29	78	122	120
19	19	31	50	55	100	101
20	20	81	16	86	127	119
21	21	26	52	49	100	95
22	22	84	11	75	114	108
23	23	73	12	77	119	116
24	24	36	25	66	117	117
25	25	73	8	67	112	111

Logistic Regression data prep Birth_Wt_Sample to High_Low

The MEANS Procedure

The Means Procedure

Summary statistics


Variable	Label	N	Mean	Std Dev	Minimum	Maximum
Weight Black Married Boy MomAge MomSmoke CigsPerDay MomWtGain Visit MomEdLevel Wt_Group	Infant Birth Weight Black Mother Married Mother Baby Boy Mother's Age Smoking Mother Cigarettes Per Day Mother's Pregnancy Weight Gain Prenatal Visit Mother's Education Level	12500 12500 12500 12500 12500 12500 12500 12500 12500 12500 12500	3373.25 0.1578400 0.7095200 0.5106400 0.3813600 0.1319200 1.4786400 0.6145600 2.6988000 1.2204000 0.4940000	561.7217975 0.3646055 0.4540020 0.4999068 5.7031611 0.3384173 4.6137064 12.8152321 0.7155135 1.0928492 0.4999840	322.0000000 0 0 0 -9.0000000 0 0 -30.0000000 0 0 0	5970.00 1.0000000 1.0000000 1.0000000 18.0000000 1.0000000 40.0000000 68.0000000 3.0000000 3.0000000 1.0000000

Logistic Regression data prep Birth_Wt_Sample to High_Low

The Print Procedure

Data Set WORK.HIGH_LOW


Obs	Weight	Black	Married	Boy	MomAge	MomSmoke	CigsPerDay	MomWtGain	Visit	MomEdLevel	Wt_Group
1	3430	1	1	1	-4	0	0	-6	3	0	0
2	3657	0	1	0	6	0	0	15	3	0	0
3	4054	0	1	1	-5	0	0	21	3	2	0
4	4536	1	0	1	3	0	0	-1	3	3	0
5	3295	0	1	1	6	0	0	-29	3	2	1
6	3458	0	1	1	8	0	0	-18	3	1	0
7	3714	0	0	1	2	0	0	25	1	1	0
8	2807	0	1	0	-5	1	1	13	3	0	1
9	3625	0	1	0	2	0	0	0	3	0	0
10	3884	0	1	1	-2	0	0	-10	3	0	0

Import generated dataset Heart_Attack

The Print Procedure

Data Set WORK.HEART_ATTACK


Obs	Gender	Age	Age_Group	Chol	High_Chol	Heart_Attack
1	F	63	2	211	1	0
2	M	69	2	249	1	0
3	F	69	2	139	0	0
4	M	59	1	239	1	0
5	F	71	3	195	0	0
6	M	50	1	193	0	0
7	F	57	1	179	0	0
8	M	75	3	186	0	1
9	F	60	2	164	0	0
10	M	52	1	237	1	1

Import generated dataset Heart_Attack

The FREQ Procedure

The Freq Procedure

Table Gender

One-Way Frequencies


Gender
Gender	Frequency	Percent	Cumulative Frequency	Cumulative Percent
F	250	50.00	250	50.00
M	250	50.00	500	100.00

Table Age_Group

One-Way Frequencies


Age_Group
Age_Group	Frequency	Percent	Cumulative Frequency	Cumulative Percent
1	146	29.20	146	29.20
2	180	36.00	326	65.20
3	174	34.80	500	100.00

Table High_Chol

One-Way Frequencies


High_Chol
High_Chol	Frequency	Percent	Cumulative Frequency	Cumulative Percent
0	245	49.00	245	49.00
1	255	51.00	500	100.00

Table Heart_Attack

One-Way Frequencies


Heart_Attack
Heart_Attack	Frequency	Percent	Cumulative Frequency	Cumulative Percent
0	442	88.40	442	88.40
1	58	11.60	500	100.00

Sample Observations from Formatted Heart_Attack

The Print Procedure

Data Set WORK.HEART_ATTACK


Obs	Gender	Age	Age_Group	Chol	High_Chol	Heart_Attack
1	Female	63	60-70	211	Yes	No
2	Male	69	60-70	249	Yes	No
3	Female	69	60-70	139	No	No
4	Male	59	< 60	239	Yes	No
5	Female	71	71+	195	No	No
6	Male	50	< 60	193	No	No
7	Female	57	< 60	179	No	No
8	Male	75	71+	186	No	Yes
9	Female	60	60-70	164	No	No
10	Male	52	< 60	237	Yes	Yes

Sample Observations from Formatted Heart_Attack

The FREQ Procedure

The Freq Procedure

Table Gender

One-Way Frequencies


Gender
Gender	Frequency	Percent	Cumulative Frequency	Cumulative Percent
Female	250	50.00	250	50.00
Male	250	50.00	500	100.00

Table Age_Group

One-Way Frequencies


Age_Group
Age_Group	Frequency	Percent	Cumulative Frequency	Cumulative Percent
< 60	146	29.20	146	29.20
60-70	180	36.00	326	65.20
71+	174	34.80	500	100.00

Table High_Chol

One-Way Frequencies


High_Chol
High_Chol	Frequency	Percent	Cumulative Frequency	Cumulative Percent
No	245	49.00	245	49.00
Yes	255	51.00	500	100.00

Table Heart_Attack

One-Way Frequencies


Heart_Attack
Heart_Attack	Frequency	Percent	Cumulative Frequency	Cumulative Percent
No	442	88.40	442	88.40
Yes	58	11.60	500	100.00

Sample Observations from Reformatted Heart_Attack

The Print Procedure

Data Set WORK.HEART_ATTACK


Obs	Gender	Age	Age_Group	Chol	High_Chol	Heart_Attack
1	2:Female	63	60-70	211	1:Yes	2:No
2	1:Male	69	60-70	249	1:Yes	2:No
3	2:Female	69	60-70	139	2:No	2:No
4	1:Male	59	< 60	239	1:Yes	2:No
5	2:Female	71	71+	195	2:No	2:No
6	1:Male	50	< 60	193	2:No	2:No
7	2:Female	57	< 60	179	2:No	2:No
8	1:Male	75	71+	186	2:No	1:Yes
9	2:Female	60	60-70	164	2:No	2:No
10	1:Male	52	< 60	237	1:Yes	1:Yes

Sample Observations from Reformatted Heart_Attack

The FREQ Procedure

The Freq Procedure

Table Gender

One-Way Frequencies


Gender
Gender	Frequency	Percent	Cumulative Frequency	Cumulative Percent
1:Male	250	50.00	250	50.00
2:Female	250	50.00	500	100.00

Table Age_Group

One-Way Frequencies


Age_Group
Age_Group	Frequency	Percent	Cumulative Frequency	Cumulative Percent
60-70	180	36.00	180	36.00
71+	174	34.80	354	70.80
< 60	146	29.20	500	100.00

Table High_Chol

One-Way Frequencies


High_Chol
High_Chol	Frequency	Percent	Cumulative Frequency	Cumulative Percent
1:Yes	255	51.00	255	51.00
2:No	245	49.00	500	100.00

Table Heart_Attack

One-Way Frequencies


Heart_Attack
Heart_Attack	Frequency	Percent	Cumulative Frequency	Cumulative Percent
1:Yes	58	11.60	58	11.60
2:No	442	88.40	500	100.00

Chapter 5: Descriptive Statistics - Univariate Analysis

Descriptive Statistics for Continuous Variables.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/*** Chapter 5: Descriptive Statistics - Univariate Analysis  ***/

/* Descriptive Statistics for Continuous Variables - generated code - start */
ods noproctitle;
ods graphics / imagemap=on;

title1 "Descriptive Statistics for Continuous Variables";
proc means data=sashelp.heart chartype mean std min max median n nmiss 
        vardef=df clm alpha=0.05 qmethod=os;
    var Height Weight Diastolic Systolic;
run;

/* Graph template to construct combination histogram/boxplot */
proc template;
    define statgraph histobox;
        dynamic AVAR ByVarInfo;
        begingraph;
        entrytitle "Distribution of " AVAR ByVarInfo;
        layout lattice / rows=2 columndatarange=union rowgutter=0 rowweights=(0.75 
            0.25);
        layout overlay / yaxisopts=(offsetmax=0.1) xaxisopts=(display=none);
        histogram AVAR /;
        endlayout;
        layout overlay /;
        BoxPlot Y=AVAR / orient=horizontal;
        endlayout;
        endlayout;
        endgraph;
    end;
run;

/* Macro to subset data and create a histobox for every by group */
%macro byGroupHistobox(data=, level=, num_level=, byVars=, num_byvars=, avar=);
    %do j=1 %to &num_byvars;
        %let varName&j=%scan(%str(&byVars), &j);
    %end;

    %do i=1 %to &num_level;

        /* Get group variable values */
        data _null_;
            i=&i;
            set &level point=i;

            %do j=1 %to &num_byvars;
                call symputx("x&j", strip(&&varName&j), 'l');
            %end;
            stop;
        run;

        /* Build proc sql where clause */
        %let dsid=%sysfunc(open(&data));
        %let whereClause=;

        %do j=1 %to %eval(&num_byvars-1);
            %let varnum=%sysfunc(varnum(&dsid, &&varName&j));

            %if(%sysfunc(vartype(&dsid, &varnum))=C) %then
                %let whereClause=&whereClause.&&varName&j.="&&x&j"%str( and );
            %else
                %let whereClause=&whereClause.&&varName&j.=&&x&j.%str( and );
        %end;
        %let varnum=%sysfunc(varnum(&dsid, &&varName&num_byvars));

        %if(%sysfunc(vartype(&dsid, &varnum))=C) %then
            %let whereClause=&whereClause.&&varName&num_byvars.="&&x&num_byvars";
        %else
            %let whereClause=&whereClause.&&varName&num_byvars.=&&x&num_byvars;
        %let rc=%sysfunc(close(&dsid));

        /* Subset the data set */
        proc sql noprint;
            create table WORK.tempData as select * from &data
            where &whereClause;
        quit;

        /* Build plot group info */
        %let groupInfo=;

        %do j=1 %to %eval(&num_byvars-1);
            %let groupInfo=&groupInfo.&&varName&j.=&&x&j%str( );
        %end;
        %let groupInfo=&groupInfo.&&varName&num_byvars.=&&x&num_byvars;

        /* Create histogram/boxplot combo plot */
        proc sgrender data=WORK.tempData template=histobox;
            dynamic AVAR="&avar" ByVarInfo=" (&groupInfo)";
        run;

    %end;
%mend;

proc sgrender data=sashelp.heart template=histobox;
    dynamic AVAR="Height" ByVarInfo="";
run;

proc sgrender data=sashelp.heart template=histobox;
    dynamic AVAR="Weight" ByVarInfo="";
run;

proc sgrender data=sashelp.heart template=histobox;
    dynamic AVAR="Diastolic" ByVarInfo="";
run;

proc sgrender data=sashelp.heart template=histobox;
    dynamic AVAR="Systolic" ByVarInfo="";
run;

proc datasets library=WORK noprint;
    delete tempData;
    run;

title1;
/* Descriptive Statistics for Continuous Variables - generated code - end */

/* The histogram and box plot for Systolic is positively skewed, seen by the long tail on the right side of the histogram and by the outliers (circles) on the box plot.  The mean (diamond on the box plot) is to the right of the median (vertical line in the box), indicating that the data values are positively skewed.  Let's further investigate the distribution of systolic blood pressure. */

SAS Output

Results: biostats_by_ex_SAScode.sas

Descriptive Statistics for Continuous Variables

The Means Procedure

Summary statistics


Variable	Mean	Std Dev	Minimum	Maximum	Median	N	N Miss	Lower 95% CL for Mean	Upper 95% CL for Mean
Height Weight Diastolic Systolic	64.8131847 153.0866808 85.3586101 136.9095796	3.5827074 28.9154261 12.9730913 23.7395964	51.5000000 67.0000000 50.0000000 82.0000000	76.5000000 300.0000000 160.0000000 300.0000000	64.5000000 150.0000000 84.0000000 132.0000000	5203 5203 5209 5209	6 6 0 0	64.7158128 152.3008087 85.0062268 136.2647496	64.9105566 153.8725528 85.7109934 137.5544095

Descriptive Statistics for Continuous Variables

The SGRender Procedure

Descriptive Statistics for Continuous Variables

The SGRender Procedure

Descriptive Statistics for Continuous Variables

The SGRender Procedure

Descriptive Statistics for Continuous Variables

The SGRender Procedure

The histogram and box plot for Systolic is positively skewed, seen by the long tail on the right side of the histogram and by the outliers (circles) on the box plot. The mean (diamond on the box plot) is to the right of the median (vertical line in the box), indicating that the data values are positively skewed. Let’s further investigate the distribution of systolic blood pressure.

Investigating the Distribution for Systolic Blood Pressure.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Investigating the Distribution for Systolic Blood Pressure - start */
ods noproctitle;
ods graphics / imagemap=on;

title1 "Investigating the Distribution for Systolic Blood Pressure";
proc univariate data=sashelp.heart;
    ods select QQPlot;
    var Systolic;

    /* Checking for Normality */
    qqplot Systolic / normal(mu=est sigma=est);
    inset skewness kurtosis n / position=nw;
run;

title1;
/* Investigating the Distribution for Systolic Blood Pressure - end */

/* The straight line represents a normal distribution with the same mean and standard deviation as the variable Systolic.  Values for Skewness and Kurtosis close to zero result from distributions that are close to normal.  The theoretical normal distribution has a mean (Mu) of 136.91 and a standard deviation (Sigma) of 23.74.  The circles on the plot represent values of systolic blood pressure from the sample data.  The right side of the this Q-Q plot shows circles above the straight line, indicating that the sample data includes values of systolic blood pressure that are higher than expected if the values were normally distributed.  This confirms the strong positive skewness seen in the prior histogram.   Positive values for skewness indicate a positively skewed distribution (right tail).  Positive values for kurtosis indicate both that the distribution is too peaked (leptokurtic) and that the tails are too heavy.  (In contrast, negative values for kurtosis indicate that the distribution is too flat (platykurtic) and that the tails are too light. Modern interpretation of kurtosis puts emphasis on the tails being too heavy or too light and deemphasizes the concepts of the distribution being too peaked or too flat.) */

SAS Output

Results: biostats_by_ex_SAScode.sas

Investigating the Distribution for Systolic Blood Pressure

The Univariate Procedure

Systolic

Q-Q Plot 1

Panel 1

The straight line represents a normal distribution with the same mean and standard deviation as the variable Systolic. Values for Skewness and Kurtosis close to zero result from distributions that are close to normal. The theoretical normal distribution has a mean (Mu) of 136.91 and a standard deviation (Sigma) of 23.74. The circles on the plot represent values of systolic blood pressure from the sample data. The right side of the this Q-Q plot shows circles above the straight line, indicating that the sample data includes values of systolic blood pressure that are higher than expected if the values were normally distributed. This confirms the strong positive skewness seen in the prior histogram. Positive values for skewness indicate a positively skewed distribution (right tail). Positive values for kurtosis indicate both that the distribution is too peaked (leptokurtic) and that the tails are too heavy. (In contrast, negative values for kurtosis indicate that the distribution is too flat (platykurtic) and that the tails are too light. Modern interpretation of kurtosis puts emphasis on the tails being too heavy or too light and deemphasizes the concepts of the distribution being too peaked or too flat.)

Adding a Classification Variable in the Summary Statistics.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Adding a Classification Variable in the Summary Statistics - start */
ods noproctitle;
ods graphics / imagemap=on;

title1 "Adding a Classification Variable in the Summary Statistics";
proc means data=sashelp.heart chartype mean std min max median n nmiss 
        vardef=df clm alpha=0.05 qmethod=os;
    var Height;
    class Sex;
run;

proc univariate data=sashelp.heart vardef=df noprint;
    var Height;
    class Sex;
    histogram Height / normal(noprint);
run;

proc sort data=sashelp.heart out=WORK.TempSorted2236;
    by Sex;
run;

proc boxplot data=WORK.TempSorted2236;
    plot (Height)*Sex / boxstyle=schematic;
run;

proc datasets library=WORK noprint;
    delete TempSorted2236;
    run;
title1;
/* Adding a Classification Variable in the Summary Statistics - end */

/* The center of the distribution for Sex=Male is shifted to the right compared to the distribution for Sex=Female.  The box plots also show more outliers in the female distribution of Height compared to the male distribution.  This may be partly due to the smaller interquartile range (the distance from the top to the bottom of the box) for the females compared to the males. */

SAS Output

Results: biostats_by_ex_SAScode.sas

Adding a Classification Variable in the Summary Statistics

The Means Procedure

Summary statistics


Analysis Variable : Height
Sex	N Obs	Mean	Std Dev	Minimum	Maximum	Median	N	N Miss	Lower 95% CL for Mean	Upper 95% CL for Mean
Female	2873	62.5725863	2.4524112	51.5000000	70.7500000	62.5000000	2869	4	62.4828104	62.6623621
Male	2336	67.5673736	2.7321366	56.0000000	76.5000000	67.5000000	2334	2	67.4564752	67.6782721

Adding a Classification Variable in the Summary Statistics

The Univariate Procedure

Height

Histogram 1

Panel 1

Adding a Classification Variable in the Summary Statistics

The Boxplot Procedure

Height

Panel 1

The center of the distribution for Sex=Male is shifted to the right compared to the distribution for Sex=Female. The box plots also show more outliers in the female distribution of Height compared to the male distribution. This may be partly due to the smaller interquartile range (the distance from the top to the bottom of the box) for the females compared to the males.

Describing Categorical Variables.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Describing Categorical Variables  - start */

title1 "Frequencies on Data from Health Data Set";
proc freq data=sashelp.heart;
    tables Status Sex Chol_Status / nocum missing plots=(freqplot);
run;
title1;

/* Describing Categorical Variables  - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Frequencies on Data from Health Data Set

The Freq Procedure

Table Status

One-Way Frequencies


Status	Frequency	Percent
Alive	3218	61.78
Dead	1991	38.22

Distribution Plots

Frequency Plot

Table Sex

One-Way Frequencies


Sex	Frequency	Percent
Female	2873	55.15
Male	2336	44.85

Distribution Plots

Frequency Plot

Table Chol_Status

One-Way Frequencies


Cholesterol Status
Chol_Status	Frequency	Percent
	152	2.92
Borderline	1861	35.73
Desirable	1405	26.97
High	1791	34.38

Distribution Plots

Frequency Plot

Bar Chart of Frequencies for Chol_Status

Practice: Distribution of SBP (Systolic Blood Pressure) and DBP (Diastolic Blood Pressure) Values

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Distribution Practice 5-3  - start */

ods noproctitle;
ods graphics / imagemap=on;

proc univariate data=work.blood_pressure;
    ods select Histogram GoodnessOfFit QQPlot;
    var SBP DBP;
    /* Checking for Normality */
    histogram SBP DBP / normal(mu=est sigma=est);
    inset skewness kurtosis n / position=ne;
    qqplot SBP DBP / normal(mu=est sigma=est);
    inset skewness kurtosis n / position=nw;
run;
title1;

/* Practice Distribution 5-3  - end */

/* Using data from Blood_Pressure.xlsx, we examine the distribution of the two variables: SBP (Systolic Blood Pressure) and DBP (Diastolic Blood Pressure) with histogram and Q-Q plots.  Tests of normality (https://support.sas.com/documentation/cdl/en/procstat/63104/HTML/default/viewer.htm#procstat_univariate_sect037.htm) were requested, including and the [SAS-modified version of] Kolmogorov-Smirnov D test.  SBP (Systolic Blood Pressure) has a Kolmogorov-Smirnov D p-value of 0.111, exceeding our arbitrary significance level of 0.05 which means we cannot reject the null hypothesis, and cannot conclude that the data did not come from a normal distribution.  DBP (Diastolic Blood Pressure) has a Kolmogorov-Smirnov D p-value of 0.041, which is less than the predetermined critical value of 0.05, so we reject the null hypothesis and conclude that the data did not come from a normal distribution. */

SAS Output

Results: biostats_by_ex_SAScode.sas

The Univariate Procedure

SBP

Histogram 1

Panel 1

Fitted Normal Distribution for SBP (SBP)

Normal Fit

Goodness of Fit


Goodness-of-Fit Tests for Normal Distribution
Test	Statistic		p Value
Kolmogorov-Smirnov	D	0.10671056	Pr > D	0.111
Cramer-von Mises	W-Sq	0.08450704	Pr > W-Sq	0.184
Anderson-Darling	A-Sq	0.45548604	Pr > A-Sq	>0.250

Q-Q Plot 1

Panel 1

DBP

Histogram 1

Panel 1

Fitted Normal Distribution for DBP (DBP)

Normal Fit

Goodness of Fit


Goodness-of-Fit Tests for Normal Distribution
Test	Statistic		p Value
Kolmogorov-Smirnov	D	0.12096354	Pr > D	0.041
Cramer-von Mises	W-Sq	0.10293193	Pr > W-Sq	0.101
Anderson-Darling	A-Sq	0.59220879	Pr > A-Sq	0.122

Q-Q Plot 1

Panel 1

Using data from Blood_Pressure.xlsx, we examine the distribution of the two variables: SBP (Systolic Blood Pressure) and DBP (Diastolic Blood Pressure) with histogram and Q-Q plots. Tests of normality were requested, including and the [SAS-modified version of] Kolmogorov-Smirnov D test. SBP (Systolic Blood Pressure) has a Kolmogorov-Smirnov D p-value of 0.111, exceeding our arbitrary significance level of 0.05 which means we cannot reject the null hypothesis, and cannot conclude that the data did not come from a normal distribution. DBP (Diastolic Blood Pressure) has a Kolmogorov-Smirnov D p-value of 0.041, which is less than the predetermined critical value of 0.05, so we reject the null hypothesis and conclude that the data did not come from a normal distribution.

Chapter 6: One-Sample Tests

Performing a One-Sample t Test.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/*** Chapter 6: One-Sample Tests ***/

/* Performing a One-Sample t Test - generated code - start */

ods noproctitle;
ods graphics / imagemap=on;

/* You have 56 new measurements of perch and want to test if their mean weight is different from the historical mean value of 500. */

/* Test for normality */
title1 "Performing Tests for normality on Perch data";
proc univariate data=work.perch normal mu0=500;
    ods select TestsForNormality;
    var Weight;
run;

/* All of these tests reject the null hypothesis (at the α = .05 level).  One of the assumptions for one- or two-sample t tests is that the data values come from a population of values that are normally distributed.  At this point, you may be tempted to abandon the t test and choose a nonparametric alternative such as a sign test or a Wilcoxon rank sum test. The decision whether to use a parametric test should not be determined solely by these tests of normality. Before you decide to abandon the one-sample t test, you should take a look at the distribution of weights.  The one-sample t test task produces both a histogram and a Q-Q plot to help you understand how your data values are distributed. */

/* t test */
title1 "Performing a One-Sample t Test on Perch data";
proc ttest data=work.perch sides=2 h0=500 plots(showh0);
    var Weight;
run;

/* The 56 perch had a mean weight of 382.2 and a standard deviation of 347.6.  The 95% confidence limits are (293.1, 427.3), which does not include the historical mean of 500.  We see a t value of -2.54 and a p-value of .0141, which lets us reject the null hypothesis and state that that perch weights are lower than the historical value of 500 at a significance level α = .05.  */

/* "Output from the one-sample t test, shows a histogram with a normal distribution and a kernel distribution superimposed.  Although this is a skewed distribution, you may decide that with a sample size of 56, you can rely on the t test to decide if you should accept the alternative hypothesis (reject the null hypothesis).  You probably want to check the nonparametric test results." */

/* Performing a One-Sample t Test - generated code - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Performing Tests for normality on Perch data

Variable: Weight (Weight)

The Univariate Procedure

Weight

Tests For Normality


Tests for Normality
Test	Statistic		p Value
Shapiro-Wilk	W	0.816849	Pr < W	<0.0001
Kolmogorov-Smirnov	D	0.239184	Pr > D	<0.0100
Cramer-von Mises	W-Sq	0.810687	Pr > W-Sq	<0.0050
Anderson-Darling	A-Sq	4.362138	Pr > A-Sq	<0.0050

The Ttest Procedure

Weight

Performing a One-Sample t Test on Perch data

Variable: Weight (Weight)

Statistics


N	Mean	Std Dev	Std Err	Minimum	Maximum
56	382.2	347.6	46.4524	5.9000	1100.0

Confidence Limits


Mean	95% CL Mean		Std Dev	95% CL Std Dev
382.2	289.1	475.3	347.6	293.1	427.3

T-Tests


DF	t Value	Pr > \|t\|
55	-2.54	0.0141

Summary Panel

Q-Q Plot

You have 56 new measurements of perch and want to test if their mean weight is different from the historical mean value of 500.
All 4 tests for normality reject the null hypothesis (at the α = .05 level). “One of the assumptions for one- or two-sample t tests is that the data values come from a population of values that are normally distributed. At this point, you may be tempted to abandon the t test and choose a nonparametric alternative such as a sign test or a Wilcoxon rank sum test. The decision whether to use a parametric test should not be determined solely by these tests of normality. Before you decide to abandon the one-sample t test, you should take a look at the distribution of weights. The one-sample t test task produces both a histogram and a Q-Q plot to help you understand how your data values are distributed.”
The 56 perch had a mean weight of 382.2 and a standard deviation of 347.6. The 95% confidence limits are (293.1, 427.3), which does not include the historical mean of 500. We see a t value of -2.54 and a p-value of .0141, which lets us reject the null hypothesis and state that that perch weights are lower than the historical value of 500 at a significance level α = .05.
“Output from the one-sample t test, shows a histogram with a normal distribution and a kernel distribution superimposed. Although this is a skewed distribution, you may decide that with a sample size of 56, you can rely on the t test to decide if you should accept the alternative hypothesis (reject the null hypothesis). You probably want to check the nonparametric test results.”

“Perform some nonparametric tests to confirm conclusion from the one-sample t test because distribution of weights is significantly different from a normal distribution.”

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Nonparametric One-sample Tests - generated code - start */

/* "Perform some nonparametric tests to confirm conclusion from the one-sample t test because distribution of weights is significantly different from a normal distribution." */

/* Nonparametric test */
title1 "Performing Nonparametric One-sample Tests on Perch data";
proc univariate data=work.perch mu0=500;
    ods select TestsForLocation;
    var Weight;
run;
title1;

/* "Both the sign test (https://en.wikipedia.org/wiki/Sign_test) and the Wilcoxon Signed Rank test (https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test) show p-values less than .05, confirming conclusion from the prior one-sample t test." */

/* Nonparametric One-sample Tests - generated code - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Performing Nonparametric One-sample Tests on Perch data

Variable: Weight (Weight)

The Univariate Procedure

Weight

Tests For Location


Tests for Location: Mu0=500
Test	Statistic		p Value
Student's t	t	-2.53509	Pr > \|t\|	0.0141
Sign	M	-9	Pr >= \|M\|	0.0222
Signed Rank	S	-275	Pr >= \|S\|	0.0235

“Both the sign test (https://en.wikipedia.org/wiki/Sign_test) and the Wilcoxon Signed Rank test (https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test) show p-values less than .05, confirming conclusion from the prior one-sample t test.”

Chapter 7: Two-Sample Tests

Unpaired t Test (t Test for Independent Groups).

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/*** Chapter 7: Two-Sample Tests ***/

/* Unpaired t Test (t Test for Independent Groups) - generated code - start */

ods noproctitle;
ods graphics / imagemap=on;

/* Test for normality */
title1 "Performing Normality Tests on Heart / Weight Data by Sex";
proc univariate data=sashelp.heart normal mu0=0;
    ods select TestsForNormality;
    class Sex;
    var Weight;
run;

/* t test */
title1 "Performing Unpaired t Test on Heart / Weight Data by Sex";
proc ttest data=sashelp.heart sides=2 h0=0 plots(showh0);
    class Sex;
    var Weight;
run;

/* Nonparametric test */
title1 "Nonparametric Two-sample Tests on Heart / Weight Data by Sex";
proc npar1way data=sashelp.heart wilcoxon plots=wilcoxonplot;
    class Sex;
    var Weight;
run;
title1;

/* "The three tests for normality, for both values of Sex, show p-values less than .05, leading us to reject the null hypothesis that the distribution of weight for females and males is normally distributed.  Look at the histogram and Q-Q plots to determine if parametric tests are appropriate. */

/* "There are two different confidence limits listed and two t- and p-values in the table; one set of values is computed assuming that the two groups have equal variance; the other set of values is computed assuming unequal variance.  The most common strategy to choose Which set of values is to examine the F test, which tests the equality of variances for the groups.  (The null hypothesis is that the variances are equal; the alternative hypothesis is that they are not equal.)  If the p-value for this test is less than .05, you choose the t- and p- values under the assumption of unequal variances (labeled Satterthwaite); if the p-value is greater than .05 (in this case it is .0503), you use the values based on equal variances (labeled Pooled) which has t-value of -36.20 and p-value of <.0001.  Based on the test of homogeneity of variance p-value of .0503, we use the pooled values in the table.  We reject the null hypothesis that the populations from which you took the female and male weight samples have equal means, and conclude that the female mean was less than the male mean." */

/* (Nonparametric Two-sample Tests performed on same data for optional reference.) */

/* Unpaired t Test (t Test for Independent Groups) - generated code - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Performing Normality Tests on Heart / Weight Data by Sex

Variable: Weight

Sex = Female

The Univariate Procedure

Weight

Sex = Female

Tests For Normality


Tests for Normality
Test	Statistic		p Value
Kolmogorov-Smirnov	D	0.077525	Pr > D	<0.0100
Cramer-von Mises	W-Sq	4.91407	Pr > W-Sq	<0.0050
Anderson-Darling	A-Sq	29.30832	Pr > A-Sq	<0.0050

Performing Normality Tests on Heart / Weight Data by Sex

Variable: Weight

Sex = Male

Tests For Normality


Tests for Normality
Test	Statistic		p Value
Kolmogorov-Smirnov	D	0.033472	Pr > D	<0.0100
Cramer-von Mises	W-Sq	0.559512	Pr > W-Sq	<0.0050
Anderson-Darling	A-Sq	4.016874	Pr > A-Sq	<0.0050

The Ttest Procedure

Weight

Performing Unpaired t Test on Heart / Weight Data by Sex

Variable: Weight

Statistics


Sex	Method	N	Mean	Std Dev	Std Err	Minimum	Maximum
Female		2869	141.4	26.2880	0.4908	67.0000	300.0
Male		2334	167.5	25.2907	0.5235	99.0000	276.0
Diff (1-2)	Pooled		-26.0775	25.8454	0.7204
Diff (1-2)	Satterthwaite		-26.0775		0.7176

Confidence Limits


Sex	Method	Mean	95% CL Mean		Std Dev	95% CL Std Dev
Female		141.4	140.4	142.4	26.2880	25.6251	26.9865
Male		167.5	166.4	168.5	25.2907	24.5855	26.0379
Diff (1-2)	Pooled	-26.0775	-27.4899	-24.6652	25.8454	25.3582	26.3519
Diff (1-2)	Satterthwaite	-26.0775	-27.4843	-24.6708

T-Tests


Method	Variances	DF	t Value	Pr > \|t\|
Pooled	Equal	5201	-36.20	<.0001
Satterthwaite	Unequal	5057.9	-36.34	<.0001

Equality of Variances


Equality of Variances
Method	Num DF	Den DF	F Value	Pr > F
Folded F	2868	2333	1.08	0.0503

Summary Panel

Q-Q Plots

Nonparametric Two-sample Tests on Heart / Weight Data by Sex

The Npar1way Procedure

Variable Weight

Wilcoxon Analysis

Scores


Wilcoxon Scores (Rank Sums) for Variable Weight Classified by Variable Sex
Sex	N	Sum of Scores	Expected Under H0	Std Dev Under H0	Mean Score
Average scores were used for ties.
Female	2869	5594004.0	7465138.0	53884.9799	1949.80969
Male	2334	7944202.0	6073068.0	53884.9799	3403.68552

Two-Sample Test


Wilcoxon Two-Sample Test
Statistic	Z	Pr > Z	Pr > \|Z\|	t Approximation
Statistic	Z	Pr > Z	Pr > \|Z\|	Pr > Z	Pr > \|Z\|
Z includes a continuity correction of 0.5.
7944202	34.7246	<.0001	<.0001	<.0001	<.0001

Kruskal-Wallis Test


Kruskal-Wallis Test
Chi-Square	DF	Pr > ChiSq
1205.7974	1	<.0001

Box Plot

“The three tests for normality, for both values of Sex, show p-values less than .05, leading us to reject the null hypothesis that the distribution of weight for females and males is normally distributed. Look at the histogram and Q-Q plots to determine if parametric tests are appropriate.”
“There are two different confidence limits listed and two t- and p-values in the table; one set of values is computed assuming that the two groups have equal variance; the other set of values is computed assuming unequal variance. The most common strategy to choose Which set of values is to examine the F test, which tests the equality of variances for the groups. (The null hypothesis is that the variances are equal; the alternative hypothesis is that they are not equal.) If the p-value for this test is less than .05, you choose the t- and p- values under the assumption of unequal variances (labeled Satterthwaite); if the p-value is greater than .05 (in this case it is .0503), you use the values based on equal variances (labeled Pooled) which has t-value of -36.20 and p-value of <.0001. Based on the test of homogeneity of variance p-value of .0503, we use the pooled values in the table. We reject the null hypothesis that the populations from which you took the female and male weight samples have equal means, and conclude that the female mean was less than the male mean.”
*(Nonparametric Two-sample Tests performed on same data for optional reference.)

Nonparametric Two-sample Tests.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Nonparametric Two-sample Tests - generated code - start */

/* Test for normality */
title1 "Performing Normality Tests on Pike vs. Roach Fish";
proc univariate data=sashelp.fish normal mu0=0;
    ods select TestsForNormality;
    where (Species EQ "Pike") OR (Species EQ "Roach");
    class Species;
    var Weight;
run;

/* t test */
title1 "Unpaired t Test on Pike vs. Roach Fish";
proc ttest data=sashelp.fish sides=2 h0=0 plots(only showh0)=(summaryPlot 
        qqplot);
    where (Species EQ "Pike") OR (Species EQ "Roach");
    class Species;
    var Weight;
run;

/* Nonparametric test */
title1 "Nonparametric Two-sample Tests on Pike vs. Roach Fish";
proc npar1way data=sashelp.fish wilcoxon plots=wilcoxonplot;
    where (Species EQ "Pike") OR (Species EQ "Roach");
    class Species;
    var Weight;
run;
title1;

/* "We see smaller sample sizes for the two fish species (n=20 for Roach and n=17 for Pike) compared to our previous example.  The tests for normality reject the null hypothesis that these weights for Pike come from a population that is normally distributed, and the histograms show a distribution that is not symmetric or normal.  Time to use nonparametric tests."  */ 

/* "The Wilcoxon rank sum test shows the sum of ranks for the two fish species.  If the null hypothesis is true, you would expect the sum of ranks to be about the same for both groups, which they are not.  This is also reflected on the histograms which show Roach fish being lighter than Pike fish.  There are two p-values shown in the Wilcoxon Two-Sample Test section: a z-test (with a correction for continuity) for larger samples; and a t approximation sometimes used for smaller samples (such as this example).  both p-values are tiny (<.0001) which implies that Roach fish are lighter than Pike fish." */

/* Nonparametric Two-sample Tests - generated code - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Performing Normality Tests on Pike vs. Roach Fish

Variable: Weight

Species = Pike

The Univariate Procedure

Weight

Species = Pike

Tests For Normality


Tests for Normality
Test	Statistic		p Value
Shapiro-Wilk	W	0.821287	Pr < W	0.0040
Kolmogorov-Smirnov	D	0.267641	Pr > D	<0.0100
Cramer-von Mises	W-Sq	0.218639	Pr > W-Sq	<0.0050
Anderson-Darling	A-Sq	1.255384	Pr > A-Sq	<0.0050

Performing Normality Tests on Pike vs. Roach Fish

Variable: Weight

Species = Roach

Tests For Normality


Tests for Normality
Test	Statistic		p Value
Shapiro-Wilk	W	0.932277	Pr < W	0.1708
Kolmogorov-Smirnov	D	0.176514	Pr > D	0.0986
Cramer-von Mises	W-Sq	0.102833	Pr > W-Sq	0.0968
Anderson-Darling	A-Sq	0.572359	Pr > A-Sq	0.1243

The Ttest Procedure

Weight

Unpaired t Test on Pike vs. Roach Fish

Variable: Weight

Statistics


Species	Method	N	Mean	Std Dev	Std Err	Minimum	Maximum
Pike		17	718.7	494.1	119.8	200.0	1650.0
Roach		20	152.1	88.8289	19.8627	0	390.0
Diff (1-2)	Pooled		566.7	340.5	112.3
Diff (1-2)	Satterthwaite		566.7		121.5

Confidence Limits


Species	Method	Mean	95% CL Mean		Std Dev	95% CL Std Dev
Pike		718.7	464.6	972.8	494.1	368.0	752.0
Roach		152.1	110.5	193.6	88.8289	67.5535	129.7
Diff (1-2)	Pooled	566.7	338.7	794.7	340.5	276.1	444.1
Diff (1-2)	Satterthwaite	566.7	310.2	823.1

T-Tests


Method	Variances	DF	t Value	Pr > \|t\|
Pooled	Equal	35	5.05	<.0001
Satterthwaite	Unequal	16.88	4.66	0.0002

Equality of Variances


Equality of Variances
Method	Num DF	Den DF	F Value	Pr > F
Folded F	16	19	30.95	<.0001

Summary Panel

Q-Q Plots

Nonparametric Two-sample Tests on Pike vs. Roach Fish

The Npar1way Procedure

Variable Weight

Wilcoxon Analysis

Scores


Wilcoxon Scores (Rank Sums) for Variable Weight Classified by Variable Species
Species	N	Sum of Scores	Expected Under H0	Std Dev Under H0	Mean Score
Average scores were used for ties.
Roach	20	217.50	380.0	32.798983	10.875000
Pike	17	485.50	323.0	32.798983	28.558824

Two-Sample Test


Wilcoxon Two-Sample Test
Statistic	Z	Pr > Z	Pr > \|Z\|	t Approximation
Statistic	Z	Pr > Z	Pr > \|Z\|	Pr > Z	Pr > \|Z\|
Z includes a continuity correction of 0.5.
485.5000	4.9392	<.0001	<.0001	<.0001	<.0001

Kruskal-Wallis Test


Kruskal-Wallis Test
Chi-Square	DF	Pr > ChiSq
24.5463	1	<.0001

Box Plot

“We see smaller sample sizes for the two fish species (n=20 for Roach and n=17 for Pike) compared to our previous example. The tests for normality reject the null hypothesis that these weights for Pike come from a population that is normally distributed, and the histograms show a distribution that is not symmetric or normal. Time to use nonparametric tests.”
“The Wilcoxon rank sum test shows the sum of ranks for the two fish species. If the null hypothesis is true, you would expect the sum of ranks to be about the same for both groups, which they are not. This is also reflected on the histograms which show Roach fish being lighter than Pike fish. There are two p-values shown in the Wilcoxon Two-Sample Test section: a z-test (with a correction for continuity) for larger samples; and a t approximation sometimes used for smaller samples (such as this example). both p-values are tiny (<.0001) which implies that Roach fish are lighter than Pike fish.”

Paired t Test.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Paired t Test - generated code - start */

ods noproctitle;
ods graphics / imagemap=on;

data Work._Paired_diffs_;
    set work.yoga;
    _Difference_=Before - After;
    label _Difference_="Difference: Before - After";
run;

/* Test for normality */
title1 "Normality Tests on Yoga data";
proc univariate data=Work._Paired_diffs_ normal mu0=0;
    ods select TestsForNormality;
    var _Difference_;
run;

/* t test */
title1 "Paired t Test on Yoga data";
proc ttest data=work.yoga sides=2 h0=0 plots(showh0);
    paired Before*After;
run;

/* Nonparametric test */
title1 "Nonparametric Two-sample Tests on Yoga data";
proc univariate data=work._paired_diffs_ mu0=0;
    ods select TestsForLocation;
    var _Difference_;
run;

/* Clean up */
proc delete data=work._paired_diffs_;
run;

/* Data taken from a small study designed to show if a half hour of yoga can lower a subject's heart rate. Before and After heart rates for 10 participants were measured. */

/* "All tests for normality are not significant, but due to the tiny sample size (n=10) you should not interpret this as difference scores being normally distributed.  You need to look at the histogram or Q-Q plot to decide if you should be using a nonparametric test.  The t table shows a mean difference is 4.222 with t-value of 3.74 and p-value of .0057.  Because the mean difference is positive (computed as before - after), we conclude that yoga helped reduce heart rate." */

/* "When in doubt, rerun analysis using nonparametric tests such as the Sign test and the Wilcoxon Signed Rank test.  Both show a significant difference at the .05 level." */

/* "Do not simply look at the pvalues from the normality tests. Doing so often results in incorrect decisions. When sample sizes are large, the tests for normality are often significant. When sample sizes are small, they are rarely significant. And, it is with small samples that deviations from normality are most important." */

title1;

/* Paired t Test - generated code - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Normality Tests on Yoga data

Variable: _Difference_ (Difference: Before - After)

The Univariate Procedure

_Difference_

Tests For Normality


Tests for Normality
Test	Statistic		p Value
Shapiro-Wilk	W	0.952338	Pr < W	0.7157
Kolmogorov-Smirnov	D	0.162908	Pr > D	>0.1500
Cramer-von Mises	W-Sq	0.027145	Pr > W-Sq	>0.2500
Anderson-Darling	A-Sq	0.209771	Pr > A-Sq	>0.2500

The Ttest Procedure

Before - After

Paired t Test on Yoga data

Difference: Before - After

Statistics


N	Mean	Std Dev	Std Err	Minimum	Maximum
9	4.2222	3.3830	1.1277	0	10.0000

Confidence Limits


Mean	95% CL Mean		Std Dev	95% CL Std Dev
4.2222	1.6218	6.8226	3.3830	2.2850	6.4810

T-Tests


DF	t Value	Pr > \|t\|
8	3.74	0.0057

Summary Panel

Profiles Plot

Agreement Plot

Q-Q Plot

Nonparametric Two-sample Tests on Yoga data

Variable: _Difference_ (Difference: Before - After)

The Univariate Procedure

_Difference_

Tests For Location


Tests for Location: Mu0=0
Test	Statistic		p Value
Student's t	t	3.744251	Pr > \|t\|	0.0057
Sign	M	4	Pr >= \|M\|	0.0078
Signed Rank	S	18	Pr >= \|S\|	0.0078

“Data taken from a small study designed to show if a half hour of yoga can lower a subject’s heart rate. Before and After heart rates for 10 participants were measured.”
“All tests for normality are not significant, but due to the tiny sample size (n=10) you should not interpret this as difference scores being normally distributed. You need to look at the histogram or Q-Q plot to decide if you should be using a nonparametric test. The t table shows a mean difference is 4.222 with t-value of 3.74 and p-value of .0057. Because the mean difference is positive (computed as before - after), we conclude that yoga helped reduce heart rate.”
“When in doubt, rerun analysis using nonparametric tests such as the Sign test and the Wilcoxon Signed Rank test. Both show a significant difference at the .05 level.”
“Do not simply look at the p-values from the normality tests. Doing so often results in incorrect decisions. When sample sizes are large, the tests for normality are often significant. When sample sizes are small, they are rarely significant. And, it is with small samples that deviations from normality are most important.”

Chapter 8: Comparing More Than Two Means (ANOVA)

Performing a One-Way Analysis of Variance.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/*** Chapter 8: Comparing More Than Two Means (ANOVA) ***/

/* Performing a One-Way Analysis of Variance - generated code - start */

ods noproctitle;
ods graphics / imagemap=on;

title1 "One-Way Analysis of Variance - tukey";
proc glm data=sashelp.heart plots(maxpoints=none only)=(boxplot 
        diagnostics(unpack));
    class Chol_Status;
    model Weight=Chol_Status;
    means Chol_Status / hovtest=levene plots=none;
    lsmeans Chol_Status / adjust=tukey pdiff alpha=.05 plots=(meanplot diffplot);
    run;
quit;

title1 "One-Way Analysis of Variance - snk Student-Newman-Keuls";
proc glm data=sashelp.heart plots=none;
    class Chol_Status;
    model Weight=Chol_Status;
    means Chol_Status / snk alpha=.05 plots=none;
    run;
quit;

title1;

/*"The Class Level Information confirms that there are three levels for Chol_Status with a relatively small number of missing values.  The ANOVA table shows F test and p-values, but we need to check the diagnostics tests to confirm whether the assumptions were satisfied.  The model has 2 degrees of freedom (3 levels of the independent variable).  The mean squares for the model and error terms show an F-value (the ratio of the between-group variance and the within-group variance) of 25.90 with a corresponding p-value of less than .0001." */

/* "The Residual Plots show the residuals (the differences between the mean of each group and each individual score) in that group with the predicted values (means of each group) shown on the x-axis.  [Residuals by Predicted for Weight] displays the residuals as actual scores, while [Studentized Residuals by Predicted for Weight] displays the residuals as t scores (the number of standard deviations above or below the mean of the group.)" */

/* "[Q-Q Plot of Residuals for Weight] shows small deviations from a straight line towards the bookends; one of the assumptions for running a one-way ANOVA is that the errors (the residuals are estimates of these errors) are normally distributed which appear as a straight line on a Q-Q plot.  A Histogram for Residuals [Distribution of Residuals for Weight] appear to be relatively normally distributed." */

/* "The Box Plot for Weight by Cholesterol Level [Distribution of Weight] graphically displays the distribution of weights in the 3 groups.  The line in the center of the box represents the median, and the small diamond represents the mean.  The means and medians of the three groups are not very different, but the results are highly significant because of the large (> n=5,000) sample size (which give you high power to see even small differences)." */

/* "Next, the output table [Levene's Test for Homogeneity of Weight Variance / ANOVA of Squared Deviations from Group Means] shows a p-value of .2194, you do not reject the null hypothesis of equal variance." */

/* "The next table shows the Group Means and Standard Deviations for the three groups." */

/* "The table that follows [Least Squares Means / Adjustment for Multiple Comparisons: Tukey-Kramer] show the least square means equal to the means in the previous table because this is a one-way model; in unbalanced models with more than one factor, this may not be the case.  We then see p-values for all of the pairwise differences: Borderline and Desirable shows a p-value = <.0001, High and Desirable shows a p-value = <.0001, and Borderline and High shows a p-value = .4869 (not significant)." */

/* "The plot Pairwise Comparison of Means [Weight Comparisons for Chol_Status] displays pairwise differences.  At the intersection of any two groups, you see a diagonal line representing a 95% confidence interval for the difference between the two group means.  If the interval crosses the main diagonal line (that represents no difference), the two group means are not significantly different at the .05 level.  Also, significant differences are shown in blue and non-significant differences are shown in red." */

/* "Further down this set of plots and tables, we see the output from the Student-Newman-Keuls Test for Weight.  SNK is an alternative method of determining pairwise differences [instead of Tukey], and has a slightly higher power to detect differences.  The SNK display [Weight SNK Grouping for Means of Chol_Status ] shows the three means in order from highest to lowest, and means covered by the same bar are not significantly different.  High and Borderline means show up as not being significantly different at the .05 level, consistent with the prior Tukey pairwise differences table and plot." */

/* Performing a One-Way Analysis of Variance - generated code - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

One-Way Analysis of Variance - tukey

The GLM Procedure

Data

Class Levels


Class Level Information
Class	Levels	Values
Chol_Status	3	Borderline Desirable High

Number of Observations


Number of Observations Read	5209
Number of Observations Used	5051

One-Way Analysis of Variance - tukey

Dependent Variable: Weight

Analysis of Variance

Weight

Overall ANOVA


Source	DF	Sum of Squares	Mean Square	F Value	Pr > F
Model	2	42864.375	21432.188	25.90	<.0001
Error	5048	4176597.649	827.377
Corrected Total	5050	4219462.024

Fit Statistics


R-Square	Coeff Var	Root MSE	Weight Mean
0.010159	18.79164	28.76416	153.0689

Type I Model ANOVA


Source	DF	Type I SS	Mean Square	F Value	Pr > F
Chol_Status	2	42864.37515	21432.18758	25.90	<.0001

Type III Model ANOVA


Source	DF	Type III SS	Mean Square	F Value	Pr > F
Chol_Status	2	42864.37515	21432.18758	25.90	<.0001

Diagnostic Plots

Residual by Predicted

RStudent by Predicted

RStudent by Leverage

Q-Q Plot

Weight by Predicted

Cook's D Plot

Needleplot of Cook's D statistic by Observation for Weight

Residual Histogram

Histogram of Residuals for Weight with normal and kernel densities overlaid

RF Plot

Box Plot

Fit Plot for Weight by Cholesterol Status

One-Way Analysis of Variance - tukey

Means

Chol_Status

Weight

Levene's HoV Test


Levene's Test for Homogeneity of Weight Variance ANOVA of Squared Deviations from Group Means
Source	DF	Sum of Squares	Mean Square	F Value	Pr > F
Chol_Status	2	5399505	2699752	1.52	0.2194
Error	5048	8.9813E9	1779186

One-Way Analysis of Variance - tukey

Chol_Status

Means


Level of Chol_Status	N	Weight
Level of Chol_Status	N	Mean	Std Dev
Borderline	1860	154.318280	28.5982126
Desirable	1403	148.431219	29.6364336
High	1788	155.408277	28.2367277

One-Way Analysis of Variance - tukey

Least Squares Means

Adjustment for Multiple Comparisons: Tukey-Kramer

Least Squares Means

Chol_Status

Weight

LSMeans


Chol_Status	Weight LSMEAN	LSMEAN Number
Borderline	154.318280	1
Desirable	148.431219	2
High	155.408277	3

Difference Matrix


Least Squares Means for effect Chol_Status Pr > \|t\| for H0: LSMean(i)=LSMean(j) Dependent Variable: Weight
i/j	1	2	3
1		<.0001	0.4869
2	<.0001		<.0001
3	0.4869	<.0001

Chol_Status Mean Plot

Plot of Weight least-squares means for Chol_Status.

Chol_Status Diffogram

Plot of all pairwise Weight least-squares means differences for Chol_Status with Tukey-Kramer adjustment at significance level 0.05.

One-Way Analysis of Variance - snk Student-Newman-Keuls

The GLM Procedure

Data

Class Levels


Class Level Information
Class	Levels	Values
Chol_Status	3	Borderline Desirable High

Number of Observations


Number of Observations Read	5209
Number of Observations Used	5051

One-Way Analysis of Variance - snk Student-Newman-Keuls

Dependent Variable: Weight

Analysis of Variance

Weight

Overall ANOVA


Source	DF	Sum of Squares	Mean Square	F Value	Pr > F
Model	2	42864.375	21432.188	25.90	<.0001
Error	5048	4176597.649	827.377
Corrected Total	5050	4219462.024

Fit Statistics


R-Square	Coeff Var	Root MSE	Weight Mean
0.010159	18.79164	28.76416	153.0689

Type I Model ANOVA


Source	DF	Type I SS	Mean Square	F Value	Pr > F
Chol_Status	2	42864.37515	21432.18758	25.90	<.0001

Type III Model ANOVA


Source	DF	Type III SS	Mean Square	F Value	Pr > F
Chol_Status	2	42864.37515	21432.18758	25.90	<.0001

Means

Chol_Status

Weight

Multiple Comparison Lines

Student-Newman-Keuls

One-Way Analysis of Variance - snk Student-Newman-Keuls

Student-Newman-Keuls Test for Weight

Note:This test controls the Type I experimentwise error rate under the complete null hypothesis but not under partial null hypotheses.

Information


Alpha	0.05
Error Degrees of Freedom	5048
Error Mean Square	827.3767
Harmonic Mean of Cell Sizes	1657.754

Note:Cell sizes are not equal.

Critical Ranges


Number of Means	2	3
Critical Range	1.958659	2.3422709

LinesPlot

“The Class Level Information confirms that there are three levels for Chol_Status with a relatively small number of missing values. The ANOVA table shows F test and p-values, but we need to check the diagnostics tests to confirm whether the assumptions were satisfied. The model has 2 degrees of freedom (3 levels of the independent variable). The mean squares for the model and error terms show an F-value (the ratio of the between-group variance and the within-group variance) of 25.90 with a corresponding p-value of less than .0001.”
“The Residual Plots show the residuals (the differences between the mean of each group and each individual score) in that group with the predicted values (means of each group) shown on the x-axis. [Residuals by Predicted for Weight] displays the residuals as actual scores, while [Studentized Residuals by Predicted for Weight] displays the residuals as t scores (the number of standard deviations above or below the mean of the group.)”
“[Q-Q Plot of Residuals for Weight] shows small deviations from a straight line towards the bookends; one of the assumptions for running a one-way ANOVA is that the errors (the residuals are estimates of these errors) are normally distributed which appear as a straight line on a Q-Q plot. A Histogram for Residuals [Distribution of Residuals for Weight] appear to be relatively normally distributed.”
“The Box Plot for Weight by Cholesterol Level [Distribution of Weight] graphically displays the distribution of weights in the 3 groups. The line in the center of the box represents the median, and the small diamond represents the mean. The means and medians of the three groups are not very different, but the results are highly significant because of the large (> n=5,000) sample size (which give you high power to see even small differences).”
“Next, the output table [Levene’s Test for Homogeneity of Weight Variance / ANOVA of Squared Deviations from Group Means] shows a p-value of .2194, you do not reject the null hypothesis of equal variance.”
“The next table shows the Group Means and Standard Deviations for the three groups.”
“The table that follows [Least Squares Means / Adjustment for Multiple Comparisons: Tukey-Kramer] show the least square means equal to the means in the previous table because this is a one-way model; in unbalanced models with more than one factor, this may not be the case. We then see p-values for all of the pairwise differences: Borderline and Desirable shows a p-value = <.0001, High and Desirable shows a p-value = <.0001, and Borderline and High shows a p-value = .4869 (not significant).” - “The plot Pairwise Comparison of Means [Weight Comparisons for Chol_Status] displays pairwise differences. At the intersection of any two groups, you see a diagonal line representing a 95% confidence interval for the difference between the two group means. If the interval crosses the main diagonal line (that represents no difference), the two group means are not significantly different at the .05 level. Also, significant differences are shown in blue and non-significant differences are shown in red.”
“Further down this set of plots and tables, we see the output from the Student-Newman-Keuls Test for Weight. SNK is an alternative method of determining pairwise differences [instead of Tukey], and has a slightly higher power to detect differences. The SNK display [Weight SNK Grouping for Means of Chol_Status ] shows the three means in order from highest to lowest, and means covered by the same bar are not significantly different. High and Borderline means show up as not being significantly different at the .05 level, consistent with the prior Tukey pairwise differences table and plot.”

Nonparametric One-Way Tests.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Performing a Nonparametric One-Way Tests - generated code - start */
title1 "Nonparametric One-Way Test on 3 Fish Species";
ods noproctitle;
proc npar1way data=sashelp.fish (where=(Species EQ 'Bream' OR Species EQ 'Pike' OR Species EQ 'Roach'))
  wilcoxon dscf plots(only)=(wilcoxonboxplot);
  class Species;
  var Weight;
  run;
  
  /* "Using Nonparametric One-Way ANOVA + Pairwise multiple comparison analysis (asymptotic only) if you feel that the distribution assumptions are not satisfied." */  

  /* "The Kruskal-Wallis test implies that the three fish weights are not all equal (p <.0001).  */  

  /* "The box plot show that Roach are lighter than Bream or Pike. To determine which pairs of fish are unequal, look at Pairwise Comparisons table to see the p-values; it shows that the comparisons of Bream vs. Roach and Roach vs. Pike are significantly different (p <.0001) while the comparison of Bream vs. Pike is not." */  
title1;
/* Performing a Nonparametric One-Way Tests - generated code - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Nonparametric One-Way Test on 3 Fish Species

The Npar1way Procedure

Variable Weight

Wilcoxon Analysis

Scores


Wilcoxon Scores (Rank Sums) for Variable Weight Classified by Variable Species
Species	N	Sum of Scores	Expected Under H0	Std Dev Under H0	Mean Score
Average scores were used for ties.
Bream	34	1580.00	1224.0	86.852273	46.470588
Roach	20	224.50	720.0	78.206158	11.225000
Pike	17	751.50	612.0	74.192876	44.205882

Kruskal-Wallis Test


Kruskal-Wallis Test
Chi-Square	DF	Pr > ChiSq
40.2791	2	<.0001

Box Plot

Nonparametric One-Way Test on 3 Fish Species

Multiple Comparison Analysis

DSCF


Pairwise Two-Sided Multiple Comparison Analysis
Dwass, Steel, Critchlow-Fligner Method
Variable: Weight
Species	Wilcoxon Z	DSCF Value	Pr > DSCF
Bream vs. Roach	5.9671	8.4388	<.0001
Bream vs. Pike	0.4599	0.6504	0.8900
Roach vs. Pike	-4.9544	7.0066	<.0001

“Using Nonparametric One-Way ANOVA + Pairwise multiple comparison analysis (asymptotic only) if you feel that the distribution assumptions are not satisfied.”
“The Kruskal-Wallis test implies that the three fish weights are not all equal (p <.0001).”
“The box plot show that Roach are lighter than Bream or Pike. To determine which pairs of fish are unequal, look at Pairwise Comparisons table to see the p-values; it shows that the comparisons of Bream vs. Roach and Roach vs. Pike are significantly different (p <.0001) while the comparison of Bream vs. Pike is not.”

Practice: One-way ANOVA with test for Tukey multiple comparisons.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Practice: One-way ANOVA with test for Tukey multiple comparisons, CHF - generated code - start */

title1 "Practice: One-way ANOVA with test for Tukey multiple comparisons, CHF";
/* Placebo: 55 58 62 48 57 57 80 40 55 52 */ 
/* Calcium: 57 65 55 78 57 84 72 80 78 81 */ 
/* Lasix:   60 60 65 67 48 62 64 70 57 40 */ 
data work.congestive_heart_failure;
  do Group = 'Placebo','Calcium','Lasix';
    do Subj = 1 to 10;
      input LVEF @@;
      output; 
    end;
  end;
datalines;
  55 58 62 48 57 57 80 40 55 52
  57 65 55 78 57 84 72 80 78 81
  60 60 65 67 48 62 64 70 57 40
    ; 
/* proc print u data=work.congestive_heart_failure ; run; */

ods noproctitle;
ods graphics / imagemap=on;
proc glm data=work.congestive_heart_failure
  plots(maxpoints=none only)=(boxplot diagnostics(unpack));
    class Group;
    model LVEF=Group;
    means Group / hovtest=levene welch plots=none;
    lsmeans Group / adjust=tukey pdiff alpha=.05 plots=(meanplot diffplot);
    run;
quit;

/* The ANOVA table shows F-value of 5.45 and with a corresponding p-value of 0.0102. */ 

/* The Q-Q Plot of Residuals for left ventricular ejection fraction (LVEF) shows deviations from the straight line which means leads me to request Welch's variance-weighted ANOVA. The Histogram for Residuals [Distribution of Residuals for LVEF] appear to be relatively normally distributed. */ 

/* To confirm the assumption of homogeneity of variance / homoscedasticity, we require a p-value above the chosen significance level. The output table [Levene's Test for Homogeneity of LVEF Variance / ANOVA of Squared Deviations from Group Means] shows a p-value of .8069, so do not reject the null hypothesis of equal variance at the .05 level. However, the output table [Welch's ANOVA for LVEF] shows a p-value of .0228, so reject the null hypothesis of equal variance at the .05 level?!? */ 

/* The table [Least Squares Means / Adjustment for Multiple Comparisons: Tukey] shows p-values for all of the pairwise differences: Lasix vs. Placebo shows a p-value = 0.8031 (not significant at the .05 level), Calcium vs. Lasix shows a p-value = 0.0488 (significant at the .05 level), and Calcium vs. Placebo shows a p-value = 0.0114 (significant at the .05 level).  This is confirmed by the plot Pairwise Comparison of Means [LVEF Comparisons for Group]. */ 

title1;
/* Practice: One-way ANOVA with test for Tukey multiple comparisons, CHF - generated code - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

The GLM Procedure

Data

Class Levels


Class Level Information
Class	Levels	Values
Group	3	Calcium Lasix Placebo

Number of Observations


Number of Observations Read	30
Number of Observations Used	30

Dependent Variable: LVEF

Analysis of Variance

LVEF

Overall ANOVA


Source	DF	Sum of Squares	Mean Square	F Value	Pr > F
Model	2	1142.866667	571.433333	5.45	0.0102
Error	27	2828.600000	104.762963
Corrected Total	29	3971.466667

Fit Statistics


R-Square	Coeff Var	Root MSE	LVEF Mean
0.287769	16.47325	10.23538	62.13333

Type I Model ANOVA


Source	DF	Type I SS	Mean Square	F Value	Pr > F
Group	2	1142.866667	571.433333	5.45	0.0102

Type III Model ANOVA


Source	DF	Type III SS	Mean Square	F Value	Pr > F
Group	2	1142.866667	571.433333	5.45	0.0102

Diagnostic Plots

Residual by Predicted

RStudent by Predicted

RStudent by Leverage

Q-Q Plot

LVEF by Predicted

Cook's D Plot

Needleplot of Cook's D statistic by Observation for LVEF

Residual Histogram

Histogram of Residuals for LVEF with normal and kernel densities overlaid

RF Plot

Box Plot

Means

Group

LVEF

Levene's HoV Test


Levene's Test for Homogeneity of LVEF Variance ANOVA of Squared Deviations from Group Means
Source	DF	Sum of Squares	Mean Square	F Value	Pr > F
Group	2	7625.0	3812.5	0.22	0.8069
Error	27	475849	17624.0

Group

LVEF

Welch's ANOVA


Welch's ANOVA for LVEF
Source	DF	F Value	Pr > F
Group	2.0000	4.71	0.0228
Error	17.8619

Group

Means


Level of Group	N	LVEF
Level of Group	N	Mean	Std Dev
Calcium	10	70.7000000	11.2155646
Lasix	10	59.3000000	9.0805041
Placebo	10	56.4000000	10.2977883

Least Squares Means

Adjustment for Multiple Comparisons: Tukey

Least Squares Means

Group

LVEF

LSMeans


Group	LVEF LSMEAN	LSMEAN Number
Calcium	70.7000000	1
Lasix	59.3000000	2
Placebo	56.4000000	3

Difference Matrix


Least Squares Means for effect Group Pr > \|t\| for H0: LSMean(i)=LSMean(j) Dependent Variable: LVEF
i/j	1	2	3
1		0.0488	0.0114
2	0.0488		0.8031
3	0.0114	0.8031

Group Mean Plot

Plot of LVEF least-squares means for Group.

Group Diffogram

Plot of all pairwise LVEF least-squares means differences for Group with Tukey adjustment at significance level 0.05.

The ANOVA table shows F-value of 5.45 and with a corresponding p-value of 0.0102.
The Q-Q Plot of Residuals for left ventricular ejection fraction (LVEF) shows deviations from the straight line which means leads me to request Welch’s variance-weighted ANOVA. The Histogram for Residuals [Distribution of Residuals for LVEF] appear to be relatively normally distributed.
To confirm the assumption of homogeneity of variance / homoscedasticity, we require a p-value above the chosen significance level. The output table [Levene’s Test for Homogeneity of LVEF Variance / ANOVA of Squared Deviations from Group Means] shows a p-value of .8069, so do not reject the null hypothesis of equal variance at the .05 level. However, the output table [Welch’s ANOVA for LVEF] shows a p-value of .0228, so reject the null hypothesis of equal variance at the .05 level?!?
The table [Least Squares Means / Adjustment for Multiple Comparisons: Tukey] shows p-values for all of the pairwise differences: Lasix vs. Placebo shows a p-value = 0.8031 (not significant at the .05 level), Calcium vs. Lasix shows a p-value = 0.0488 (significant at the .05 level), and Calcium vs. Placebo shows a p-value = 0.0114 (significant at the .05 level). This is confirmed by the plot Pairwise Comparison of Means [LVEF Comparisons for Group].

Chapter 9: N-Way ANOVA

Performing a Two-Way Analysis of Variance.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/*** Chapter 9: N-Way ANOVA ***/

/* Chapter 9: Performing a Two-Way Analysis of Variance - generated code - start */

/* Perform N-way ANOVA */
/* Model Effects Full Factorial, Type 3 Sum of Squares, Perform Multiple Comparisons default Main effects Tukey 0.05 level  Interaction plot Diagnostic plots*/
title1 "N-Way ANOVA: Birth Weight where Interaction Term is not Significant";
ods noproctitle;
ods graphics / imagemap=on;
proc glm data=work.Birth_Wt_Sample
  plots(only maxpoints=none)=(diagnostics(unpack) intplot);
    class Black MomSmoke;
    model Weight=Black MomSmoke Black*MomSmoke / ss3;
    lsmeans Black MomSmoke / adjust=tukey pdiff=all alpha=0.05 
      cl ;
quit;

/* The ANOVA table shows the F and p-values for the model as a whole, then the sum of squares, mean squares, F and p-values for each of the two factors and the interaction term.  Both of the main factors are highly significant (Black and MomSmoke) while the interaction term (Black*MomSmoke where p = 0.4842) is not significant at the .05 level. 
Checking the diagnostic plots to verify ANOVA assumptions, we see that the first plot (Residuals by Predicted for Weight) shows few outliers on the low side and similar variance in each of the four categories.  The 7th plot/histogram of residuals (Distribution of Residuals for Weight) looks symmetric and close enough to a normal distribution.  
Babies born from black mothers are lighter than babies born from non-black mothers (difference of about 235.6 grams or about half a pound).  (Note that the 95% confidence limits do not include 0 because the difference is significant at the .05 level.)  Babies born to mothers who smoke are approximately 263.4 grams or about half a pound lighter than babies born to mothers who do not smoke.  */

/* Model Effects Full Factorial, Type 3 Sum of Squares, Perform Multiple Comparisons ALL effects Tukey 0.05 level  Interaction plot LSMeans plot Diagnostic plots*/
title1 "N-Way ANOVA: Birth Weight where Interaction Term is Significant";
ods noproctitle;
ods graphics / imagemap=on;
proc glm data=work.Birth_Wt_Sample
  plots(only maxpoints=none)=(diagnostics(unpack) intplot);
    class MomSmoke Married;
    model Weight=MomSmoke Married MomSmoke*Married / ss3;
    lsmeans MomSmoke Married MomSmoke*Married / adjust=tukey pdiff=all alpha=0.05 
      cl plots=(meanplot(cl));
quit;

/* The ANOVA table shows the F and p-values for the model as a whole, then the sum of squares, mean squares, F and p-values for each of the two factors and the interaction term.  Both of the main factors are highly significant (MomSmoke and Married) and the interaction term (MomSmoke*Married where p = 0.0001) is also significant at the .05 level. 
In this model, Babies born to mothers who smoke are approximately 221.8 grams lighter than babies born to mothers who do not smoke.  Babies born from married mothers are heavier by about 120.5 grams than babies born from unmarried mothers.  (Note that the 95% confidence limits [both of which are negative numbers since they are heavier] do not include 0 because the difference is significant at the .05 level.)
Scrolling down to the interaction tables (Least Squares Means - Adjustment for Multiple Comparisons: Tukey-Kramer), we see that nonsmoking married mothers (MomSmoke=0, Married=1, LSMEAN number identifier 2) show the heaviest birth weight, while smoking unmarried mothers (MomSmoke=1, Married=0, LSMEAN number identifier 3) show the lightest birth weight.  "Looking at the last plot (LS-Means for MomSmoke*Married With 95% Confidence Limits), we see the means and confidence limits for all 4 combinations of the two factors; drawing straight lines between the means of (0 * 0) and (0 * 1), and (1 * 0) and (1 * 1) we see that there is some interaction between the two terms because there are different slopes (no interaction would show parallel lines drawn)."  */

title1;
/* Performing a Two-Way Analysis of Variance - generated code - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

N-Way ANOVA: Birth Weight where Interaction Term is not Significant

The GLM Procedure

Data

Class Levels


Class Level Information
Class	Levels	Values
Black	2	0 1
MomSmoke	2	0 1

Number of Observations


Number of Observations Read	12500
Number of Observations Used	12500

N-Way ANOVA: Birth Weight where Interaction Term is not Significant

Dependent Variable: Weight Infant Birth Weight

Analysis of Variance

Weight

Overall ANOVA


Source	DF	Sum of Squares	Mean Square	F Value	Pr > F
Model	3	201913196	67304399	224.76	<.0001
Error	12496	3741913496	299449
Corrected Total	12499	3943826691

Fit Statistics


R-Square	Coeff Var	Root MSE	Weight Mean
0.051197	16.22229	547.2192	3373.254

Type III Model ANOVA


Source	DF	Type III SS	Mean Square	F Value	Pr > F
Black	1	34754959.61	34754959.61	116.06	<.0001
MomSmoke	1	43453129.92	43453129.92	145.11	<.0001
Black*MomSmoke	1	146558.23	146558.23	0.49	0.4842

Diagnostic Plots

Residual by Predicted

RStudent by Predicted

RStudent by Leverage

Q-Q Plot

Weight by Predicted

Cook's D Plot

Residual Histogram

RF Plot

Interaction Plot

N-Way ANOVA: Birth Weight where Interaction Term is not Significant

Least Squares Means

Adjustment for Multiple Comparisons: Tukey-Kramer

Least Squares Means

Black

Weight

LSMeans


Black	Weight LSMEAN	H0:LSMean1=LSMean2
Black	Weight LSMEAN	Pr > \|t\|
0	3309.77078	<.0001
1	3074.16668

Confidence Intervals


Black	Weight LSMEAN	95% Confidence Limits
0	3309.770783	3294.603120	3324.938446
1	3074.166679	3034.072463	3114.260895

Pairwise Differences


Least Squares Means for Effect Black
i	j	Difference Between Means	Simultaneous 95% Confidence Limits for LSMean(i)-LSMean(j)
1	2	235.604104	192.736813	278.471395

N-Way ANOVA: Birth Weight where Interaction Term is not Significant

Least Squares Means

Adjustment for Multiple Comparisons: Tukey-Kramer

MomSmoke

Weight

LSMeans


MomSmoke	Weight LSMEAN	H0:LSMean1=LSMean2
MomSmoke	Weight LSMEAN	Pr > \|t\|
0	3323.68972	<.0001
1	3060.24774

Confidence Intervals


MomSmoke	Weight LSMEAN	95% Confidence Limits
0	3323.689724	3309.767505	3337.611942
1	3060.247739	3019.704231	3100.791246

Pairwise Differences


Least Squares Means for Effect MomSmoke
i	j	Difference Between Means	Simultaneous 95% Confidence Limits for LSMean(i)-LSMean(j)
1	2	263.441985	220.574694	306.309276

The ANOVA table shows the F and p-values for the model as a whole, then the sum of squares, mean squares, F and p-values for each of the two factors and the interaction term. Both of the main factors are highly significant (Black and MomSmoke) while the interaction term (Black*MomSmoke where p = 0.4842) is not significant at the .05 level.
Checking the diagnostic plots to verify ANOVA assumptions, we see that the first plot (Residuals by Predicted for Weight) shows few outliers on the low side and similar variance in each of the four categories. The 7th plot/histogram of residuals (Distribution of Residuals for Weight) looks symmetric and close enough to a normal distribution.
Babies born from black mothers are lighter than babies born from non-black mothers (difference of about 235.6 grams or about half a pound). (Note that the 95% confidence limits do not include 0 because the difference is significant at the .05 level.) Babies born to mothers who smoke are approximately 263.4 grams or about half a pound lighter than babies born to mothers who do not smoke.

SAS Output

Results: biostats_by_ex_SAScode.sas

N-Way ANOVA: Birth Weight where Interaction Term is Significant

The GLM Procedure

Data

Class Levels


Class Level Information
Class	Levels	Values
MomSmoke	2	0 1
Married	2	0 1

Number of Observations


Number of Observations Read	12500
Number of Observations Used	12500

N-Way ANOVA: Birth Weight where Interaction Term is Significant

Dependent Variable: Weight Infant Birth Weight

Analysis of Variance

Weight

Overall ANOVA


Source	DF	Sum of Squares	Mean Square	F Value	Pr > F
Model	3	167654386	55884795	184.93	<.0001
Error	12496	3776172306	302190
Corrected Total	12499	3943826691

Fit Statistics


R-Square	Coeff Var	Root MSE	Weight Mean
0.042511	16.29639	549.7186	3373.254

Type III Model ANOVA


Source	DF	Type III SS	Mean Square	F Value	Pr > F
MomSmoke	1	67738620.58	67738620.58	224.16	<.0001
Married	1	20004153.29	20004153.29	66.20	<.0001
MomSmoke*Married	1	4535264.42	4535264.42	15.01	0.0001

Diagnostic Plots

Residual by Predicted

RStudent by Predicted

RStudent by Leverage

Q-Q Plot

Weight by Predicted

Cook's D Plot

Residual Histogram

RF Plot

Interaction Plot

N-Way ANOVA: Birth Weight where Interaction Term is Significant

Least Squares Means

Adjustment for Multiple Comparisons: Tukey-Kramer

Least Squares Means

MomSmoke

Weight

LSMeans


MomSmoke	Weight LSMEAN	H0:LSMean1=LSMean2
MomSmoke	Weight LSMEAN	Pr > \|t\|
0	3365.31299	<.0001
1	3143.48832

Confidence Intervals


MomSmoke	Weight LSMEAN	95% Confidence Limits
0	3365.312990	3353.513146	3377.112834
1	3143.488320	3116.951841	3170.024800

Pairwise Differences


Least Squares Means for Effect MomSmoke
i	j	Difference Between Means	Simultaneous 95% Confidence Limits for LSMean(i)-LSMean(j)
1	2	221.824670	192.782953	250.866386

MomSmoke Mean Plot

N-Way ANOVA: Birth Weight where Interaction Term is Significant

Least Squares Means

Adjustment for Multiple Comparisons: Tukey-Kramer

Married

Weight

LSMeans


Married	Weight LSMEAN	H0:LSMean1=LSMean2
Married	Weight LSMEAN	Pr > \|t\|
0	3194.12780	<.0001
1	3314.67351

Confidence Intervals


Married	Weight LSMEAN	95% Confidence Limits
0	3194.127801	3172.707311	3215.548291
1	3314.673510	3295.062705	3334.284315

Pairwise Differences


Least Squares Means for Effect Married
i	j	Difference Between Means	Simultaneous 95% Confidence Limits for LSMean(i)-LSMean(j)
1	2	-120.545709	-149.587425	-91.503993

Married Mean Plot

N-Way ANOVA: Birth Weight where Interaction Term is Significant

Least Squares Means

Adjustment for Multiple Comparisons: Tukey-Kramer

MomSmoke*Married

Weight

LSMeans


MomSmoke	Married	Weight LSMEAN	LSMEAN Number
0	0	3276.34139	1
0	1	3454.28459	2
1	0	3111.91422	3
1	1	3175.06242	4

Difference Matrix


Least Squares Means for effect MomSmoke*Married Pr > \|t\| for H0: LSMean(i)=LSMean(j) Dependent Variable: Weight
i/j	1	2	3	4
1		<.0001	<.0001	<.0001
2	<.0001		<.0001	<.0001
3	<.0001	<.0001		0.0908
4	<.0001	<.0001	0.0908

Confidence Intervals


MomSmoke	Married	Weight LSMEAN	95% Confidence Limits
0	0	3276.341385	3256.032254	3296.650517
0	1	3454.284594	3442.264425	3466.304764
1	0	3111.914216	3074.193018	3149.635413
1	1	3175.062425	3137.728122	3212.396728

Pairwise Differences


Least Squares Means for Effect MomSmoke*Married
i	j	Difference Between Means	Simultaneous 95% Confidence Limits for LSMean(i)-LSMean(j)
1	2	-177.943209	-208.877642	-147.008776
1	3	164.427170	108.271283	220.583057
1	4	101.278960	45.569089	156.988832
2	3	342.370379	290.475786	394.264971
2	4	279.222169	227.810547	330.633792
3	4	-63.148209	-132.716157	6.419739

MomSmoke*Married Mean Plot

Plot of Weight least-squares means for MomSmoke*Married. With 95% confidence limits.

The ANOVA table shows the F and p-values for the model as a whole, then the sum of squares, mean squares, F and p-values for each of the two factors and the interaction term. Both of the main factors are highly significant (MomSmoke and Married) and the interaction term (MomSmoke*Married where p = 0.0001) is also significant at the .05 level.
In this model, Babies born to mothers who smoke are approximately 221.8 grams lighter than babies born to mothers who do not smoke. Babies born from married mothers are heavier by about 120.5 grams than babies born from unmarried mothers. (Note that the 95% confidence limits [both of which are negative numbers since they are heavier] do not include 0 because the difference is significant at the .05 level.)
Scrolling down to the interaction tables (Least Squares Means - Adjustment for Multiple Comparisons: Tukey-Kramer), we see that nonsmoking married mothers (MomSmoke=0, Married=1, LSMEAN number identifier 2) show the heaviest birth weight, while smoking unmarried mothers (MomSmoke=1, Married=0, LSMEAN number identifier 3) show the lightest birth weight. “Looking at the last plot (LS-Means for MomSmoke * Married With 95% Confidence Limits), we see the means and confidence limits for all 4 combinations of the two factors; drawing straight lines between the means of (0 * 0) and (0 * 1), and (1 * 0) and (1 * 1) we see that there is some interaction between the two terms because there are different slopes (no interaction would show parallel lines drawn).”

Practice: Two-Way ANOVA: Weight by Gender or Cholesterol Status

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Practice: Two-Way Analysis of Variance - Heart / Weight - start 9.2 */
/* Model Effects Full Factorial, Type 3 Sum of Squares, Perform Multiple Comparisons default Main effects Tukey 0.05 level  Interaction plot Diagnostic plots*/
title1 "Practice: Two-Way ANOVA: Weight by Gender or Cholesterol Status Part 1";
ods noproctitle;
ods graphics / imagemap=on;
proc glm data=biostats.heart plots=none;
    class Sex Chol_Status;
    model Weight=Sex Chol_Status Sex*Chol_Status / ss3;
    lsmeans Sex Chol_Status / adjust=tukey pdiff=all alpha=0.05 cl;
quit;
title1 "Practice: Two-Way ANOVA: Weight by Gender or Cholesterol Status Part 2";
/* Collapse categories in Chol_Status and rerun */
data work.heart2;
  set biostats.heart;
  if Chol_Status EQ "Borderline" then Chol_Status2 = "Border-High"; else
  if Chol_Status EQ "High" then Chol_Status2 = "Border-High"; else
    Chol_Status2 = Chol_Status;
  run;
proc glm data=work.heart2 plots=none;
    class Sex Chol_Status2;
    model Weight=Sex Chol_Status2 Sex*Chol_Status2 / ss3;
    lsmeans Sex Chol_Status2 / adjust=tukey pdiff=all alpha=0.05 cl;
quit;

/* Running a two-way ANOVA using Weight as the dependent variable and Sex and Chol_Status (cholesterol status) as factors shows significant p-value for the model as a whole.  Both of the main factors are highly significant, but the interaction term (Sex*Chol_Status where p = 0.5409) is not significant at the .05 level.   
In this model, Females weigh about 26.2 pounds lighter.  Cholesterol Status is not as staightforward, as the Least Squares Means for effect Chol_Status table shows that differences between Borderline and Desirable, and High and Desirable are significant (p <.0001) but differences between Borderline and High are not significant (p = 0.0204) at the .05 level.
We revised the data by collapsing the Borderline and High categories of Chol_Status as Border-High, and rerun the ANOVA.  In this rerun with collapsed categories, both of the main factors remain highly significant with the interaction term (Sex*Chol_Status2 where p = 0.9869) not showing as significant at the .05 level.  Females weigh about 26.2 pounds lighter and those with Chol_Status of Border-High weigh about 5.5 pounds heavier.  */

title1;
/* Practice: Two-Way Analysis of Variance - Heart / Weight - end 9.2 */

SAS Output

Results: biostats_by_ex_SAScode.sas

Practice: Two-Way ANOVA: Weight by Gender or Cholesterol Status Part 1

The GLM Procedure

Data

Class Levels


Class Level Information
Class	Levels	Values
Sex	2	Female Male
Chol_Status	3	Borderline Desirable High

Number of Observations


Number of Observations Read	5209
Number of Observations Used	5051

Practice: Two-Way ANOVA: Weight by Gender or Cholesterol Status Part 1

Dependent Variable: Weight

Analysis of Variance

Weight

Overall ANOVA


Source	DF	Sum of Squares	Mean Square	F Value	Pr > F
Model	5	905101.176	181020.235	275.54	<.0001
Error	5045	3314360.848	656.960
Corrected Total	5050	4219462.024

Fit Statistics


R-Square	Coeff Var	Root MSE	Weight Mean
0.214506	16.74489	25.63122	153.0689

Type III Model ANOVA


Source	DF	Type III SS	Mean Square	F Value	Pr > F
Sex	1	845072.8456	845072.8456	1286.34	<.0001
Chol_Status	2	35120.9760	17560.4880	26.73	<.0001
Sex*Chol_Status	2	807.6324	403.8162	0.61	0.5409

Practice: Two-Way ANOVA: Weight by Gender or Cholesterol Status Part 1

Least Squares Means

Adjustment for Multiple Comparisons: Tukey-Kramer

Least Squares Means

Sex

Weight

LSMeans


Sex	Weight LSMEAN	H0:LSMean1=LSMean2
Sex	Weight LSMEAN	Pr > \|t\|
Female	140.905421	<.0001
Male	167.155864

Confidence Intervals


Sex	Weight LSMEAN	95% Confidence Limits
Female	140.905421	139.946138	141.864705
Male	167.155864	166.088806	168.222923

Pairwise Differences


Least Squares Means for Effect Sex
i	j	Difference Between Means	Simultaneous 95% Confidence Limits for LSMean(i)-LSMean(j)
1	2	-26.250443	-27.685309	-24.815577

Practice: Two-Way ANOVA: Weight by Gender or Cholesterol Status Part 1

Least Squares Means

Adjustment for Multiple Comparisons: Tukey-Kramer

Chol_Status

Weight

LSMeans


Chol_Status	Weight LSMEAN	LSMEAN Number
Borderline	154.742853	1
Desirable	150.324762	2
High	157.024313	3

Difference Matrix


Least Squares Means for effect Chol_Status Pr > \|t\| for H0: LSMean(i)=LSMean(j) Dependent Variable: Weight
i/j	1	2	3
1		<.0001	0.0204
2	<.0001		<.0001
3	0.0204	<.0001

Confidence Intervals


Chol_Status	Weight LSMEAN	95% Confidence Limits
Borderline	154.742853	153.577181	155.908525
Desirable	150.324762	148.968989	151.680536
High	157.024313	155.826200	158.222426

Pairwise Differences


Least Squares Means for Effect Chol_Status
i	j	Difference Between Means	Simultaneous 95% Confidence Limits for LSMean(i)-LSMean(j)
1	2	4.418091	2.279913	6.556268
1	3	-2.281460	-4.280458	-0.282462
2	3	-6.699551	-8.863220	-4.535881

Practice: Two-Way ANOVA: Weight by Gender or Cholesterol Status Part 2

The GLM Procedure

Data

Class Levels


Class Level Information
Class	Levels	Values
Sex	2	Female Male
Chol_Status2	2	Border-High Desirable

Number of Observations


Number of Observations Read	5209
Number of Observations Used	5051

Practice: Two-Way ANOVA: Weight by Gender or Cholesterol Status Part 2

Dependent Variable: Weight

Analysis of Variance

Weight

Overall ANOVA


Source	DF	Sum of Squares	Mean Square	F Value	Pr > F
Model	3	899243.582	299747.861	455.64	<.0001
Error	5047	3320218.442	657.860
Corrected Total	5050	4219462.024

Fit Statistics


R-Square	Coeff Var	Root MSE	Weight Mean
0.213118	16.75636	25.64878	153.0689

Type III Model ANOVA


Source	DF	Type III SS	Mean Square	F Value	Pr > F
Sex	1	683157.0169	683157.0169	1038.45	<.0001
Chol_Status2	1	30738.5140	30738.5140	46.73	<.0001
Sex*Chol_Status2	1	0.1763	0.1763	0.00	0.9869

Practice: Two-Way ANOVA: Weight by Gender or Cholesterol Status Part 2

Least Squares Means

Adjustment for Multiple Comparisons: Tukey-Kramer

Least Squares Means

Sex

Weight

LSMeans


Sex	Weight LSMEAN	H0:LSMean1=LSMean2
Sex	Weight LSMEAN	Pr > \|t\|
Female	140.008609	<.0001
Male	166.195720

Confidence Intervals


Sex	Weight LSMEAN	95% Confidence Limits
Female	140.008609	138.955753	141.061464
Male	166.195720	165.000103	167.391337

Pairwise Differences


Least Squares Means for Effect Sex
i	j	Difference Between Means	Simultaneous 95% Confidence Limits for LSMean(i)-LSMean(j)
1	2	-26.187112	-27.780224	-24.594000

Practice: Two-Way ANOVA: Weight by Gender or Cholesterol Status Part 2

Least Squares Means

Adjustment for Multiple Comparisons: Tukey-Kramer

Chol_Status2

Weight

LSMeans


Chol_Status2	Weight LSMEAN	H0:LSMean1=LSMean2
Chol_Status2	Weight LSMEAN	Pr > \|t\|
Border-High	155.879567	<.0001
Desirable	150.324762

Confidence Intervals


Chol_Status2	Weight LSMEAN	95% Confidence Limits
Border-High	155.879567	155.044482	156.714651
Desirable	150.324762	148.968061	151.681464

Pairwise Differences


Least Squares Means for Effect Chol_Status2
i	j	Difference Between Means	Simultaneous 95% Confidence Limits for LSMean(i)-LSMean(j)
1	2	5.554804	3.961692	7.147916

Running a two-way ANOVA using Weight as the dependent variable and Sex and Chol_Status (cholesterol status) as factors shows significant p-value for the model as a whole. Both of the main factors are highly significant, but the interaction term (Sex*Chol_Status where p = 0.5409) is not significant at the .05 level.
In this model, Females weigh about 26.2 pounds lighter. Cholesterol Status is not as staightforward, as the Least Squares Means for effect Chol_Status table shows that differences between Borderline and Desirable, and High and Desirable are significant (p <.0001) but differences between Borderline and High are not significant (p = 0.0204) at the .05 level.
We revised the data by collapsing the Borderline and High categories of Chol_Status as Border-High, and rerun the ANOVA. In this rerun with collapsed categories, both of the main factors remain highly significant with the interaction term (Sex*Chol_Status2 where p = 0.9869) not showing as significant at the .05 level. Females weigh about 26.2 pounds lighter and those with Chol_Status of Border-High weigh about 5.5 pounds heavier.

Chapter 10: Correlation

Correlation and Scatter Plot Matrices.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/*** Chapter 10: Statistics Correlation ***/
/* Correlation and Scatter Plot Matrices - generated code - start */
title1 "Pearson Correlation and Scatter Plot Matrices, w/ Nonparametric Spearman";
ods noproctitle;
ods graphics / imagemap=on;
proc corr data=work.exercise pearson spearman nosimple 
        plots(maxpoints=none)=matrix(histogram);
    var Age Pushups Rest_Pulse Max_Pulse Run_Pulse;
run;

/* Pearson correlation coefficient measures the strength of the relationship between two variables.  Generate a matrix of scatter plots and include histograms on the diagonal of the matrix.  The p-value in the correlation tables is the probability that you would obtain a correlation  with an absolute value as large as or larger than the one you obtained by chance alone, given that the true population correlation between your two variables is actually 0. 
Spearman correlation is one of the most popular nonparametric alternatives to a Pearson correlation. The Spearman method substitutes ranks for the two variables and then computes a correlation on the ranks.  When there are outliers on your scatter plot, you may want to consider computing Spearman correlations. 
The relationship between Max_Pulse and Rest_Pulse is pretty strong at 0.83112, with a significant p-value of <.0001.  The inverse relationship between Age and Pushups is somewhat significant at -0.49191, with a significant p-value = 0.0003.  */ 
title1;
/* Correlation and Scatter Plot Matrices - generated code - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Pearson Correlation and Scatter Plot Matrices, w/ Nonparametric Spearman

The Corr Procedure

Variables Information


5 Variables:	Age Pushups Rest_Pulse Max_Pulse Run_Pulse

Pearson Correlations


Pearson Correlation Coefficients, N = 50 Prob > \|r\| under H0: Rho=0
	Age	Pushups	Rest_Pulse	Max_Pulse	Run_Pulse
Age Age	1.00000	-0.49191 0.0003	0.48774 0.0003	0.26582 0.0621	0.25097 0.0788
Pushups Pushups	-0.49191 0.0003	1.00000	-0.49639 0.0002	-0.45010 0.0010	-0.34555 0.0140
Rest_Pulse Rest_Pulse	0.48774 0.0003	-0.49639 0.0002	1.00000	0.83112 <.0001	0.76139 <.0001
Max_Pulse Max_Pulse	0.26582 0.0621	-0.45010 0.0010	0.83112 <.0001	1.00000	0.93634 <.0001
Run_Pulse Run_Pulse	0.25097 0.0788	-0.34555 0.0140	0.76139 <.0001	0.93634 <.0001	1.00000

Spearman Correlations


Spearman Correlation Coefficients, N = 50 Prob > \|r\| under H0: Rho=0
	Age	Pushups	Rest_Pulse	Max_Pulse	Run_Pulse
Age Age	1.00000	-0.46625 0.0006	0.48269 0.0004	0.25725 0.0713	0.25123 0.0784
Pushups Pushups	-0.46625 0.0006	1.00000	-0.48882 0.0003	-0.45181 0.0010	-0.33960 0.0158
Rest_Pulse Rest_Pulse	0.48269 0.0004	-0.48882 0.0003	1.00000	0.80310 <.0001	0.70668 <.0001
Max_Pulse Max_Pulse	0.25725 0.0713	-0.45181 0.0010	0.80310 <.0001	1.00000	0.91251 <.0001
Run_Pulse Run_Pulse	0.25123 0.0784	-0.33960 0.0158	0.70668 <.0001	0.91251 <.0001	1.00000

Scatter Plot Matrix

Pearson correlation coefficient measures the strength of the relationship between two variables. Generate a matrix of scatter plots and include histograms on the diagonal of the matrix. The p-value in the correlation tables is the probability that you would obtain a correlation with an absolute value as large as or larger than the one you obtained by chance alone, given that the true population correlation between your two variables is actually 0.
Spearman correlation is one of the most popular nonparametric alternatives to a Pearson correlation. The Spearman method substitutes ranks for the two variables and then computes a correlation on the ranks. When there are outliers on your scatter plot, you may want to consider computing Spearman correlations.
The relationship between Max_Pulse and Rest_Pulse is pretty strong at 0.83112, with a significant p-value of <.0001. The inverse relationship between Age and Pushups is somewhat significant at -0.49191, with a significant p-value = 0.0003.

Practice: Correlation and Scatter Plot Matrices - Height / Weight / Cholesterol

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Practice: Correlation and Scatter Plot Matrices - Heart / Weight - start 10.2 */
title1 "Practice: Correlation and Scatter Plot Matrices - Heart / Weight";
proc corr data=biostats.heart pearson spearman nosimple 
        plots(maxpoints=none)=matrix(histogram);
    var Height Weight Cholesterol;
run;

/* We see some relationship between Height and Weight, but not much for Height and Cholesterol, or Weight and Cholesterol. */
title1;
/* Practice: Correlation and Scatter Plot Matrices - Heart / Weight - end 10.2 */

SAS Output

Results: biostats_by_ex_SAScode.sas

Practice: Correlation and Scatter Plot Matrices - Heart / Weight

The Corr Procedure

Variables Information


3 Variables:	Height Weight Cholesterol

Pearson Correlations


Pearson Correlation Coefficients Prob > \|r\| under H0: Rho=0 Number of Observations
	Height	Weight	Cholesterol
Height	1.00000 5203	0.51739 <.0001 5199	-0.07959 <.0001 5051
Weight	0.51739 <.0001 5199	1.00000 5203	0.07243 <.0001 5051
Cholesterol	-0.07959 <.0001 5051	0.07243 <.0001 5051	1.00000 5057

Spearman Correlations


Spearman Correlation Coefficients Prob > \|r\| under H0: Rho=0 Number of Observations
	Height	Weight	Cholesterol
Height	1.00000 5203	0.52969 <.0001 5199	-0.07054 <.0001 5051
Weight	0.52969 <.0001 5199	1.00000 5203	0.09730 <.0001 5051
Cholesterol	-0.07054 <.0001 5051	0.09730 <.0001 5051	1.00000 5057

Scatter Plot Matrix

We see some relationship between Height and Weight, but not much for Height and Cholesterol, or Weight and Cholesterol.

Chapter 11: Simple and Multiple Regression

Simple Linear Regression.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/*** Chapter 11: Simple and Multiple Regression ***/

/* Simple Linear Regression - generated code - start */
ods noproctitle;
ods graphics / imagemap=on;
title1 "Simple Linear Regression"; 
proc reg data=work.exercise alpha=0.05 
  plots(only)=(diagnostics(unpack) residuals(unpack) fitplot observedbypredicted);
  model Run_Pulse=Rest_Pulse /;
  run;
quit;

/* "The mean square due to the model is much larger than the mean square due to error, yielding a very large F-value and a low p-value.  The mean of the dependent variable Run_Pulse is 112.92, and the adjusted R-square is .5710.  (The addition of independent variables in a model causes the value of R-square to increase even if independent variables are only randomly 
correlated with the dependent variable; Adj R-Sq adjusts for the number of independent variables in the model and is a way to compare models with different numbers of independent variables.)"  
  
The model is: Run_Pulse = 63.15163 + (.73297 * Rest_Pulse).  

"The Residual by Predicted for Run_Pulse plot shows points that seem mostly random.  The spread of the points around 0 does not tend to increase or decrease with different values of Run_Pulse."  

"The Cook's D for Run_Pulse plot shows the Cook's D value for each observation, where larger values indicates the more influential points."

"The Q-Q Plot of Residuals for Run_Pulse shows the distribution of the residuals, where the points fall very closely around the straight line."

"The shaded portion of the Fit plot for Run_Pulse graph represents the 95% confidence limit for the prediction of Run_Pulse for any given value of Rest_Pulse.  The other, wider confidence limits marked with dashed lines are for individual data points.  Given a value of Rest_Pulse, you are 95% confident that a random data point will be within these limits."  */

title1; 
/* Simple Linear Regression - generated code - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Simple Linear Regression

Model: MODEL1

Dependent Variable: Run_Pulse Run_Pulse

The Reg Procedure

MODEL1

Fit

Run_Pulse

Number of Observations


Number of Observations Read	50
Number of Observations Used	50

Analysis of Variance


Analysis of Variance
Source	DF	Sum of Squares	Mean Square	F Value	Pr > F
Model	1	2484.46062	2484.46062	66.21	<.0001
Error	48	1801.21938	37.52540
Corrected Total	49	4285.68000

Fit Statistics


Root MSE	6.12580	R-Square	0.5797
Dependent Mean	112.92000	Adj R-Sq	0.5710
Coeff Var	5.42490

Parameter Estimates


Parameter Estimates
Variable	Label	DF	Parameter Estimate	Standard Error	t Value	Pr > \|t\|
Intercept	Intercept	1	63.15163	6.17751	10.22	<.0001
Rest_Pulse	Rest_Pulse	1	0.73297	0.09008	8.14	<.0001

Simple Linear Regression

Model: MODEL1

Dependent Variable: Run_Pulse Run_Pulse

Observation-wise Statistics

Run_Pulse

Diagnostic Plots

Residual Histogram

Histogram of residuals for Run_Pulse with normal and kernel densities overlaid.

Residual by Predicted

Scatter plot of residuals by predicted values for Run_Pulse.

RStudent by Predicted

Observed by Predicted

Cook's D

RStudent by Leverage

Q-Q Plot

RF Plot

Residual-Fit spread plot for Run_Pulse. This plot consists of two side-by-side plots that show the spread in the fitted values about their mean and the spread in the residuals respectively.

Residual Plots

Rest_Pulse

Fit Plot

Scatterplot of Run_Pulse by Rest_Pulse overlaid with the fit line, a 95% confidence band and lower and upper 95% prediction limits.

“The mean square due to the model is much larger than the mean square due to error, yielding a very large F-value and a low p-value. The mean of the dependent variable Run_Pulse is 112.92, and the adjusted R-square is .5710. (The addition of independent variables in a model causes the value of R-square to increase even if independent variables are only randomly correlated with the dependent variable; Adj R-Sq adjusts for the number of independent variables in the model and is a way to compare models with different numbers of independent variables.)”
The model is: Run_Pulse = 63.15163 + (.73297 * Rest_Pulse).
“The Residual by Predicted for Run_Pulse plot shows points that seem mostly random. The spread of the points around 0 does not tend to increase or decrease with different values of Run_Pulse.”
“The Cook’s D for Run_Pulse plot shows the Cook’s D value for each observation, where larger values indicates the more influential points.”
“The Q-Q Plot of Residuals for Run_Pulse shows the distribution of the residuals, where the points fall very closely around the straight line.”
“The shaded portion of the Fit plot for Run_Pulse graph represents the 95% confidence limit for the prediction of Run_Pulse for any given value of Rest_Pulse. The other, wider confidence limits marked with dashed lines are for individual data points. Given a value of Rest_Pulse, you are 95% confident that a random data point will be within these limits.”

Multiple Regression.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Multiple Regression - generated code - start */

ods noproctitle;
ods graphics / imagemap=on;
title1 "Multiple Regression - Correlation Matrix"; 
proc corr data=work.exercise pearson spearman nosimple 
        plots(maxpoints=none)=matrix(histogram nvar=10);
    var Age Pushups Rest_Pulse Max_Pulse Run_Pulse;
run;

/* Strongest correlations are between Max_Pulse and Run_Pulse at 0.93634 with a p-value of <.0001.  The relationships between Age and Max_Pulse or Age and Run_Pulse are not statistically significant at the 0.05 level.   
 Nonparametric Spearman correlation matrix also provided.  
 "There are some strong correlations among the variables, and the condition where the predictor variables are highly correlated is called multi-collinearity.  This causes serious problems when these variables are all used in multiple regression."
*/ 

title1 "Multiple Regression - Linear Regression, part 1"; 
proc reg data=work.exercise  alpha=0.05 
  plots(only label)=(diagnostics residuals observedbypredicted);
    model Run_Pulse=Age Pushups Rest_Pulse Max_Pulse / vif;
    run;
quit;

/* This initial Linear Regression model's R-square .8869 and adjusted R-square .8768 appear highly significant, but once when we look at the parameter estimates we see that only Max_Pulse looks significant.  Also, a negative parameter estimate for Rest_Pulse also looks off.  
"VIF is a popular diagnostic test for multi-collinearity, and large values of VIF indicate multi-collinearity problems."  
Rest_Pulse has the highest value for Variance Inflation (VIF) at 4.30537, so we drop this parameter and rerun the regression.
*/

title1 "Multiple Regression - Linear Regression, part 2"; 
proc reg data=work.exercise  alpha=0.05 
  plots(only label)=(diagnostics residuals observedbypredicted);
    model Run_Pulse=Age Pushups Max_Pulse / vif;
    run;
quit;

/* The revised Linear Regression model's parameter estimates Variance Inflation (VIF) values are lower after Rest_Pulse is removed.
*/

title1; 
/* Multiple Regression - generated code - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Multiple Regression - Correlation Matrix

The Corr Procedure

Variables Information


5 Variables:	Age Pushups Rest_Pulse Max_Pulse Run_Pulse

Pearson Correlations


Pearson Correlation Coefficients, N = 50 Prob > \|r\| under H0: Rho=0
	Age	Pushups	Rest_Pulse	Max_Pulse	Run_Pulse
Age Age	1.00000	-0.49191 0.0003	0.48774 0.0003	0.26582 0.0621	0.25097 0.0788
Pushups Pushups	-0.49191 0.0003	1.00000	-0.49639 0.0002	-0.45010 0.0010	-0.34555 0.0140
Rest_Pulse Rest_Pulse	0.48774 0.0003	-0.49639 0.0002	1.00000	0.83112 <.0001	0.76139 <.0001
Max_Pulse Max_Pulse	0.26582 0.0621	-0.45010 0.0010	0.83112 <.0001	1.00000	0.93634 <.0001
Run_Pulse Run_Pulse	0.25097 0.0788	-0.34555 0.0140	0.76139 <.0001	0.93634 <.0001	1.00000

Spearman Correlations


Spearman Correlation Coefficients, N = 50 Prob > \|r\| under H0: Rho=0
	Age	Pushups	Rest_Pulse	Max_Pulse	Run_Pulse
Age Age	1.00000	-0.46625 0.0006	0.48269 0.0004	0.25725 0.0713	0.25123 0.0784
Pushups Pushups	-0.46625 0.0006	1.00000	-0.48882 0.0003	-0.45181 0.0010	-0.33960 0.0158
Rest_Pulse Rest_Pulse	0.48269 0.0004	-0.48882 0.0003	1.00000	0.80310 <.0001	0.70668 <.0001
Max_Pulse Max_Pulse	0.25725 0.0713	-0.45181 0.0010	0.80310 <.0001	1.00000	0.91251 <.0001
Run_Pulse Run_Pulse	0.25123 0.0784	-0.33960 0.0158	0.70668 <.0001	0.91251 <.0001	1.00000

Scatter Plot Matrix

Multiple Regression - Linear Regression, part 1

Model: MODEL1

Dependent Variable: Run_Pulse Run_Pulse

The Reg Procedure

MODEL1

Fit

Run_Pulse

Number of Observations


Number of Observations Read	50
Number of Observations Used	50

Analysis of Variance


Analysis of Variance
Source	DF	Sum of Squares	Mean Square	F Value	Pr > F
Model	4	3800.80979	950.20245	88.19	<.0001
Error	45	484.87021	10.77489
Corrected Total	49	4285.68000

Fit Statistics


Root MSE	3.28251	R-Square	0.8869
Dependent Mean	112.92000	Adj R-Sq	0.8768
Coeff Var	2.90694

Parameter Estimates


Parameter Estimates
Variable	Label	DF	Parameter Estimate	Standard Error	t Value	Pr > \|t\|	Variance Inflation
Intercept	Intercept	1	-7.80929	8.04006	-0.97	0.3366	0
Age	Age	1	0.03955	0.03738	1.06	0.2956	1.65524
Pushups	Pushups	1	0.08573	0.04576	1.87	0.0675	1.54495
Rest_Pulse	Rest_Pulse	1	-0.06462	0.10016	-0.65	0.5221	4.30537
Max_Pulse	Max_Pulse	1	1.04744	0.09786	10.70	<.0001	3.65824

Multiple Regression - Linear Regression, part 1

Model: MODEL1

Dependent Variable: Run_Pulse Run_Pulse

Observation-wise Statistics

Run_Pulse

Diagnostic Plots

Observed by Predicted

Fit Diagnostics

Residual Plots

Panel 1

Panel of scatterplots of residuals by regressors for Run_Pulse.

Multiple Regression - Linear Regression, part 2

Model: MODEL1

Dependent Variable: Run_Pulse Run_Pulse

The Reg Procedure

MODEL1

Fit

Run_Pulse

Number of Observations


Number of Observations Read	50
Number of Observations Used	50

Analysis of Variance


Analysis of Variance
Source	DF	Sum of Squares	Mean Square	F Value	Pr > F
Model	3	3796.32397	1265.44132	118.95	<.0001
Error	46	489.35603	10.63817
Corrected Total	49	4285.68000

Fit Statistics


Root MSE	3.26162	R-Square	0.8858
Dependent Mean	112.92000	Adj R-Sq	0.8784
Coeff Var	2.88844

Parameter Estimates


Parameter Estimates
Variable	Label	DF	Parameter Estimate	Standard Error	t Value	Pr > \|t\|	Variance Inflation
Intercept	Intercept	1	-5.76862	7.34483	-0.79	0.4362	0
Age	Age	1	0.02876	0.03321	0.87	0.3910	1.32353
Pushups	Pushups	1	0.08691	0.04544	1.91	0.0620	1.54251
Max_Pulse	Max_Pulse	1	0.99630	0.05702	17.47	<.0001	1.25817

Multiple Regression - Linear Regression, part 2

Model: MODEL1

Dependent Variable: Run_Pulse Run_Pulse

Observation-wise Statistics

Run_Pulse

Diagnostic Plots

Observed by Predicted

Fit Diagnostics

Residual Plots

Panel 1

This initial Linear Regression model’s R-square .8869 and adjusted R-square .8768 appear highly significant, but once when we look at the parameter estimates we see that only Max_Pulse looks significant. Also, a negative parameter estimate for Rest_Pulse also looks off.
“VIF is a popular diagnostic test for multi-collinearity, and large values of VIF indicate multi-collinearity problems.”
Rest_Pulse has the highest value for Variance Inflation (VIF) at 4.30537, so we drop this parameter and rerun the regression.
The revised Linear Regression model’s parameter estimates Variance Inflation (VIF) values are lower after Rest_Pulse is removed.

Stepwise Multiple Regression.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Stepwise Multiple Regression - generated code - start */

/* Create 25 pct sample without replacement using random seed 13579 */
title1 "Multiple Regression - Create 25% Sample of Birth Weights Data"; 
proc surveyselect data=biostats.bweight out=work.Birth_Wt_Sample method=srs 
        samprate=0.25 seed=13579;
run;

title1 "Multiple Regression - Stepwise Selection using R-Square"; 
proc glmselect data=work.Birth_Wt_Sample 
        outdesign(addinputvars)=work.reg_design plots=(criterionpanel);
    class Black MomSmoke / param=glm;
    model Weight=MomAge MomWtGain Black MomSmoke / showpvalues 
        selection=stepwise (select=rsquare);
run;
proc reg data=work.reg_design alpha=0.05 
  plots(only maxpoints=none)=(diagnostics residuals observedbypredicted);
    where Black is not missing and MomSmoke is not missing;
    ods select ParameterEstimates DiagnosticsPanel ResidualPlot 
        ObservedByPredicted;
    model Weight=&_GLSMOD / vif;
    run;
quit;
proc delete data=work.reg_design;
run;

/* "Using the 25% random sample of birth weights from the SASHELP data set Bweight, we demonstrate stepwise multiple regression using the stepwise selection method specifying add/remove effect with R-square."

"The dependent variable is Weight; use the two variables Black (1=black, 0=non-black) and MomSmoke (1=yes, 0=no) as classification variables and the two variables MomAge (mother's age) and MomWtGain (mom's weight gain) as continuous variables."

"The stepwise selection summary shows which variables were entered into the models as well as the order of entry.  All four variables were entered into the model.  The last column, labeled Model R-Square, shows the model R-square as each variable is entered into the model."  

"AIC (Akaike's information criteria) and a modified version AICC are displayed in the top two graphs.  The stars on the plots indicate that the four-variable model is the "best" model as defined by each criterion.  SBC (Schwarz Bayesian Information Criterion) is displayed on the bottom left, and the adjusted R-square is displayed on the bottom right.  Although all four criteria selected the same model, this is not always the case." 

"The overall p-value for this model is low; but with 12,500 observations in this 25% random sample, even small effects can be significant.  Looking at the parameter estimates along with the t and p-values, we see only two levels of the classification variables.  The estimate for Black 0 (non-black) is about 218, which means that after all the other variables are adjusted for, babies of a non-black mother are 218 grams heavier than babies where the mother is black.  Because smoking is a risk factor for low birth weight babies, you see that if a mother does not smoke (MomSmoke = 0), the babies are almost 253 grams heavier than if the mother smokes." 
*/

title1; 
/* Stepwise Multiple Regression - generated code - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Multiple Regression - Create 25% Sample of Birth Weights Data

The Surveyselect Procedure

Sample Selection Method


Selection Method	Simple Random Sampling

Sample Selection Summary


Input Data Set	BWEIGHT
Random Number Seed	13579
Sampling Rate	0.25
Sample Size	12500
Selection Probability	0.25
Sampling Weight	4
Output Data Set	BIRTH_WT_SAMPLE

Multiple Regression - Stepwise Selection using R-Square

The GLMSelect Procedure

Model Information


Data Set	WORK.BIRTH_WT_SAMPLE
Dependent Variable	Weight
Selection Method	Stepwise
Select Criterion	R-Square
Stop Criterion	R-Square
Effect Hierarchy Enforced	None

Number of Observations


Number of Observations Read	12500
Number of Observations Used	12500

Class Level Information


Class Level Information
Class	Levels	Values
Black	2	0 1
MomSmoke	2	0 1

Dimensions


Dimensions
Number of Effects	5
Number of Parameters	7

Multiple Regression - Stepwise Selection using R-Square

Model Building Summary

Stepwise Selection Summary


Stepwise Selection Summary
Step	Effect Entered	Effect Removed	Number Effects In	Number Parms In	Model R-Square
0	Intercept		1	1	0.0000
1	MomWtGain		2	2	0.0423
2	MomSmoke		3	3	0.0657
3	Black		4	4	0.0882
4	MomAge		5	5	0.0946

Stop Reason


Selection stopped because all effects are in the final model.

Criterion Panel

Panel showing how selection criteria change with the effect sequence.

Selected Model

Multiple Regression - Stepwise Selection using R-Square

Selected Model

The selected model is the model at the last step (Step 4).

Selected Effects


Effects:	Intercept MomAge MomWtGain Black MomSmoke

Note:The p-values for parameters and effects are not adjusted for the fact that the terms in the model have been selected and so are generally liberal.

ANOVA


Analysis of Variance
Source	DF	Sum of Squares	Mean Square	F Value	Pr > F
Model	4	373063503	93265876	326.36	<.0001
Error	12495	3570763188	285775
Corrected Total	12499	3943826691

Fit Statistics


Root MSE	534.57962
Dependent Mean	3373.25448
R-Square	0.0946
Adj R-Sq	0.0943
AIC	169544
AICC	169544
SBC	157079

Parameter Estimates


Parameter Estimates
Parameter	DF	Estimate	Standard Error	t Value	Pr > \|t\|
Intercept	1	2961.598722	17.629377	167.99	<.0001
MomAge	1	7.963514	0.847390	9.40	<.0001
MomWtGain	1	8.677708	0.374621	23.16	<.0001
Black 0	1	218.282338	13.219116	16.51	<.0001
Black 1	0	0	.	.	.
MomSmoke 0	1	252.807537	14.202584	17.80	<.0001
MomSmoke 1	0	0	.	.	.

Multiple Regression - Stepwise Selection using R-Square

Model: MODEL1

Dependent Variable: Weight Infant Birth Weight

The Reg Procedure

MODEL1

Fit

Weight

Parameter Estimates


Parameter Estimates
Variable	Label	DF	Parameter Estimate	Standard Error	t Value	Pr > \|t\|	Variance Inflation
Intercept	Intercept	B	2961.59872	17.62938	167.99	<.0001	0
MomAge	MomAge	1	7.96351	0.84739	9.40	<.0001	1.02152
MomWtGain	MomWtGain	1	8.67771	0.37462	23.16	<.0001	1.00806
Black 0	Black 0	B	218.28234	13.21912	16.51	<.0001	1.01602
Black 1	Black 1	0	0	.	.	.	.
MomSmoke 0	MomSmoke 0	B	252.80754	14.20258	17.80	<.0001	1.01039
MomSmoke 1	MomSmoke 1	0	0	.	.	.	.

Multiple Regression - Stepwise Selection using R-Square

Model: MODEL1

Dependent Variable: Weight Infant Birth Weight

Observation-wise Statistics

Weight

Diagnostic Plots

Observed by Predicted

Fit Diagnostics

Residual Plots

Panel 1

Panel of scatterplots of residuals by regressors for Weight.

“Using the 25% random sample of birth weights from the SASHELP data set Bweight, we demonstrate stepwise multiple regression using the stepwise selection method specifying add/remove effect with R-square.”
“The dependent variable is Weight; use the two variables Black (1=black, 0=non-black) and MomSmoke (1=yes, 0=no) as classification variables and the two variables MomAge (mother’s age) and MomWtGain (mom’s weight gain) as continuous variables.”
“The stepwise selection summary shows which variables were entered into the models as well as the order of entry. All four variables were entered into the model. The last column, labeled Model R-Square, shows the model R-square as each variable is entered into the model.”
“AIC (Akaike’s information criteria) and a modified version AICC are displayed in the top two graphs. The stars on the plots indicate that the four-variable model is the”best” model as defined by each criterion. SBC (Schwarz Bayesian Information Criterion) is displayed on the bottom left, and the adjusted R-square is displayed on the bottom right. Although all four criteria selected the same model, this is not always the case.”
“The overall p-value for this model is low; but with 12,500 observations in this 25% random sample, even small effects can be significant. Looking at the parameter estimates along with the t and p-values, we see only two levels of the classification variables. The estimate for Black 0 (non-black) is about 218, which means that after all the other variables are adjusted for, babies of a non-black mother are 218 grams heavier than babies where the mother is black. Because smoking is a risk factor for low birth weight babies, you see that if a mother does not smoke (MomSmoke = 0), the babies are almost 253 grams heavier than if the mother smokes.”

Chapter 12: Binary Logistic Regression

Logistic Regression.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Logistic Regression - generated code - start */
ods noproctitle;
ods graphics / imagemap=on;
title1 "Binary Logistic Regression - Reference Coding"; 
proc logistic data=WORK.HIGH_LOW;
    class Black MomSmoke / param=ref;
    model Wt_Group(event='1')=MomAge MomWtGain Black MomSmoke / link=logit 
        technique=fisher;
run;

/* specify reference levels by using values in quotes:
  class Black(ref='0') MomSmoke(ref='0') / param=ref;
  which impacts the Odds Ratio Estimates: Black 1 vs 0 = 2.038, MomSmoke 1 vs 0 = 2.295
*/

/*
"The model fit statistics section is useful when comparing models.  Smaller values of AIC (Akaike's information criteria) indicate better models.  The criterion labeled SC (Schwarz Criterion) is based on the value of AIC but adjusts for the number of variables entered into the model.  As with AIC, smaller values of SC  indicate better models.  The Schwarz Criterion is probably better to use than the AIC if you want a parsimonious model (fewer predictor variables)."  

"The global test of the null hypothesis shows all three tests of the null hypothesis reject it with very low p-values."  

"The table of p-values for each of the predictor variables (both classification variables and continuous variables) shows that all variables are significant."  

"The odds ratios for the two classification variables show that: 
 - for Black (0 versus 1), you see a point estimate of .491.  (Because you didn't select a reference level for the two classification variables, the task selected the higher value (1) as the reference level.)  You conclude that a person who is non-black (0) is less likely to have a baby whose weight is below the median value. If you take the reciprocal of this value (1/.491 or about 2.037), you could say that based on this model, that the odds of a black mother having a baby whose weight is below the median is 2.037 times higher than for a non-black mother.  The 95% confidence limits indicate that you are 95% confident that your estimate of the odds ratio is between those two limits.  Because both classification variables were significant, these limits do not include 1 (meaning that the odds are equal for each outcome).  
 - for MomSmoke (0 versus 1), you see a point estimate of .436, which means that non-smoking moms are less likely to have babies with birth weights below the median.  Based on this model, that the odds of a smoking mother having a baby whose weight is below the median is [1/.436 or] 2.294 times higher than for a non-smoking mother."
  
"The odds ratios for the continuous variables show the odds for each year (for MomAge) or each pound (for MomWtGain)."  
*/
title1 ; 
/* Logistic Regression - generated code - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Binary Logistic Regression - Reference Coding

The Logistic Procedure

Model Information


Model Information
Data Set	WORK.HIGH_LOW
Response Variable	Wt_Group
Number of Response Levels	2
Model	binary logit
Optimization Technique	Fisher's scoring

Observations Summary


Number of Observations Read	12500
Number of Observations Used	12500

Response Profile


Response Profile
Ordered Value	Wt_Group	Total Frequency
1	0	6325
2	1	6175

Probability modeled is Wt_Group=1.

Class Level Information


Class Level Information
Class	Value	Design Variables
Black	0	1
	1	0
MomSmoke	0	1
	1	0

Convergence Status


Model Convergence Status
Convergence criterion (GCONV=1E-8) satisfied.

Fit Statistics


Model Fit Statistics
Criterion	Intercept Only	Intercept and Covariates
AIC	17328.879	16490.398
SC	17336.313	16527.565
-2 Log L	17326.879	16480.398

Global Tests


Testing Global Null Hypothesis: BETA=0
Test	Chi-Square	DF	Pr > ChiSq
Likelihood Ratio	846.4819	4	<.0001
Score	816.2735	4	<.0001
Wald	761.0124	4	<.0001

Type 3 Tests


Type 3 Analysis of Effects
Effect	DF	Wald Chi-Square	Pr > ChiSq
MomAge	1	83.0882	<.0001
MomWtGain	1	291.4380	<.0001
Black	1	183.9687	<.0001
MomSmoke	1	210.9599	<.0001

Parameter Estimates


Analysis of Maximum Likelihood Estimates
Parameter		DF	Estimate	Standard Error	Wald Chi-Square	Pr > ChiSq
Intercept		1	1.3296	0.0718	342.7422	<.0001
MomAge		1	-0.0300	0.00329	83.0882	<.0001
MomWtGain		1	-0.0256	0.00150	291.4380	<.0001
Black	0	1	-0.7119	0.0525	183.9687	<.0001
MomSmoke	0	1	-0.8308	0.0572	210.9599	<.0001

Odds Ratios


Odds Ratio Estimates
Effect	Point Estimate	95% Wald Confidence Limits
MomAge	0.970	0.964	0.977
MomWtGain	0.975	0.972	0.978
Black 0 vs 1	0.491	0.443	0.544
MomSmoke 0 vs 1	0.436	0.389	0.487

Association Statistics


Association of Predicted Probabilities and Observed Responses
Percent Concordant	64.7	Somers' D	0.296
Percent Discordant	35.2	Gamma	0.296
Percent Tied	0.1	Tau-a	0.148
Pairs	39056875	c	0.648

“The model fit statistics section is useful when comparing models. Smaller values of AIC (Akaike’s information criteria) indicate better models. The criterion labeled SC (Schwarz Criterion) is based on the value of AIC but adjusts for the number of variables entered into the model. As with AIC, smaller values of SC indicate better models. The Schwarz Criterion is probably better to use than the AIC if you want a parsimonious model (fewer predictor variables).”
“The global test of the null hypothesis shows all three tests of the null hypothesis reject it with very low p-values.”
“The table of p-values for each of the predictor variables (both classification variables and continuous variables) shows that all variables are significant.”
“The odds ratios for the two classification variables show that:
– for Black (0 versus 1), you see a point estimate of .491. (Because you didn’t select a reference level for the two classification variables, the task selected the higher value (1) as the reference level.) You conclude that a person who is non-black (0) is less likely to have a baby whose weight is below the median value. If you take the reciprocal of this value (1/.491 or about 2.037), you could say that based on this model, that the odds of a black mother having a baby whose weight is below the median is 2.037 times higher than for a non-black mother. The 95% confidence limits indicate that you are 95% confident that your estimate of the odds ratio is between those two limits. Because both classification variables were significant, these limits do not include 1 (meaning that the odds are equal for each outcome).
– for MomSmoke (0 versus 1), you see a point estimate of .436, which means that non-smoking moms are less likely to have babies with birth weights below the median. Based on this model, that the odds of a smoking mother having a baby whose weight is below the median is [1/.436 or] 2.294 times higher than for a non-smoking mother.”
“The odds ratios for the continuous variables show the odds for each year (for MomAge) or each pound (for MomWtGain).”

Chapter 13: Analyzing Categorical Data

Two-Way Tables for Categorical Data.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Two-Way Tables for Heart_Attack - start */
title1 "Two-Way Tables for Categorical Data - Heart_Attack";
ods noproctitle;
proc freq data=WORK.HEART_ATTACK order=formatted;
    tables  (Gender Age_Group High_Chol) * (Heart_Attack) 
    / chisq relrisk fisher nopercent nocum plots=none;
  run;

/*
  - "Looking at the output, the first section is the Table of Gender by Heart_Attack where each box has 3 values: frequency count, row percentage, and column percentage.  There are 41 subjects who are Male and had a Heart Attack.  These represent 16.40% of Males (row), and 70.69% of those who had a Heart Attack (column). Row and Column total frequencies are also displayed."  
  
  - "The next section is the Statistics for Table of Gender by Heart_Attack which shows various measures with very low p-values."  
  
  - "The Odds Ratio and Relative Risks section shows the risk of having a Heart Attack if you are Male.  If you conducted a case-control study, you would have used the Odds Ratio.  Since this is a cohort study, we use the Relative Risk.  A person is 2.4118 times more likely to suffer a heart attack if he is Male.  The table also shows the confidence limits for the odds ratio and relative risks; note that these intervals do not include 1.  An odds ratio or relative risk of 1 would mean that a Male person was not at a higher or lower risk of having a heart attack.  You expect this because of the significant chi-square value."  
  
  - "When Analyzing Tables with Low Expected Values (e.g., 2-by-2 tables with expected values less than 5) a popular alternative to chi-square is Fisher's Exact test.   There are statisticians who prefer chi-square with a correction for continuity instead, and we have both statistics (see Statistics for Table of Gender by Heart_Attack):  
  - Continuity Adj. Chi-Square value = 10.3175, p = .0013;  
  - Fisher's Exact test - Two-sided Pr <= P = 0.0012."    
*/

title1;
/* Two-Way Tables for Heart_Attack - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Two-Way Tables for Categorical Data - Heart_Attack

The Freq Procedure

Table Gender * Heart_Attack

Cross-Tabular Freq Table

Frequency Row Pct Col Pct


Table of Gender by Heart_Attack
Gender(Gender)	Heart_Attack(Heart_Attack)
Gender(Gender)	1:Yes	2:No	Total
1:Male	41 16.40 70.69	209 83.60 47.29	250
2:Female	17 6.80 29.31	233 93.20 52.71	250
Total	58	442	500

Statistics for Table of Gender by Heart_Attack

Chi-Square Tests


Statistic	DF	Value	Prob
Chi-Square	1	11.2342	0.0008
Likelihood Ratio Chi-Square	1	11.5397	0.0007
Continuity Adj. Chi-Square	1	10.3175	0.0013
Mantel-Haenszel Chi-Square	1	11.2117	0.0008
Phi Coefficient		0.1499
Contingency Coefficient		0.1482
Cramer's V		0.1499

Fisher's Exact Test


Fisher's Exact Test
Cell (1,1) Frequency (F)	41
Left-sided Pr <= F	0.9998
Right-sided Pr >= F	0.0006

Table Probability (P)	0.0004
Two-sided Pr <= P	0.0012

Relative Risk Estimates


Odds Ratio and Relative Risks
Statistic	Value	95% Confidence Limits
Odds Ratio	2.6887	1.4824	4.8768
Relative Risk (Column 1)	2.4118	1.4089	4.1284
Relative Risk (Column 2)	0.8970	0.8411	0.9566

Sample Size = 500

Table Age_Group * Heart_Attack

Cross-Tabular Freq Table

Frequency Row Pct Col Pct


Table of Age_Group by Heart_Attack
Age_Group(Age_Group)	Heart_Attack(Heart_Attack)
Age_Group(Age_Group)	1:Yes	2:No	Total
60-70	23 12.78 39.66	157 87.22 35.52	180
71+	30 17.24 51.72	144 82.76 32.58	174
< 60	5 3.42 8.62	141 96.58 31.90	146
Total	58	442	500

Statistics for Table of Age_Group by Heart_Attack

Chi-Square Tests


Statistic	DF	Value	Prob
Chi-Square	2	15.1597	0.0005
Likelihood Ratio Chi-Square	2	17.7666	0.0001
Mantel-Haenszel Chi-Square	1	14.4611	0.0001
Phi Coefficient		0.1741
Contingency Coefficient		0.1715
Cramer's V		0.1741

Fisher's Exact Test


Fisher's Exact Test
Table Probability (P)	<.0001
Pr <= P	0.0002

Sample Size = 500

Table High_Chol * Heart_Attack

Cross-Tabular Freq Table

Frequency Row Pct Col Pct


Table of High_Chol by Heart_Attack
High_Chol(High_Chol)	Heart_Attack(Heart_Attack)
High_Chol(High_Chol)	1:Yes	2:No	Total
1:Yes	44 17.25 75.86	211 82.75 47.74	255
2:No	14 5.71 24.14	231 94.29 52.26	245
Total	58	442	500

Statistics for Table of High_Chol by Heart_Attack

Chi-Square Tests


Statistic	DF	Value	Prob
Chi-Square	1	16.2287	<.0001
Likelihood Ratio Chi-Square	1	17.0012	<.0001
Continuity Adj. Chi-Square	1	15.1228	0.0001
Mantel-Haenszel Chi-Square	1	16.1963	<.0001
Phi Coefficient		0.1802
Contingency Coefficient		0.1773
Cramer's V		0.1802

Fisher's Exact Test


Fisher's Exact Test
Cell (1,1) Frequency (F)	44
Left-sided Pr <= F	1.0000
Right-sided Pr >= F	<.0001

Table Probability (P)	<.0001
Two-sided Pr <= P	<.0001

Relative Risk Estimates


Odds Ratio and Relative Risks
Statistic	Value	95% Confidence Limits
Odds Ratio	3.4408	1.8331	6.4585
Relative Risk (Column 1)	3.0196	1.6987	5.3678
Relative Risk (Column 2)	0.8776	0.8232	0.9356

Sample Size = 500

“Looking at the output, the first section is the Table of Gender by Heart_Attack where each box has 3 values: frequency count, row percentage, and column percentage. There are 41 subjects who are Male and had a Heart Attack. These represent 16.40% of Males (row), and 70.69% of those who had a Heart Attack (column). Row and Column total frequencies are also displayed.”
“The next section is the Statistics for Table of Gender by Heart_Attack which shows various measures with very low p-values.”
“The Odds Ratio and Relative Risks section shows the risk of having a Heart Attack if you are Male. If you conducted a case-control study, you would have used the Odds Ratio. Since this is a cohort study, we use the Relative Risk. A person is 2.4118 times more likely to suffer a heart attack if he is Male. The table also shows the confidence limits for the odds ratio and relative risks; note that these intervals do not include 1. An odds ratio or relative risk of 1 would mean that a Male person was not at a higher or lower risk of having a heart attack. You expect this because of the significant chi-square value.”
“When Analyzing Tables with Low Expected Values (e.g., 2-by-2 tables with expected values less than 5) a popular alternative to chi-square is Fisher’s Exact test. There are statisticians who prefer chi-square with a correction for continuity instead, and we have both statistics (see Statistics for Table of Gender by Heart_Attack):
Continuity Adj. Chi-Square value = 10.3175, p = .0013;
Fisher’s Exact test - Two-sided Pr <= P = 0.0012.”

Chapter 14: Computing Power and Sample Size

“Just about any study will require detailed power and sample size calculations. The power of a study is the probability that the study will result in a statistically significant finding if the drug or treatment that you are studying is different (hopefully better) than either a placebo or an alternate drug or treatment. For many studies conducted at research labs or universities, powers of 80% or 90% are typical. Very large-scale studies may strive for a power of 95%. Small exploratory studies may be satisfied with powers closer to 70%. The bottom line is that it is unethical and wasteful to begin a study with low power. You will have a low probability of demonstrating the superiority of your drug or intervention, and this negative result may dissuade others from investigating the same drug or intervention when it may actually be beneficial.”
“Depending on the type of study (comparing means or comparing proportions, for example), there is a set of questions that need to be answered before you can determine the number of subjects you will need for a particular study.”

Computing Sample Size for a t Test.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Computing Sample Size for a t Test - start */
title1 "Computing Sample Size for a t Test";
ods noproctitle;
ods graphics / imagemap=on;
proc power;
    twosamplemeans test=diff sides=2 groupmeans=(130 120) (130 125) stddev=10 
        power=0.8 0.9 alpha=0.05 npergroup=.;
    plot x=power;
run;
/*
"Assume we will conduct a simple study to compare blood pressure in a group of borderline hypertensive subjects.  You want to see if a low dose of a beta blocker will reduce blood pressure.  Because these subjects are borderline hypertensive and the trial will be relatively short, you decide that it is ethical to use a placebo as your control.  What information do you need to decide how many subjects you need to recruit in order to have a power of 80%?  We assume equal variances in the two groups, and select a two-sided test with equal (pooled) variances. 
We enter the expected mean for each group to compute sample sizes; one for means of 130 versus 120 (a 10-point difference) and another for means of 130 versus 125 (a 5-point difference).  We provide an estimate of 10 for standard deviation, and entered the desired power(s): .8 and .9. 
  
  - Sample Size per Group in Table Form: Notice the large sample sizes necessary to detect a small difference of 5 points.  The lowest sample size per group (n per group = 17) is for the largest difference (130 versus 120) and the lowest power (80%).   
  - Plot of Power versus Sample Size: The graph of sample size by power will contain a line for each combination of means, standard deviations, and powers that you entered.  The final decision of sample size is sometimes a compromise between how many subjects you can recruit (and pay for) and how large a difference you would like to be able to detect."  
*/
title1;
/* Computing Sample Size for a t Test - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Computing Sample Size for a t Test

Two-Sample t Test for Mean Difference

The Power Procedure

Two-Sample t Test for Mean Difference 1

Fixed Scenario Elements


Fixed Scenario Elements
Distribution	Normal
Method	Exact
Number of Sides	2
Alpha	0.05
Standard Deviation	10
Null Difference	0

Output


Computed N per Group
Index	Mean1	Mean2	Nominal Power	Actual Power	N per Group
1	130	120	0.8	0.807	17
2	130	120	0.9	0.912	23
3	130	125	0.8	0.801	64
4	130	125	0.9	0.903	86

Computing Sample Size for a t Test

Two-Sample t Test for Mean Difference

Plot Statement 1

N per Group vs. Power

“Assume we will conduct a simple study to compare blood pressure in a group of borderline hypertensive subjects. You want to see if a low dose of a beta blocker will reduce blood pressure. Because these subjects are borderline hypertensive and the trial will be relatively short, you decide that it is ethical to use a placebo as your control. What information do you need to decide how many subjects you need to recruit in order to have a power of 80%? We assume equal variances in the two groups, and select a two-sided test with equal (pooled) variances. We enter the expected mean for each group to compute sample sizes; one for means of 130 versus 120 (a 10-point difference) and another for means of 130 versus 125 (a 5-point difference). We provide an estimate of 10 for standard deviation, and entered the desired power(s): .8 and .9.
Sample Size per Group in Table Form: Notice the large sample sizes necessary to detect a small difference of 5 points. The lowest sample size per group (n per group = 17) is for the largest difference (130 versus 120) and the lowest power (80%).
Plot of Power versus Sample Size: The graph of sample size by power will contain a line for each combination of means, standard deviations, and powers that you entered. The final decision of sample size is sometimes a compromise between how many subjects you can recruit (and pay for) and how large a difference you would like to be able to detect.”

Calculating the Sample Size for a Test of Proportions.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Calculating the Sample Size for a Test of Proportions - start */
title1 "Calculating the Sample Size for a Test of Proportions";
ods noproctitle;
ods graphics / imagemap=on;
proc power;
    twosamplefreq test=pchi sides=2 groupproportions=(0.7 0.8) (0.7 0.9) power=0.8 
        0.85 0.9 alpha=0.05 npergroup=.;
    plot x=power;
run;
/*
"Information needed to compute sample sizes for test of proportions:  
  - Conduct a one-sided or two-sided test? (Usually two-sided);  
  - Alpha level? (Usually ⍺ = .05);  
  - Proportion in the first group (usually a control group)?  If unsure, lean toward .5 (maximum variance);  
  - How large a difference in proportions do you want to be able to detect? (Or the proportion in group two);   
  - You decide to compute sample sizes for two different scenarios: one with proportions of .7 and .8, the other with proportions of .7 and .9;  
  - What power do you want? (You often enter several values such as .8, .85, and .9);  
  - The statistical test that you plan to use in the analysis: for most studies with fairly large n's, the Pearson chi-square test is a
good choice; if you believe that you will have small expected values in the study, you may choose Fisher's exact test.  
  
  - "Sample Size per Group in Table Form: If you look at the N per Group for comparing proportions of .7 and .8 with a power of 90%, you see that you need 392 subjects per group.  The smallest number of subjects per group (62) is for proportions of .7 versus .9 with 80% power.
  - Plot of Power versus Sample Size: When you compare proportions, you may have to lower your expectations of detecting small differences and design the study with larger differences in the two proportions and, perhaps, slightly lower power."  
*/
title1;
/* Calculating the Sample Size for a Test of Proportions - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Calculating the Sample Size for a Test of Proportions

Pearson Chi-square Test for Proportion Difference

The Power Procedure

Pearson Chi-square Test for Proportion Difference 1

Fixed Scenario Elements


Fixed Scenario Elements
Distribution	Asymptotic normal
Method	Normal approximation
Number of Sides	2
Alpha	0.05
Null Proportion Difference	0

Output


Computed N per Group
Index	Proportion1	Proportion2	Nominal Power	Actual Power	N per Group
1	0.7	0.8	0.80	0.801	294
2	0.7	0.8	0.85	0.851	336
3	0.7	0.8	0.90	0.900	392
4	0.7	0.9	0.80	0.803	62
5	0.7	0.9	0.85	0.854	71
6	0.7	0.9	0.90	0.900	82

Calculating the Sample Size for a Test of Proportions

Pearson Chi-square Test for Proportion Difference

Plot Statement 1

N per Group vs. Power

“Information needed to compute sample sizes for test of proportions:
– Conduct a one-sided or two-sided test? (Usually two-sided);
– Alpha level? (Usually ⍺ = .05);
– Proportion in the first group (usually a control group)? If unsure, lean toward .5 (maximum variance);
– How large a difference in proportions do you want to be able to detect? (Or the proportion in group two);
– You decide to compute sample sizes for two different scenarios: one with proportions of .7 and .8, the other with proportions of .7 and .9;
– What power do you want? (You often enter several values such as .8, .85, and .9);
– The statistical test that you plan to use in the analysis: for most studies with fairly large n’s, the Pearson chi-square test is a good choice; if you believe that you will have small expected values in the study, you may choose Fisher’s exact test.
Sample Size per Group in Table Form: If you look at the N per Group for comparing proportions of .7 and .8 with a power of 90%, you see that you need 392 subjects per group. The smallest number of subjects per group (62) is for proportions of .7 versus .9 with 80% power.
Plot of Power versus Sample Size: When you compare proportions, you may have to lower your expectations of detecting small differences and design the study with larger differences in the two proportions and, perhaps, slightly lower power.”

Computing Sample Size for a One-Way ANOVA Design.

SAS Code, click to expand

SAS Code:

#  (&path1 and biostats libname previously defined)
/* Computing Sample Size for a One-Way ANOVA Design - start */
title1 "Computing Sample Size for a One-Way ANOVA Design";
proc power;
   onewayanova
   groupmeans = 20 | 25 | 30 
   stddev = 8 10 
   power = .80 .90
   npergroup = .;
   plot x = power min = .70 max = .90;
run; 
/*
  - "3 means estimated to be 20, 25, and 30;  
  - Two estimates for standard deviation: 8 and 10;   
  - You want to compute sample size for powers of 80% and 90%;  
  - You want to compute the n-per-group (as compared to power for a given sample size);  
  - You would like a plot of Power (x-axis) versus sample size and the axes scaled to show powers from .7 to .9;  
  - You enter the group means following the key word GROUPMEANS, separating them using a vertical bar (also called a pipe symbol);  
  - Follow the keyword STDDEV with one or more estimates of the pooled standard deviation;  
  - You have a choice of computing power or sample size:  
   -- if you want to compute sample size, enter one or more values for your desired power, and enter a SAS numeric missing value (coded as a period) for the number of subjects per group;
   -- if you want to compute power for a given sample size, enter a period for the power and one or more values of sample size. 
  - Use a PLOT statement to indicate that you want power on the x axis, with values of power ranging from .7 to .9."  
*/
title1;
/* Computing Sample Size for a One-Way ANOVA Design - end */

SAS Output

Results: biostats_by_ex_SAScode.sas

Computing Sample Size for a One-Way ANOVA Design

Overall F Test for One-Way ANOVA

The Power Procedure

Overall F Test for One-Way ANOVA 1

Fixed Scenario Elements


Fixed Scenario Elements
Method	Exact
Group Means	20 25 30
Alpha	0.05

Output


Computed N per Group
Index	Std Dev	Nominal Power	Actual Power	N per Group
1	8	0.8	0.820	14
2	8	0.9	0.913	18
3	10	0.8	0.815	21
4	10	0.9	0.908	27

Computing Sample Size for a One-Way ANOVA Design

Overall F Test for One-Way ANOVA

Plot Statement 1

N per Group vs. Power

“3 means estimated to be 20, 25, and 30;
Two estimates for standard deviation: 8 and 10;
You want to compute sample size for powers of 80% and 90%;
You want to compute the n-per-group (as compared to power for a given sample size);
You would like a plot of Power (x-axis) versus sample size and the axes scaled to show powers from .7 to .9;
You enter the group means following the key word GROUPMEANS, separating them using a vertical bar (also called a pipe symbol);
Follow the keyword STDDEV with one or more estimates of the pooled standard deviation;
You have a choice of computing power or sample size:
– if you want to compute sample size, enter one or more values for your desired power, and enter a SAS numeric missing value (coded as a period) for the number of subjects per group;
– if you want to compute power for a given sample size, enter a period for the power and one or more values of sample size.
Use a PLOT statement to indicate that you want power on the x axis, with values of power ranging from .7 to .9.”

Now, let’s use R!

Chapters 1 - 4: Data Import Basics

Read-in bult-in dataset, import from text and CSV files, work with various delimiters and header profiles. For example, the heart dataset was exported from the built-in SAS library into a physical SAS dataset, which is then imported by `R` using the `haven` package. Another dataset Bweight contains birth weights for 50,000 babies, along with several variables believed to be related to birth weight, such as race (coded as black=1 or not black=0), mother’s smoking status (smoking=1 or non-smoking=0), and marital status (not married=0 or married=1).

R Code:

# if you wrote the SAS dataset with validvarname v6, you might need to fill in the mainframe style field names
# heart <- haven::read_sas("heart.sas7bdat") %>%
#   rename(Status =     STATUS,
# DeathCause = DEATHCAU, # Cause of Death
# AgeCHDdiag = AGECHDDI, # Age CHD Diagnosed
# Sex = SEX, 
# AgeAtStart = AGEATSTA, # Age at Start
# Height = HEIGHT, 
# Weight = WEIGHT, 
# Diastolic = DIASTOLI, 
# Systolic = SYSTOLIC, 
# MRW = MRW, # Metropolitan Relative Weight
# Smoking = SMOKING, 
# AgeAtDeath = AGEATDEA, # Age at Death
# Cholesterol = CHOLESTE, 
# Chol_Status = CHOL_STA, # Cholesterol Status
# BP_Status = BP_STATU, # Blood Pressure Status
# Weight_Status = WEIGHT_S, # Weight Status
# Smoking_Status = SMOKING_)
# glimpse(heart)

# if you wrote the SAS dataset with validvarname v9, field names longer than 8 characters and mixed case are okay 
heart <- haven::read_sas("heart_v9.sas7bdat")
glimpse(heart)

## Rows: 5,209
## Columns: 17
## $ Status         <chr> "Dead", "Dead", "Alive", "Alive", "Alive", "Alive", "Al~
## $ DeathCause     <chr> "Other", "Cancer", "", "", "", "", "", "Other", "", "Ce~
## $ AgeCHDdiag     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 57, 55, 79,~
## $ Sex            <chr> "Female", "Female", "Female", "Female", "Male", "Female~
## $ AgeAtStart     <dbl> 29, 41, 57, 39, 42, 58, 36, 53, 35, 52, 39, 33, 33, 57,~
## $ Height         <dbl> 62.50, 59.75, 62.25, 65.75, 66.00, 61.75, 64.75, 65.50,~
## $ Weight         <dbl> 140, 194, 132, 158, 156, 131, 136, 130, 194, 129, 179, ~
## $ Diastolic      <dbl> 78, 92, 90, 80, 76, 92, 80, 80, 68, 78, 76, 68, 90, 76,~
## $ Systolic       <dbl> 124, 144, 170, 128, 110, 176, 112, 114, 132, 124, 128, ~
## $ MRW            <dbl> 121, 183, 114, 123, 116, 117, 110, 99, 124, 106, 133, 1~
## $ Smoking        <dbl> 0, 0, 10, 0, 20, 0, 15, 0, 0, 5, 30, 0, 0, 15, 30, 10, ~
## $ AgeAtDeath     <dbl> 55, 57, NA, NA, NA, NA, NA, 77, NA, 82, NA, NA, NA, NA,~
## $ Cholesterol    <dbl> NA, 181, 250, 242, 281, 196, 196, 276, 211, 284, 225, 2~
## $ Chol_Status    <chr> "", "Desirable", "High", "High", "High", "Desirable", "~
## $ BP_Status      <chr> "Normal", "High", "High", "Normal", "Optimal", "High", ~
## $ Weight_Status  <chr> "Overweight", "Overweight", "Overweight", "Overweight",~
## $ Smoking_Status <chr> "Non-smoker", "Non-smoker", "Moderate (6-15)", "Non-smo~

fish <- haven::read_sas("fish.sas7bdat")
glimpse(heart)

## Rows: 5,209
## Columns: 17
## $ Status         <chr> "Dead", "Dead", "Alive", "Alive", "Alive", "Alive", "Al~
## $ DeathCause     <chr> "Other", "Cancer", "", "", "", "", "", "Other", "", "Ce~
## $ AgeCHDdiag     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 57, 55, 79,~
## $ Sex            <chr> "Female", "Female", "Female", "Female", "Male", "Female~
## $ AgeAtStart     <dbl> 29, 41, 57, 39, 42, 58, 36, 53, 35, 52, 39, 33, 33, 57,~
## $ Height         <dbl> 62.50, 59.75, 62.25, 65.75, 66.00, 61.75, 64.75, 65.50,~
## $ Weight         <dbl> 140, 194, 132, 158, 156, 131, 136, 130, 194, 129, 179, ~
## $ Diastolic      <dbl> 78, 92, 90, 80, 76, 92, 80, 80, 68, 78, 76, 68, 90, 76,~
## $ Systolic       <dbl> 124, 144, 170, 128, 110, 176, 112, 114, 132, 124, 128, ~
## $ MRW            <dbl> 121, 183, 114, 123, 116, 117, 110, 99, 124, 106, 133, 1~
## $ Smoking        <dbl> 0, 0, 10, 0, 20, 0, 15, 0, 0, 5, 30, 0, 0, 15, 30, 10, ~
## $ AgeAtDeath     <dbl> 55, 57, NA, NA, NA, NA, NA, 77, NA, 82, NA, NA, NA, NA,~
## $ Cholesterol    <dbl> NA, 181, 250, 242, 281, 196, 196, 276, 211, 284, 225, 2~
## $ Chol_Status    <chr> "", "Desirable", "High", "High", "High", "Desirable", "~
## $ BP_Status      <chr> "Normal", "High", "High", "Normal", "Optimal", "High", ~
## $ Weight_Status  <chr> "Overweight", "Overweight", "Overweight", "Overweight",~
## $ Smoking_Status <chr> "Non-smoker", "Non-smoker", "Moderate (6-15)", "Non-smo~

bweight <- haven::read_sas("bweight.sas7bdat")
glimpse(bweight)

## Rows: 50,000
## Columns: 10
## $ Weight     <dbl> 4111, 3997, 3572, 1956, 3515, 3757, 2977, 3884, 3629, 3062,~
## $ Black      <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,~
## $ Married    <dbl> 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,~
## $ Boy        <dbl> 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,~
## $ MomAge     <dbl> -3, 1, 0, -1, -6, 3, -5, -5, 6, -1, -2, -6, 0, 1, 1, 7, -4,~
## $ MomSmoke   <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,~
## $ CigsPerDay <dbl> 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 4, 0, 0, 10, 0, 0, 0, 0, 0, 0~
## $ MomWtGain  <dbl> -16, 2, -3, -5, -20, 0, 5, 0, -5, 6, 22, -1, 7, -6, 10, 15,~
## $ Visit      <dbl> 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,~
## $ MomEdLevel <dbl> 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,~

# test reading local files 
# excel file import 
read_excel("Grades.xlsx") %>%
  glimpse()

## Rows: 3
## Columns: 8
## $ Name    <chr> "Jones", "Hildebrand", "O'Brien"
## $ ID      <dbl> 12345, 22222, 33333
## $ Quiz1   <dbl> 88, 95, 76
## $ Quiz2   <dbl> 80, 92, 78
## $ Midterm <dbl> 76, 91, 79
## $ Quiz3   <dbl> 88, 94, 81
## $ Quiz4   <dbl> 90, 90, 83
## $ Final   <dbl> 82, 96, 80

# excel file with odd fields names import - backticked
read_excel("Grades2.xlsx") %>%
  glimpse()

## Rows: 3
## Columns: 8
## $ `Stuent Name` <chr> "Jones", "Hildebrand", "O'Brien"
## $ ID            <dbl> 12345, 22222, 33333
## $ `Quiz 1`      <dbl> 88, 95, 76
## $ `Quiz 2`      <dbl> 80, 92, 78
## $ `Mid Term`    <dbl> 76, 91, 79
## $ `Quiz 3`      <dbl> 88, 94, 81
## $ `Quiz 4`      <dbl> 90, 90, 83
## $ `2015Final`   <dbl> 82, 96, 80

# excel file with odd fields names import - rename on the fly
read_excel("Grades2.xlsx") %>%
  rename(
    Quiz_1 = `Quiz 1`,
    Quiz_2 = `Quiz 2`,
    Midterm = `Mid Term`,
    Quiz_3 = `Quiz 3`,
    Quiz_4 = `Quiz 4`,
    Final_2015 = `2015Final`
    ) %>%
  glimpse()

## Rows: 3
## Columns: 8
## $ `Stuent Name` <chr> "Jones", "Hildebrand", "O'Brien"
## $ ID            <dbl> 12345, 22222, 33333
## $ Quiz_1        <dbl> 88, 95, 76
## $ Quiz_2        <dbl> 80, 92, 78
## $ Midterm       <dbl> 76, 91, 79
## $ Quiz_3        <dbl> 88, 94, 81
## $ Quiz_4        <dbl> 90, 90, 83
## $ Final_2015    <dbl> 82, 96, 80

# csv file import 
read_csv("Grades.csv") %>%
  glimpse()

## Rows: 3 Columns: 8

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): Name
## dbl (7): ID, Quiz1, Quiz2, Midterm, Quiz3, Quiz4, Final

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

## Rows: 3
## Columns: 8
## $ Name    <chr> "Jones", "Hildebrand", "O'Brien"
## $ ID      <dbl> 12345, 22222, 33333
## $ Quiz1   <dbl> 88, 95, 76
## $ Quiz2   <dbl> 80, 92, 78
## $ Midterm <dbl> 76, 91, 79
## $ Quiz3   <dbl> 88, 94, 81
## $ Quiz4   <dbl> 90, 90, 83
## $ Final   <dbl> 82, 96, 80

# txt file import space delimiter treat multiples as one
# read_delim with delim = " " will trip up on multiple spaces
# added period in the na spec list as per SAS input text file
# columns has no headings, and rename default names
read_table("Health_List.txt",
  col_names = F, na = c("", "NA", ".")) %>%
  rename(
    Subj = X1,
    Gender = X2,
    Age = X3,
    Heart_Rate = X4,
    SBP = X5,
    DBP = X6,
    Chol = X7
    ) %>%
  glimpse()

## 
## -- Column specification --------------------------------------------------------
## cols(
##   X1 = col_character(),
##   X2 = col_character(),
##   X3 = col_double(),
##   X4 = col_double(),
##   X5 = col_double(),
##   X6 = col_double(),
##   X7 = col_double()
## )

## Rows: 6
## Columns: 7
## $ Subj       <chr> "001", "002", "003", "004", "005", "006"
## $ Gender     <chr> "M", "F", "F", "M", "F", "F"
## $ Age        <dbl> 23, 55, 18, 80, 34, 38
## $ Heart_Rate <dbl> 68, 72, 58, 82, 62, 78
## $ SBP        <dbl> 120, 180, 118, NA, 128, 108
## $ DBP        <dbl> 90, 90, 72, NA, 80, 68
## $ Chol       <dbl> 128, 170, 122, 220, NA, 220

# fixed width txt file import
read_fwf("Health.txt", 
  fwf_cols(
    Subj = 3, 
    Gender = 1,
    Age = 2,
    Heart_Rate = 2,
    SBP = 3,
    DBP = 3,
    Chol = 3
    )) %>%
  glimpse()

## Rows: 5 Columns: 7

## -- Column specification --------------------------------------------------------
## 
## chr (2): Subj, Gender
## dbl (5): Age, Heart_Rate, SBP, DBP, Chol

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

## Rows: 5
## Columns: 7
## $ Subj       <chr> "001", "002", "003", "004", "005"
## $ Gender     <chr> "M", "F", "F", "M", "F"
## $ Age        <dbl> 23, 55, 18, 80, 34
## $ Heart_Rate <dbl> 68, 72, 58, 82, 62
## $ SBP        <dbl> 120, 180, 118, NA, 128
## $ DBP        <dbl> 90, 90, 72, NA, 80
## $ Chol       <dbl> 128, 170, 122, 220, NA

# tab delimited text file with no column names
read_tsv("Blood_Pressure.txt",
  col_names = F) %>%
  rename(
    Drug = X1,
    Subj = X2,
    Gender = X3,
    SBP = X4,
    DBP = X5
    ) %>%
  glimpse()

## Rows: 60 Columns: 5

## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (2): X1, X3
## dbl (3): X2, X4, X5

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

## Rows: 60
## Columns: 5
## $ Drug   <chr> "Placebo", "Placebo", "Placebo", "Placebo", "Placebo", "Placebo~
## $ Subj   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ~
## $ Gender <chr> "F", "M", "F", NA, "F", "M", "F", "F", "F", "M", "F", "F", "M",~
## $ SBP    <dbl> 138, 124, 150, 136, NA, 132, 130, 146, 134, 138, 144, 130, 134,~
## $ DBP    <dbl> 86, 82, 72, 84, NA, 84, 84, 88, 82, 88, 84, 88, 80, 90, NA, 88,~

# Import / transform external files for exercises in subsequent chapters

# Chapter 5: Distribution Practice 5-3
blood_pressure <- read_excel("Blood_Pressure.xlsx")
glimpse(blood_pressure)

## Rows: 60
## Columns: 5
## $ Drug   <chr> "Placebo", "Placebo", "Placebo", "Placebo", "Placebo", "Placebo~
## $ Subj   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ~
## $ Gender <chr> "F", "M", "F", NA, "F", "M", "F", "F", "F", "M", "F", "F", "M",~
## $ SBP    <dbl> 138, 124, 150, 136, NA, 132, 130, 146, 134, 138, 144, 130, 134,~
## $ DBP    <dbl> 86, 82, 72, 84, NA, 84, 84, 88, 82, 88, 84, 88, 80, 90, NA, 88,~

# Chapter 6: One-Sample Tests - Perch 
perch <- read_excel("Perch.xlsx")
glimpse(perch)

## Rows: 56
## Columns: 3
## $ Weight <dbl> 5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0, 110~
## $ Height <dbl> 2.1120, 3.5280, 3.8240, 4.5924, 4.5880, 5.2224, 5.1992, 5.6358,~
## $ Width  <dbl> 1.4080, 1.9992, 2.4320, 2.6316, 2.9415, 3.3216, 3.1234, 3.0502,~

# Paired t Test - Yoga 
yoga <- read_excel("Yoga.xlsx")
glimpse(yoga)

## Rows: 9
## Columns: 3
## $ Subj   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9
## $ Before <dbl> 78, 68, 76, 58, 83, 80, 69, 77, 77
## $ After  <dbl> 74, 68, 70, 57, 73, 77, 61, 76, 72

# Chapter 8: One-way ANOVA with test for Tukey multiple comparisons 8.6
lvef <- as.character("
  55 58 62 48 57 57 80 40 55 52
  57 65 55 78 57 84 72 80 78 81
  60 60 65 67 48 62 64 70 57 40
") %>% 
  read_table(col_names = F, na = c("", "NA")) %>% 
  tibble::rownames_to_column() %>%
  pivot_longer(-rowname) %>%
  mutate(
    Group = case_when(
      rowname == "1" ~ "Placebo"
      , rowname == "2" ~ "Calcium"
      , rowname == "3" ~ "Lasix"
      , TRUE ~ rowname
      )
    , Subj = str_remove_all(name,"X")
    ) %>%
  rename(LVEF = value) %>%
  select(Group, Subj, LVEF)
glimpse(lvef)

## Rows: 30
## Columns: 3
## $ Group <chr> "Placebo", "Placebo", "Placebo", "Placebo", "Placebo", "Placebo"~
## $ Subj  <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "1", "2", "3"~
## $ LVEF  <dbl> 55, 58, 62, 48, 57, 57, 80, 40, 55, 52, 57, 65, 55, 78, 57, 84, ~

# Chapter 9: create 25 pct sample w/o replacement for Two-Way Anova
# The SASHELP library contains the Bweight dataset containing birth weights for 50,000 babies, along with several variables believed to be related to birth weight, such as race (coded as black or not black), mother's smoking status (smoking or non-smoking), and marital status.
# Weight (in grams) is the Dependent variable
# Black (0 = not black, 1=black)
# MomSmoke (0=no, 1=yes)
# Married (0=no, 1=yes)
set.seed(13579)
Birth_Wt_Sample <- bweight %>%
  slice_sample(n = as.numeric(count(bweight)*.25), replace = FALSE)
summary(Birth_Wt_Sample)

##      Weight         Black           Married            Boy        
##  Min.   : 240   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:3062   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :3402   Median :0.0000   Median :1.0000   Median :1.0000  
##  Mean   :3368   Mean   :0.1655   Mean   :0.7129   Mean   :0.5151  
##  3rd Qu.:3714   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :5970   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      MomAge           MomSmoke        CigsPerDay       MomWtGain       
##  Min.   :-9.0000   Min.   :0.0000   Min.   : 0.000   Min.   :-30.0000  
##  1st Qu.:-4.0000   1st Qu.:0.0000   1st Qu.: 0.000   1st Qu.: -8.0000  
##  Median : 0.0000   Median :0.0000   Median : 0.000   Median :  0.0000  
##  Mean   : 0.4449   Mean   :0.1308   Mean   : 1.482   Mean   :  0.7021  
##  3rd Qu.: 5.0000   3rd Qu.:0.0000   3rd Qu.: 0.000   3rd Qu.:  9.0000  
##  Max.   :18.0000   Max.   :1.0000   Max.   :40.000   Max.   : 68.0000  
##      Visit         MomEdLevel   
##  Min.   :0.000   Min.   :0.000  
##  1st Qu.:3.000   1st Qu.:0.000  
##  Median :3.000   Median :1.000  
##  Mean   :2.701   Mean   :1.218  
##  3rd Qu.:3.000   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :3.000

# writexl::write_xlsx(Birth_Wt_Sample, "Birth_Wt_Sample_R.xlsx")
set.seed(123456)
# results of this R sample extract snippet does not match the equivalent SAS proc surveyselect with same parameters and random seed, so re-do extract using LOJ to sas extract export.
Birth_Wt_Sample_SAS <- read_excel("Birth_Wt_Sample.xlsx")
summary(Birth_Wt_Sample_SAS)

##      Weight         Black           Married            Boy        
##  Min.   : 322   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:3062   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :3402   Median :0.0000   Median :1.0000   Median :1.0000  
##  Mean   :3373   Mean   :0.1578   Mean   :0.7095   Mean   :0.5106  
##  3rd Qu.:3714   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :5970   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      MomAge           MomSmoke        CigsPerDay       MomWtGain       
##  Min.   :-9.0000   Min.   :0.0000   Min.   : 0.000   Min.   :-30.0000  
##  1st Qu.:-4.0000   1st Qu.:0.0000   1st Qu.: 0.000   1st Qu.: -8.0000  
##  Median : 0.0000   Median :0.0000   Median : 0.000   Median :  0.0000  
##  Mean   : 0.3814   Mean   :0.1319   Mean   : 1.479   Mean   :  0.6146  
##  3rd Qu.: 5.0000   3rd Qu.:0.0000   3rd Qu.: 0.000   3rd Qu.:  9.0000  
##  Max.   :18.0000   Max.   :1.0000   Max.   :40.000   Max.   : 68.0000  
##      Visit         MomEdLevel  
##  Min.   :0.000   Min.   :0.00  
##  1st Qu.:3.000   1st Qu.:0.00  
##  Median :3.000   Median :1.00  
##  Mean   :2.699   Mean   :1.22  
##  3rd Qu.:3.000   3rd Qu.:2.00  
##  Max.   :3.000   Max.   :3.00

# Chapter 10: Correlation - Exercise 
exercise <- read_excel("Exercise.xls")
glimpse(exercise)

## Rows: 50
## Columns: 6
## $ Subj       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ~
## $ Age        <dbl> 68, 64, 76, 44, 55, 57, 64, 30, 35, 49, 63, 63, 19, 51, 54,~
## $ Pushups    <dbl> 19, 36, 11, 35, 24, 14, 21, 48, 25, 9, 51, 30, 34, 23, 7, 4~
## $ Rest_Pulse <dbl> 75, 61, 74, 59, 76, 74, 69, 60, 55, 88, 55, 73, 65, 66, 59,~
## $ Max_Pulse  <dbl> 124, 107, 115, 111, 115, 121, 106, 114, 107, 137, 102, 126,~
## $ Run_Pulse  <dbl> 121, 110, 105, 108, 110, 118, 103, 110, 107, 134, 103, 125,~

# Chapter 12: Binary Logistic Regression
# Create a categorical weight variable from arbitrary median 3402 gram cutoff using non-missing weight from previously-created sample
high_low <- Birth_Wt_Sample_SAS %>%
  filter(!is.na(Weight)) %>%
  mutate(Wt_Group = case_when(Weight < 3402 ~ 1,
                              TRUE ~ 0))
glimpse(high_low)

## Rows: 12,500
## Columns: 11
## $ Weight     <dbl> 3430, 3657, 4054, 4536, 3295, 3458, 3714, 2807, 3625, 3884,~
## $ Black      <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ Married    <dbl> 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,~
## $ Boy        <dbl> 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,~
## $ MomAge     <dbl> -4, 6, -5, 3, 6, 8, 2, -5, 2, -2, -6, 6, 7, -8, -1, -9, -9,~
## $ MomSmoke   <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,~
## $ CigsPerDay <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 4~
## $ MomWtGain  <dbl> -6, 15, 21, -1, -29, -18, 25, 13, 0, -10, 12, -9, 7, -7, 23~
## $ Visit      <dbl> 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,~
## $ MomEdLevel <dbl> 0, 0, 2, 3, 2, 1, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 3, 2, 2, 3,~
## $ Wt_Group   <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,~

# Chapter 13: Prepare generated dataset Heart_Attack
Heart_Attack <- read_excel("Heart_Attack.xlsx") %>% 
  mutate(Gender_ = case_when(Gender == 'F' ~ 'Female',
                             Gender == 'M' ~ 'Male',
                             TRUE ~ Gender),
  High_Chol_ = case_when(High_Chol == 0 ~ 'No',
                         High_Chol == 1 ~ 'Yes',
                         TRUE ~ as.character(High_Chol)),
  Heart_Attack_ = case_when(Heart_Attack == 0 ~ 'No',
                            Heart_Attack == 1 ~ 'Yes',
                            TRUE ~ as.character(Heart_Attack)),
  Age_Group_ = case_when(Age_Group == 1 ~ '< 60',
                         Age_Group == 2 ~ '60-70',
                         Age_Group == 3 ~ '71+',
                         TRUE ~ as.character(Age_Group))
  )
glimpse(Heart_Attack)

## Rows: 500
## Columns: 10
## $ Gender        <chr> "F", "M", "F", "M", "F", "M", "F", "M", "F", "M", "F", "~
## $ Age           <dbl> 63, 69, 69, 59, 71, 50, 57, 75, 60, 52, 80, 54, 75, 57, ~
## $ Age_Group     <dbl> 2, 2, 2, 1, 3, 1, 1, 3, 2, 1, 3, 1, 3, 1, 2, 2, 2, 2, 1,~
## $ Chol          <dbl> 211, 249, 139, 239, 195, 193, 179, 186, 164, 237, 178, 2~
## $ High_Chol     <dbl> 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,~
## $ Heart_Attack  <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,~
## $ Gender_       <chr> "Female", "Male", "Female", "Male", "Female", "Male", "F~
## $ High_Chol_    <chr> "Yes", "Yes", "No", "Yes", "No", "No", "No", "No", "No",~
## $ Heart_Attack_ <chr> "No", "No", "No", "No", "No", "No", "No", "Yes", "No", "~
## $ Age_Group_    <chr> "60-70", "60-70", "60-70", "< 60", "71+", "< 60", "< 60"~

Chapter 5: Descriptive Statistics - Univariate Analysis

Descriptive Statistics for Continuous Variables

R Code:

# summary(heart) 
# summary(heart[c('Weight','Height','Systolic','Diastolic')]) 
# describe(heart)
# Hmisc::describe(heart[c('Weight','Height','Systolic','Diastolic')],na.rm=TRUE)
psych::describe(heart[c('Weight','Height','Systolic','Diastolic')],na.rm=TRUE)

##           vars    n   mean    sd median trimmed   mad  min   max range skew
## Weight       1 5203 153.09 28.92  150.0  151.66 29.65 67.0 300.0   233 0.56
## Height       2 5203  64.81  3.58   64.5   64.74  3.71 51.5  76.5    25 0.18
## Systolic     3 5209 136.91 23.74  132.0  134.35 17.79 82.0 300.0   218 1.49
## Diastolic    4 5209  85.36 12.97   84.0   84.44 11.86 50.0 160.0   110 0.88
##           kurtosis   se
## Weight        0.52 0.40
## Height       -0.40 0.05
## Systolic      4.22 0.33
## Diastolic     1.85 0.18

cat(c("95% Confidence Intervals for Weight 2.5%,",
  "97.5%: \n"),
lsr::ciMean(heart$Weight, conf=.95, na.rm=TRUE))

## 95% Confidence Intervals for Weight 2.5%, 97.5%: 
##  152.3008 153.8726

cat(c("95% Confidence Intervals for Height 2.5%,",
  "97.5%: \n"),
lsr::ciMean(heart$Height, conf=.95, na.rm=TRUE))

## 95% Confidence Intervals for Height 2.5%, 97.5%: 
##  64.71581 64.91056

cat(c("95% Confidence Intervals for Systolic 2.5%,",
  "97.5%: \n"),
lsr::ciMean(heart$Systolic, conf=.95, na.rm=TRUE))

## 95% Confidence Intervals for Systolic 2.5%, 97.5%: 
##  136.2647 137.5544

cat(c("95% Confidence Intervals for Diastolic 2.5%,",
  "97.5%: \n"),
lsr::ciMean(heart$Diastolic, conf=.95, na.rm=TRUE))

## 95% Confidence Intervals for Diastolic 2.5%, 97.5%: 
##  85.00623 85.71099

# Visualize distribution of Weight
heart %>% 
  select(Weight) %>%
  filter(!is.na(Weight)) %>%
  ggplot(aes(x=Weight)) +
  geom_histogram(aes(y=(..count..)/sum(..count..)),
    bins=24, color="black", fill="lightgray") +
  labs(title="Descriptive Statistics for Continuous Variables",
         y="Percentage", x= "Weight") + 
  scale_x_continuous(breaks = seq(70, 290, 20)) + 
  scale_y_continuous(breaks = seq(0, .15, .025),
    labels = scales::percent) + 
  theme_minimal()

heart %>% 
  select(Weight) %>%
  filter(!is.na(Weight)) %>%
  ggplot(aes(x=Weight)) +
  geom_histogram(aes(y=..density..),
    bins=24, color="black", fill="lightgray") +
  labs(title="Descriptive Statistics for Continuous Variables",
         y="Density", x= "Weight") +
  scale_x_continuous(breaks = seq(70, 290, 20)) +  
  stat_function(fun = dnorm, args = list(
    mean=mean(heart$Weight,na.rm=TRUE),
    sd=sd(heart$Weight,na.rm=TRUE)),
    col='red', lwd=2, lty='dashed') + 
  theme_minimal()

heart %>% 
  select(Weight) %>%
  filter(!is.na(Weight)) %>%
  ggplot(aes(y=Weight, x="")) +
  stat_boxplot(geom="errorbar", width=0.15) +
  geom_boxplot(color="black", fill="lightgray") +
#  geom_point() +
  stat_summary(fun="mean", pch=13, color="red", size=1) +
  labs(title="Descriptive Statistics for Continuous Variables",
         x="", y= "Weight") + 
  scale_y_continuous(breaks = seq(70, 290, 20)) + 
  theme_minimal() +
  coord_flip()

## Warning: Removed 1 rows containing missing values (geom_segment).

# Visualize distribution of Systolic
heart %>% 
  select(Systolic) %>%
  filter(!is.na(Systolic)) %>%
  ggplot(aes(x=Systolic)) +
  geom_histogram(aes(y=(..count..)/sum(..count..)),
    bins=26, color="black", fill="lightgray") +
  labs(title="Descriptive Statistics for Continuous Variables",
         y="Percentage", x= "Systolic") + 
  scale_x_continuous(breaks = seq(84, 292, 16)) + 
  scale_y_continuous(breaks = seq(0, .15, .05),
    labels = scales::percent_format(accuracy = 1L)) + 
  theme_minimal()

heart %>% 
  select(Systolic) %>%
  filter(!is.na(Systolic)) %>%
  ggplot(aes(x=Systolic)) +
  geom_histogram(aes(y=..density..),
    bins=24, color="black", fill="lightgray") +
  labs(title="Descriptive Statistics for Continuous Variables",
         y="Density", x= "Systolic") +
  scale_x_continuous(breaks = seq(84, 292, 16)) +  
  stat_function(fun = dnorm, args = list(
    mean=mean(heart$Systolic,na.rm=TRUE),
    sd=sd(heart$Systolic,na.rm=TRUE)),
    col='red', lwd=2, lty='dashed') + 
  theme_minimal()

heart %>% 
  select(Systolic) %>%
  filter(!is.na(Systolic)) %>%
  ggplot(aes(y=Systolic, x="")) +
  stat_boxplot(geom="errorbar", width=0.15) +
  geom_boxplot(color="black", fill="lightgray") +
#  geom_point() +
  stat_summary(fun="mean", pch=13, color="red", size=1) +
  labs(title="Descriptive Statistics for Continuous Variables",
         x="", y= "Systolic") + 
  scale_y_continuous(breaks = seq(84, 292, 16)) + 
  theme_minimal() +
  coord_flip()

## Warning: Removed 1 rows containing missing values (geom_segment).

##

DRAFT Notes: Biostatistics by Example Using SAS Studio

All rights reserved to SAS Institute Inc. & Ron Cody

analysisinsightdata

Wednesday, July 13, 2022

Notes made while working through Biostatistics by Example Using SAS Studio by Ron Cody, using the SAS programmer interface. All SAS content and materials belong to them, not me.

R will then be used to complete representative activities.

Chapters 1 - 4: Data Import Basics

Read-in bult-in dataset, import from text and CSV files, work with various delimiters and header profiles.

Results: biostats_by_ex_SAScode.sas

The Contents Procedure

SASHELP.HEART

Attributes

Engine/Host Information

Varnum

The Print Procedure

Data Set SASHELP.HEART

The Print Procedure

Data Set WORK.HEART_TEMP

The Print Procedure

Data Set WORK.HEART_TEMP

The Freq Procedure

Table Height

One-Way Frequencies

The Contents Procedure

WORK.GRADES_XLSX

Attributes

Engine/Host Information

Varnum

The Print Procedure

Data Set WORK.GRADES_XLSX

The Contents Procedure

WORK.GRADES2_XLSX

Attributes

Engine/Host Information

Varnum

The Print Procedure

Data Set WORK.GRADES2_XLSX

The Print Procedure

Data Set WORK.GRADES2_XLSX

The Contents Procedure

WORK.GRADES_CSV

Attributes

Engine/Host Information

Varnum

The Print Procedure

Data Set WORK.GRADES_CSV

The Contents Procedure

WORK.HEALTHLIST_TXT

Attributes

Engine/Host Information

Varnum

The Print Procedure

Data Set WORK.HEALTHLIST_TXT

The Contents Procedure

WORK.HEALTHLIST_TXT

Attributes

Engine/Host Information

Varnum

The Print Procedure

Data Set WORK.HEALTHLIST_TXT

The Contents Procedure

WORK.HEALTHLIST_TXT

Attributes

Engine/Host Information

Varnum

The Print Procedure

Data Set WORK.HEALTHLIST_TXT

The Contents Procedure

WORK.HEALTH_CSV

Attributes

Engine/Host Information

Varnum

The Print Procedure

Data Set WORK.HEALTH_CSV

The Contents Procedure

WORK.HEALTH_TXT

Attributes

Engine/Host Information

Varnum

The Print Procedure