Examine Residuals
/* Declare macro variable 'interval' with the selected list of independent variables */
%let interval=Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area
Lot_Area Age_Sold Bedroom_AbvGr Total_Bathroom;
/* Part A: Linear regression model with ODS graphics enabled */
/* st105d01.sas - Filename */
/* Enable ODS graphics for producing various plots */
ods graphics on;
/* Run linear regression on the ameshousing3 dataset from STAT1 library */
proc reg data=STAT1.ameshousing3;
/* Model SalePrice as the dependent variable and the interval variables as the independent variables */
CONTINUOUS: model SalePrice
= &interval;
/* Set the title for the output */
title 'SalePrice Model - Plots of Diagnostic Statistics';
run;
quit;
/* Part B: Linear regression model with specific diagnostic plots */
/* st105d01.sas - Filename */
/* Run linear regression on the ameshousing3 dataset from STAT1 library with specific diagnostic plots */
proc reg data=STAT1.ameshousing3
/* Select only the specified diagnostic plots: QQ plot, Residuals by Predicted, and Residuals */
plots(only)=(QQ RESIDUALBYPREDICTED RESIDUALS);
/* Model SalePrice as the dependent variable and the interval variables as the independent variables */
CONTINUOUS: model SalePrice
= &interval;
/* Set the title for the output */
title 'SalePrice Model - Plots of Diagnostic Statistics';
run;
quit;
Result
/* st105s01.sas - Filename */
/* Enable ODS graphics with imagemap functionality */
ods graphics / imagemap=on;
/* Run linear regression on the BodyFat2 dataset from STAT1 library */
proc reg data=STAT1.BodyFat2
/* Select only the specified diagnostic plots: QQ plot, Residuals by Predicted, and Residuals */
plots(only)=(QQ RESIDUALBYPREDICTED RESIDUALS);
/* Model PctBodyFat2 as the dependent variable and the specified variables as the independent variables */
FORWARD: model PctBodyFat2
= Abdomen Weight Wrist Forearm;
/* Identify observations by the 'Case' variable */
id Case;
/* Set the title for the output */
title 'FORWARD Model - Plots of Diagnostic Statistics';
run;
quit;
Result
Infulential Observations
/* Declare macro variable 'interval' with the selected list of independent variables */
%let interval=Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area
Lot_Area Age_Sold Bedroom_AbvGr Total_Bathroom;
/* st105d02.sas - Filename */
/* Part A: Stepwise model selection and diagnostic plots */
/* Disable output display temporarily */
ods select none;
/* Perform stepwise model selection using PROC GLMSELECT on the ameshousing3 dataset from STAT1 library */
proc glmselect data=STAT1.ameshousing3 plots=all;
STEPWISE: model SalePrice = &interval / selection=stepwise details=steps select=SL slentry=0.05 slstay=0.05;
title "Stepwise Model Selection for SalePrice - SL 0.05";
run;
quit;
/* Enable output display */
ods select all;
/* Enable ODS graphics for producing various plots */
ods graphics on;
/* Output datasets for diagnostic statistics */
ods output RSTUDENTBYPREDICTED=Rstud
COOKSDPLOT=Cook
DFFITSPLOT=Dffits
DFBETASPANEL=Dfbs;
/* Run linear regression on the ameshousing3 dataset from STAT1 library with specific diagnostic plots */
proc reg data=STAT1.ameshousing3
plots(only label)=
(RSTUDENTBYPREDICTED
COOKSD
DFFITS
DFBETAS);
SigLimit: model SalePrice = &_GLSIND;
title 'SigLimit Model - Plots of Diagnostic Statistics';
run;
quit;
/* Part B: Print diagnostic statistics and identify influential observations */
/* Clear the title */
title;
/* Print the Rstud dataset */
proc print data=Rstud;
run;
/* Print the Cook dataset */
proc print data=Cook;
run;
/* Print the Dffits dataset */
proc print data=Dffits;
run;
/* Print the Dfbs dataset */
proc print data=Dfbs;
run;
/* Split the Dfbs dataset into two parts */
data Dfbs01;
set Dfbs (obs=300);
run;
data Dfbs02;
set Dfbs (firstobs=301);
run;
/* Combine the two parts of the Dfbs dataset */
data Dfbs2;
update Dfbs01 Dfbs02;
by Observation;
run;
/* Create the influential dataset to flag influential observations */
data influential;
/* Merge datasets from above */
merge Rstud
Cook
Dffits
Dfbs2;
by observation;
/* Flag observations that have exceeded at least one cutpoint */
if (ABS(Rstudent)>3) or (Cooksdlabel ne ' ') or Dffitsout then flag=1;
array dfbetas{*} _dfbetasout: ;
do i=2 to dim(dfbetas);
if dfbetas{i} then flag=1;
end;
/* Set to missing values of influence statistics for those that have not exceeded cutpoints */
if ABS(Rstudent)<=3 then RStudent=.;
if Cooksdlabel eq ' ' then CooksD=.;
/* Subset only observations that have been flagged */
if flag=1;
drop i flag;
run;
/* Clear the title */
title;
/* Print the influential dataset */
proc print data=influential;
id observation;
var Rstudent CooksD Dffitsout _dfbetasout:;
run;
Result
/* st105s02.sas - Filename */
/* Part A: Linear regression with diagnostic plots and output */
/* Enable ODS graphics for producing various plots */
ods graphics on;
/* Output datasets for diagnostic statistics */
ods output RSTUDENTBYPREDICTED=Rstud
COOKSDPLOT=Cook
DFFITSPLOT=Dffits
DFBETASPANEL=Dfbs;
/* Run linear regression on the BodyFat2 dataset from STAT1 library with specific diagnostic plots */
proc reg data=STAT1.BodyFat2
plots(only label)=
(RSTUDENTBYPREDICTED
COOKSD
DFFITS
DFBETAS);
FORWARD: model PctBodyFat2
= Abdomen Weight Wrist Forearm;
id Case;
title 'FORWARD Model - Plots of Diagnostic Statistics';
run;
quit;
/* Part B: Identify influential observations */
/* Create the influential dataset to flag influential observations */
data influential;
/* Merge datasets from above */
merge Rstud
Cook
Dffits
Dfbs;
by observation;
/* Flag observations that have exceeded at least one cutpoint */
if (ABS(Rstudent)>3) or (Cooksdlabel ne ' ') or Dffitsout then flag=1;
array dfbetas{*} _dfbetasout: ;
do i=2 to dim(dfbetas);
if dfbetas{i} then flag=1;
end;
/* Set to missing values of influence statistics for those who have not exceeded cutpoints */
if ABS(Rstudent)<=3 then RStudent=.;
if Cooksdlabel eq ' ' then CooksD=.;
/* Subset only observations that have been flagged */
if flag=1;
drop i flag;
run;
/* Print the influential dataset */
proc print data=influential;
id observation ID1;
var Rstudent CooksD Dffitsout _dfbetasout:;
run;
Result
Collinearity
/* Declare macro variable 'interval' with the selected list of independent variables */
%let interval=Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area
Lot_Area Age_Sold Bedroom_AbvGr Total_Bathroom;
/* st105d03.sas - Filename */
/* Part A: Sort and merge datasets, then run correlation analysis */
/* Sort the ameshousing3 dataset by PID and save it in the STAT1 library as ames_sorted */
proc sort data=STAT1.ameshousing3 out=STAT1.ames_sorted;
by PID;
run;
/* Sort the amesaltuse dataset by PID */
proc sort data=STAT1.amesaltuse;
by PID;
run;
/* Merge the sorted ameshousing3 and amesaltuse datasets by PID */
data amescombined;
merge STAT1.ames_sorted STAT1.amesaltuse;
by PID;
run;
/* Clear the title */
title;
/* Run correlation analysis on the amescombined dataset */
proc corr data=amescombined nosimple;
var &interval;
with score;
run;
/* Part B: Collinearity diagnostics and removing 'score' variable */
/* Run linear regression on the amescombined dataset with collinearity diagnostics */
proc reg data=amescombined;
model SalePrice = &interval score / vif;
title 'Collinearity Diagnostics';
run;
quit;
/* Run linear regression on the amescombined dataset without the 'score' variable */
proc reg data=amescombined;
NOSCORE: model SalePrice = &interval / vif;
title2 'Removing Score';
run;
quit;
Result
/* st105s03.sas - Filename */
/* Part A: Run linear regression with full model and collinearity diagnostics */
/* Disable ODS graphics to avoid unnecessary graphics output for this step */
ods graphics off;
/* Run linear regression on the BodyFat2 dataset from the STAT1 library */
/* The dependent variable is PctBodyFat2, and the independent variables */
/* include Age, Weight, Height, Neck, Chest, Abdomen, Hip, Thigh, Knee, */
/* Ankle, Biceps, Forearm, and Wrist. Collinearity diagnostics are produced */
/* using the VIF (Variance Inflation Factor) option. */
proc reg data=STAT1.BodyFat2;
FULLMODL: model PctBodyFat2
= Age Weight Height
Neck Chest Abdomen Hip Thigh
Knee Ankle Biceps Forearm Wrist
/ vif;
title 'Collinearity -- Full Model';
run;
quit;
/* Enable ODS graphics for subsequent steps */
ods graphics on;
/* Part B: Run linear regression without weight and collinearity diagnostics */
/* Disable ODS graphics to avoid unnecessary graphics output for this step */
ods graphics off;
/* Run linear regression on the BodyFat2 dataset from the STAT1 library */
/* This time, the Weight variable is excluded from the list of independent */
/* variables. The dependent variable is still PctBodyFat2, and the remaining */
/* independent variables include Age, Height, Neck, Chest, Abdomen, Hip, Thigh, */
/* Knee, Ankle, Biceps, Forearm, and Wrist. Collinearity diagnostics are */
/* produced using the VIF (Variance Inflation Factor) option. */
proc reg data=STAT1.BodyFat2;
NOWT: model PctBodyFat2
= Age Height
Neck Chest Abdomen Hip Thigh
Knee Ankle Biceps Forearm Wrist
/ vif;
title 'Collinearity -- No Weight';
run;
quit;
/* Enable ODS graphics for subsequent steps */
ods graphics on;
Result