Week 5

Examine Residuals

/* Declare macro variable 'interval' with the selected list of independent variables */
%let interval=Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area
         Lot_Area Age_Sold Bedroom_AbvGr Total_Bathroom;

/* Part A: Linear regression model with ODS graphics enabled */
/* st105d01.sas - Filename */
/* Enable ODS graphics for producing various plots */
ods graphics on;

/* Run linear regression on the ameshousing3 dataset from STAT1 library */
proc reg data=STAT1.ameshousing3;
    /* Model SalePrice as the dependent variable and the interval variables as the independent variables */
    CONTINUOUS: model SalePrice
                  = &interval;
    /* Set the title for the output */
    title 'SalePrice Model - Plots of Diagnostic Statistics';
run;
quit;

/* Part B: Linear regression model with specific diagnostic plots */
/* st105d01.sas - Filename */
/* Run linear regression on the ameshousing3 dataset from STAT1 library with specific diagnostic plots */
proc reg data=STAT1.ameshousing3
         /* Select only the specified diagnostic plots: QQ plot, Residuals by Predicted, and Residuals */
         plots(only)=(QQ RESIDUALBYPREDICTED RESIDUALS);
    /* Model SalePrice as the dependent variable and the interval variables as the independent variables */
    CONTINUOUS: model SalePrice
                  = &interval;
    /* Set the title for the output */
    title 'SalePrice Model - Plots of Diagnostic Statistics';
run;
quit;

Result

/* st105s01.sas - Filename */
/* Enable ODS graphics with imagemap functionality */
ods graphics / imagemap=on;

/* Run linear regression on the BodyFat2 dataset from STAT1 library */
proc reg data=STAT1.BodyFat2
    /* Select only the specified diagnostic plots: QQ plot, Residuals by Predicted, and Residuals */
    plots(only)=(QQ RESIDUALBYPREDICTED RESIDUALS);
    /* Model PctBodyFat2 as the dependent variable and the specified variables as the independent variables */
    FORWARD: model PctBodyFat2
                  = Abdomen Weight Wrist Forearm;
    /* Identify observations by the 'Case' variable */
    id Case;
    /* Set the title for the output */
    title 'FORWARD Model - Plots of Diagnostic Statistics';
run;
quit;

Result

Infulential Observations

/* Declare macro variable 'interval' with the selected list of independent variables */
%let interval=Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area
         Lot_Area Age_Sold Bedroom_AbvGr Total_Bathroom;

/* st105d02.sas - Filename */
/* Part A: Stepwise model selection and diagnostic plots */
/* Disable output display temporarily */
ods select none;

/* Perform stepwise model selection using PROC GLMSELECT on the ameshousing3 dataset from STAT1 library */
proc glmselect data=STAT1.ameshousing3 plots=all;
    STEPWISE: model SalePrice = &interval / selection=stepwise details=steps select=SL slentry=0.05 slstay=0.05;
    title "Stepwise Model Selection for SalePrice - SL 0.05";
run;
quit;

/* Enable output display */
ods select all;

/* Enable ODS graphics for producing various plots */
ods graphics on;

/* Output datasets for diagnostic statistics */
ods output RSTUDENTBYPREDICTED=Rstud
           COOKSDPLOT=Cook
           DFFITSPLOT=Dffits
           DFBETASPANEL=Dfbs;

/* Run linear regression on the ameshousing3 dataset from STAT1 library with specific diagnostic plots */
proc reg data=STAT1.ameshousing3
         plots(only label)=
              (RSTUDENTBYPREDICTED
               COOKSD
               DFFITS
               DFBETAS);
    SigLimit: model SalePrice = &_GLSIND;
    title 'SigLimit Model - Plots of Diagnostic Statistics';
run;
quit;

/* Part B: Print diagnostic statistics and identify influential observations */
/* Clear the title */
title;

/* Print the Rstud dataset */
proc print data=Rstud;
run;

/* Print the Cook dataset */
proc print data=Cook;
run;

/* Print the Dffits dataset */
proc print data=Dffits;
run;

/* Print the Dfbs dataset */
proc print data=Dfbs;
run;

/* Split the Dfbs dataset into two parts */
data Dfbs01;
    set Dfbs (obs=300);
run;
data Dfbs02;
    set Dfbs (firstobs=301);
run;

/* Combine the two parts of the Dfbs dataset */
data Dfbs2;
    update Dfbs01 Dfbs02;
    by Observation;
run;

/* Create the influential dataset to flag influential observations */
data influential;
    /* Merge datasets from above */
    merge Rstud
          Cook
          Dffits
          Dfbs2;
    by observation;

    /* Flag observations that have exceeded at least one cutpoint */
    if (ABS(Rstudent)>3) or (Cooksdlabel ne ' ') or Dffitsout then flag=1;
    array dfbetas{*} _dfbetasout: ;
    do i=2 to dim(dfbetas);
         if dfbetas{i} then flag=1;
    end;

    /* Set to missing values of influence statistics for those that have not exceeded cutpoints */
    if ABS(Rstudent)<=3 then RStudent=.;
    if Cooksdlabel eq ' ' then CooksD=.;
    
    /* Subset only observations that have been flagged */
    if flag=1;
    drop i flag;
run;

/* Clear the title */
title;

/* Print the influential dataset */
proc print data=influential;
    id observation;
    var Rstudent CooksD Dffitsout _dfbetasout:;
run;

Result

/* st105s02.sas - Filename */
/* Part A: Linear regression with diagnostic plots and output */
/* Enable ODS graphics for producing various plots */
ods graphics on;

/* Output datasets for diagnostic statistics */
ods output RSTUDENTBYPREDICTED=Rstud
           COOKSDPLOT=Cook
           DFFITSPLOT=Dffits
           DFBETASPANEL=Dfbs;

/* Run linear regression on the BodyFat2 dataset from STAT1 library with specific diagnostic plots */
proc reg data=STAT1.BodyFat2
         plots(only label)=
              (RSTUDENTBYPREDICTED
               COOKSD
               DFFITS
               DFBETAS);
    FORWARD: model PctBodyFat2
                 = Abdomen Weight Wrist Forearm;
    id Case;
    title 'FORWARD Model - Plots of Diagnostic Statistics';
run;
quit;

/* Part B: Identify influential observations */
/* Create the influential dataset to flag influential observations */
data influential;
    /* Merge datasets from above */
    merge Rstud
          Cook
          Dffits
          Dfbs;
    by observation;

    /* Flag observations that have exceeded at least one cutpoint */
    if (ABS(Rstudent)>3) or (Cooksdlabel ne ' ') or Dffitsout then flag=1;
    array dfbetas{*} _dfbetasout: ;
    do i=2 to dim(dfbetas);
        if dfbetas{i} then flag=1;
    end;

    /* Set to missing values of influence statistics for those who have not exceeded cutpoints */
    if ABS(Rstudent)<=3 then RStudent=.;
    if Cooksdlabel eq ' ' then CooksD=.;
    
    /* Subset only observations that have been flagged */
    if flag=1;
    drop i flag;
run;

/* Print the influential dataset */
proc print data=influential;
    id observation ID1;
    var Rstudent CooksD Dffitsout _dfbetasout:;
run;

Result

Collinearity

/* Declare macro variable 'interval' with the selected list of independent variables */
%let interval=Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area
         Lot_Area Age_Sold Bedroom_AbvGr Total_Bathroom;

/* st105d03.sas - Filename */
/* Part A: Sort and merge datasets, then run correlation analysis */
/* Sort the ameshousing3 dataset by PID and save it in the STAT1 library as ames_sorted */
proc sort data=STAT1.ameshousing3 out=STAT1.ames_sorted;
    by PID;
run;

/* Sort the amesaltuse dataset by PID */
proc sort data=STAT1.amesaltuse;
    by PID;
run;

/* Merge the sorted ameshousing3 and amesaltuse datasets by PID */
data amescombined;
    merge STAT1.ames_sorted STAT1.amesaltuse;
    by PID;
run;

/* Clear the title */
title;

/* Run correlation analysis on the amescombined dataset */
proc corr data=amescombined nosimple;
    var &interval;
    with score;
run;

/* Part B: Collinearity diagnostics and removing 'score' variable */
/* Run linear regression on the amescombined dataset with collinearity diagnostics */
proc reg data=amescombined;
    model SalePrice = &interval score / vif;
    title 'Collinearity Diagnostics';
run;
quit;

/* Run linear regression on the amescombined dataset without the 'score' variable */
proc reg data=amescombined;
    NOSCORE: model SalePrice = &interval / vif;
    title2 'Removing Score';
run;
quit;

Result

/* st105s03.sas - Filename */

/* Part A: Run linear regression with full model and collinearity diagnostics */
/* Disable ODS graphics to avoid unnecessary graphics output for this step */
ods graphics off;

/* Run linear regression on the BodyFat2 dataset from the STAT1 library */
/* The dependent variable is PctBodyFat2, and the independent variables */
/* include Age, Weight, Height, Neck, Chest, Abdomen, Hip, Thigh, Knee, */
/* Ankle, Biceps, Forearm, and Wrist. Collinearity diagnostics are produced */
/* using the VIF (Variance Inflation Factor) option. */
proc reg data=STAT1.BodyFat2;
    FULLMODL: model PctBodyFat2
                  = Age Weight Height
                    Neck Chest Abdomen Hip Thigh
                    Knee Ankle Biceps Forearm Wrist
                  / vif;
    title 'Collinearity -- Full Model';
run;
quit;

/* Enable ODS graphics for subsequent steps */
ods graphics on;

/* Part B: Run linear regression without weight and collinearity diagnostics */
/* Disable ODS graphics to avoid unnecessary graphics output for this step */
ods graphics off;

/* Run linear regression on the BodyFat2 dataset from the STAT1 library */
/* This time, the Weight variable is excluded from the list of independent */
/* variables. The dependent variable is still PctBodyFat2, and the remaining */
/* independent variables include Age, Height, Neck, Chest, Abdomen, Hip, Thigh, */
/* Knee, Ankle, Biceps, Forearm, and Wrist. Collinearity diagnostics are */
/* produced using the VIF (Variance Inflation Factor) option. */
proc reg data=STAT1.BodyFat2;
    NOWT: model PctBodyFat2
              = Age Height
                Neck Chest Abdomen Hip Thigh
                Knee Ankle Biceps Forearm Wrist
              / vif;
    title 'Collinearity -- No Weight';
run;
quit;

/* Enable ODS graphics for subsequent steps */
ods graphics on;

Result