Week 2

Box plot and Scatter plot

/*st102d01.sas*/  /*Part A*/
/*The PROC SGSCATTER procedure is used to create a scatter plot matrix that displays the relationships among variables.*/
/*In this case, the plot shows the association between SalePrice and Gr_Liv_Area with a regression line. */
proc sgscatter data=STAT1.ameshousing3;
    plot SalePrice*Gr_Liv_Area / reg;
    title "Associations of Above Grade Living Area with Sale Price";
run;

/*st102d01.sas*/  /*Part B*/
/*The %LET statement creates a macro variable named "interval" and assigns it a list of variables. */
/*The macro variable &interval is used in PROC SGSCATTER to plot scatter plots for each of the variables in the list against SalePrice. */
/*The option nolabel is used to remove the variable names from the plot. */
/*The title statement gives the plot a descriptive title. */
%let interval=Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area 
         Lot_Area Age_Sold Bedroom_AbvGr Total_Bathroom;

/*PROC SGSCATTER is used to explore relationships among continuous variables*/
/*using scatter plots*/
options nolabel;
proc sgscatter data=STAT1.ameshousing3;
    plot SalePrice*(&interval) / reg;
    title "Associations of Interval Variables with Sale Price";
run;

/*st102d01.sas*/  /*Part C*/
/*The PROC SGPLOT procedure is used to create a vertical box plot that displays the median, quartiles, and extreme values of SalePrice for two categories of Central_Air: Y and N. */
/*The connect=mean option is used to connect the means of the two groups with a line. */
proc sgplot data=STAT1.ameshousing3;
    vbox SalePrice / category=Central_Air 
                     connect=mean;
    title "Sale Price Differences across Central Air";
run;

/*st102d01.sas*/  /*Part D*/
/*The %LET statement creates a macro variable named "categorical" and assigns it a list of variables. */
/*The macro named "box" is defined using the %MACRO and %MEND statements. */
/*The macro uses PROC SGPLOT to create a vertical box plot for each categorical variable in the list, with SalePrice on the y-axis and the categorical variable on the x-axis. */
/*The title statement gives each plot a descriptive title. */
%let categorical=House_Style2 Overall_Qual2 Overall_Cond2 Fireplaces 
         Season_Sold Garage_Type_2 Foundation_2 Heating_QC 
         Masonry_Veneer Lot_Shape_2 Central_Air;

/*
      Macro Usage:
      %box(DSN = <data set name>,
           Response = <response variable name>,
           CharVar = <bar chart grouping variable list>)
*/
%macro box(dsn      = ,
           response = ,
           Charvar  = );
%let i = 1 ;
%do %while(%scan(&charvar,&i,%str( )) ^= %str()) ;
    %let var = %scan(&charvar,&i,%str( ));
    proc sgplot data=&dsn;
        vbox &response / category=&var 
                         grouporder=ascending 
                         connect=mean;
        title "&response across Levels of &var";
    run;
    %let i = %eval(&i + 1 ) ;
%end ;
%mend box;

%box(dsn      = STAT1.ameshousing3,
     response = SalePrice,
     charvar  = &categorical);
title;
options label;

Generalized linear model

/* st102d02.sas */

/* Enable ODS graphics output */
ods graphics;

/* Fit a general linear model using PROC GLM */
proc glm data=STAT1.ameshousing3 plots=diagnostics;
    /* Specify the categorical variable */
    class Heating_QC;
    /* Specify the dependent and independent variables */
    model SalePrice=Heating_QC;
    /* Calculate the means of SalePrice for each level of Heating_QC */
    means Heating_QC / hovtest=levene;
    /* Specify the format of the categorical variable */
    format Heating_QC $Heating_QC.;
    /* Assign a title to the output */
    title "One-Way ANOVA with Heating Quality as Predictor";
/* End of PROC GLM */
run;
quit;

/* Clear the title from the output */
title;

Example 2:

/* st102s01.sas */

/* Part A: Descriptive Statistics and Box Plot using PROC MEANS and PROC SGPLOT */
/* Calculate the descriptive statistics of BulbWt by Fertilizer using PROC MEANS */
proc means data=STAT1.Garlic;
    var BulbWt;
    class Fertilizer;
    title 'Descriptive Statistics of BulbWt by Fertilizer';
run;

/* Generate a box plot of BulbWt for each level of Fertilizer using PROC SGPLOT */
proc sgplot data=STAT1.Garlic;
    vbox BulbWt / category=Fertilizer
                 connect=mean;
    title "Bulb Weight Differences across Fertilizers";
run;

/* Clear the title from the output */
title;

/* Part B: One-Way ANOVA using PROC GLM */
/* Enable ODS graphics output */
ods graphics;

/* Fit a general linear model using PROC GLM */
proc glm data=STAT1.Garlic plots=diagnostics;
    /* Specify the categorical variable */
    class Fertilizer;
    /* Specify the dependent and independent variables */
    model BulbWt=Fertilizer;
    /* Calculate the means of BulbWt for each level of Fertilizer */
    means Fertilizer / hovtest=levene;
    /* Assign a title to the output */
    title "One-Way ANOVA with Fertilizer as Predictor";
/* End of PROC GLM */
run;
quit;

/* Clear the title from the output */
title;

Multiple Comparison

/*st102d03.sas*/
/*The ODS GRAPHICS statement is used to enable graphics output for the procedure*/
ods graphics;

/*ODS SELECT statement is used to select specific output tables and graphs*/
ods select lsmeans diff diffplot controlplot;

/*PROC GLM is used to perform an analysis of variance (ANOVA) to test the null hypothesis*/
/*that the means of the dependent variable (SalePrice) are equal for different levels of*/
/*the independent variable (Heating_QC)*/
proc glm data=STAT1.ameshousing3
         plots(only)=(diffplot(center) controlplot);
    class Heating_QC;
    model SalePrice=Heating_QC;
    
    /*The LSMEANS statement is used to calculate Least Squares Means (LSMEANS)*/
    /*and multiple comparisons for Heating_QC*/
    lsmeans Heating_QC / pdiff=all
                         adjust=tukey;
    lsmeans Heating_QC / pdiff=control('Average/Typical')
                         adjust=dunnett;
    
    /*The FORMAT statement is used to specify a format for the variable Heating_QC*/
    format Heating_QC $Heating_QC.;
    
    title "Post-Hoc Analysis of ANOVA - Heating Quality as Predictor";
run;
quit;

title;

Pearson Correlation

/* 
This SAS code is divided into two parts A and B. 
Part A uses PROC CORR to calculate correlations and create scatter plots between a set of predictor variables and SalePrice variable. 
Part B also uses PROC CORR to calculate correlations between the same set of predictor variables, but this time it creates a scatter plot matrix. 
*/

/* Part A */
/* Define a macro variable named "interval" which is a list of predictor variables */
%let interval=Gr_Liv_Area Basement_Area Garage_Area Deck_Porch_Area Lot_Area Age_Sold Bedroom_AbvGr Total_Bathroom;

/* Activate ODS graphics and reset all previous graphics settings */
ods graphics / reset=all imagemap;

/* Use PROC CORR to calculate correlations between the predictor variables in "interval" and the SalePrice variable, and create scatter plots */
proc corr data=STAT1.AmesHousing3 rank plots(only)=scatter(nvar=all ellipse=none); 
    var &interval; /* list of variables to correlate */
    with SalePrice; /* variable to correlate with */
    id PID; /* identification variable */
    title "Correlations and Scatter Plots with SalePrice"; /* title of the output */
run;
title;

/* Part B */
/* Deactivate ODS graphics */
ods graphics off;

/* Use PROC CORR to calculate correlations between the predictor variables in "interval" and create a scatter plot matrix. */
proc corr data=STAT1.AmesHousing3 nosimple best=3; 
    var &interval; /* list of variables to correlate */
    title "Correlations and Scatter Plot Matrix of Predictors"; /* title of the output */
run;
title;

Example 2:

/*
This SAS code is divided into two parts A and B. 
Part A uses PROC CORR to calculate correlations and create scatter plots between a set of predictor variables and the PctBodyFat2 variable. 
Part B also uses PROC CORR to calculate correlations between the same set of predictor variables, but this time it selects only the highest correlations and formats them for easy interpretation.
*/

/* Part A */
/* Define a macro variable named "interval" which is a list of predictor variables */
%let interval=Age Weight Height Neck Chest Abdomen Hip Thigh Knee Ankle Biceps Forearm Wrist;

/* Activate ODS graphics and reset all previous graphics settings */
ods graphics / reset=all imagemap;

/* Use PROC CORR to calculate correlations between the predictor variables in "interval" and the PctBodyFat2 variable, and create scatter plots */
proc corr data=STAT1.BodyFat2 plots(only)=scatter(nvar=all ellipse=none);
    var &interval; /* list of variables to correlate */
    with PctBodyFat2; /* variable to correlate with */
    id Case; /* identification variable */
    title "Correlations and Scatter Plots"; /* title of the output */
run;

/* Define a new macro variable "interval" which is a shorter list of predictor variables */
%let interval=Biceps Forearm Wrist;

/* Activate ODS graphics and select only scatterplot in the output */
ods graphics / reset=all imagemap;
ods select scatterplot;

/* Use PROC CORR to calculate correlations between the predictor variables in "interval" and the PctBodyFat2 variable, and create scatter plots */
proc corr data=STAT1.BodyFat2 plots(only)=scatter(nvar=all ellipse=none);
    var &interval; /* list of variables to correlate */
    with PctBodyFat2; /* variable to correlate with */
    id Case; /* identification variable */
    title "Correlations and Scatter Plots"; /* title of the output */
run;

/* Part B */
/* Deactivate ODS graphics */
ods graphics off;

/* Define a macro variable named "interval" which is the same list of predictor variables used in Part A */
%let interval=Age Weight Height Neck Chest Abdomen Hip Thigh Knee Ankle Biceps Forearm Wrist;

/* Use PROC CORR to calculate correlations between the predictor variables in "interval" and store the results in a dataset named "pearson" */
proc corr data=STAT1.BodyFat2 nosimple best=5 out=pearson;
    var &interval; /* list of variables to correlate */
    title "Correlations of Predictors"; /* title of the output */
run;

/* Define a macro variable named "big" which represents the cutoff value for high correlations */
%let big=0.7;

/* Define a format named "correlations" to apply to the correlation values */
proc format;
    picture correlations &big -< 1 = '009.99' (prefix="*") /* prefix "*" to indicate high correlation */
                            -1 <- -&big = '009.99' (prefix="*")
                            -&big <-< &big = '009.99';
run;

/* Print the correlation values in the "pearson" dataset and apply the "correlations" format to the relevant columns */
proc print data=pearson;
    var _NAME_ &interval; /* list of variables to print */
    where _type_="CORR"; /* select only the correlation values */
    format &interval correlations.; /* apply the "correlations" format */
run;

/* Define a macro variable named "big" which represents the cutoff value for high correlations */
%let big=0.7;

/* Create a new dataset named "big