DRAFT Notes: Analyzing Census Bureau Data in SAS Studio

Notes made while working through this introductory course, using the SAS programmer interface. Contrary to instructions on the course video and slides, files were read in “as downloaded” and were neither manually modified nor converted to xlsx format (if originally in csv format) – programming as much of the work as possible. Also included sample code to pull data through the API using SAS proc http instead of downloading CSV files.

Self-directed learning: R will then be used to complete representative activities, and a snippet of TidyCensus for API data pull is also provided.

Data Import and Cleansing

American Community Survey B25077 MEDIAN VALUE (DOLLARS), 2018: ACS 1-Year Estimates Detailed Tables, Owner-occupied housing units

SAS Code:

#  (&path1 and census libname previously defined)
/* Median Home Value, by State */
proc import out=census.medianhomevalueraw
    datafile="&path1.ACSDT1Y2018.B25077_data_with_overlays_2021-11-05T055419.csv"
    dbms=csv
    replace;
    datarow=3;
    getnames=YES;
run;
data census.medianhomevalue;
  set census.medianhomevalueraw;
    rename NAME = State;
    MedianHome = input(B25077_001E, 10.);
    MedianHomeMOE = input(B25077_001M, 10.);
    drop B25077_001E 
      B25077_001M;
    format MedianHome NLMNY15. 
      MedianHomeMOE NLMNY15.; 
  run;

American Community Survey B01003 TOTAL POPULATION, 2018: ACS 1-Year Estimates Detailed Tables, Total population

SAS Code:

#  (&path1 and census libname previously defined)
/* Total Population, by State 2018 */
proc import out=census.population2018raw
    datafile="&path1.ACSDT1Y2018.B01003_data_with_overlays_2022-04-10T165631.csv"
    dbms=csv
    replace;
    datarow=3;
    getnames=YES;
run;
data census.population2018try;
  set census.population2018raw;
    rename NAME = State;
    TotalPopulation = input(B01003_001E, 12.);
    drop B01003_001E 
      B01003_001M;
    format TotalPopulation COMMA15.; 
  run;

/* Total Population, by State for 2010 */
proc import out=census.population2010raw
    datafile="&path1.ACSDT1Y2010.B01003_data_with_overlays_2022-04-11T133027.csv"
    dbms=csv
    replace;
    datarow=3;
    getnames=YES;
run;
data census.population2010try;
  set census.population2010raw;
    rename NAME = State;
    TotalPopulation = input(B01003_001E, 12.);
    drop B01003_001E 
      B01003_001M;
    format TotalPopulation COMMA15.; 
  run;
proc sort data=census.population2010try noduprecs; by State; run;

American Community Survey B06009 Place Of Birth By Educational Attainment In The United States, 2018: ACS 1-Year Estimates Detailed Tables, Population 25 years and over in the United States

SAS Code:

#  (&path1 and census libname previously defined)
/* Educational Attainment, by Division */
proc import out=census.education2018tryraw
    datafile="&path1.ACSDT1Y2018.B06009_data_with_overlays_2022-04-10T182606.csv"
    dbms=csv
    replace;
    datarow=3;
    getnames=NO;
run;
data census.education2018try;
    attrib Division length=$20.;
  set census.education2018tryraw;
    Division = strip(tranwrd(VAR62, ' Division', ' '));
    B06009_002E = input(VAR3, 12.);
    B06009_003E = input(VAR5, 12.);
    B06009_004E = input(VAR7, 12.);
    B06009_005E = input(VAR9, 12.);
    B06009_006E = input(VAR11, 12.);
    drop VAR1 - VAR62;
  run;
proc sort data=census.education2018try noduprecs; by Division; run;
proc transpose data=census.education2018try 
  out=census.education2018try2;
  var B06009_002E B06009_003E B06009_004E B06009_005E B06009_006E;
  by Division;
  run;
data census.education2018try3;
    attrib Educational_Attainment length=$45.;
  set census.education2018try2;
    if _NAME_ EQ "B06009_002E" THEN Educational_Attainment = "Less than high school graduate"; else
    if _NAME_ EQ "B06009_003E" THEN Educational_Attainment = "High school graduate (includes equivalency)"; else
    if _NAME_ EQ "B06009_004E" THEN Educational_Attainment = "Some college or associate's degree"; else
    if _NAME_ EQ "B06009_005E" THEN Educational_Attainment = "Bachelor's degree"; else
    if _NAME_ EQ "B06009_006E" THEN Educational_Attainment = "Graduate or professional degree"; 
    drop _NAME_;
    format COL1 COMMA15.; 
    rename COL1 = Count;
  run;

2018 Population Estimates FIPS Codes Reference File

SAS Code:

#  (&path1 and census libname previously defined)
/* 2018 Census Bureau Region and Division Codes and State FIPS Codes */
proc import out=census.state_georaw
    datafile="&path1.state-geocodes-v2018.xlsx"
    dbms=xlsx
    replace;
    datarow=7;
    getnames=NO;
run;
data state_geotry_Reg  (keep=RegionNum Region)
  state_geotry_Div (keep=DivisionNum Division) 
  state_geotry_Sta (keep=RegionNum DivisionNum StateNum_FIPS State);
    attrib RegionNum length=$2.;
    attrib Region length=$20.;
    attrib DivisionNum length=$2.;
    attrib Division length=$20.;
    attrib StateNum_FIPS length=$2.;
    attrib State length=$20.;
  set census.state_georaw;
    if (C NE "00")and(C NE " ") then do;
      StateNum_FIPS = C;
      State = strip(D);
      DivisionNum = B;
      RegionNum = A;
      output state_geotry_Sta ;
      end; else
    if (B NE "0")and(B NE " ") then do;
      DivisionNum = B;
      Division = strip(tranwrd(D, ' Division', ' '));
      output state_geotry_Div ;
      end; else
    if (A NE "0")and(A NE " ") then do;
      RegionNum = A;
      Region = strip(tranwrd(D, ' Region', ' '));
      output state_geotry_Reg ;
      end; 
   /* drop A - D; */
  run;
proc sql;
  create table census.state_geotry as
  select S.RegionNum, R.Region, S.DivisionNum, D.Division, StateNum_FIPS, State  
    from state_geotry_Sta as S
    left join state_geotry_Reg as R
    on (S.RegionNum = R.RegionNum)
    left join state_geotry_Div as D
    on (S.DivisionNum = D.DivisionNum)
    ;
  quit;

American Community Survey B19013 Median Household Income in The Past 12 Months (in 2018 Inflation-Adjusted Dollars), 2018: ACS 1-Year Estimates Detailed Tables, Households

SAS Code:

#  (&path1 and census libname previously defined)
/* Median HH Income Table B19013 2018 1-year ACS State Level  */
proc import out=census.medianincomeraw
    datafile="&path1.ACSDT1Y2018.B19013_data_with_overlays_2022-04-11T120015.csv"
    dbms=csv
    replace;
    datarow=3;
    getnames=YES;
run;
data census.medianincometry;
  set census.medianincomeraw;
    rename NAME = State;
    MedianIncome = input(B19013_001E, 10.);
    MedianIncomeMOE = input(B19013_001M, 10.);
    drop B19013_001E 
      B19013_001M;
    format MedianIncome NLMNY15. 
      MedianIncomeMOE NLMNY15.; 
  run;

Joining State-level Median Income file to State Geo file and generate ranking by Region

SAS Code:

#  (&path1 and census libname previously defined)
/* Joining state_geo to medianincome file with selected fields; use inner join to drop Puerto Rico */
proc sql;
  create table census.medianincome_geotry as
  select I.GEO_ID, I.State, G.Division, G.Region, MedianIncome, MedianIncomeMOE 
    from census.medianincometry as I
    inner join census.state_geotry as G
    on (I.State = G.State)
    ;
  quit;
proc sort data=census.medianincome_geotry noduprecs; by Region descending MedianIncome; run;
/* adapted from generated ranks task */
proc rank data=census.medianincome_geotry descending out=census.medianincome_rank;
    by Region;
    var MedianIncome;
    ranks rank_MedianIncome;
run;
proc sort data=census.medianincome_rank noduprecs; by Region rank_MedianIncome; run;

Generate Top 5 Median Household Incomes Report

SAS Code:

#  (&path1 and census libname previously defined)
/* adapted from generated list data task */
title1 'Top 5 Median Household Incomes by Region';
proc print data=census.medianincome_rank (where=(rank_MedianIncome <=5)) label;
    var State MedianIncome;
    by Region;
    id Region;
run;
title1;

Combine 2010 and 2018 files and create percentage change dataset

SAS Code:

#  (&path1 and census libname previously defined)
proc sql;
  create table census.population_change as
  select A.GEO_ID, A. State, A.TotalPopulation as TotalPop2010,
    B.TotalPopulation as TotalPop2018,
    (TotalPop2018-TotalPop2010)/TotalPop2010 as PercentChange format=percentn7.1
    from census.population2010try as A
    inner join census.population2018try as B
    on (A.GEO_ID = B.GEO_ID)
    ;
  quit;
proc sort data=census.population_change noduprecs; by PercentChange; run;
proc contents data=census.population_change; run;
proc print u data=census.population_change; run;
/* adapted from generated recode ranges task */
data census.population_change_cat;
    set census.population_change;
    length PercentChangeCat  $22;
    select;
        when (-1 <=PercentChange <=-0.001) PercentChangeCat='Decrease';
        when (0 <=PercentChange <=0.049) PercentChangeCat='0% to 5% Increase';
        when (0.05 <=PercentChange <=0.099) PercentChangeCat='5% to 10% Increase';
        when (0.1 <=PercentChange <=1) PercentChangeCat='10% or Higher Increase';
        otherwise PercentChangeCat='Other';
    end;
run;
/* adapted from generated one way freq task in stats */
proc freq data=census.population_change_cat order=freq;
    tables PercentChangeCat / nocum plots=(freqplot);
run;

Almost half of all states had a population increase between 0% and 5%.
Only four states had a decrease in population.

Data Visualization

Bar Chart - Median Home Value in 2018

SAS Code:

#  (&path1 and census libname previously defined)
/* Bar Chart - Median Home Value in 2018 */
ods graphics / reset width=10in height=4.8in imagemap;
proc sgplot data=census.medianhomevalue (where=(MedianHome > 229700));
    title height=14pt "Median Home Value in 2018";
    title2 "States Exceeding U.S. Median of $229,700";
    vbar State / response=MedianHome categoryorder=respdesc;
    yaxis grid label="Median Home Value in 2018";
    refline 229700 / axis=y lineattrs=(thickness=2 color=green) 
        label="U.S. Median" labelattrs=(color=green);
run;
ods graphics / reset;

Bar Chart - States with Population Exceeding 10,000,000 in 2018

SAS Code:

#  (&path1 and census libname previously defined)
/* Bar Chart - States with Population Exceeding 10,000,000 in 2018 */
ods graphics / reset width=10in height=4.8in imagemap;
proc sgplot data=census.population2018try (where=(TotalPopulation > 10000000));
    title height=14pt "States with Population Exceeding 10,000,000 in 2018";
    vbar State / response=TotalPopulation categoryorder=respdesc;
    yaxis grid label="Population Estimate";
run;
ods graphics / reset;

Stacked Bar Chart - Educational Attainment in 2018

SAS Code:

#  (&path1 and census libname previously defined)
/* Stacked Bar Chart - Educational Attainment in 2018 */
ods graphics / reset width=10in height=4.8in imagemap;
proc sgplot data=census.education2018try3;
    title height=14pt "Educational Attainment in 2018";
    vbar Division / response=Count group=Educational_Attainment stat=sum;
    yaxis grid label="Count";
    keylegend / title="Educational Attainment";
run;
ods graphics / reset;

100% Stacked Bar Chart - Educational Attainment in 2018

SAS Code:

#  (&path1 and census libname previously defined)
/* 100% Stacked Bar Chart - Educational Attainment in 2018 */
ods graphics / reset width=10in height=4.8in imagemap;
proc sgplot data=census.education2018try3 pctlevel=group;
    title height=14pt "Distribution of Educational Attainment Levels in 2018";
    vbar Division / response=Count group=Educational_Attainment
    stat=percent seglabel ;
    yaxis grid label="Percentage";
    keylegend / title="Educational Attainment Levels";
run;
ods graphics / reset;

Analyze Census Bureau Data

2018 Household Median Income

SAS Code:

#  (&path1 and census libname previously defined)
ods noproctitle;
ods graphics / imagemap=on;
/* Exploring Data */
title1 "2018 Distribution of Median Income, Combined";
proc univariate data=census.stateinfo_combined;
    ods select Histogram;
    var MedianIncome;
    histogram MedianIncome / normal kernel;
    inset n mean median / position=ne;
run;
title1;

SAS Code:

#  (&path1 and census libname previously defined)
ods noproctitle;
ods graphics / imagemap=on;
/* Exploring Data by Region */
title1 "2018 Distribution of Median Income, by Region";
proc univariate data=census.stateinfo_combined;
    ods select Histogram;
    var MedianIncome;
    class Region;
    histogram MedianIncome / normal kernel;
    inset n mean median / position=ne;
run;
title1;

One-way ANOVA

SAS Code:

#  (&path1 and census libname previously defined)
/* adapted from generated one way ANOVA task in linear models */
Title;
ods noproctitle;
ods graphics / imagemap=on;
title1 "2018 Distribution of Median Income, one way ANOVA";
proc glm data=census.stateinfo_combined plots(only)=(boxplot diagnostics);
    class Region;
    model MedianIncome=Region;
    means Region / hovtest=bartlett hovtest=levene (type=square) plots=none;
    means Region / hovtest=levene (type=abs) hovtest=bf plots=none;
    lsmeans Region / adjust=tukey pdiff alpha=.05 plots=none;
    run;
quit;
title1;

The overall analysis of variance table returns a p-value of 0.0228, which is less than 0.05. This indicates that the test is significant. This suggests that there are at least two regions where the mean of the median household income values is significantly different.

The observations must be independent. We do not see the appearance of repeated measures or clustering. We will assume that the data was collected in a way to ensure independence of the observations.
The errors must be normally distributed. The first scatter plot on the second row is a quantilequantile (Q-Q) plot. There is very little deviation from the reference line. Thus, the residuals are normally distributed. This can be verified using the first histogram on the third row, which is a residual histogram. This histogram displays a relatively normal distribution of the residuals.

All groups have equal error variances. This can be verified with Levene’s test for homogeneity of variance. Levene’s test returns a p-value of 0.1542, which is greater than 0.05. Thus, the null hypothesis of equal variances failed to be rejected. Therefore, the assumption of equal error variances across all groups is met.*
- *BUT – please note that SAS PROC GLM hovtest uses TYPE=SQUARE by default; using option TYPE=ABS will provide the equivalent of R’s car::leveneTest(MedianIncome ~ RegionName, medianincome_Region, center = mean) which returns a p-value of 0.0323 (which is less than 0.05, implying the contradiction that we reject the null hypothesis of equal variances).
The least squares mean tables display pairwise comparisons among all regions. An adjusted pvalue of 0.0286, which is less than 0.05, between the Northeast and South regions indicates that the mean of the median household income values between the two regions are significantly different.

Correlation Analysis

SAS Code:

#  (&path1 and census libname previously defined)
/* adapted from generated correlation analysis task in stats */
ods noproctitle;
ods graphics / imagemap=on;
title1 "2018 Distribution of Median Income, Correlation Analysis";
proc corr data=census.stateinfo_combined pearson nosimple noprob plots=scatter(ellipse=none) rank;
  var MeanHoursWorked TotalPopulation MedianAge MedianCurrentMarriageDuration MedianMonthlyHousingCosts;
  with MedianIncome;
  id State Region;
  run;
title1;

The Pearson correlation coefficients table displays the strength of a linear association between median household income and the potential predictor variables. The stronger the association between two variables, the closer the Pearson correlation coefficient will be to either -1 or 1, depending on whether the relationship is negative or positive, respectively. The Pearson correlation coefficient between median household income values and median monthly housing costs is 0.93492, indicating a strong positive linear relationship between the two as compared to the other pairs. In other words, the higher the median monthly housing costs, the higher the median income. Use the scatter plot to verify that the relationship is linear.

The Pearson correlation coefficient between median household income values and median monthly housing costs was 0.9349, which indicated a strong positive linear relationship between the two as compared to the other pairs. The scatter plot for median household income and median monthly housing costs obtained through the Correlation Analysis task confirmed the strong positive linear relationship between the two variables.

Linear Regression

SAS Code:

#  (&path1 and census libname previously defined)
/* adapted from generated linear regression task in linear models */
ods noproctitle;
ods graphics / imagemap=on;
title1 "2018 Median Income by Housing Costs, Correlation Analysis";
proc reg data=census.stateinfo_combined alpha=0.05 plots(only)=(diagnostics 
        residuals fitplot observedbypredicted);
    model MedianIncome=MedianMonthlyHousingCosts /;
    run;
quit;
title1;

The ANOVA table provides an analysis of the variability observed in the data and the variability explained by the regression line. The fitted regression line in the baseline model is a horizontal line across all values of the predictor variable. Thus, the slope is 0 and the intercept is the sample mean of the response variable. The p-value is less than 0.0001, which is less than 0.05. Thus, the null hypothesis that the baseline model fits the data is rejected in favor of the alternative model, the simple linear regression. Therefore, median monthly housing costs explain a significant amount of variability in median household income.
The third part of the output provides summary measures of fit for the model. The R-square value of 0.8741 means that the effects contained in the model explain 87.41% of the total variation in the median household income values.
The Parameter Estimates table defines the model. The p-value is less than 0.0001 for median monthly housing costs, which is less than 0.05. Therefore, the slope for the predictor variable is statistically different from 0. Using the parameter estimates, the estimated regression equation is MedianIncome = $23,954 + ($35.9759 * MedianMonthlyHousingCosts). Each additional dollar of median monthly housing costs is associated with an approximately $35.98 higher median household income.

The fit diagnostics panel contains a set of graphs commonly used to validate the assumptions of simple linear regression. There are several assumptions that must be met.

The observations must be independent. We do not see the appearance of repeated measures or clustering. We will assume that the data was collected in a way to ensure independence of the observations.
The errors have equal variance. The first scatter plot on the first row plots the residuals against the predicted values from the linear regression model. The plot shows no discernable pattern such as a megaphone or bowtie that would show a change in variance of the residuals as the predicted values changed. Thus, the constant variance assumption is considered valid.
The errors must be normally distributed with a mean of zero. The points on the first scatter plot on the first row appear to be randomly scattered around zero. Thus, the assumption of a mean of zero is met. The Q-Q plot, which is the first scatter plot on the second row, shows very little deviation from the reference line. Thus, the residuals are normally distributed. This can be verified using the residual histogram, which is the first histogram on the third row. This histogram displays a relatively normal distribution of the residuals.
The relationship between the predictor variable and the response variable is linear through the equation parameters. The scatter plot for median household income and median monthly housing costs obtained through the Correlation Analysis task confirmed the linear relationship. In addition, the first scatter plot on the first row does not display a curvilinear pattern, verifying this assumption.

The fit plot shows the estimated regression line superimposed over a scatter plot of the data. The blue shaded area represents the 95% confidence interval for the mean. The area between the dashed lines represents the 95% prediction interval for an individual observation. Therefore, we are 95% confident that a future single observation would fall between the dashed lines.

Extra: Pull Census Data via proc http

(A better process than downloading and importing csv files!)

SAS Code:

#  (&path1 and census libname previously defined)
/*  Extra work - retrieve Census data systemtically to skip manual download steps for csv files. Note that this code does not use the API key and is therefore rate limited.  */

filename test "&path1.medianhomevalueraw.txt";
proc http
 URL= 'http://api.census.gov/data/2018/acs/acs1?get=NAME,B25077_001E&for=state:*'
 method = 'GET'
 out=test;
quit;

proc import datafile=test dbms=csv out=census.medianhomevalueurl replace;
run;

data census.medianhomevalueurl ;
  length StateName $22. 
    State_FIPS $2.;
  set census.medianhomevalueurl; 
    StateName = strip(tranwrd(tranwrd('[["NAME"'N, '"', ' '), '[', ' '));
    length State_FIPS $2.;
    State_FIPS = strip(tranwrd(tranwrd(state, '"', ' '), ']', ' '));
    MedianHome = input(B25077_001E, 10.);
    format MedianHome NLMNY15.;
  drop '[["NAME"'N B25077_001E VAR4 state ;
  run;
proc sort data=census.medianhomevalueurl nodups; by State_FIPS; run;

Now, let’s use R!

library(tidyverse)
library(lubridate)
library(readxl)
library(kableExtra)
library(tidycensus)
library(car)
library(scales)
library(lsr)
library(psych)
library(broom)
# assumes Census Bureau API already loaded and can be verified with Sys.getenv("CENSUS_API_KEY")

Data Import and Cleansing

American Community Survey B25077 MEDIAN VALUE (DOLLARS), 2018: ACS 1-Year Estimates Detailed Tables, Owner-occupied housing units

R Code:

medianhomevalue <- read_csv("ACSDT1Y2018.B25077_data_with_overlays_2021-11-05T055419.csv") %>%
  filter(GEO_ID != "id") %>%
  mutate(
    MedianHome = as.integer(B25077_001E),
    MedianHomeMOE = as.integer(B25077_001M)) %>%
  select(-B25077_001E,-B25077_001M)

## Rows: 53 Columns: 4

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): GEO_ID, NAME, B25077_001E, B25077_001M

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(medianhomevalue)

## Rows: 52
## Columns: 4
## $ GEO_ID        <chr> "0400000US01", "0400000US02", "0400000US04", "0400000US0~
## $ NAME          <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California"~
## $ MedianHome    <int> 147900, 276100, 241100, 133100, 546800, 373300, 277400, ~
## $ MedianHomeMOE <int> 2304, 4365, 1921, 2166, 2008, 2652, 2057, 5345, 17497, 1~

American Community Survey B01003 TOTAL POPULATION, 2018: ACS 1-Year Estimates Detailed Tables, Total population

population2018 <- read_csv("ACSDT1Y2018.B01003_data_with_overlays_2022-04-10T165631.csv") %>%
  filter(GEO_ID != "id") %>%
  mutate(
    TotalPopulation = as.integer(B01003_001E)) %>%
  select(-B01003_001E,-B01003_001M) %>% 
  rename(StateName = NAME)

## Rows: 53 Columns: 4

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): B01003_001E, B01003_001M, GEO_ID, NAME

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(population2018)

## Rows: 52
## Columns: 3
## $ GEO_ID          <chr> "0400000US01", "0400000US02", "0400000US04", "0400000U~
## $ StateName       <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi~
## $ TotalPopulation <int> 4887871, 737438, 7171646, 3013825, 39557045, 5695564, ~

population2010 <- read_csv("ACSDT1Y2010.B01003_data_with_overlays_2022-04-11T133027.csv") %>%
  filter(GEO_ID != "id") %>%
  mutate(
    TotalPopulation2010 = as.integer(B01003_001E)) %>%
  select(-B01003_001E,-B01003_001M) %>% 
  rename(StateName = NAME)

## Rows: 53 Columns: 4

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): GEO_ID, B01003_001E, B01003_001M, NAME

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(population2010)

## Rows: 52
## Columns: 3
## $ GEO_ID              <chr> "0400000US01", "0400000US02", "0400000US04", "0400~
## $ StateName           <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Calif~
## $ TotalPopulation2010 <int> 4785298, 713985, 6413737, 2921606, 37349363, 50490~

American Community Survey B06009 Place Of Birth By Educational Attainment In The United States, 2018: ACS 1-Year Estimates Detailed Tables, Population 25 years and over in the United States

education2018 <- read_csv("ACSDT1Y2018.B06009_data_with_overlays_2022-04-10T182606.csv") %>%
  filter(GEO_ID != "id") %>%
  mutate(
    Division = str_trim(str_replace(NAME, "Division", " ")),
    B06009_002E = as.integer(B06009_002E),
    B06009_003E = as.integer(B06009_003E),
    B06009_004E = as.integer(B06009_004E),
    B06009_005E = as.integer(B06009_005E),
    B06009_006E = as.integer(B06009_006E)) %>%
  select(GEO_ID, Division, B06009_002E, B06009_003E, B06009_004E, 
    B06009_005E, B06009_006E) %>%
  pivot_longer(!c(GEO_ID, Division), names_to = "Education", values_to = "Count") %>% 
  mutate(Education = 
    case_when(
      Education == "B06009_002E" ~ "Less than high school graduate",
      Education == "B06009_003E" ~ "High school graduate (includes equivalency)",
      Education == "B06009_004E" ~ "Some college or associate's degree",
      Education == "B06009_005E" ~ "Bachelor's degree",
      Education == "B06009_006E" ~ "Graduate or professional degree", 
      TRUE ~ Education
  ))

## Rows: 10 Columns: 62

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (62): B06009_001E, B06009_001M, B06009_002E, B06009_002M, B06009_003E, B...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(education2018)

## Rows: 45
## Columns: 4
## $ GEO_ID    <chr> "0300000US1", "0300000US1", "0300000US1", "0300000US1", "030~
## $ Division  <chr> "New England", "New England", "New England", "New England", ~
## $ Education <chr> "Less than high school graduate", "High school graduate (inc~
## $ Count     <int> 917018, 2712566, 2603070, 2388242, 1845877, 3176958, 8290185~

table(education2018$Division)

## 
## East North Central East South Central    Middle Atlantic           Mountain 
##                  5                  5                  5                  5 
##        New England            Pacific     South Atlantic West North Central 
##                  5                  5                  5                  5 
## West South Central 
##                  5

table(education2018$Education)

## 
##                           Bachelor's degree 
##                                           9 
##             Graduate or professional degree 
##                                           9 
## High school graduate (includes equivalency) 
##                                           9 
##              Less than high school graduate 
##                                           9 
##          Some college or associate's degree 
##                                           9

2018 Population Estimates FIPS Codes Reference File

R Code:

state_georaw <- read_excel("state-geocodes-v2018.xlsx",
  skip=5) %>%
  mutate(Name = str_trim(Name)) %>% 
  rename(State_FIPS = `State (FIPS)`)
glimpse(state_georaw)

## Rows: 64
## Columns: 4
## $ Region     <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",~
## $ Division   <chr> "0", "1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2",~
## $ State_FIPS <chr> "00", "00", "09", "23", "25", "33", "44", "50", "00", "34",~
## $ Name       <chr> "Northeast Region", "New England Division", "Connecticut", ~

state_geo_Sta <- state_georaw %>%
  filter((State_FIPS != "00")&(!is.na(State_FIPS))) %>% 
  rename(StateName = Name)
glimpse(state_geo_Sta)

## Rows: 51
## Columns: 4
## $ Region     <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "2", "2",~
## $ Division   <chr> "1", "1", "1", "1", "1", "1", "2", "2", "2", "3", "3", "3",~
## $ State_FIPS <chr> "09", "23", "25", "33", "44", "50", "34", "36", "42", "17",~
## $ StateName  <chr> "Connecticut", "Maine", "Massachusetts", "New Hampshire", "~

table(state_geo_Sta$StateName)

## 
##              Alabama               Alaska              Arizona 
##                    1                    1                    1 
##             Arkansas           California             Colorado 
##                    1                    1                    1 
##          Connecticut             Delaware District of Columbia 
##                    1                    1                    1 
##              Florida              Georgia               Hawaii 
##                    1                    1                    1 
##                Idaho             Illinois              Indiana 
##                    1                    1                    1 
##                 Iowa               Kansas             Kentucky 
##                    1                    1                    1 
##            Louisiana                Maine             Maryland 
##                    1                    1                    1 
##        Massachusetts             Michigan            Minnesota 
##                    1                    1                    1 
##          Mississippi             Missouri              Montana 
##                    1                    1                    1 
##             Nebraska               Nevada        New Hampshire 
##                    1                    1                    1 
##           New Jersey           New Mexico             New York 
##                    1                    1                    1 
##       North Carolina         North Dakota                 Ohio 
##                    1                    1                    1 
##             Oklahoma               Oregon         Pennsylvania 
##                    1                    1                    1 
##         Rhode Island       South Carolina         South Dakota 
##                    1                    1                    1 
##            Tennessee                Texas                 Utah 
##                    1                    1                    1 
##              Vermont             Virginia           Washington 
##                    1                    1                    1 
##        West Virginia            Wisconsin              Wyoming 
##                    1                    1                    1

state_geo_Div <- state_georaw %>%
  filter((Division != "0")&(State_FIPS == "00")) %>%
  mutate(DivisionName = str_trim(str_replace(Name, "Division", " "))) %>%
  select(Division, DivisionName)
glimpse(state_geo_Div)

## Rows: 9
## Columns: 2
## $ Division     <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9"
## $ DivisionName <chr> "New England", "Middle Atlantic", "East North Central", "~

table(state_geo_Div$DivisionName)

## 
## East North Central East South Central    Middle Atlantic           Mountain 
##                  1                  1                  1                  1 
##        New England            Pacific     South Atlantic West North Central 
##                  1                  1                  1                  1 
## West South Central 
##                  1

state_geo_Reg <- state_georaw %>%
  filter((Region != "0")&(Division == "0")&(State_FIPS == "00")) %>%
  mutate(RegionName = str_trim(str_replace(Name, "Region", " "))) %>%
  select(Region, RegionName)
glimpse(state_geo_Reg)

## Rows: 4
## Columns: 2
## $ Region     <chr> "1", "2", "3", "4"
## $ RegionName <chr> "Northeast", "Midwest", "South", "West"

table(state_geo_Reg$RegionName)

## 
##   Midwest Northeast     South      West 
##         1         1         1         1

state_geo_all <- state_geo_Sta %>% 
  inner_join(state_geo_Div, by = "Division")
glimpse(state_geo_all)

## Rows: 51
## Columns: 5
## $ Region       <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "2", "2~
## $ Division     <chr> "1", "1", "1", "1", "1", "1", "2", "2", "2", "3", "3", "3~
## $ State_FIPS   <chr> "09", "23", "25", "33", "44", "50", "34", "36", "42", "17~
## $ StateName    <chr> "Connecticut", "Maine", "Massachusetts", "New Hampshire",~
## $ DivisionName <chr> "New England", "New England", "New England", "New England~

table(state_geo_all$DivisionName)

## 
## East North Central East South Central    Middle Atlantic           Mountain 
##                  5                  4                  3                  8 
##        New England            Pacific     South Atlantic West North Central 
##                  6                  5                  9                  7 
## West South Central 
##                  4

state_geo_all <- state_geo_all %>% 
  inner_join(state_geo_Reg, by = "Region")
glimpse(state_geo_all)

## Rows: 51
## Columns: 6
## $ Region       <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "2", "2~
## $ Division     <chr> "1", "1", "1", "1", "1", "1", "2", "2", "2", "3", "3", "3~
## $ State_FIPS   <chr> "09", "23", "25", "33", "44", "50", "34", "36", "42", "17~
## $ StateName    <chr> "Connecticut", "Maine", "Massachusetts", "New Hampshire",~
## $ DivisionName <chr> "New England", "New England", "New England", "New England~
## $ RegionName   <chr> "Northeast", "Northeast", "Northeast", "Northeast", "Nort~

table(state_geo_all$RegionName)

## 
##   Midwest Northeast     South      West 
##        12         9        17        13

rm(state_georaw, state_geo_Sta, state_geo_Div, state_geo_Reg)

American Community Survey B19013 Median Household Income in The Past 12 Months (in 2018 Inflation-Adjusted Dollars), 2018: ACS 1-Year Estimates Detailed Tables, Households

R Code:

medianincome <- read_csv("ACSDT1Y2018.B19013_data_with_overlays_2022-04-11T120015.csv") %>%
  filter(GEO_ID != "id") %>%
  mutate(
    MedianIncome = as.integer(B19013_001E),
    MedianIncomeMOE = as.integer(B19013_001M)) %>%
  rename(StateName = NAME) %>%
  select(-B19013_001E,-B19013_001M)

## Rows: 53 Columns: 4

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): B19013_001E, B19013_001M, GEO_ID, NAME

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(medianincome)

## Rows: 52
## Columns: 4
## $ GEO_ID          <chr> "0400000US01", "0400000US02", "0400000US04", "0400000U~
## $ StateName       <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi~
## $ MedianIncome    <int> 49861, 74346, 59246, 47062, 75277, 71953, 76348, 64805~
## $ MedianIncomeMOE <int> 783, 2288, 732, 713, 317, 655, 921, 1570, 3414, 384, 7~

Joining State-level Median Income file to State Geo file and generate ranking by Region

R Code:

medianincome_geo <- medianincome %>% 
  inner_join(state_geo_all, by = "StateName")
glimpse(medianincome_geo)

## Rows: 51
## Columns: 9
## $ GEO_ID          <chr> "0400000US01", "0400000US02", "0400000US04", "0400000U~
## $ StateName       <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi~
## $ MedianIncome    <int> 49861, 74346, 59246, 47062, 75277, 71953, 76348, 64805~
## $ MedianIncomeMOE <int> 783, 2288, 732, 713, 317, 655, 921, 1570, 3414, 384, 7~
## $ Region          <chr> "3", "4", "4", "3", "4", "4", "1", "3", "3", "3", "3",~
## $ Division        <chr> "6", "9", "8", "7", "9", "8", "1", "5", "5", "5", "5",~
## $ State_FIPS      <chr> "01", "02", "04", "05", "06", "08", "09", "10", "11", ~
## $ DivisionName    <chr> "East South Central", "Pacific", "Mountain", "West Sou~
## $ RegionName      <chr> "South", "West", "West", "South", "West", "West", "Nor~

table(medianincome_geo$RegionName)

## 
##   Midwest Northeast     South      West 
##        12         9        17        13

# Generate ranks by Region, retain group by
medianincome_Region <- medianincome_geo %>% 
  group_by(RegionName) %>%
  mutate(RegionRank = min_rank(-MedianIncome))
glimpse(medianincome_Region)

## Rows: 51
## Columns: 10
## Groups: RegionName [4]
## $ GEO_ID          <chr> "0400000US01", "0400000US02", "0400000US04", "0400000U~
## $ StateName       <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi~
## $ MedianIncome    <int> 49861, 74346, 59246, 47062, 75277, 71953, 76348, 64805~
## $ MedianIncomeMOE <int> 783, 2288, 732, 713, 317, 655, 921, 1570, 3414, 384, 7~
## $ Region          <chr> "3", "4", "4", "3", "4", "4", "1", "3", "3", "3", "3",~
## $ Division        <chr> "6", "9", "8", "7", "9", "8", "1", "5", "5", "5", "5",~
## $ State_FIPS      <chr> "01", "02", "04", "05", "06", "08", "09", "10", "11", ~
## $ DivisionName    <chr> "East South Central", "Pacific", "Mountain", "West Sou~
## $ RegionName      <chr> "South", "West", "West", "South", "West", "West", "Nor~
## $ RegionRank      <int> 13, 3, 9, 15, 2, 5, 3, 4, 1, 7, 6, 1, 11, 2, 11, 5, 7,~

table(medianincome_Region$RegionRank)

## 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 
##  4  4  4  4  4  4  4  4  4  3  3  3  2  1  1  1  1

Generate Top 5 Median Household Incomes Report

R Code:

medianincome_Region5 <- medianincome_Region %>%
  ungroup() %>%
  mutate(MedianIncome = scales::dollar(MedianIncome, negative_parens = TRUE)) %>%
  filter(RegionRank <= 5) %>%
  mutate(RegionName = as.factor(RegionName),
    StateName = as.factor(StateName)) %>%
  arrange(RegionName, RegionRank) %>%
  select(RegionName, RegionRank, StateName, MedianIncome) 

medianincome_Region5 %>%
  select(-RegionName) %>%
  kable(digits = 0, col.names = c("Region Rank", "State", "Median Income"), caption = "  Regional 'Top 5' State Median Household Incomes, 2018") %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left") %>% 
  pack_rows(index = table(medianincome_Region5$RegionName))

Regional ‘Top 5’ State Median Household Incomes, 2018
Region Rank	State	Median Income
Midwest
1	Minnesota	$70,315
2	Illinois	$65,030
3	North Dakota	$63,837
4	Wisconsin	$60,773
5	Iowa	$59,955
Northeast
1	New Jersey	$81,740
2	Massachusetts	$79,835
3	Connecticut	$76,348
4	New Hampshire	$74,991
5	New York	$67,844
South
1	District of Columbia	$85,203
2	Maryland	$83,242
3	Virginia	$72,577
4	Delaware	$64,805
5	Texas	$60,629
West
1	Hawaii	$80,212
2	California	$75,277
3	Alaska	$74,346
4	Washington	$74,073
5	Colorado	$71,953

Combine 2010 and 2018 files and create percentage change dataset

R Code:

population_change <- population2018 %>%
  inner_join(select(population2010, -GEO_ID), by = "StateName") %>%
  mutate(PercentChange = round(
    ((TotalPopulation-TotalPopulation2010) / TotalPopulation2010)
    ,3)) %>%
  arrange(PercentChange) %>% 
  mutate(PercentChangeCat =
    case_when(
      (PercentChange >= -2)&(PercentChange <= -0.001) ~ "Decrease",
      (PercentChange >= 0)&(PercentChange <= 0.049) ~ "0% to 5% Increase",
      (PercentChange >= .05)&(PercentChange <= 0.099) ~ "5% to 10% Increase",
      (PercentChange >= 0.1)&(PercentChange <= 2) ~ "10% or Higher Increase",
      TRUE ~ "Other"
  )) %>% 
  group_by(PercentChangeCat) %>%
  summarize(Frequency = n()) %>%
  mutate(
    Pct = scales::percent(round(Frequency/sum(Frequency),4))
    )
glimpse(population_change)

## Rows: 4
## Columns: 3
## $ PercentChangeCat <chr> "0% to 5% Increase", "10% or Higher Increase", "5% to~
## $ Frequency        <int> 25, 10, 13, 4
## $ Pct              <chr> "48.1%", "19.2%", "25.0%", "7.7%"

population_change %>%
  arrange(-Frequency) %>% 
  kable(col.names = c("Change Category", "# States", "Pct of Total"), caption = "Total State-level Population Change Counts, 2010 vs. 2018") %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")

Total State-level Population Change Counts, 2010 vs. 2018
Change Category	# States	Pct of Total
0% to 5% Increase	25	48.1%
5% to 10% Increase	13	25.0%
10% or Higher Increase	10	19.2%
Decrease	4	7.7%

population_change %>%
  arrange(-Frequency) %>% 
  mutate(PercentChangeCat = fct_relevel(PercentChangeCat, c("Decrease",
    "0% to 5% Increase",
    "5% to 10% Increase",
    "10% or Higher Increase"))) %>%
  ggplot(aes(x=PercentChangeCat, y=Frequency)) + 
  geom_bar(stat = 'identity', fill="gray", alpha=.7) +
  labs(title="Total State-level Population Change Counts, 2010 vs. 2018",
       x="Change Segment",
       y="Number of States")

Data Visualization

Bar Chart - Median Home Value in 2018

R Code:

# systemically pull US median home value from 2018 ACS
us_acs_2018_pull <- get_acs(
 #  cache_table = TRUE,
   survey = "acs1",
   year = 2018,
   geography = "US",
  variables = c(
    medianhomevalue = "B25077_001"
    )
  )

## The 1-year ACS provides data for geographies with populations of 65,000 and greater.

## Getting data from the 2018 1-year ACS

glimpse(us_acs_2018_pull)

## Rows: 1
## Columns: 5
## $ GEOID    <chr> "1"
## $ NAME     <chr> "United States"
## $ variable <chr> "medianhomevalue"
## $ estimate <dbl> 229700
## $ moe      <dbl> 366

(us_2018_medhomeval <- us_acs_2018_pull$estimate[us_acs_2018_pull$variable=="medianhomevalue"])

## [1] 229700

# select states to highlight and create chart
states_gt_usmed <- select(arrange(filter(medianhomevalue, MedianHome > us_2018_medhomeval), -MedianHome), NAME)

medianhomevalue %>% 
  filter(MedianHome > us_2018_medhomeval) %>% 
  arrange(-MedianHome, -MedianHomeMOE) %>% 
  mutate(NAME = fct_relevel(NAME, as_vector(states_gt_usmed))) %>%
  ggplot(aes(x=NAME, y=MedianHome)) +
  geom_bar(stat = 'identity', fill="gray", alpha=.7) +
  labs(title="Owner-occupied Median Home Value in 2018", 
    subtitle=paste("States Exceeding U.S. Median of", scales::dollar(us_2018_medhomeval)),
       x="State",
       y="Median Home Value") +
  theme (axis.text.x = element_text (angle=35, vjust=1, hjust=1)) +
  scale_y_continuous(labels = scales::label_dollar()) + 
  geom_hline(yintercept = us_2018_medhomeval, color="darkgreen") + 
  geom_text(aes(20, us_2018_medhomeval, label = paste("US Median Value ", scales::dollar(us_2018_medhomeval)), vjust = 1.2), color="darkgreen")

Bar Chart - States with Population Exceeding 10,000,000 in 2018

R Code:

# select states to highlight and create chart
states_pop_10mm <- select(arrange(filter(population2018, TotalPopulation > 10000000), -TotalPopulation), StateName)

population2018 %>% 
  filter(TotalPopulation > 10000000) %>% 
  arrange(-TotalPopulation, StateName) %>% 
  mutate(StateName = fct_relevel(StateName, as_vector(states_pop_10mm))) %>%
  ggplot(aes(x=StateName, y=TotalPopulation)) +
  geom_bar(stat = 'identity', fill="gray", alpha=.7) +
  labs(title="States with Population Exceeding 10,000,000 in 2018", 
       x="State",
       y="# Population Estimate") +
  theme (axis.text.x = element_text (angle=35, vjust=1, hjust=1)) +
  scale_y_continuous(labels = scales::label_dollar()) + 
  geom_hline(yintercept = 10000000, color="darkgreen")

Stacked Bar Chart - Educational Attainment in 2018

R Code:

# reorder and factor educational attainment
education_order <- c(
  "Graduate or professional degree",
  "Bachelor's degree",
  "Some college or associate's degree",
  "High school graduate (includes equivalency)",
  "Less than high school graduate")

education2018 %>% 
  mutate(Education = fct_relevel(Education,  as_vector(education_order))) %>%
  ggplot(aes(x=Division, y=Count)) +
  geom_bar(stat = 'identity', aes(fill=Education)) +
  labs(title="Educational Attainment in 2018", 
       x="Division",
       y="# Estimate") +
  theme (axis.text.x = element_text (angle=35, vjust=1, hjust=1), 
    legend.title = element_text(color = "black", size = 8),
  legend.text = element_text(color = "blue", size = 6)) +
  scale_y_continuous(labels = scales::label_comma())

100% Stacked Bar Chart - Educational Attainment in 2018, let ggplot generate the percentages

R Code:

# reorder and factor educational attainment
education_order <- c(
  "Graduate or professional degree",
  "Bachelor's degree",
  "Some college or associate's degree",
  "High school graduate (includes equivalency)",
  "Less than high school graduate")

education2018 %>% 
  mutate(Education = fct_relevel(Education,  as_vector(education_order))
    ) %>% 
  ggplot(aes(x=Division, y=Count, fill=Education)) +
  geom_bar(stat = 'identity', position= "fill") +
  labs(title="Distribution of Educational Attainment Levels in 2018 v.1", 
       x="Division",
       y="# Estimate") +
  theme (axis.text.x = element_text (angle=35, vjust=1, hjust=1), 
    legend.title = element_text(color = "black", size = 10),
  legend.text = element_text(color = "blue", size = 7)) +
  scale_y_continuous(labels = scales::label_percent())

100% Stacked Bar Chart - Educational Attainment in 2018 version 2, preprocess data percentages

R Code:

# reorder and factor educational attainment
education_order <- c(
  "Graduate or professional degree",
  "Bachelor's degree",
  "Some college or associate's degree",
  "High school graduate (includes equivalency)",
  "Less than high school graduate")

education2018 %>% 
  group_by(Division) %>% 
  mutate(
    Education = fct_relevel(Education,  as_vector(education_order)),
    percentage = round(Count/sum(Count),3), 
    cumpercent = cumsum(percentage)
    ) %>%
  ungroup() %>%
  ggplot(aes(x=Division, y=percentage, fill=Education)) +
  geom_bar(stat = 'identity', position= "fill") +
  labs(title="Distribution of Educational Attainment Levels in 2018 v.2",
       x="Division",
       y="% Total Estimates") +
  theme (axis.text.x = element_text (angle=35, vjust=1, hjust=1),
    legend.title = element_text(color = "black", size = 10),
  legend.text = element_text(color = "blue", size = 7)) +
  scale_y_continuous(labels = scales::label_percent()) +
  geom_text(size=3, aes(x=Division,y=cumpercent,label=scales::percent(percentage,accuracy = 0.1),vjust=1))

Analyze Census Bureau Data

2018 Household Median Income, by Region

R Code:

# Prepare data for faceted visualization

binwidth <- 8500

medianincome_all_labels <- medianincome_Region %>%
  ungroup() %>% 
  summarize(
    med_inc_N = length(MedianIncome),
    med_inc_mean = round(mean(MedianIncome),2),
    med_inc_median = median(MedianIncome),
    med_inc_max = round(max(MedianIncome),0),
    med_inc_sd = round(sd(MedianIncome),2),
    label_all = paste0(
      "N = ",med_inc_N,"\n",
      "Mean = ",scales::dollar(round(med_inc_mean,2)),"\n",
      "Median = ",scales::dollar(round(med_inc_median,2)),"\n",
      "Std Dev = ",scales::dollar(round(med_inc_sd,2)),"\n"))

medianincome_reg_labels <- medianincome_Region %>%
  group_by(RegionName) %>% 
  summarize(
    med_inc_reg_N = length(MedianIncome),
    med_inc_reg_mean = round(mean(MedianIncome),2),
    med_inc_reg_median = median(MedianIncome),
    med_inc_reg_max = round(max(MedianIncome),0),
    med_inc_reg_sd = round(sd(MedianIncome),2),
    label_reg = paste0(
      "N = ",med_inc_reg_N,"\n",
      "Mean = ",scales::dollar(round(med_inc_reg_mean,2)),"\n",
      "Median = ",scales::dollar(round(med_inc_reg_median,2)),"\n",
      "Std Dev = ",scales::dollar(round(med_inc_reg_sd,2)),"\n"))


# Faceted by Region - different approach using density, edited
plot_density_income <- medianincome_Region %>% 
  ggplot(aes(x=MedianIncome)) +
  geom_histogram(aes(y=..density..), binwidth = binwidth, fill="gray", alpha=.7, color="black") +
  scale_y_continuous(labels = scales::comma) +
  scale_x_continuous(labels = scales::label_dollar(),
    breaks=c(28000, 36000, 44000, 52000, 60000, 68000, 76000,
      84000, 92000, 100000),
    limits = c(30000, 92000)) +
  theme (axis.text.x = element_text (angle=35, vjust=1, hjust=1)) +
  labs(title="Median Household Income in the Past 12 Months, 2018 ACS",
       x="Inflation-Adjusted Dollars",
       y="Number of States (density)") +
  geom_density(color = "red", adjust = .79)

# Using saved plot object above, All Regions combined with text
plot_density_income  +
  stat_function(color = "blue", fun=dnorm,
    args = with(medianincome_Region, c(
      mean = mean(MedianIncome), 
      sd = sd(MedianIncome)))
    ) +
  geom_text(medianincome_all_labels,
    mapping=aes(
    x = 80000 ,
    y = .000035 ,
    label = label_all))

## Warning: Removed 2 rows containing missing values (geom_bar).

# Using saved plot object above, by Region with text
plot_density_income +
  facet_wrap(~RegionName) + 
  geom_line(color = "blue", aes(y = dnorm(MedianIncome,
    mean = tapply(MedianIncome, RegionName, mean, na.rm = TRUE)[RegionName],
    sd = tapply(MedianIncome, RegionName, sd, na.rm = TRUE)[RegionName]))) + 
  geom_text(medianincome_reg_labels,
    mapping=aes(
    x = 80000 ,
    y = .000075 ,
    label = label_reg))

## Warning: Removed 8 rows containing missing values (geom_bar).

2018 Household Median Income, One-way ANOVA

R Code:

# Descriptive statistics

psych::describeBy(medianincome_Region$MedianIncome, group = medianincome_Region$RegionName, mat = TRUE, digits = 2) %>% 
  select(Region=group1, N=n, Mean=mean, Median=median, SD=sd, Min=min, Max=max, Skew=skew, Kurtosis=kurtosis, SEM=se) %>% 
  kable(align=c("lrrrrrrrr"), digits=2, row.names = FALSE,
        caption="Median Household Income in the Past 12 Months, 2018 ACS") %>% 
  kable_styling(bootstrap_options=c("bordered", "responsive","striped"), full_width = FALSE)

Median Household Income in the Past 12 Months, 2018 ACS
Region	N	Mean	Median	SD	Min	Max	Skew	Kurtosis	SEM
Midwest	12	59750.00	58892	4652.88	54478	70315	0.88	-0.34	1343.17
Northeast	9	69154.11	67844	9387.49	55602	81740	0.00	-1.77	3129.16
South	17	57354.29	52375	12446.36	44097	85203	1.08	-0.08	3018.69
West	13	65250.54	63426	9929.57	47169	80212	-0.16	-1.38	2753.97

medianincome_Region %>%
  ggplot(aes(RegionName,MedianIncome,fill=RegionName)) +
  geom_jitter(colour = "dark gray",width=.1) +
  stat_boxplot(geom ='errorbar',width = 0.4) +
  geom_boxplot()+
  labs(title="Median Household Income in the Past 12 Months, 2018 ACS", 
       x = "Region",
       y = "Inflation-Adjusted Dollars",
       subtitle ="Gray dots=sample data points, Black dot=outlier, Blue dot=mean, Red=99% confidence interval",
       caption = "Boxplot, dotplot and SEM plot of mileage by Region") +
  guides(fill="none") +
  stat_summary(fun.data = "mean_cl_normal", colour = "red", size = 1.5, fun.args = list(conf.int=.99)) +
  stat_summary(geom="point", fun=mean, color="blue") +
  theme_bw() + 
  scale_y_continuous(labels = scales::label_dollar())

# create model objects

Income_aov <- aov(MedianIncome ~ RegionName, medianincome_Region)
class(Income_aov)

## [1] "aov" "lm"

names(Income_aov)

##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "contrasts"     "xlevels"       "call"          "terms"        
## [13] "model"

summary(Income_aov)

##             Df    Sum Sq   Mean Sq F value Pr(>F)  
## RegionName   3 1.026e+09 341878315   3.489 0.0228 *
## Residuals   47 4.605e+09  97976327                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Income_lm <- lm(MedianIncome ~ RegionName, medianincome_Region)
summary(Income_lm)

## 
## Call:
## lm(formula = MedianIncome ~ RegionName, data = medianincome_Region)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18082  -6304  -1892   6433  27849 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            59750       2857  20.911   <2e-16 ***
## RegionNameNortheast     9404       4365   2.155   0.0364 *  
## RegionNameSouth        -2396       3732  -0.642   0.5240    
## RegionNameWest          5500       3962   1.388   0.1716    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9898 on 47 degrees of freedom
## Multiple R-squared:  0.1822, Adjusted R-squared:   0.13 
## F-statistic: 3.489 on 3 and 47 DF,  p-value: 0.02282

The overall analysis of variance table returns a p-value of 0.0228, so we reject the null hypothesis at the α = 0.05 significance level (95% confidence). A Oneway ANOVA showed a significant effect for Region on Median Income, F(3,47)=3.489, p<.05. This suggests that there are at least two regions where the mean of the median household income values is significantly different.
Alternatively, we can try the Kruskal-Wallis non-parametric test.

R Code:

kruskal.test(MedianIncome ~ RegionName, medianincome_Region)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  MedianIncome by RegionName
## Kruskal-Wallis chi-squared = 11.36, df = 3, p-value = 0.009928

The p-value of 0.009928 there are significant differences between the treatment groups.
But which specific 6 pairs of Regions are different? Let’s do the pairwise tests.

R Code:

# pairwise.t.test(medianincome_Region$MedianIncome, medianincome_Region$RegionName, p.adjust.method = "none")

lsr::posthocPairwiseT(Income_aov,p.adjust.method = "none")

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  MedianIncome and RegionName 
## 
##           Midwest Northeast South 
## Northeast 0.0364  -         -     
## South     0.5240  0.0058    -     
## West      0.1716  0.3677    0.0355
## 
## P value adjustment method: none

With no P-Value Adjustments, Northeast + South are different at α = 0.01, while Northeast + Midwest and Midwest + South are different at α = 0.05; the remaining 3 pairs are not significantly different regions. But we need to account for making these multiple simultaneous comparisons (akin to rolling the dice multiple times) and need to run a better test (or tests).
Let’s look at Tukey’s Honestly Significant Difference (HSD)

R Code:

TukeyHSD(Income_aov, conf.level = 0.95)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = MedianIncome ~ RegionName, data = medianincome_Region)
## 
## $RegionName
##                         diff        lwr        upr     p adj
## Northeast-Midwest   9404.111  -2220.890 21029.1120 0.1511736
## South-Midwest      -2395.706 -12335.540  7544.1283 0.9178251
## West-Midwest        5500.538  -5053.117 16054.1943 0.5129279
## South-Northeast   -11799.817 -22667.480  -932.1535 0.0285604
## West-Northeast     -3903.573 -15335.347  7528.2014 0.7998888
## West-South          7896.244  -1816.897 17609.3858 0.1480457

TukeyHSD(Income_aov, conf.level = 0.99)

##   Tukey multiple comparisons of means
##     99% family-wise confidence level
## 
## Fit: aov(formula = MedianIncome ~ RegionName, data = medianincome_Region)
## 
## $RegionName
##                         diff        lwr       upr     p adj
## Northeast-Midwest   9404.111  -4946.529 23754.751 0.1511736
## South-Midwest      -2395.706 -14666.069  9874.657 0.9178251
## West-Midwest        5500.538  -7527.565 18528.642 0.5129279
## South-Northeast   -11799.817 -25215.551  1615.917 0.0285604
## West-Northeast     -3903.573 -18015.681 10208.536 0.7998888
## West-South          7896.244  -4094.275 19886.763 0.1480457

The more conservative Tukey HSD test yields only one significant pairing, Northeast + South, at α = 0.05. Let’s look at errorbar plots:

R Code:

# from https://www.r-bloggers.com/2017/09/oneway-anova-explanation-and-example-in-r/
# def.par <- par(no.readonly = TRUE) # save default, for resetting...
#par(def.par)  # reset to default
# par()$oma # current margins
# par(oma=c(0,0,0,0)) # put the margins back
def.par1 <- par(no.readonly = TRUE) # save default
par(oma=c(0,4,0,0)) # all sides have 3 lines of space
TukeyHSD(Income_aov, conf.level = 0.95) %>% 
  plot(las=1, col = "red", cex.lab=1)

par(def.par1)  # reset to default
# only 1 pair does not cross zero -- Northeast + South

# Improve upon default plot with ggplot errorbar of 95% CI Tukey HSD
TukeyHSD_Income_aov <- as.data.frame(TukeyHSD(Income_aov, conf.level = 0.95)["RegionName"]) %>%
  rename(Difference = RegionName.diff,
    Lower = RegionName.lwr,
    Upper = RegionName.upr,
    PValAdj = RegionName.p.adj)
TukeyHSD_Income_aov$RegionPair <- row.names(TukeyHSD_Income_aov)

TukeyHSD_Income_aov %>% 
  mutate(Significance = case_when(
    PValAdj < .0001 ~ "pval < .0001",
    PValAdj < .001 ~ "pval < .001",
    PValAdj < .01 ~ "pval < .01",
    PValAdj < .05 ~ "pval < .05",
    TRUE ~ "pval not signif."),
    RegionPair = fct_rev(factor(RegionPair))
      ) %>% 
  ggplot(aes(RegionPair, Difference, color=Significance)) +
  geom_point() +
  geom_hline(yintercept=0, linetype="dashed", color="blue") + 
  geom_errorbar(aes(ymin = Lower, ymax = Upper)) +
  scale_y_continuous(labels = scales::label_dollar()) + 
  labs(title="Tukey's Honestly Significant Difference Test 95% CI",
    subtitle="Median Household Income in the Past 12 Months, 2018 ACS", 
    x="Region Pairs", 
    y="Difference in mean levels") +
  theme_bw() + 
  coord_flip()

We’re content enough using Tukey’s HSD, but let’s try other multiple pairwise-comparison between group methods and see where they land (subjectively ordered from less preferred to more preferred):
Benjamini & Hochberg test:

R Code:

pairwise.wilcox.test(medianincome_Region$MedianIncome,
  medianincome_Region$RegionName,
  p.adjust.method = "BH") # Benjamini & Hochberg

## 
##  Pairwise comparisons using Wilcoxon rank sum exact test 
## 
## data:  medianincome_Region$MedianIncome and medianincome_Region$RegionName 
## 
##           Midwest Northeast South
## Northeast 0.044   -         -    
## South     0.109   0.044     -    
## West      0.202   0.292     0.070
## 
## P value adjustment method: BH

Northeast + South and Northeast + Midwest are different at α = 0.05; the remaining 4 pairs are not significantly different regions.
Bonferroni correction test:

R Code:

pairwise.t.test(medianincome_Region$MedianIncome,
  medianincome_Region$RegionName, 
  p.adjust.method = "bonferroni") # Bonferroni correction

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  medianincome_Region$MedianIncome and medianincome_Region$RegionName 
## 
##           Midwest Northeast South
## Northeast 0.218   -         -    
## South     1.000   0.035     -    
## West      1.000   1.000     0.213
## 
## P value adjustment method: bonferroni

Northeast + South are different at α = 0.05
Holm method or Bonferroni–Holm method test:

R Code:

pairwise.t.test(medianincome_Region$MedianIncome,
  medianincome_Region$RegionName, 
  p.adjust.method = "holm") # Sture Holm or Bonferroni–Holm

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  medianincome_Region$MedianIncome and medianincome_Region$RegionName 
## 
##           Midwest Northeast South
## Northeast 0.177   -         -    
## South     0.735   0.035     -    
## West      0.515   0.735     0.177
## 
## P value adjustment method: holm

# lsr::posthocPairwiseT(Income_aov,p.adjust.method = "holm")
# lsr::posthocPairwiseT(Income_aov) # holm is default

Northeast + South are different at α = 0.05
Now, let’s look at some plots:

R Code:

psych::describe(residuals(object = Income_aov ))

##    vars  n mean      sd   median trimmed     mad       min      max    range
## X1    1 51    0 9596.76 -1892.29 -699.66 9424.62 -18081.54 27848.71 45930.24
##    skew kurtosis      se
## X1 0.76     0.46 1343.81

def.par1 <- par(no.readonly = TRUE) # save default
par(mfrow = c(2, 2)) 

# plot(Income_aov, ask=FALSE)
# Residuals vs. Fitted
plot(Income_aov, 1)
# Normal Q-Q - not quite normal?
plot(Income_aov, 2)
# Scale-Location
plot(Income_aov, 3)
# Cook's Distance
plot(Income_aov, 4)

# # Residuals vs. Leverage
# plot(Income_aov, 5)
# # Cook's Distance vs. Leverage
# plot(Income_aov, 6)

par(def.par1)  # reset to default

# histogram of the residuals
# hist(x = residuals(Income_aov)) 
Income_aov_resids <- residuals(Income_aov)
hist(x = Income_aov_resids, prob=TRUE,
  main = "",
  xlab = "Histogram: Median Income ~ Region residuals") 
curve(dnorm(x, mean=mean(Income_aov_resids),
  sd=sd(Income_aov_resids)),
  col="darkblue", lwd=2, add=TRUE, yaxt="n")

What about other tests? These two tests of the normality assumption require a p-value above the significance level:

Shapiro-Wilk test for normality using the residuals.

R Code:

shapiro.test(x = residuals(object = Income_aov ))

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(object = Income_aov)
## W = 0.95295, p-value = 0.04189

the p-value of 0.04189 is less than α = 0.05 which means data may NOT be normal, since here we want to NOT reject the null. Let’s perform the other test.
Kolmogorov-Smirnov test against a normal distribution.

R Code:

ks.test(residuals(object = Income_aov), "pnorm",
  mean(residuals(object = Income_aov)), 
  sd(residuals(object = Income_aov)))

## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  residuals(object = Income_aov)
## D = 0.12292, p-value = 0.3922
## alternative hypothesis: two-sided

Here we want to NOT reject the null, and the p-value of 0.3922 means no reason to suspect our data are significantly non normal. However, this contradicts the result we got from Shapiro-Wilk earlier.
Before we forget, let’s calculate the Effect Size:

R Code:

# Effect size is how big the differences are; for Oneway ANOVA the appropriate measure of effect size is eta squared (η2)
(eta_squared <- var(predict(Income_aov)) / var(medianincome_Region$MedianIncome))

## [1] 0.1821563

# 0.1821563
# or you can use
# lsr::etaSquared(Income_aov)

For the assumption of homogeneity of variance / homoscedasticity, let’s try 3 tests that require a p-value above the significance level:

Bartlett test - oldest and most complex mathematically and is sensitive to violations of normality. (“Not recommedned for routine use.”)

R Code:

bartlett.test(MedianIncome ~ RegionName, medianincome_Region) # yet another way of testing

## 
##  Bartlett test of homogeneity of variances
## 
## data:  MedianIncome by RegionName
## Bartlett's K-squared = 9.683, df = 3, p-value = 0.02146

# 0.02146

Levene test - using differences from the mean. R’s car package Levene test yielded an “unfortunate” p-value of 0.03233, as the calculation was based on absolute deviations from group means. This differs from the SAS PROC GLM Levene test (hovtest=levene) default setting of TYPE=SQUARE which yields a more “desirable” p-value of 0.1542, because the calculation was based on squared deviations from group means. While SAS has options that can change the calculation, we need to perform a workaround in R to mimic the SAS calculation.

R Code:

# leveneTest in R the default is actually a Brown Forsyth to get a true Levene you must specify center = mean
# car::leveneTest(Income_aov, center = mean) 
# 0.03233
# retest using regular dataset (not model) has same results
# car::leveneTest(MedianIncome ~ RegionName, medianincome_Region, center = mean)
# equivalent to absolute deviations from group means - SAS PROC GLM hovtest=abs not-default
car::leveneTest(residuals(Income_lm) ~ factor(medianincome_Region$RegionName), center = mean)

## Levene's Test for Homogeneity of Variance (center = mean)
##       Df F value  Pr(>F)  
## group  3  3.1826 0.03233 *
##       47                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

If we want to replicate SAS’s default Levene test which uses squared deviations, we use anova on the lm object and make an adjustment to use squared residuals.

R Code:

# leveneTest in R must specify center = mean and uses absolute deviations from group means; to get squared deviations from group means modify the underlying data before passing it to the anova function
anova(lm(residuals(Income_lm)^2 ~ medianincome_Region$RegionName))

## Analysis of Variance Table
## 
## Response: residuals(Income_lm)^2
##                                Df     Sum Sq    Mean Sq F value Pr(>F)
## medianincome_Region$RegionName  3 1.1323e+17 3.7742e+16  1.8323 0.1542
## Residuals                      47 9.6810e+17 2.0598e+16

# 0.1542

Brown Forsyth test - using differences from the median, making it more robust.

R Code:

car::leveneTest(Income_aov) # technically a Brown Forsyth

## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.

## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  3  1.5919 0.2039
##       47

# 0.2039
# car::leveneTest(Income_aov, , center = median)

With the p-value of the Bartlett test falling below the significance level, the two versions of Levene tests falling on either side depending on the choice of absolute or squared deviations, and the Brown Forsyth showing a value above the significance level – it’s not clear that we can NOT not reject the null hypothesis (or not). We may have reason to suspect that our data are significantly non normal?
Let us try Welch’s ANOVA where we can drop the assumption that all populations have equal variances:

R Code:

oneway.test(MedianIncome ~ RegionName, medianincome_Region) # not assuming equal variances

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  MedianIncome and RegionName
## F = 3.5881, num df = 3.000, denom df = 22.694, p-value = 0.02937

# 0.02937

In contrast, recall the default / usual ANOVA where we assume that all populations have equal variances:

R Code:

oneway.test(MedianIncome ~ RegionName, medianincome_Region, var.equal = TRUE) # assume equal variances

## 
##  One-way analysis of means
## 
## data:  MedianIncome and RegionName
## F = 3.4894, num df = 3, denom df = 47, p-value = 0.02282

# 0.02282
# same as the calculation earlier
summary(Income_aov)

##             Df    Sum Sq   Mean Sq F value Pr(>F)  
## RegionName   3 1.026e+09 341878315   3.489 0.0228 *
## Residuals   47 4.605e+09  97976327                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# 0.0228

Given the speed bump we encountered with mixed results from our 3 tests for homoscedasticity, the Welch’s ANOVA p-value of 0.02937 allows us to reject the null hypothesis at α = 0.05 significance level and infer that at least one Region-pair has significantly different means. That pair is Northeast + South, for which the Tukey HSD test yields 0.0285604 – passing at α = 0.05 significance level.

Correlation Analysis

Read in pre-processed dataset provided by course

R Code:

state_combined <- read_excel("stateinfo_combined.xlsx") %>% 
  mutate(TotalPopulation = as.integer(TotalPopulation))
glimpse(state_combined)

## Rows: 51
## Columns: 10
## $ GEO_ID                        <chr> "0400000US08", "0400000US18", "0400000US~
## $ State                         <chr> "Colorado", "Indiana", "Kentucky", "Loui~
## $ Division                      <chr> "Mountain", "East North Central", "East ~
## $ Region                        <chr> "West", "Midwest", "South", "South", "Mi~
## $ MedianIncome                  <dbl> 71953, 55746, 50247, 47905, 65030, 59955~
## $ MeanHoursWorked               <dbl> 39.2, 39.0, 38.9, 39.7, 38.6, 38.9, 38.6~
## $ TotalPopulation               <int> 5695564, 6691878, 4468402, 4659978, 1274~
## $ MedianAge                     <dbl> 36.9, 37.8, 39.1, 37.3, 38.3, 38.1, 43.1~
## $ MedianCurrentMarriageDuration <dbl> 17.7, 19.8, 19.8, 19.7, 20.5, 21.8, 22.2~
## $ MedianMonthlyHousingCosts     <dbl> 1335, 838, 776, 806, 1109, 839, 1314, 70~

Calculate Pearson correlation coefficients between Median Household Income and five potential predictors, and plot.

R Code:

pcc_show <- function(xvec, xlab) { 
  pccInc_Calc <- cor.test(xvec,
  state_combined$MedianIncome,method = "pearson")
  xoffset <- min(xvec) + sd(xvec)
  Ncount <- length(xvec)
  pvaltxt <- case_when(
    pccInc_Calc$p.value < .0001 ~ "pval < .0001",
    pccInc_Calc$p.value < .001 ~ "pval < .001",
    pccInc_Calc$p.value < .01 ~ "pval < .01",
    pccInc_Calc$p.value < .05 ~ "pval < .05",
    TRUE ~ "not signif. pval")
state_combined %>% 
  ggplot(aes(x= xvec, 
    y= MedianIncome)) +
  geom_point(alpha= 0.5, color="red") + 
  scale_x_continuous(labels = scales::label_comma()) + 
  scale_y_continuous(labels = scales::label_dollar(),
    limits = c(30000, 90000)) + 
  geom_smooth(method= lm, se= FALSE) + 
  labs(title=paste0("Scatter Plot: Median Household Income vs. ",xlab),
       x= xlab,
       y= "Median Household Income") + 
  geom_text(mapping=aes(x = xoffset, y = 35000,
    label = paste0(
      "Observations = ",Ncount,"\n",
      "Pearson Corr. = ",round(pccInc_Calc$estimate,4),"\n",
      " ",pvaltxt),
      )) + 
  theme_bw() 
}

# pcc_show(xvec, xlab)
pcc_show(state_combined$MeanHoursWorked, "Mean Hours Worked")

## `geom_smooth()` using formula 'y ~ x'

pcc_show(state_combined$TotalPopulation, "Total Population")

## `geom_smooth()` using formula 'y ~ x'

pcc_show(state_combined$MedianAge, "Median Age")

## `geom_smooth()` using formula 'y ~ x'

pcc_show(state_combined$MedianCurrentMarriageDuration, "Median Current Marriage Duration")

## `geom_smooth()` using formula 'y ~ x'

pcc_show(state_combined$MedianMonthlyHousingCosts, "Median Monthly Housing Costs")

## `geom_smooth()` using formula 'y ~ x'

cor.test(state_combined$MedianIncome, state_combined$MedianMonthlyHousingCosts, method = "pearson", conf.level = 0.95)

## 
##  Pearson's product-moment correlation
## 
## data:  state_combined$MedianIncome and state_combined$MedianMonthlyHousingCosts
## t = 18.442, df = 49, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8881712 0.9625128
## sample estimates:
##      cor 
## 0.934919

The strongest positive relationship observed is with Median Household Income and Median Monthly Housing Costs, at 0.9349.
Let’s model this!

Linear Regression

R Code:

# ANOVA table
Income_HousingCosts_aov <- aov(MedianIncome ~ MedianMonthlyHousingCosts, state_combined)
# summary(Income_HousingCosts_aov)
Income_HousingCosts_aov_df <- broom::tidy(Income_HousingCosts_aov) %>%
mutate(Significance = case_when(
    p.value < .0001 ~ "pval < .0001",
    p.value < .001 ~ "pval < .001",
    p.value < .01 ~ "pval < .01",
    p.value < .05 ~ "pval < .05",
    is.na(p.value) ~ " ",
    TRUE ~ "pval not signif."))

Income_HousingCosts_aov_df %>% 
  kable(align=c("lrrrrrrl"), digits=2, row.names = FALSE,
        caption="ANOVA: Median Household Income ~ MedianMonthlyHousingCosts") %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")

ANOVA: Median Household Income ~ MedianMonthlyHousingCosts
term	df	sumsq	meansq	statistic	p.value	Significance
MedianMonthlyHousingCosts	1	4921490297	4921490297	340.12	0	pval < .0001
Residuals	49	709032041	14470042	NA	NA

#  use standard lm to create model
Income_HousingCosts_lm <-lm(MedianIncome ~ MedianMonthlyHousingCosts, state_combined)
# summary(Income_HousingCosts_lm)
Income_HousingCosts_lm_df <- cbind(broom::tidy(Income_HousingCosts_lm),confint(Income_HousingCosts_lm)) %>%
mutate(Significance = case_when(
    p.value < .0001 ~ "pval < .0001",
    p.value < .001 ~ "pval < .001",
    p.value < .01 ~ "pval < .01",
    p.value < .05 ~ "pval < .05",
    is.na(p.value) ~ " ",
    TRUE ~ "pval not signif."))

glance(Income_HousingCosts_lm) %>% 
mutate(Significance = case_when(
    p.value < .0001 ~ "pval < .0001",
    p.value < .001 ~ "pval < .001",
    p.value < .01 ~ "pval < .01",
    p.value < .05 ~ "pval < .05",
    is.na(p.value) ~ " ",
    TRUE ~ "pval not signif.")) %>%
  kable(align=c("llrrrrrrrrrl"), digits=3, row.names = FALSE,
        caption="Linear Regression: Median Household Income ~ MedianMonthlyHousingCosts") %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")

Linear Regression: Median Household Income ~ MedianMonthlyHousingCosts
r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs	Significance
0.874	0.872	3803.951	340.116	0	1	-491.779	989.559	995.354	709032041	49	51	pval < .0001

Income_HousingCosts_lm_df %>% 
  kable(align=c("lrrrrrrl"), digits=3, row.names = FALSE,
        caption="Parameter Estimates: Median Household Income ~ MedianMonthlyHousingCosts") %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")

Parameter Estimates: Median Household Income ~ MedianMonthlyHousingCosts
term	estimate	std.error	statistic	p.value	2.5 %	97.5 %	Significance
(Intercept)	23954.107	2131.320	11.239	0	19671.059	28237.156	pval < .0001
MedianMonthlyHousingCosts	35.976	1.951	18.442	0	32.056	39.896	pval < .0001

Confidence interval does not include the value zero, so we know that the result is significant at the 5% level.

Fit Diagnostics

R Code:

augment(Income_HousingCosts_lm) %>% 
  rename(Predicted = ".fitted") %>%
ggplot(aes(x = Predicted, y = MedianIncome)) +
  geom_point(pch=1, color="blue") +
  geom_abline(intercept = 0, slope = 1, color = "red", size = 1) + 
  scale_x_continuous(labels = scales::label_dollar()) +
  scale_y_continuous(labels = scales::label_dollar()) + 
  labs(title="Scatter Plot: Observed by Predicted for Median Income",
    x="Predicted Income", 
    y="Observed Income") +
  theme_bw()

# 
def.par1 <- par(no.readonly = TRUE) # save default
par(mfrow = c(2, 2)) 

# Residuals vs. Fitted
plot(Income_HousingCosts_lm, 1)
# Normal Q-Q 
plot(Income_HousingCosts_lm, 2)
# Scale-Location
plot(Income_HousingCosts_lm, 3)
# Cook's Distance
plot(Income_HousingCosts_lm, 4)

# # Residuals vs. Leverage
# plot(Income_HousingCosts_lm, 5)
# # Cook's Distance vs. Leverage
# plot(Income_HousingCosts_lm, 6)

par(def.par1)  # reset to default

First is a constant variance plot, which checks for the homogeneity of the variance and the linear relation. In this Residuals vs. Predicted plot, there does not appear to be a pattern that would show a change in variance. The errors appear randomly scattered around zero. But we will try a non-visual test for homoscedasticity later.
Second is a Q-Q plot, which checks that the residuals follow a normal distribution. The points should fall on a line if the normality assumption is met. Things look relatively normal.
Third plot allows to detect heterogeneity of the variance.
Fourth plot allows for the detection of points that have a large impact on the regression coefficients.

# histogram of the residuals
Income_HousingC_lm_resids <- residuals(Income_HousingCosts_lm)
hist(x = Income_HousingC_lm_resids, prob=TRUE, 
  main = "",
  xlab = "Histogram: Income ~ Housing Costs residuals") 
curve(dnorm(x, mean=mean(Income_HousingC_lm_resids),
  sd=sd(Income_HousingC_lm_resids)),
  col="darkblue", lwd=2, add=TRUE, yaxt="n")

The residual histogram displays a relatively normal distribution of the residuals.

R Code:

augment(Income_HousingCosts_lm) %>% 
  rename(Residual = ".resid") %>%
ggplot(aes(x = MedianMonthlyHousingCosts, y = Residual)) +
  geom_point(pch=1, color="blue") +
  geom_abline(intercept = 0, slope = 0, color = "red", size = 1) + 
  scale_x_continuous(labels = scales::label_dollar()) +
  scale_y_continuous(labels = scales::label_dollar()) + 
  labs(title="Scatter Plot: Residuals for Median Income",
    x="Median Monthly Housing Costs", 
    y="Residual") +
  theme_bw()

# Calculate MSE
MSE <- sum(Income_HousingCosts_lm$residuals^2) / Income_HousingCosts_lm$df

# Re-predict to add upr and lwr 95% prediction limits
cbind(augment(Income_HousingCosts_lm), predict(Income_HousingCosts_lm, interval = "predict")) %>% 
  ggplot(aes(x = MedianMonthlyHousingCosts, y = MedianIncome)) +
  geom_point(pch=1, color="blue") + 
  geom_smooth(method=lm) + 
  scale_x_continuous(labels = scales::label_dollar()) +
  scale_y_continuous(labels = scales::label_dollar()) + 
  labs(title="Fit Plot for Median Income",
    subtitle="95% Confidence Intervals and Prediction Limits",
    x="Median Monthly Housing Costs", 
    y="Median Housing Income") + 
  geom_line(aes(y = lwr), col = "red", linetype = "dashed") + 
  geom_line(aes(y = upr), col = "red", linetype = "dashed") + 
  geom_text(mapping=aes(x = 1550, y = 50000,
    label = paste0(
      "Observations = ", length(Income_HousingCosts_lm$residuals), "\n",
      "Parameters = ", length(Income_HousingCosts_lm$coefficients), "\n",
      "Error DF = ", Income_HousingCosts_lm$df, "\n",
      "MSE = ", round(MSE, 0), "\n",
      "R-Square = ", round(glance(Income_HousingCosts_lm)$r.squared,4), "\n",
      "Adj R-Square = ", round(glance(Income_HousingCosts_lm)$adj.r.squared,4), "\n"
      ))) +
  theme_bw()

## Warning in predict.lm(Income_HousingCosts_lm, interval = "predict"): predictions on current data refer to _future_ responses

## `geom_smooth()` using formula 'y ~ x'

The solid blue line represents the estimated linear regression line, the blue circles are the independent data, the gray shaded area is the 95% confidence interval, and the red dashes border the 95% prediction limits for individual observations.

Extra: Pull Census Data via TidyCensus

(A better process than downloading and importing csv files!)

R Code:

# To avoid manual CSV downloads and imports, we can pull census data using TidyCensus.

# inventory of acs1 fields for 2018
# vars_acs1_2018 <- load_variables(2018, "acs1")
# write.csv(vars_acs1_2018,"vars_acs1_2018.csv")
# vars_acs1_2018p <- load_variables(2018, "acs1/profile")
# write.csv(vars_acs1_2018p,"vars_acs1_2018p.csv")

# Obtain values using API and tidycensus
st_acs_2018_pull <- get_acs(
 #  cache_table = TRUE,
   survey = "acs1",
   year = 2018,
   geography = "state",
  variables = c(
    medianhomevalue = "B25077_001",
    totalpopulation = "B01003_001",
    B06009_002 = "B06009_002",
    B06009_003 = "B06009_003",
    B06009_004 = "B06009_004",
    B06009_005 = "B06009_005",
    B06009_006 = "B06009_006",
    medianincome = "B19013_001"
    )
  )
glimpse(st_acs_2018_pull)

st_acs_2010_pull <- get_acs(
 #  cache_table = TRUE,
   survey = "acs1",
   year = 2010,
   geography = "state",
  variables = c(
    totalpop_2010 = "B01003_001",
    medianinc_2010 = "B19013_001"
    )
  )
glimpse(st_acs_2010_pull)

st_acs_2018_pull <- rbind(st_acs_2018_pull, st_acs_2010_pull)
rm(st_acs_2010_pull)
glimpse(st_acs_2018_pull)
# write.csv(st_acs_2018_pull,"st_acs_2018_pull.csv")

DRAFT Notes: Analyzing Census Bureau Data in SAS Studio

SAS Institute Inc. & U.S. Census Bureau

analysisinsightdata

Sunday, May 15, 2022

Self-directed learning: R will then be used to complete representative activities, and a snippet of TidyCensus for API data pull is also provided.

Data Import and Cleansing

American Community Survey B25077 MEDIAN VALUE (DOLLARS), 2018: ACS 1-Year Estimates Detailed Tables, Owner-occupied housing units

American Community Survey B01003 TOTAL POPULATION, 2018: ACS 1-Year Estimates Detailed Tables, Total population

American Community Survey B06009 Place Of Birth By Educational Attainment In The United States, 2018: ACS 1-Year Estimates Detailed Tables, Population 25 years and over in the United States

2018 Population Estimates FIPS Codes Reference File

American Community Survey B19013 Median Household Income in The Past 12 Months (in 2018 Inflation-Adjusted Dollars), 2018: ACS 1-Year Estimates Detailed Tables, Households

Joining State-level Median Income file to State Geo file and generate ranking by Region

Generate Top 5 Median Household Incomes Report

Combine 2010 and 2018 files and create percentage change dataset

Data Visualization

Bar Chart - Median Home Value in 2018

Bar Chart - States with Population Exceeding 10,000,000 in 2018

Stacked Bar Chart - Educational Attainment in 2018

100% Stacked Bar Chart - Educational Attainment in 2018

Analyze Census Bureau Data

2018 Household Median Income

One-way ANOVA

Correlation Analysis

Linear Regression

Extra: Pull Census Data via proc http

(A better process than downloading and importing csv files!)

Now, let’s use R!

Data Import and Cleansing

American Community Survey B25077 MEDIAN VALUE (DOLLARS), 2018: ACS 1-Year Estimates Detailed Tables, Owner-occupied housing units

American Community Survey B01003 TOTAL POPULATION, 2018: ACS 1-Year Estimates Detailed Tables, Total population

American Community Survey B06009 Place Of Birth By Educational Attainment In The United States, 2018: ACS 1-Year Estimates Detailed Tables, Population 25 years and over in the United States

2018 Population Estimates FIPS Codes Reference File

American Community Survey B19013 Median Household Income in The Past 12 Months (in 2018 Inflation-Adjusted Dollars), 2018: ACS 1-Year Estimates Detailed Tables, Households

Joining State-level Median Income file to State Geo file and generate ranking by Region

Generate Top 5 Median Household Incomes Report

Combine 2010 and 2018 files and create percentage change dataset

Data Visualization

Bar Chart - Median Home Value in 2018

Bar Chart - States with Population Exceeding 10,000,000 in 2018

Stacked Bar Chart - Educational Attainment in 2018

100% Stacked Bar Chart - Educational Attainment in 2018, let ggplot generate the percentages

100% Stacked Bar Chart - Educational Attainment in 2018 version 2, preprocess data percentages

Analyze Census Bureau Data

2018 Household Median Income, by Region

2018 Household Median Income, One-way ANOVA

What about other tests? These two tests of the normality assumption require a p-value above the significance level:

For the assumption of homogeneity of variance / homoscedasticity, let’s try 3 tests that require a p-value above the significance level:

Correlation Analysis

Read in pre-processed dataset provided by course

Calculate Pearson correlation coefficients between Median Household Income and five potential predictors, and plot.

Linear Regression

Fit Diagnostics

Extra: Pull Census Data via TidyCensus

(A better process than downloading and importing csv files!)