SAS Code:
# (&path1 and census libname previously defined)
/* Median Home Value, by State */
proc import out=census.medianhomevalueraw
datafile="&path1.ACSDT1Y2018.B25077_data_with_overlays_2021-11-05T055419.csv"
dbms=csv
replace;
datarow=3;
getnames=YES;
run;
data census.medianhomevalue;
set census.medianhomevalueraw;
rename NAME = State;
MedianHome = input(B25077_001E, 10.);
MedianHomeMOE = input(B25077_001M, 10.);
drop B25077_001E
B25077_001M;
format MedianHome NLMNY15.
MedianHomeMOE NLMNY15.;
run;
SAS Code:
# (&path1 and census libname previously defined)
/* Total Population, by State 2018 */
proc import out=census.population2018raw
datafile="&path1.ACSDT1Y2018.B01003_data_with_overlays_2022-04-10T165631.csv"
dbms=csv
replace;
datarow=3;
getnames=YES;
run;
data census.population2018try;
set census.population2018raw;
rename NAME = State;
TotalPopulation = input(B01003_001E, 12.);
drop B01003_001E
B01003_001M;
format TotalPopulation COMMA15.;
run;
/* Total Population, by State for 2010 */
proc import out=census.population2010raw
datafile="&path1.ACSDT1Y2010.B01003_data_with_overlays_2022-04-11T133027.csv"
dbms=csv
replace;
datarow=3;
getnames=YES;
run;
data census.population2010try;
set census.population2010raw;
rename NAME = State;
TotalPopulation = input(B01003_001E, 12.);
drop B01003_001E
B01003_001M;
format TotalPopulation COMMA15.;
run;
proc sort data=census.population2010try noduprecs; by State; run;
SAS Code:
# (&path1 and census libname previously defined)
/* Educational Attainment, by Division */
proc import out=census.education2018tryraw
datafile="&path1.ACSDT1Y2018.B06009_data_with_overlays_2022-04-10T182606.csv"
dbms=csv
replace;
datarow=3;
getnames=NO;
run;
data census.education2018try;
attrib Division length=$20.;
set census.education2018tryraw;
Division = strip(tranwrd(VAR62, ' Division', ' '));
B06009_002E = input(VAR3, 12.);
B06009_003E = input(VAR5, 12.);
B06009_004E = input(VAR7, 12.);
B06009_005E = input(VAR9, 12.);
B06009_006E = input(VAR11, 12.);
drop VAR1 - VAR62;
run;
proc sort data=census.education2018try noduprecs; by Division; run;
proc transpose data=census.education2018try
out=census.education2018try2;
var B06009_002E B06009_003E B06009_004E B06009_005E B06009_006E;
by Division;
run;
data census.education2018try3;
attrib Educational_Attainment length=$45.;
set census.education2018try2;
if _NAME_ EQ "B06009_002E" THEN Educational_Attainment = "Less than high school graduate"; else
if _NAME_ EQ "B06009_003E" THEN Educational_Attainment = "High school graduate (includes equivalency)"; else
if _NAME_ EQ "B06009_004E" THEN Educational_Attainment = "Some college or associate's degree"; else
if _NAME_ EQ "B06009_005E" THEN Educational_Attainment = "Bachelor's degree"; else
if _NAME_ EQ "B06009_006E" THEN Educational_Attainment = "Graduate or professional degree";
drop _NAME_;
format COL1 COMMA15.;
rename COL1 = Count;
run;
SAS Code:
# (&path1 and census libname previously defined)
/* 2018 Census Bureau Region and Division Codes and State FIPS Codes */
proc import out=census.state_georaw
datafile="&path1.state-geocodes-v2018.xlsx"
dbms=xlsx
replace;
datarow=7;
getnames=NO;
run;
data state_geotry_Reg (keep=RegionNum Region)
state_geotry_Div (keep=DivisionNum Division)
state_geotry_Sta (keep=RegionNum DivisionNum StateNum_FIPS State);
attrib RegionNum length=$2.;
attrib Region length=$20.;
attrib DivisionNum length=$2.;
attrib Division length=$20.;
attrib StateNum_FIPS length=$2.;
attrib State length=$20.;
set census.state_georaw;
if (C NE "00")and(C NE " ") then do;
StateNum_FIPS = C;
State = strip(D);
DivisionNum = B;
RegionNum = A;
output state_geotry_Sta ;
end; else
if (B NE "0")and(B NE " ") then do;
DivisionNum = B;
Division = strip(tranwrd(D, ' Division', ' '));
output state_geotry_Div ;
end; else
if (A NE "0")and(A NE " ") then do;
RegionNum = A;
Region = strip(tranwrd(D, ' Region', ' '));
output state_geotry_Reg ;
end;
/* drop A - D; */
run;
proc sql;
create table census.state_geotry as
select S.RegionNum, R.Region, S.DivisionNum, D.Division, StateNum_FIPS, State
from state_geotry_Sta as S
left join state_geotry_Reg as R
on (S.RegionNum = R.RegionNum)
left join state_geotry_Div as D
on (S.DivisionNum = D.DivisionNum)
;
quit;
SAS Code:
# (&path1 and census libname previously defined)
/* Median HH Income Table B19013 2018 1-year ACS State Level */
proc import out=census.medianincomeraw
datafile="&path1.ACSDT1Y2018.B19013_data_with_overlays_2022-04-11T120015.csv"
dbms=csv
replace;
datarow=3;
getnames=YES;
run;
data census.medianincometry;
set census.medianincomeraw;
rename NAME = State;
MedianIncome = input(B19013_001E, 10.);
MedianIncomeMOE = input(B19013_001M, 10.);
drop B19013_001E
B19013_001M;
format MedianIncome NLMNY15.
MedianIncomeMOE NLMNY15.;
run;
SAS Code:
# (&path1 and census libname previously defined)
/* Joining state_geo to medianincome file with selected fields; use inner join to drop Puerto Rico */
proc sql;
create table census.medianincome_geotry as
select I.GEO_ID, I.State, G.Division, G.Region, MedianIncome, MedianIncomeMOE
from census.medianincometry as I
inner join census.state_geotry as G
on (I.State = G.State)
;
quit;
proc sort data=census.medianincome_geotry noduprecs; by Region descending MedianIncome; run;
/* adapted from generated ranks task */
proc rank data=census.medianincome_geotry descending out=census.medianincome_rank;
by Region;
var MedianIncome;
ranks rank_MedianIncome;
run;
proc sort data=census.medianincome_rank noduprecs; by Region rank_MedianIncome; run;
SAS Code:
# (&path1 and census libname previously defined)
/* adapted from generated list data task */
title1 'Top 5 Median Household Incomes by Region';
proc print data=census.medianincome_rank (where=(rank_MedianIncome <=5)) label;
var State MedianIncome;
by Region;
id Region;
run;
title1;
SAS Code:
# (&path1 and census libname previously defined)
proc sql;
create table census.population_change as
select A.GEO_ID, A. State, A.TotalPopulation as TotalPop2010,
B.TotalPopulation as TotalPop2018,
(TotalPop2018-TotalPop2010)/TotalPop2010 as PercentChange format=percentn7.1
from census.population2010try as A
inner join census.population2018try as B
on (A.GEO_ID = B.GEO_ID)
;
quit;
proc sort data=census.population_change noduprecs; by PercentChange; run;
proc contents data=census.population_change; run;
proc print u data=census.population_change; run;
/* adapted from generated recode ranges task */
data census.population_change_cat;
set census.population_change;
length PercentChangeCat $22;
select;
when (-1 <=PercentChange <=-0.001) PercentChangeCat='Decrease';
when (0 <=PercentChange <=0.049) PercentChangeCat='0% to 5% Increase';
when (0.05 <=PercentChange <=0.099) PercentChangeCat='5% to 10% Increase';
when (0.1 <=PercentChange <=1) PercentChangeCat='10% or Higher Increase';
otherwise PercentChangeCat='Other';
end;
run;
/* adapted from generated one way freq task in stats */
proc freq data=census.population_change_cat order=freq;
tables PercentChangeCat / nocum plots=(freqplot);
run;
SAS Code:
# (&path1 and census libname previously defined)
/* Bar Chart - Median Home Value in 2018 */
ods graphics / reset width=10in height=4.8in imagemap;
proc sgplot data=census.medianhomevalue (where=(MedianHome > 229700));
title height=14pt "Median Home Value in 2018";
title2 "States Exceeding U.S. Median of $229,700";
vbar State / response=MedianHome categoryorder=respdesc;
yaxis grid label="Median Home Value in 2018";
refline 229700 / axis=y lineattrs=(thickness=2 color=green)
label="U.S. Median" labelattrs=(color=green);
run;
ods graphics / reset;
SAS Code:
# (&path1 and census libname previously defined)
/* Bar Chart - States with Population Exceeding 10,000,000 in 2018 */
ods graphics / reset width=10in height=4.8in imagemap;
proc sgplot data=census.population2018try (where=(TotalPopulation > 10000000));
title height=14pt "States with Population Exceeding 10,000,000 in 2018";
vbar State / response=TotalPopulation categoryorder=respdesc;
yaxis grid label="Population Estimate";
run;
ods graphics / reset;
SAS Code:
# (&path1 and census libname previously defined)
/* Stacked Bar Chart - Educational Attainment in 2018 */
ods graphics / reset width=10in height=4.8in imagemap;
proc sgplot data=census.education2018try3;
title height=14pt "Educational Attainment in 2018";
vbar Division / response=Count group=Educational_Attainment stat=sum;
yaxis grid label="Count";
keylegend / title="Educational Attainment";
run;
ods graphics / reset;
SAS Code:
# (&path1 and census libname previously defined)
/* 100% Stacked Bar Chart - Educational Attainment in 2018 */
ods graphics / reset width=10in height=4.8in imagemap;
proc sgplot data=census.education2018try3 pctlevel=group;
title height=14pt "Distribution of Educational Attainment Levels in 2018";
vbar Division / response=Count group=Educational_Attainment
stat=percent seglabel ;
yaxis grid label="Percentage";
keylegend / title="Educational Attainment Levels";
run;
ods graphics / reset;
SAS Code:
# (&path1 and census libname previously defined)
ods noproctitle;
ods graphics / imagemap=on;
/* Exploring Data */
title1 "2018 Distribution of Median Income, Combined";
proc univariate data=census.stateinfo_combined;
ods select Histogram;
var MedianIncome;
histogram MedianIncome / normal kernel;
inset n mean median / position=ne;
run;
title1;
SAS Code:
# (&path1 and census libname previously defined)
ods noproctitle;
ods graphics / imagemap=on;
/* Exploring Data by Region */
title1 "2018 Distribution of Median Income, by Region";
proc univariate data=census.stateinfo_combined;
ods select Histogram;
var MedianIncome;
class Region;
histogram MedianIncome / normal kernel;
inset n mean median / position=ne;
run;
title1;
SAS Code:
# (&path1 and census libname previously defined)
/* adapted from generated one way ANOVA task in linear models */
Title;
ods noproctitle;
ods graphics / imagemap=on;
title1 "2018 Distribution of Median Income, one way ANOVA";
proc glm data=census.stateinfo_combined plots(only)=(boxplot diagnostics);
class Region;
model MedianIncome=Region;
means Region / hovtest=bartlett hovtest=levene (type=square) plots=none;
means Region / hovtest=levene (type=abs) hovtest=bf plots=none;
lsmeans Region / adjust=tukey pdiff alpha=.05 plots=none;
run;
quit;
title1;
car::leveneTest(MedianIncome ~ RegionName, medianincome_Region, center = mean) which returns a p-value of 0.0323 (which is less than 0.05, implying the contradiction that we reject the null hypothesis of equal variances).
SAS Code:
# (&path1 and census libname previously defined)
/* adapted from generated correlation analysis task in stats */
ods noproctitle;
ods graphics / imagemap=on;
title1 "2018 Distribution of Median Income, Correlation Analysis";
proc corr data=census.stateinfo_combined pearson nosimple noprob plots=scatter(ellipse=none) rank;
var MeanHoursWorked TotalPopulation MedianAge MedianCurrentMarriageDuration MedianMonthlyHousingCosts;
with MedianIncome;
id State Region;
run;
title1;
SAS Code:
# (&path1 and census libname previously defined)
/* adapted from generated linear regression task in linear models */
ods noproctitle;
ods graphics / imagemap=on;
title1 "2018 Median Income by Housing Costs, Correlation Analysis";
proc reg data=census.stateinfo_combined alpha=0.05 plots(only)=(diagnostics
residuals fitplot observedbypredicted);
model MedianIncome=MedianMonthlyHousingCosts /;
run;
quit;
title1;
The fit diagnostics panel contains a set of graphs commonly used to validate the assumptions of simple linear regression. There are several assumptions that must be met.
SAS Code:
# (&path1 and census libname previously defined)
/* Extra work - retrieve Census data systemtically to skip manual download steps for csv files. Note that this code does not use the API key and is therefore rate limited. */
filename test "&path1.medianhomevalueraw.txt";
proc http
URL= 'http://api.census.gov/data/2018/acs/acs1?get=NAME,B25077_001E&for=state:*'
method = 'GET'
out=test;
quit;
proc import datafile=test dbms=csv out=census.medianhomevalueurl replace;
run;
data census.medianhomevalueurl ;
length StateName $22.
State_FIPS $2.;
set census.medianhomevalueurl;
StateName = strip(tranwrd(tranwrd('[["NAME"'N, '"', ' '), '[', ' '));
length State_FIPS $2.;
State_FIPS = strip(tranwrd(tranwrd(state, '"', ' '), ']', ' '));
MedianHome = input(B25077_001E, 10.);
format MedianHome NLMNY15.;
drop '[["NAME"'N B25077_001E VAR4 state ;
run;
proc sort data=census.medianhomevalueurl nodups; by State_FIPS; run;
library(tidyverse)
library(lubridate)
library(readxl)
library(kableExtra)
library(tidycensus)
library(car)
library(scales)
library(lsr)
library(psych)
library(broom)
# assumes Census Bureau API already loaded and can be verified with Sys.getenv("CENSUS_API_KEY")
R Code:
medianhomevalue <- read_csv("ACSDT1Y2018.B25077_data_with_overlays_2021-11-05T055419.csv") %>%
filter(GEO_ID != "id") %>%
mutate(
MedianHome = as.integer(B25077_001E),
MedianHomeMOE = as.integer(B25077_001M)) %>%
select(-B25077_001E,-B25077_001M)
## Rows: 53 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): GEO_ID, NAME, B25077_001E, B25077_001M
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(medianhomevalue)
## Rows: 52
## Columns: 4
## $ GEO_ID <chr> "0400000US01", "0400000US02", "0400000US04", "0400000US0~
## $ NAME <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California"~
## $ MedianHome <int> 147900, 276100, 241100, 133100, 546800, 373300, 277400, ~
## $ MedianHomeMOE <int> 2304, 4365, 1921, 2166, 2008, 2652, 2057, 5345, 17497, 1~
population2018 <- read_csv("ACSDT1Y2018.B01003_data_with_overlays_2022-04-10T165631.csv") %>%
filter(GEO_ID != "id") %>%
mutate(
TotalPopulation = as.integer(B01003_001E)) %>%
select(-B01003_001E,-B01003_001M) %>%
rename(StateName = NAME)
## Rows: 53 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): B01003_001E, B01003_001M, GEO_ID, NAME
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(population2018)
## Rows: 52
## Columns: 3
## $ GEO_ID <chr> "0400000US01", "0400000US02", "0400000US04", "0400000U~
## $ StateName <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi~
## $ TotalPopulation <int> 4887871, 737438, 7171646, 3013825, 39557045, 5695564, ~
population2010 <- read_csv("ACSDT1Y2010.B01003_data_with_overlays_2022-04-11T133027.csv") %>%
filter(GEO_ID != "id") %>%
mutate(
TotalPopulation2010 = as.integer(B01003_001E)) %>%
select(-B01003_001E,-B01003_001M) %>%
rename(StateName = NAME)
## Rows: 53 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): GEO_ID, B01003_001E, B01003_001M, NAME
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(population2010)
## Rows: 52
## Columns: 3
## $ GEO_ID <chr> "0400000US01", "0400000US02", "0400000US04", "0400~
## $ StateName <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Calif~
## $ TotalPopulation2010 <int> 4785298, 713985, 6413737, 2921606, 37349363, 50490~
education2018 <- read_csv("ACSDT1Y2018.B06009_data_with_overlays_2022-04-10T182606.csv") %>%
filter(GEO_ID != "id") %>%
mutate(
Division = str_trim(str_replace(NAME, "Division", " ")),
B06009_002E = as.integer(B06009_002E),
B06009_003E = as.integer(B06009_003E),
B06009_004E = as.integer(B06009_004E),
B06009_005E = as.integer(B06009_005E),
B06009_006E = as.integer(B06009_006E)) %>%
select(GEO_ID, Division, B06009_002E, B06009_003E, B06009_004E,
B06009_005E, B06009_006E) %>%
pivot_longer(!c(GEO_ID, Division), names_to = "Education", values_to = "Count") %>%
mutate(Education =
case_when(
Education == "B06009_002E" ~ "Less than high school graduate",
Education == "B06009_003E" ~ "High school graduate (includes equivalency)",
Education == "B06009_004E" ~ "Some college or associate's degree",
Education == "B06009_005E" ~ "Bachelor's degree",
Education == "B06009_006E" ~ "Graduate or professional degree",
TRUE ~ Education
))
## Rows: 10 Columns: 62
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (62): B06009_001E, B06009_001M, B06009_002E, B06009_002M, B06009_003E, B...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(education2018)
## Rows: 45
## Columns: 4
## $ GEO_ID <chr> "0300000US1", "0300000US1", "0300000US1", "0300000US1", "030~
## $ Division <chr> "New England", "New England", "New England", "New England", ~
## $ Education <chr> "Less than high school graduate", "High school graduate (inc~
## $ Count <int> 917018, 2712566, 2603070, 2388242, 1845877, 3176958, 8290185~
table(education2018$Division)
##
## East North Central East South Central Middle Atlantic Mountain
## 5 5 5 5
## New England Pacific South Atlantic West North Central
## 5 5 5 5
## West South Central
## 5
table(education2018$Education)
##
## Bachelor's degree
## 9
## Graduate or professional degree
## 9
## High school graduate (includes equivalency)
## 9
## Less than high school graduate
## 9
## Some college or associate's degree
## 9
R Code:
state_georaw <- read_excel("state-geocodes-v2018.xlsx",
skip=5) %>%
mutate(Name = str_trim(Name)) %>%
rename(State_FIPS = `State (FIPS)`)
glimpse(state_georaw)
## Rows: 64
## Columns: 4
## $ Region <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",~
## $ Division <chr> "0", "1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2",~
## $ State_FIPS <chr> "00", "00", "09", "23", "25", "33", "44", "50", "00", "34",~
## $ Name <chr> "Northeast Region", "New England Division", "Connecticut", ~
state_geo_Sta <- state_georaw %>%
filter((State_FIPS != "00")&(!is.na(State_FIPS))) %>%
rename(StateName = Name)
glimpse(state_geo_Sta)
## Rows: 51
## Columns: 4
## $ Region <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "2", "2",~
## $ Division <chr> "1", "1", "1", "1", "1", "1", "2", "2", "2", "3", "3", "3",~
## $ State_FIPS <chr> "09", "23", "25", "33", "44", "50", "34", "36", "42", "17",~
## $ StateName <chr> "Connecticut", "Maine", "Massachusetts", "New Hampshire", "~
table(state_geo_Sta$StateName)
##
## Alabama Alaska Arizona
## 1 1 1
## Arkansas California Colorado
## 1 1 1
## Connecticut Delaware District of Columbia
## 1 1 1
## Florida Georgia Hawaii
## 1 1 1
## Idaho Illinois Indiana
## 1 1 1
## Iowa Kansas Kentucky
## 1 1 1
## Louisiana Maine Maryland
## 1 1 1
## Massachusetts Michigan Minnesota
## 1 1 1
## Mississippi Missouri Montana
## 1 1 1
## Nebraska Nevada New Hampshire
## 1 1 1
## New Jersey New Mexico New York
## 1 1 1
## North Carolina North Dakota Ohio
## 1 1 1
## Oklahoma Oregon Pennsylvania
## 1 1 1
## Rhode Island South Carolina South Dakota
## 1 1 1
## Tennessee Texas Utah
## 1 1 1
## Vermont Virginia Washington
## 1 1 1
## West Virginia Wisconsin Wyoming
## 1 1 1
state_geo_Div <- state_georaw %>%
filter((Division != "0")&(State_FIPS == "00")) %>%
mutate(DivisionName = str_trim(str_replace(Name, "Division", " "))) %>%
select(Division, DivisionName)
glimpse(state_geo_Div)
## Rows: 9
## Columns: 2
## $ Division <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9"
## $ DivisionName <chr> "New England", "Middle Atlantic", "East North Central", "~
table(state_geo_Div$DivisionName)
##
## East North Central East South Central Middle Atlantic Mountain
## 1 1 1 1
## New England Pacific South Atlantic West North Central
## 1 1 1 1
## West South Central
## 1
state_geo_Reg <- state_georaw %>%
filter((Region != "0")&(Division == "0")&(State_FIPS == "00")) %>%
mutate(RegionName = str_trim(str_replace(Name, "Region", " "))) %>%
select(Region, RegionName)
glimpse(state_geo_Reg)
## Rows: 4
## Columns: 2
## $ Region <chr> "1", "2", "3", "4"
## $ RegionName <chr> "Northeast", "Midwest", "South", "West"
table(state_geo_Reg$RegionName)
##
## Midwest Northeast South West
## 1 1 1 1
state_geo_all <- state_geo_Sta %>%
inner_join(state_geo_Div, by = "Division")
glimpse(state_geo_all)
## Rows: 51
## Columns: 5
## $ Region <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "2", "2~
## $ Division <chr> "1", "1", "1", "1", "1", "1", "2", "2", "2", "3", "3", "3~
## $ State_FIPS <chr> "09", "23", "25", "33", "44", "50", "34", "36", "42", "17~
## $ StateName <chr> "Connecticut", "Maine", "Massachusetts", "New Hampshire",~
## $ DivisionName <chr> "New England", "New England", "New England", "New England~
table(state_geo_all$DivisionName)
##
## East North Central East South Central Middle Atlantic Mountain
## 5 4 3 8
## New England Pacific South Atlantic West North Central
## 6 5 9 7
## West South Central
## 4
state_geo_all <- state_geo_all %>%
inner_join(state_geo_Reg, by = "Region")
glimpse(state_geo_all)
## Rows: 51
## Columns: 6
## $ Region <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "2", "2~
## $ Division <chr> "1", "1", "1", "1", "1", "1", "2", "2", "2", "3", "3", "3~
## $ State_FIPS <chr> "09", "23", "25", "33", "44", "50", "34", "36", "42", "17~
## $ StateName <chr> "Connecticut", "Maine", "Massachusetts", "New Hampshire",~
## $ DivisionName <chr> "New England", "New England", "New England", "New England~
## $ RegionName <chr> "Northeast", "Northeast", "Northeast", "Northeast", "Nort~
table(state_geo_all$RegionName)
##
## Midwest Northeast South West
## 12 9 17 13
rm(state_georaw, state_geo_Sta, state_geo_Div, state_geo_Reg)
R Code:
medianincome <- read_csv("ACSDT1Y2018.B19013_data_with_overlays_2022-04-11T120015.csv") %>%
filter(GEO_ID != "id") %>%
mutate(
MedianIncome = as.integer(B19013_001E),
MedianIncomeMOE = as.integer(B19013_001M)) %>%
rename(StateName = NAME) %>%
select(-B19013_001E,-B19013_001M)
## Rows: 53 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): B19013_001E, B19013_001M, GEO_ID, NAME
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(medianincome)
## Rows: 52
## Columns: 4
## $ GEO_ID <chr> "0400000US01", "0400000US02", "0400000US04", "0400000U~
## $ StateName <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi~
## $ MedianIncome <int> 49861, 74346, 59246, 47062, 75277, 71953, 76348, 64805~
## $ MedianIncomeMOE <int> 783, 2288, 732, 713, 317, 655, 921, 1570, 3414, 384, 7~
R Code:
medianincome_geo <- medianincome %>%
inner_join(state_geo_all, by = "StateName")
glimpse(medianincome_geo)
## Rows: 51
## Columns: 9
## $ GEO_ID <chr> "0400000US01", "0400000US02", "0400000US04", "0400000U~
## $ StateName <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi~
## $ MedianIncome <int> 49861, 74346, 59246, 47062, 75277, 71953, 76348, 64805~
## $ MedianIncomeMOE <int> 783, 2288, 732, 713, 317, 655, 921, 1570, 3414, 384, 7~
## $ Region <chr> "3", "4", "4", "3", "4", "4", "1", "3", "3", "3", "3",~
## $ Division <chr> "6", "9", "8", "7", "9", "8", "1", "5", "5", "5", "5",~
## $ State_FIPS <chr> "01", "02", "04", "05", "06", "08", "09", "10", "11", ~
## $ DivisionName <chr> "East South Central", "Pacific", "Mountain", "West Sou~
## $ RegionName <chr> "South", "West", "West", "South", "West", "West", "Nor~
table(medianincome_geo$RegionName)
##
## Midwest Northeast South West
## 12 9 17 13
# Generate ranks by Region, retain group by
medianincome_Region <- medianincome_geo %>%
group_by(RegionName) %>%
mutate(RegionRank = min_rank(-MedianIncome))
glimpse(medianincome_Region)
## Rows: 51
## Columns: 10
## Groups: RegionName [4]
## $ GEO_ID <chr> "0400000US01", "0400000US02", "0400000US04", "0400000U~
## $ StateName <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi~
## $ MedianIncome <int> 49861, 74346, 59246, 47062, 75277, 71953, 76348, 64805~
## $ MedianIncomeMOE <int> 783, 2288, 732, 713, 317, 655, 921, 1570, 3414, 384, 7~
## $ Region <chr> "3", "4", "4", "3", "4", "4", "1", "3", "3", "3", "3",~
## $ Division <chr> "6", "9", "8", "7", "9", "8", "1", "5", "5", "5", "5",~
## $ State_FIPS <chr> "01", "02", "04", "05", "06", "08", "09", "10", "11", ~
## $ DivisionName <chr> "East South Central", "Pacific", "Mountain", "West Sou~
## $ RegionName <chr> "South", "West", "West", "South", "West", "West", "Nor~
## $ RegionRank <int> 13, 3, 9, 15, 2, 5, 3, 4, 1, 7, 6, 1, 11, 2, 11, 5, 7,~
table(medianincome_Region$RegionRank)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 4 4 4 4 4 4 4 4 4 3 3 3 2 1 1 1 1
R Code:
medianincome_Region5 <- medianincome_Region %>%
ungroup() %>%
mutate(MedianIncome = scales::dollar(MedianIncome, negative_parens = TRUE)) %>%
filter(RegionRank <= 5) %>%
mutate(RegionName = as.factor(RegionName),
StateName = as.factor(StateName)) %>%
arrange(RegionName, RegionRank) %>%
select(RegionName, RegionRank, StateName, MedianIncome)
medianincome_Region5 %>%
select(-RegionName) %>%
kable(digits = 0, col.names = c("Region Rank", "State", "Median Income"), caption = " Regional 'Top 5' State Median Household Incomes, 2018") %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "left") %>%
pack_rows(index = table(medianincome_Region5$RegionName))
| Region Rank | State | Median Income |
|---|---|---|
| Midwest | ||
| 1 | Minnesota | $70,315 |
| 2 | Illinois | $65,030 |
| 3 | North Dakota | $63,837 |
| 4 | Wisconsin | $60,773 |
| 5 | Iowa | $59,955 |
| Northeast | ||
| 1 | New Jersey | $81,740 |
| 2 | Massachusetts | $79,835 |
| 3 | Connecticut | $76,348 |
| 4 | New Hampshire | $74,991 |
| 5 | New York | $67,844 |
| South | ||
| 1 | District of Columbia | $85,203 |
| 2 | Maryland | $83,242 |
| 3 | Virginia | $72,577 |
| 4 | Delaware | $64,805 |
| 5 | Texas | $60,629 |
| West | ||
| 1 | Hawaii | $80,212 |
| 2 | California | $75,277 |
| 3 | Alaska | $74,346 |
| 4 | Washington | $74,073 |
| 5 | Colorado | $71,953 |
R Code:
population_change <- population2018 %>%
inner_join(select(population2010, -GEO_ID), by = "StateName") %>%
mutate(PercentChange = round(
((TotalPopulation-TotalPopulation2010) / TotalPopulation2010)
,3)) %>%
arrange(PercentChange) %>%
mutate(PercentChangeCat =
case_when(
(PercentChange >= -2)&(PercentChange <= -0.001) ~ "Decrease",
(PercentChange >= 0)&(PercentChange <= 0.049) ~ "0% to 5% Increase",
(PercentChange >= .05)&(PercentChange <= 0.099) ~ "5% to 10% Increase",
(PercentChange >= 0.1)&(PercentChange <= 2) ~ "10% or Higher Increase",
TRUE ~ "Other"
)) %>%
group_by(PercentChangeCat) %>%
summarize(Frequency = n()) %>%
mutate(
Pct = scales::percent(round(Frequency/sum(Frequency),4))
)
glimpse(population_change)
## Rows: 4
## Columns: 3
## $ PercentChangeCat <chr> "0% to 5% Increase", "10% or Higher Increase", "5% to~
## $ Frequency <int> 25, 10, 13, 4
## $ Pct <chr> "48.1%", "19.2%", "25.0%", "7.7%"
population_change %>%
arrange(-Frequency) %>%
kable(col.names = c("Change Category", "# States", "Pct of Total"), caption = "Total State-level Population Change Counts, 2010 vs. 2018") %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
| Change Category | # States | Pct of Total |
|---|---|---|
| 0% to 5% Increase | 25 | 48.1% |
| 5% to 10% Increase | 13 | 25.0% |
| 10% or Higher Increase | 10 | 19.2% |
| Decrease | 4 | 7.7% |
population_change %>%
arrange(-Frequency) %>%
mutate(PercentChangeCat = fct_relevel(PercentChangeCat, c("Decrease",
"0% to 5% Increase",
"5% to 10% Increase",
"10% or Higher Increase"))) %>%
ggplot(aes(x=PercentChangeCat, y=Frequency)) +
geom_bar(stat = 'identity', fill="gray", alpha=.7) +
labs(title="Total State-level Population Change Counts, 2010 vs. 2018",
x="Change Segment",
y="Number of States")
R Code:
# systemically pull US median home value from 2018 ACS
us_acs_2018_pull <- get_acs(
# cache_table = TRUE,
survey = "acs1",
year = 2018,
geography = "US",
variables = c(
medianhomevalue = "B25077_001"
)
)
## The 1-year ACS provides data for geographies with populations of 65,000 and greater.
## Getting data from the 2018 1-year ACS
glimpse(us_acs_2018_pull)
## Rows: 1
## Columns: 5
## $ GEOID <chr> "1"
## $ NAME <chr> "United States"
## $ variable <chr> "medianhomevalue"
## $ estimate <dbl> 229700
## $ moe <dbl> 366
(us_2018_medhomeval <- us_acs_2018_pull$estimate[us_acs_2018_pull$variable=="medianhomevalue"])
## [1] 229700
# select states to highlight and create chart
states_gt_usmed <- select(arrange(filter(medianhomevalue, MedianHome > us_2018_medhomeval), -MedianHome), NAME)
medianhomevalue %>%
filter(MedianHome > us_2018_medhomeval) %>%
arrange(-MedianHome, -MedianHomeMOE) %>%
mutate(NAME = fct_relevel(NAME, as_vector(states_gt_usmed))) %>%
ggplot(aes(x=NAME, y=MedianHome)) +
geom_bar(stat = 'identity', fill="gray", alpha=.7) +
labs(title="Owner-occupied Median Home Value in 2018",
subtitle=paste("States Exceeding U.S. Median of", scales::dollar(us_2018_medhomeval)),
x="State",
y="Median Home Value") +
theme (axis.text.x = element_text (angle=35, vjust=1, hjust=1)) +
scale_y_continuous(labels = scales::label_dollar()) +
geom_hline(yintercept = us_2018_medhomeval, color="darkgreen") +
geom_text(aes(20, us_2018_medhomeval, label = paste("US Median Value ", scales::dollar(us_2018_medhomeval)), vjust = 1.2), color="darkgreen")
R Code:
# select states to highlight and create chart
states_pop_10mm <- select(arrange(filter(population2018, TotalPopulation > 10000000), -TotalPopulation), StateName)
population2018 %>%
filter(TotalPopulation > 10000000) %>%
arrange(-TotalPopulation, StateName) %>%
mutate(StateName = fct_relevel(StateName, as_vector(states_pop_10mm))) %>%
ggplot(aes(x=StateName, y=TotalPopulation)) +
geom_bar(stat = 'identity', fill="gray", alpha=.7) +
labs(title="States with Population Exceeding 10,000,000 in 2018",
x="State",
y="# Population Estimate") +
theme (axis.text.x = element_text (angle=35, vjust=1, hjust=1)) +
scale_y_continuous(labels = scales::label_dollar()) +
geom_hline(yintercept = 10000000, color="darkgreen")
R Code:
# reorder and factor educational attainment
education_order <- c(
"Graduate or professional degree",
"Bachelor's degree",
"Some college or associate's degree",
"High school graduate (includes equivalency)",
"Less than high school graduate")
education2018 %>%
mutate(Education = fct_relevel(Education, as_vector(education_order))) %>%
ggplot(aes(x=Division, y=Count)) +
geom_bar(stat = 'identity', aes(fill=Education)) +
labs(title="Educational Attainment in 2018",
x="Division",
y="# Estimate") +
theme (axis.text.x = element_text (angle=35, vjust=1, hjust=1),
legend.title = element_text(color = "black", size = 8),
legend.text = element_text(color = "blue", size = 6)) +
scale_y_continuous(labels = scales::label_comma())
R Code:
# reorder and factor educational attainment
education_order <- c(
"Graduate or professional degree",
"Bachelor's degree",
"Some college or associate's degree",
"High school graduate (includes equivalency)",
"Less than high school graduate")
education2018 %>%
mutate(Education = fct_relevel(Education, as_vector(education_order))
) %>%
ggplot(aes(x=Division, y=Count, fill=Education)) +
geom_bar(stat = 'identity', position= "fill") +
labs(title="Distribution of Educational Attainment Levels in 2018 v.1",
x="Division",
y="# Estimate") +
theme (axis.text.x = element_text (angle=35, vjust=1, hjust=1),
legend.title = element_text(color = "black", size = 10),
legend.text = element_text(color = "blue", size = 7)) +
scale_y_continuous(labels = scales::label_percent())
R Code:
# reorder and factor educational attainment
education_order <- c(
"Graduate or professional degree",
"Bachelor's degree",
"Some college or associate's degree",
"High school graduate (includes equivalency)",
"Less than high school graduate")
education2018 %>%
group_by(Division) %>%
mutate(
Education = fct_relevel(Education, as_vector(education_order)),
percentage = round(Count/sum(Count),3),
cumpercent = cumsum(percentage)
) %>%
ungroup() %>%
ggplot(aes(x=Division, y=percentage, fill=Education)) +
geom_bar(stat = 'identity', position= "fill") +
labs(title="Distribution of Educational Attainment Levels in 2018 v.2",
x="Division",
y="% Total Estimates") +
theme (axis.text.x = element_text (angle=35, vjust=1, hjust=1),
legend.title = element_text(color = "black", size = 10),
legend.text = element_text(color = "blue", size = 7)) +
scale_y_continuous(labels = scales::label_percent()) +
geom_text(size=3, aes(x=Division,y=cumpercent,label=scales::percent(percentage,accuracy = 0.1),vjust=1))
R Code:
# Prepare data for faceted visualization
binwidth <- 8500
medianincome_all_labels <- medianincome_Region %>%
ungroup() %>%
summarize(
med_inc_N = length(MedianIncome),
med_inc_mean = round(mean(MedianIncome),2),
med_inc_median = median(MedianIncome),
med_inc_max = round(max(MedianIncome),0),
med_inc_sd = round(sd(MedianIncome),2),
label_all = paste0(
"N = ",med_inc_N,"\n",
"Mean = ",scales::dollar(round(med_inc_mean,2)),"\n",
"Median = ",scales::dollar(round(med_inc_median,2)),"\n",
"Std Dev = ",scales::dollar(round(med_inc_sd,2)),"\n"))
medianincome_reg_labels <- medianincome_Region %>%
group_by(RegionName) %>%
summarize(
med_inc_reg_N = length(MedianIncome),
med_inc_reg_mean = round(mean(MedianIncome),2),
med_inc_reg_median = median(MedianIncome),
med_inc_reg_max = round(max(MedianIncome),0),
med_inc_reg_sd = round(sd(MedianIncome),2),
label_reg = paste0(
"N = ",med_inc_reg_N,"\n",
"Mean = ",scales::dollar(round(med_inc_reg_mean,2)),"\n",
"Median = ",scales::dollar(round(med_inc_reg_median,2)),"\n",
"Std Dev = ",scales::dollar(round(med_inc_reg_sd,2)),"\n"))
# Faceted by Region - different approach using density, edited
plot_density_income <- medianincome_Region %>%
ggplot(aes(x=MedianIncome)) +
geom_histogram(aes(y=..density..), binwidth = binwidth, fill="gray", alpha=.7, color="black") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(labels = scales::label_dollar(),
breaks=c(28000, 36000, 44000, 52000, 60000, 68000, 76000,
84000, 92000, 100000),
limits = c(30000, 92000)) +
theme (axis.text.x = element_text (angle=35, vjust=1, hjust=1)) +
labs(title="Median Household Income in the Past 12 Months, 2018 ACS",
x="Inflation-Adjusted Dollars",
y="Number of States (density)") +
geom_density(color = "red", adjust = .79)
# Using saved plot object above, All Regions combined with text
plot_density_income +
stat_function(color = "blue", fun=dnorm,
args = with(medianincome_Region, c(
mean = mean(MedianIncome),
sd = sd(MedianIncome)))
) +
geom_text(medianincome_all_labels,
mapping=aes(
x = 80000 ,
y = .000035 ,
label = label_all))
## Warning: Removed 2 rows containing missing values (geom_bar).
# Using saved plot object above, by Region with text
plot_density_income +
facet_wrap(~RegionName) +
geom_line(color = "blue", aes(y = dnorm(MedianIncome,
mean = tapply(MedianIncome, RegionName, mean, na.rm = TRUE)[RegionName],
sd = tapply(MedianIncome, RegionName, sd, na.rm = TRUE)[RegionName]))) +
geom_text(medianincome_reg_labels,
mapping=aes(
x = 80000 ,
y = .000075 ,
label = label_reg))
## Warning: Removed 8 rows containing missing values (geom_bar).
R Code:
# Descriptive statistics
psych::describeBy(medianincome_Region$MedianIncome, group = medianincome_Region$RegionName, mat = TRUE, digits = 2) %>%
select(Region=group1, N=n, Mean=mean, Median=median, SD=sd, Min=min, Max=max, Skew=skew, Kurtosis=kurtosis, SEM=se) %>%
kable(align=c("lrrrrrrrr"), digits=2, row.names = FALSE,
caption="Median Household Income in the Past 12 Months, 2018 ACS") %>%
kable_styling(bootstrap_options=c("bordered", "responsive","striped"), full_width = FALSE)
| Region | N | Mean | Median | SD | Min | Max | Skew | Kurtosis | SEM |
|---|---|---|---|---|---|---|---|---|---|
| Midwest | 12 | 59750.00 | 58892 | 4652.88 | 54478 | 70315 | 0.88 | -0.34 | 1343.17 |
| Northeast | 9 | 69154.11 | 67844 | 9387.49 | 55602 | 81740 | 0.00 | -1.77 | 3129.16 |
| South | 17 | 57354.29 | 52375 | 12446.36 | 44097 | 85203 | 1.08 | -0.08 | 3018.69 |
| West | 13 | 65250.54 | 63426 | 9929.57 | 47169 | 80212 | -0.16 | -1.38 | 2753.97 |
medianincome_Region %>%
ggplot(aes(RegionName,MedianIncome,fill=RegionName)) +
geom_jitter(colour = "dark gray",width=.1) +
stat_boxplot(geom ='errorbar',width = 0.4) +
geom_boxplot()+
labs(title="Median Household Income in the Past 12 Months, 2018 ACS",
x = "Region",
y = "Inflation-Adjusted Dollars",
subtitle ="Gray dots=sample data points, Black dot=outlier, Blue dot=mean, Red=99% confidence interval",
caption = "Boxplot, dotplot and SEM plot of mileage by Region") +
guides(fill="none") +
stat_summary(fun.data = "mean_cl_normal", colour = "red", size = 1.5, fun.args = list(conf.int=.99)) +
stat_summary(geom="point", fun=mean, color="blue") +
theme_bw() +
scale_y_continuous(labels = scales::label_dollar())
# create model objects
Income_aov <- aov(MedianIncome ~ RegionName, medianincome_Region)
class(Income_aov)
## [1] "aov" "lm"
names(Income_aov)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "contrasts" "xlevels" "call" "terms"
## [13] "model"
summary(Income_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## RegionName 3 1.026e+09 341878315 3.489 0.0228 *
## Residuals 47 4.605e+09 97976327
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Income_lm <- lm(MedianIncome ~ RegionName, medianincome_Region)
summary(Income_lm)
##
## Call:
## lm(formula = MedianIncome ~ RegionName, data = medianincome_Region)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18082 -6304 -1892 6433 27849
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59750 2857 20.911 <2e-16 ***
## RegionNameNortheast 9404 4365 2.155 0.0364 *
## RegionNameSouth -2396 3732 -0.642 0.5240
## RegionNameWest 5500 3962 1.388 0.1716
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9898 on 47 degrees of freedom
## Multiple R-squared: 0.1822, Adjusted R-squared: 0.13
## F-statistic: 3.489 on 3 and 47 DF, p-value: 0.02282
The overall analysis of variance table returns a p-value of 0.0228, so we reject the null hypothesis at the α = 0.05 significance level (95% confidence). A Oneway ANOVA showed a significant effect for Region on Median Income, F(3,47)=3.489, p<.05. This suggests that there are at least two regions where the mean of the median household income values is significantly different.
Alternatively, we can try the Kruskal-Wallis non-parametric test.
R Code:
kruskal.test(MedianIncome ~ RegionName, medianincome_Region)
##
## Kruskal-Wallis rank sum test
##
## data: MedianIncome by RegionName
## Kruskal-Wallis chi-squared = 11.36, df = 3, p-value = 0.009928
The p-value of 0.009928 there are significant differences between the treatment groups.
But which specific 6 pairs of Regions are different? Let’s do the pairwise tests.
R Code:
# pairwise.t.test(medianincome_Region$MedianIncome, medianincome_Region$RegionName, p.adjust.method = "none")
lsr::posthocPairwiseT(Income_aov,p.adjust.method = "none")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: MedianIncome and RegionName
##
## Midwest Northeast South
## Northeast 0.0364 - -
## South 0.5240 0.0058 -
## West 0.1716 0.3677 0.0355
##
## P value adjustment method: none
With no P-Value Adjustments, Northeast + South are different at α = 0.01, while Northeast + Midwest and Midwest + South are different at α = 0.05; the remaining 3 pairs are not significantly different regions. But we need to account for making these multiple simultaneous comparisons (akin to rolling the dice multiple times) and need to run a better test (or tests).
Let’s look at Tukey’s Honestly Significant Difference (HSD)
R Code:
TukeyHSD(Income_aov, conf.level = 0.95)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = MedianIncome ~ RegionName, data = medianincome_Region)
##
## $RegionName
## diff lwr upr p adj
## Northeast-Midwest 9404.111 -2220.890 21029.1120 0.1511736
## South-Midwest -2395.706 -12335.540 7544.1283 0.9178251
## West-Midwest 5500.538 -5053.117 16054.1943 0.5129279
## South-Northeast -11799.817 -22667.480 -932.1535 0.0285604
## West-Northeast -3903.573 -15335.347 7528.2014 0.7998888
## West-South 7896.244 -1816.897 17609.3858 0.1480457
TukeyHSD(Income_aov, conf.level = 0.99)
## Tukey multiple comparisons of means
## 99% family-wise confidence level
##
## Fit: aov(formula = MedianIncome ~ RegionName, data = medianincome_Region)
##
## $RegionName
## diff lwr upr p adj
## Northeast-Midwest 9404.111 -4946.529 23754.751 0.1511736
## South-Midwest -2395.706 -14666.069 9874.657 0.9178251
## West-Midwest 5500.538 -7527.565 18528.642 0.5129279
## South-Northeast -11799.817 -25215.551 1615.917 0.0285604
## West-Northeast -3903.573 -18015.681 10208.536 0.7998888
## West-South 7896.244 -4094.275 19886.763 0.1480457
R Code:
# from https://www.r-bloggers.com/2017/09/oneway-anova-explanation-and-example-in-r/
# def.par <- par(no.readonly = TRUE) # save default, for resetting...
#par(def.par) # reset to default
# par()$oma # current margins
# par(oma=c(0,0,0,0)) # put the margins back
def.par1 <- par(no.readonly = TRUE) # save default
par(oma=c(0,4,0,0)) # all sides have 3 lines of space
TukeyHSD(Income_aov, conf.level = 0.95) %>%
plot(las=1, col = "red", cex.lab=1)
par(def.par1) # reset to default
# only 1 pair does not cross zero -- Northeast + South
# Improve upon default plot with ggplot errorbar of 95% CI Tukey HSD
TukeyHSD_Income_aov <- as.data.frame(TukeyHSD(Income_aov, conf.level = 0.95)["RegionName"]) %>%
rename(Difference = RegionName.diff,
Lower = RegionName.lwr,
Upper = RegionName.upr,
PValAdj = RegionName.p.adj)
TukeyHSD_Income_aov$RegionPair <- row.names(TukeyHSD_Income_aov)
TukeyHSD_Income_aov %>%
mutate(Significance = case_when(
PValAdj < .0001 ~ "pval < .0001",
PValAdj < .001 ~ "pval < .001",
PValAdj < .01 ~ "pval < .01",
PValAdj < .05 ~ "pval < .05",
TRUE ~ "pval not signif."),
RegionPair = fct_rev(factor(RegionPair))
) %>%
ggplot(aes(RegionPair, Difference, color=Significance)) +
geom_point() +
geom_hline(yintercept=0, linetype="dashed", color="blue") +
geom_errorbar(aes(ymin = Lower, ymax = Upper)) +
scale_y_continuous(labels = scales::label_dollar()) +
labs(title="Tukey's Honestly Significant Difference Test 95% CI",
subtitle="Median Household Income in the Past 12 Months, 2018 ACS",
x="Region Pairs",
y="Difference in mean levels") +
theme_bw() +
coord_flip()
We’re content enough using Tukey’s HSD, but let’s try other multiple pairwise-comparison between group methods and see where they land (subjectively ordered from less preferred to more preferred):
Benjamini & Hochberg test:
R Code:
pairwise.wilcox.test(medianincome_Region$MedianIncome,
medianincome_Region$RegionName,
p.adjust.method = "BH") # Benjamini & Hochberg
##
## Pairwise comparisons using Wilcoxon rank sum exact test
##
## data: medianincome_Region$MedianIncome and medianincome_Region$RegionName
##
## Midwest Northeast South
## Northeast 0.044 - -
## South 0.109 0.044 -
## West 0.202 0.292 0.070
##
## P value adjustment method: BH
Northeast + South and Northeast + Midwest are different at α = 0.05; the remaining 4 pairs are not significantly different regions.
Bonferroni correction test:
R Code:
pairwise.t.test(medianincome_Region$MedianIncome,
medianincome_Region$RegionName,
p.adjust.method = "bonferroni") # Bonferroni correction
##
## Pairwise comparisons using t tests with pooled SD
##
## data: medianincome_Region$MedianIncome and medianincome_Region$RegionName
##
## Midwest Northeast South
## Northeast 0.218 - -
## South 1.000 0.035 -
## West 1.000 1.000 0.213
##
## P value adjustment method: bonferroni
Northeast + South are different at α = 0.05
Holm method or Bonferroni–Holm method test:
R Code:
pairwise.t.test(medianincome_Region$MedianIncome,
medianincome_Region$RegionName,
p.adjust.method = "holm") # Sture Holm or Bonferroni–Holm
##
## Pairwise comparisons using t tests with pooled SD
##
## data: medianincome_Region$MedianIncome and medianincome_Region$RegionName
##
## Midwest Northeast South
## Northeast 0.177 - -
## South 0.735 0.035 -
## West 0.515 0.735 0.177
##
## P value adjustment method: holm
# lsr::posthocPairwiseT(Income_aov,p.adjust.method = "holm")
# lsr::posthocPairwiseT(Income_aov) # holm is default
Northeast + South are different at α = 0.05
Now, let’s look at some plots:
R Code:
psych::describe(residuals(object = Income_aov ))
## vars n mean sd median trimmed mad min max range
## X1 1 51 0 9596.76 -1892.29 -699.66 9424.62 -18081.54 27848.71 45930.24
## skew kurtosis se
## X1 0.76 0.46 1343.81
def.par1 <- par(no.readonly = TRUE) # save default
par(mfrow = c(2, 2))
# plot(Income_aov, ask=FALSE)
# Residuals vs. Fitted
plot(Income_aov, 1)
# Normal Q-Q - not quite normal?
plot(Income_aov, 2)
# Scale-Location
plot(Income_aov, 3)
# Cook's Distance
plot(Income_aov, 4)
# # Residuals vs. Leverage
# plot(Income_aov, 5)
# # Cook's Distance vs. Leverage
# plot(Income_aov, 6)
par(def.par1) # reset to default
# histogram of the residuals
# hist(x = residuals(Income_aov))
Income_aov_resids <- residuals(Income_aov)
hist(x = Income_aov_resids, prob=TRUE,
main = "",
xlab = "Histogram: Median Income ~ Region residuals")
curve(dnorm(x, mean=mean(Income_aov_resids),
sd=sd(Income_aov_resids)),
col="darkblue", lwd=2, add=TRUE, yaxt="n")
R Code:
shapiro.test(x = residuals(object = Income_aov ))
##
## Shapiro-Wilk normality test
##
## data: residuals(object = Income_aov)
## W = 0.95295, p-value = 0.04189
the p-value of 0.04189 is less than α = 0.05 which means data may NOT be normal, since here we want to NOT reject the null. Let’s perform the other test.
Kolmogorov-Smirnov test against a normal distribution.
R Code:
ks.test(residuals(object = Income_aov), "pnorm",
mean(residuals(object = Income_aov)),
sd(residuals(object = Income_aov)))
##
## One-sample Kolmogorov-Smirnov test
##
## data: residuals(object = Income_aov)
## D = 0.12292, p-value = 0.3922
## alternative hypothesis: two-sided
Here we want to NOT reject the null, and the p-value of 0.3922 means no reason to suspect our data are significantly non normal. However, this contradicts the result we got from Shapiro-Wilk earlier.
Before we forget, let’s calculate the Effect Size:
R Code:
# Effect size is how big the differences are; for Oneway ANOVA the appropriate measure of effect size is eta squared (η2)
(eta_squared <- var(predict(Income_aov)) / var(medianincome_Region$MedianIncome))
## [1] 0.1821563
# 0.1821563
# or you can use
# lsr::etaSquared(Income_aov)
R Code:
bartlett.test(MedianIncome ~ RegionName, medianincome_Region) # yet another way of testing
##
## Bartlett test of homogeneity of variances
##
## data: MedianIncome by RegionName
## Bartlett's K-squared = 9.683, df = 3, p-value = 0.02146
# 0.02146
R Code:
# leveneTest in R the default is actually a Brown Forsyth to get a true Levene you must specify center = mean
# car::leveneTest(Income_aov, center = mean)
# 0.03233
# retest using regular dataset (not model) has same results
# car::leveneTest(MedianIncome ~ RegionName, medianincome_Region, center = mean)
# equivalent to absolute deviations from group means - SAS PROC GLM hovtest=abs not-default
car::leveneTest(residuals(Income_lm) ~ factor(medianincome_Region$RegionName), center = mean)
## Levene's Test for Homogeneity of Variance (center = mean)
## Df F value Pr(>F)
## group 3 3.1826 0.03233 *
## 47
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
SAS’s default Levene test which uses squared deviations, we use anova on the lm object and make an adjustment to use squared residuals.R Code:
# leveneTest in R must specify center = mean and uses absolute deviations from group means; to get squared deviations from group means modify the underlying data before passing it to the anova function
anova(lm(residuals(Income_lm)^2 ~ medianincome_Region$RegionName))
## Analysis of Variance Table
##
## Response: residuals(Income_lm)^2
## Df Sum Sq Mean Sq F value Pr(>F)
## medianincome_Region$RegionName 3 1.1323e+17 3.7742e+16 1.8323 0.1542
## Residuals 47 9.6810e+17 2.0598e+16
# 0.1542
R Code:
car::leveneTest(Income_aov) # technically a Brown Forsyth
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 3 1.5919 0.2039
## 47
# 0.2039
# car::leveneTest(Income_aov, , center = median)
With the p-value of the Bartlett test falling below the significance level, the two versions of Levene tests falling on either side depending on the choice of absolute or squared deviations, and the Brown Forsyth showing a value above the significance level – it’s not clear that we can NOT not reject the null hypothesis (or not). We may have reason to suspect that our data are significantly non normal?
Let us try Welch’s ANOVA where we can drop the assumption that all populations have equal variances:
R Code:
oneway.test(MedianIncome ~ RegionName, medianincome_Region) # not assuming equal variances
##
## One-way analysis of means (not assuming equal variances)
##
## data: MedianIncome and RegionName
## F = 3.5881, num df = 3.000, denom df = 22.694, p-value = 0.02937
# 0.02937
R Code:
oneway.test(MedianIncome ~ RegionName, medianincome_Region, var.equal = TRUE) # assume equal variances
##
## One-way analysis of means
##
## data: MedianIncome and RegionName
## F = 3.4894, num df = 3, denom df = 47, p-value = 0.02282
# 0.02282
# same as the calculation earlier
summary(Income_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## RegionName 3 1.026e+09 341878315 3.489 0.0228 *
## Residuals 47 4.605e+09 97976327
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 0.0228
R Code:
state_combined <- read_excel("stateinfo_combined.xlsx") %>%
mutate(TotalPopulation = as.integer(TotalPopulation))
glimpse(state_combined)
## Rows: 51
## Columns: 10
## $ GEO_ID <chr> "0400000US08", "0400000US18", "0400000US~
## $ State <chr> "Colorado", "Indiana", "Kentucky", "Loui~
## $ Division <chr> "Mountain", "East North Central", "East ~
## $ Region <chr> "West", "Midwest", "South", "South", "Mi~
## $ MedianIncome <dbl> 71953, 55746, 50247, 47905, 65030, 59955~
## $ MeanHoursWorked <dbl> 39.2, 39.0, 38.9, 39.7, 38.6, 38.9, 38.6~
## $ TotalPopulation <int> 5695564, 6691878, 4468402, 4659978, 1274~
## $ MedianAge <dbl> 36.9, 37.8, 39.1, 37.3, 38.3, 38.1, 43.1~
## $ MedianCurrentMarriageDuration <dbl> 17.7, 19.8, 19.8, 19.7, 20.5, 21.8, 22.2~
## $ MedianMonthlyHousingCosts <dbl> 1335, 838, 776, 806, 1109, 839, 1314, 70~
R Code:
pcc_show <- function(xvec, xlab) {
pccInc_Calc <- cor.test(xvec,
state_combined$MedianIncome,method = "pearson")
xoffset <- min(xvec) + sd(xvec)
Ncount <- length(xvec)
pvaltxt <- case_when(
pccInc_Calc$p.value < .0001 ~ "pval < .0001",
pccInc_Calc$p.value < .001 ~ "pval < .001",
pccInc_Calc$p.value < .01 ~ "pval < .01",
pccInc_Calc$p.value < .05 ~ "pval < .05",
TRUE ~ "not signif. pval")
state_combined %>%
ggplot(aes(x= xvec,
y= MedianIncome)) +
geom_point(alpha= 0.5, color="red") +
scale_x_continuous(labels = scales::label_comma()) +
scale_y_continuous(labels = scales::label_dollar(),
limits = c(30000, 90000)) +
geom_smooth(method= lm, se= FALSE) +
labs(title=paste0("Scatter Plot: Median Household Income vs. ",xlab),
x= xlab,
y= "Median Household Income") +
geom_text(mapping=aes(x = xoffset, y = 35000,
label = paste0(
"Observations = ",Ncount,"\n",
"Pearson Corr. = ",round(pccInc_Calc$estimate,4),"\n",
" ",pvaltxt),
)) +
theme_bw()
}
# pcc_show(xvec, xlab)
pcc_show(state_combined$MeanHoursWorked, "Mean Hours Worked")
## `geom_smooth()` using formula 'y ~ x'
pcc_show(state_combined$TotalPopulation, "Total Population")
## `geom_smooth()` using formula 'y ~ x'
pcc_show(state_combined$MedianAge, "Median Age")
## `geom_smooth()` using formula 'y ~ x'
pcc_show(state_combined$MedianCurrentMarriageDuration, "Median Current Marriage Duration")
## `geom_smooth()` using formula 'y ~ x'
pcc_show(state_combined$MedianMonthlyHousingCosts, "Median Monthly Housing Costs")
## `geom_smooth()` using formula 'y ~ x'
cor.test(state_combined$MedianIncome, state_combined$MedianMonthlyHousingCosts, method = "pearson", conf.level = 0.95)
##
## Pearson's product-moment correlation
##
## data: state_combined$MedianIncome and state_combined$MedianMonthlyHousingCosts
## t = 18.442, df = 49, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8881712 0.9625128
## sample estimates:
## cor
## 0.934919
R Code:
# ANOVA table
Income_HousingCosts_aov <- aov(MedianIncome ~ MedianMonthlyHousingCosts, state_combined)
# summary(Income_HousingCosts_aov)
Income_HousingCosts_aov_df <- broom::tidy(Income_HousingCosts_aov) %>%
mutate(Significance = case_when(
p.value < .0001 ~ "pval < .0001",
p.value < .001 ~ "pval < .001",
p.value < .01 ~ "pval < .01",
p.value < .05 ~ "pval < .05",
is.na(p.value) ~ " ",
TRUE ~ "pval not signif."))
Income_HousingCosts_aov_df %>%
kable(align=c("lrrrrrrl"), digits=2, row.names = FALSE,
caption="ANOVA: Median Household Income ~ MedianMonthlyHousingCosts") %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
| term | df | sumsq | meansq | statistic | p.value | Significance |
|---|---|---|---|---|---|---|
| MedianMonthlyHousingCosts | 1 | 4921490297 | 4921490297 | 340.12 | 0 | pval < .0001 |
| Residuals | 49 | 709032041 | 14470042 | NA | NA |
# use standard lm to create model
Income_HousingCosts_lm <-lm(MedianIncome ~ MedianMonthlyHousingCosts, state_combined)
# summary(Income_HousingCosts_lm)
Income_HousingCosts_lm_df <- cbind(broom::tidy(Income_HousingCosts_lm),confint(Income_HousingCosts_lm)) %>%
mutate(Significance = case_when(
p.value < .0001 ~ "pval < .0001",
p.value < .001 ~ "pval < .001",
p.value < .01 ~ "pval < .01",
p.value < .05 ~ "pval < .05",
is.na(p.value) ~ " ",
TRUE ~ "pval not signif."))
glance(Income_HousingCosts_lm) %>%
mutate(Significance = case_when(
p.value < .0001 ~ "pval < .0001",
p.value < .001 ~ "pval < .001",
p.value < .01 ~ "pval < .01",
p.value < .05 ~ "pval < .05",
is.na(p.value) ~ " ",
TRUE ~ "pval not signif.")) %>%
kable(align=c("llrrrrrrrrrl"), digits=3, row.names = FALSE,
caption="Linear Regression: Median Household Income ~ MedianMonthlyHousingCosts") %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs | Significance |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.874 | 0.872 | 3803.951 | 340.116 | 0 | 1 | -491.779 | 989.559 | 995.354 | 709032041 | 49 | 51 | pval < .0001 |
Income_HousingCosts_lm_df %>%
kable(align=c("lrrrrrrl"), digits=3, row.names = FALSE,
caption="Parameter Estimates: Median Household Income ~ MedianMonthlyHousingCosts") %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
| term | estimate | std.error | statistic | p.value | 2.5 % | 97.5 % | Significance |
|---|---|---|---|---|---|---|---|
| (Intercept) | 23954.107 | 2131.320 | 11.239 | 0 | 19671.059 | 28237.156 | pval < .0001 |
| MedianMonthlyHousingCosts | 35.976 | 1.951 | 18.442 | 0 | 32.056 | 39.896 | pval < .0001 |
R Code:
augment(Income_HousingCosts_lm) %>%
rename(Predicted = ".fitted") %>%
ggplot(aes(x = Predicted, y = MedianIncome)) +
geom_point(pch=1, color="blue") +
geom_abline(intercept = 0, slope = 1, color = "red", size = 1) +
scale_x_continuous(labels = scales::label_dollar()) +
scale_y_continuous(labels = scales::label_dollar()) +
labs(title="Scatter Plot: Observed by Predicted for Median Income",
x="Predicted Income",
y="Observed Income") +
theme_bw()
#
def.par1 <- par(no.readonly = TRUE) # save default
par(mfrow = c(2, 2))
# Residuals vs. Fitted
plot(Income_HousingCosts_lm, 1)
# Normal Q-Q
plot(Income_HousingCosts_lm, 2)
# Scale-Location
plot(Income_HousingCosts_lm, 3)
# Cook's Distance
plot(Income_HousingCosts_lm, 4)
# # Residuals vs. Leverage
# plot(Income_HousingCosts_lm, 5)
# # Cook's Distance vs. Leverage
# plot(Income_HousingCosts_lm, 6)
par(def.par1) # reset to default
# histogram of the residuals
Income_HousingC_lm_resids <- residuals(Income_HousingCosts_lm)
hist(x = Income_HousingC_lm_resids, prob=TRUE,
main = "",
xlab = "Histogram: Income ~ Housing Costs residuals")
curve(dnorm(x, mean=mean(Income_HousingC_lm_resids),
sd=sd(Income_HousingC_lm_resids)),
col="darkblue", lwd=2, add=TRUE, yaxt="n")
R Code:
augment(Income_HousingCosts_lm) %>%
rename(Residual = ".resid") %>%
ggplot(aes(x = MedianMonthlyHousingCosts, y = Residual)) +
geom_point(pch=1, color="blue") +
geom_abline(intercept = 0, slope = 0, color = "red", size = 1) +
scale_x_continuous(labels = scales::label_dollar()) +
scale_y_continuous(labels = scales::label_dollar()) +
labs(title="Scatter Plot: Residuals for Median Income",
x="Median Monthly Housing Costs",
y="Residual") +
theme_bw()
# Calculate MSE
MSE <- sum(Income_HousingCosts_lm$residuals^2) / Income_HousingCosts_lm$df
# Re-predict to add upr and lwr 95% prediction limits
cbind(augment(Income_HousingCosts_lm), predict(Income_HousingCosts_lm, interval = "predict")) %>%
ggplot(aes(x = MedianMonthlyHousingCosts, y = MedianIncome)) +
geom_point(pch=1, color="blue") +
geom_smooth(method=lm) +
scale_x_continuous(labels = scales::label_dollar()) +
scale_y_continuous(labels = scales::label_dollar()) +
labs(title="Fit Plot for Median Income",
subtitle="95% Confidence Intervals and Prediction Limits",
x="Median Monthly Housing Costs",
y="Median Housing Income") +
geom_line(aes(y = lwr), col = "red", linetype = "dashed") +
geom_line(aes(y = upr), col = "red", linetype = "dashed") +
geom_text(mapping=aes(x = 1550, y = 50000,
label = paste0(
"Observations = ", length(Income_HousingCosts_lm$residuals), "\n",
"Parameters = ", length(Income_HousingCosts_lm$coefficients), "\n",
"Error DF = ", Income_HousingCosts_lm$df, "\n",
"MSE = ", round(MSE, 0), "\n",
"R-Square = ", round(glance(Income_HousingCosts_lm)$r.squared,4), "\n",
"Adj R-Square = ", round(glance(Income_HousingCosts_lm)$adj.r.squared,4), "\n"
))) +
theme_bw()
## Warning in predict.lm(Income_HousingCosts_lm, interval = "predict"): predictions on current data refer to _future_ responses
## `geom_smooth()` using formula 'y ~ x'
R Code:
# To avoid manual CSV downloads and imports, we can pull census data using TidyCensus.
# inventory of acs1 fields for 2018
# vars_acs1_2018 <- load_variables(2018, "acs1")
# write.csv(vars_acs1_2018,"vars_acs1_2018.csv")
# vars_acs1_2018p <- load_variables(2018, "acs1/profile")
# write.csv(vars_acs1_2018p,"vars_acs1_2018p.csv")
# Obtain values using API and tidycensus
st_acs_2018_pull <- get_acs(
# cache_table = TRUE,
survey = "acs1",
year = 2018,
geography = "state",
variables = c(
medianhomevalue = "B25077_001",
totalpopulation = "B01003_001",
B06009_002 = "B06009_002",
B06009_003 = "B06009_003",
B06009_004 = "B06009_004",
B06009_005 = "B06009_005",
B06009_006 = "B06009_006",
medianincome = "B19013_001"
)
)
glimpse(st_acs_2018_pull)
st_acs_2010_pull <- get_acs(
# cache_table = TRUE,
survey = "acs1",
year = 2010,
geography = "state",
variables = c(
totalpop_2010 = "B01003_001",
medianinc_2010 = "B19013_001"
)
)
glimpse(st_acs_2010_pull)
st_acs_2018_pull <- rbind(st_acs_2018_pull, st_acs_2010_pull)
rm(st_acs_2010_pull)
glimpse(st_acs_2018_pull)
# write.csv(st_acs_2018_pull,"st_acs_2018_pull.csv")