http://factfinder.census.gov/faces/nav/jsf/pages/guided_search.xhtml
GINI is an internationally recognized measure of income dispersion within a specified geographic area. Income inequality has been a topical discussion in recent years and this is a key measure.
l <- read.csv(
"/Users/scottkarr/IS607Spring2016/project2/more/GINI-2014-Region-untidy.csv",
sep=",",
na.strings = "",
blank.lines.skip = TRUE,
col.names = c("Quintile", "West", "South", "Midwest","Northeast", "US Overall"),
stringsAsFactors=FALSE
)
df = data.frame(l)
# remove extraneous rows
# derived fields can be calculated from raw data
df <- df[-c(1,7),]
# gather morphs data from wide to long format
df_tidy <- df %>%
gather(Region, Gini, -Quintile) %>%
arrange(Quintile, Region, Gini)
# organize the final data sets
df_tidy <- df_tidy %>%
select(Region, Quintile, Gini) %>%
arrange(Region, Quintile, Gini)
# present data nicely
kable(df_tidy, align = 'l')
Region | Quintile | Gini |
---|---|---|
Midwest | 1st quintile | 0.08 |
Midwest | 2nd qunitile | 0.13 |
Midwest | 3rd quintile | 0.19 |
Midwest | 4th quintile | 0.29 |
Midwest | 5th quintile | 0.31 |
Northeast | 1st quintile | 0.12 |
Northeast | 2nd qunitile | 0.18 |
Northeast | 3rd quintile | 0.24 |
Northeast | 4th quintile | 0.32 |
Northeast | 5th quintile | 0.13 |
South | 1st quintile | 0.32 |
South | 2nd qunitile | 0.25 |
South | 3rd quintile | 0.18 |
South | 4th quintile | 0.14 |
South | 5th quintile | 0.11 |
US.Overall | 1st quintile | 0.20 |
US.Overall | 2nd qunitile | 0.20 |
US.Overall | 3rd quintile | 0.20 |
US.Overall | 4th quintile | 0.20 |
US.Overall | 5th quintile | 0.20 |
West | 1st quintile | 0.12 |
West | 2nd qunitile | 0.21 |
West | 3rd quintile | 0.22 |
West | 4th quintile | 0.19 |
West | 5th quintile | 0.26 |
df_tidy_grouped= group_by(df_tidy, Region)
df_stats <-summarise(df_tidy_grouped, mean_gini = mean(Gini), std_gini = sd(Gini))
# histogram of population by regions
ggplot(df_tidy) + geom_histogram(aes(x = Gini))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# regional scatterplot of population by regions
ggplot(data = df_tidy, aes(x = Quintile, y = Gini)) +
geom_point() + facet_wrap( ~ Region )