Summary: In this final project we have worked on MidWest population data set and converted that into a shiny App with limited feature at present as – Demographic Lookup based on the state selected by the user We have deployed the shiny app in shinyapps.io and can be accessed from url:

https://ksanju0.shinyapps.io/MidwestDem

We have used R, SQL and Data Science methodologies to provide analysis as shown below covering: – Comparison of statewise population per race in midwest – Total population per race in Midwest – statewise Percent of population by race – Probability analysis of Education vs Poverty per state with Outlier

library(ggplot2)

midwest <- read.csv(file="https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ggplot2/midwest.csv", header=TRUE, sep=",") 

library(sqldf)
## Warning: package 'sqldf' was built under R version 3.2.1
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.2.1
## Loading required package: proto
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.2.1
## Loading required package: DBI
stateView<-sqldf(c("Drop table if exists state_View","CREATE table state_View
  (state_id varchar(4), state_shortname varchar(3),state_longname varchar(15), countyName varchar(10));",
  "Insert into state_View(state_shortname,countyName) Select state,county from midwest", 
  "Update state_View set state_longname='ILLINOIS' where state_View.state_shortname='IL'",
  "Update state_View set state_id='IL1' where state_View.state_shortname='IL'",

  "Update state_View set state_longname='INDIANA' where state_View.state_shortname='IN'",
  "Update state_View set state_id='IN2' where state_View.state_shortname='IN'",

  "Update state_View set state_longname='MICHIGAN' where state_View.state_shortname='MI'",
  "Update state_View set state_id='MI3' where state_View.state_shortname='MI'",

  "Update state_View set state_longname='OHIO' where state_View.state_shortname='OH'",
  "Update state_View set state_id='OH4' where state_View.state_shortname='OH'",

  "Update state_View set state_longname='WISCONSIN' where state_View.state_shortname='WI'",
  "Update state_View set state_id='WI5' where state_View.state_shortname='WI'",

  "Select * from state_View"))
## Loading required package: tcltk
#write.csv(stateView, file = "stateviewimportData.csv")
head(stateView)
##   state_id state_shortname state_longname countyName
## 1      IL1              IL       ILLINOIS      ADAMS
## 2      IL1              IL       ILLINOIS  ALEXANDER
## 3      IL1              IL       ILLINOIS       BOND
## 4      IL1              IL       ILLINOIS      BOONE
## 5      IL1              IL       ILLINOIS      BROWN
## 6      IL1              IL       ILLINOIS     BUREAU
midwest <- sqldf(c("Alter table midwest ADD state_idMW varchar(15)",

  "Update midwest set state_idMW='ILLINOIS' where midwest.state='IL'",
 "Update midwest set state_idMW='INDIANA' where midwest.state='IN'",
 "Update midwest set state_idMW='MICHIGAN' where midwest.state='MI'",
"Update midwest set state_idMW='OHIO' where midwest.state='OH'",
"Update midwest set state_idMW='WISCONSIN' where midwest.state='WI'",

  "Select * from midwest"))

#write.csv(midwest, file = "MidwestimportData1.csv")
race_pop<- sqldf(c("select sum(popwhite),sum(popblack),sum(popamerindian), sum(popasian), sum(popother),state_idMW
                   from midwest group by state_idMW"))

print(race_pop)
##   sum(popwhite) sum(popblack) sum(popamerindian) sum(popasian)
## 1       8952978       1694273              21836        285311
## 2       5020700        432092              12720         37617
## 3       7756086       1291706              55638        104983
## 4       9521756       1154826              20358         91179
## 5       4512523        244539              39387         53583
##   sum(popother) state_idMW
## 1        476204   ILLINOIS
## 2         41030    INDIANA
## 3         86884   MICHIGAN
## 4         58996       OHIO
## 5         41737  WISCONSIN
#,m.popblack,m.popamerindian,m.popasian,m.popother

library(ggplot2)
library(reshape2)
race_pop2 <- race_pop
names(race_pop2) <- c( "White_Population", "African_American_Population", "Native_American_Population", "Asian_American_Population", "Population_Other_Races", "State")
m1 <- melt(race_pop2,id.vars="State", variable.name = "Race", value.name = "Population_by_Race" )
head(m1)
##       State                        Race Population_by_Race
## 1  ILLINOIS            White_Population            8952978
## 2   INDIANA            White_Population            5020700
## 3  MICHIGAN            White_Population            7756086
## 4      OHIO            White_Population            9521756
## 5 WISCONSIN            White_Population            4512523
## 6  ILLINOIS African_American_Population            1694273
midwest2 <- aggregate(Population_by_Race ~ (Race + State), data = m1, FUN = 'sum')
midwest2
##                           Race     State Population_by_Race
## 1             White_Population  ILLINOIS            8952978
## 2  African_American_Population  ILLINOIS            1694273
## 3   Native_American_Population  ILLINOIS              21836
## 4    Asian_American_Population  ILLINOIS             285311
## 5       Population_Other_Races  ILLINOIS             476204
## 6             White_Population   INDIANA            5020700
## 7  African_American_Population   INDIANA             432092
## 8   Native_American_Population   INDIANA              12720
## 9    Asian_American_Population   INDIANA              37617
## 10      Population_Other_Races   INDIANA              41030
## 11            White_Population  MICHIGAN            7756086
## 12 African_American_Population  MICHIGAN            1291706
## 13  Native_American_Population  MICHIGAN              55638
## 14   Asian_American_Population  MICHIGAN             104983
## 15      Population_Other_Races  MICHIGAN              86884
## 16            White_Population      OHIO            9521756
## 17 African_American_Population      OHIO            1154826
## 18  Native_American_Population      OHIO              20358
## 19   Asian_American_Population      OHIO              91179
## 20      Population_Other_Races      OHIO              58996
## 21            White_Population WISCONSIN            4512523
## 22 African_American_Population WISCONSIN             244539
## 23  Native_American_Population WISCONSIN              39387
## 24   Asian_American_Population WISCONSIN              53583
## 25      Population_Other_Races WISCONSIN              41737
s <- ggplot(data = midwest2, aes(x = State, y = Population_by_Race, fill = Race))
s + geom_histogram(stat = "identity", position = "dodge") + ggtitle("Midwest States And Total Population By Each Race") +
xlab("Midwest States") + ylab("Total Population By Each Race")

library(plyr)
midwest2s <- aggregate(Population_by_Race ~ (Race), data = midwest2, FUN = 'sum')
head(midwest2s)
##                          Race Population_by_Race
## 1            White_Population           35764043
## 2 African_American_Population            4817436
## 3  Native_American_Population             149939
## 4   Asian_American_Population             572673
## 5      Population_Other_Races             704851
midwest2s <- ddply(midwest2s, .(Race), transform, pos=cumsum(Population_by_Race)-0.5*(Population_by_Race))
ggplot(data = midwest2s, aes(x = "", y = Population_by_Race, fill = Race)) + geom_bar(stat = "identity", color = 'black') + coord_polar(theta="y") +  guides(fill=guide_legend(override.aes=list(colour=NA))) + ggtitle("The Total Population The Midwest By Race") +
ylab("Total Population") 

mid2 <- read.csv(file="https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ggplot2/midwest.csv", header=TRUE, sep=",") 
midwest.sub <- mid2[, c(4, 13:17)]
head(midwest.sub)
##   state percwhite  percblack percamerindan  percasian  percother
## 1    IL  96.71206  2.5752761     0.1482826 0.37675897 0.18762294
## 2    IL  66.38434 32.9004329     0.1788067 0.45172219 0.08469791
## 3    IL  96.57128  2.8617170     0.2334734 0.10673071 0.22680275
## 4    IL  95.25417  0.4122574     0.1493216 0.48691813 3.69733169
## 5    IL  90.19877  9.3728581     0.2398903 0.08567512 0.10281014
## 6    IL  98.51210  0.1401031     0.1821340 0.54640215 0.61925577
names(midwest.sub) <- c("State", "White_Population", "African_American_Population", "Native_American_Population", "Asian_American_Population", "Population_Other_Races")
m3 <- melt(midwest.sub,id.vars="State", variable.name = "Race", value.name = "Population_by_Race")
midwest3 <- aggregate(Population_by_Race ~ ((Race + State)), data = m3, FUN = 'sum')
midwest3 <- aggregate(Population_by_Race ~ ((Race + State)), data = m3, FUN = 'mean')
midwest3
##                           Race State Population_by_Race
## 1             White_Population    IL         94.9553048
## 2  African_American_Population    IL          3.6540936
## 3   Native_American_Population    IL          0.1738440
## 4    Asian_American_Population    IL          0.5638948
## 5       Population_Other_Races    IL          0.6528628
## 6             White_Population    IN         97.2071380
## 7  African_American_Population    IN          1.8903350
## 8   Native_American_Population    IN          0.2218422
## 9    Asian_American_Population    IN          0.3826576
## 10      Population_Other_Races    IN          0.2980273
## 11            White_Population    MI         94.4445679
## 12 African_American_Population    MI          3.0660734
## 13  Native_American_Population    MI          1.3652361
## 14   Asian_American_Population    MI          0.5068771
## 15      Population_Other_Races    MI          0.6172455
## 16            White_Population    OH         95.3978666
## 17 African_American_Population    OH          3.5145666
## 18  Native_American_Population    OH          0.1842380
## 19   Asian_American_Population    OH          0.4328035
## 20      Population_Other_Races    OH          0.4705253
## 21            White_Population    WI         95.7865216
## 22 African_American_Population    WI          0.8216085
## 23  Native_American_Population    WI          2.5203660
## 24   Asian_American_Population    WI          0.5562127
## 25      Population_Other_Races    WI          0.3152911
midwest3 <- ddply(midwest3, .(State), transform, pos=cumsum(Population_by_Race)-0.5*(Population_by_Race));


ggplot(data = midwest3, aes(x = "", y = Population_by_Race, fill = Race)) + geom_bar(stat = "identity", color = 'black') + coord_polar(theta="y")  + guides(fill=guide_legend(override.aes=list(colour=NA)))+
facet_wrap(~State) + ggtitle("The Population Of Midwest By Race") +
ylab("Total Population By State") 

–BoxPlot to Represent the Outliers

p <- ggplot(data = midwest, aes(y = percpovertyknown, x = percollege))
p + geom_boxplot(aes(color = state)) + facet_wrap(~state) + ggtitle("College Education Vs Total Proverty by Each Midwest State") +
xlab("Percent College Educated") + ylab("Percentage of Total proverty")

The following summary table shows the percentage of population who are college educated and also the percentage of population living below poverty level in each of the Midwest states.

State Percent of College Educated Residents Count of county In Metro Percent of resident below poverty IL 18.78 102.00 13.07 IN 16.62 92.00 10.31 MI 19.42 83.00 14.22 OH 16.89 88.00 13.03 WI 20.01 72.00 11.89

Within each state, the correlation of college education and poverty level are inversely related. The analysis was done at county level within each state. With the highest correlation being in Ohio state: -0.5. Although correlation does not suggest causation, the trend is uniform across all 5 states.

County Level College Education and Poverty Correlation States Correlation WI -0.41804331 OH -0.508687811 IL -0.257362817 IN -0.201868508 MI -0.399198465

The table below examines the probability that a person to be college educated and living below poverty within each state. The calculation was simply the multiplication of percentage of college educated residents and percentage of residents below poverty. As we can see the highest percentage of outlier are in Michigan as shown by our boxplot in R. Probability of College educated Person Living below poverty
States Calculation Probability IL 18.7813.07 0.024568214 IN 16.6210.31 0.017152081 MI 19.4214.22 0.027626611 OH 16.8913.03 0.022013441 WI 20.01*11.89 0.023808893

Conclusion: Poverty level and Education is inversely correlated and although there are few outliers the probability of those cases are 0.02%