Summary:

We have started this assignment with MidWest Region Dataset which is focused on the population, economic and education level of all the 5 states. Goal is to analyze the data of Midwest Region demography to answer few basic questions related with education, population and poverty,We have used R, SQL and Data Science methodologies to provide analysis as shown below covering:

Data Tranformation with the use of sqldf, ggplot2, reshape2 and plyr packages.

To use the data effectively, we did some data manipulation as creation of another table stateView which provides exclusive data of State and also extended teh orginal Midwest Data by adding an extra column of state_longname which can provide complete name of the states in midwest. To perform all these data manipulation, we have leveraged sqldf package which comes very handy to get the relevant subset of data and use it further for plotting.

library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
library(ggplot2)
library(reshape2)
library(plyr)


midwest <- read.csv(file="https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ggplot2/midwest.csv", header=TRUE, sep=",") 

stateView<-sqldf(c("Drop table if exists state_View","CREATE table state_View
  (state_id varchar(4), state_shortname varchar(3),state_longname varchar(15), countyName varchar(10));",
  "Insert into state_View(state_shortname,countyName) Select state,county from midwest", 
  "Update state_View set state_longname='ILLINOIS' where state_View.state_shortname='IL'",
  "Update state_View set state_id='IL1' where state_View.state_shortname='IL'",

  "Update state_View set state_longname='INDIANA' where state_View.state_shortname='IN'",
  "Update state_View set state_id='IN2' where state_View.state_shortname='IN'",

  "Update state_View set state_longname='MICHIGAN' where state_View.state_shortname='MI'",
  "Update state_View set state_id='MI3' where state_View.state_shortname='MI'",

  "Update state_View set state_longname='OHIO' where state_View.state_shortname='OH'",
  "Update state_View set state_id='OH4' where state_View.state_shortname='OH'",

  "Update state_View set state_longname='WISCONSIN' where state_View.state_shortname='WI'",
  "Update state_View set state_id='WI5' where state_View.state_shortname='WI'",

  "Select * from state_View"))
## Loading required package: tcltk
midwest <- sqldf(c("Alter table midwest ADD state_idMW varchar(15)",

  "Update midwest set state_idMW='ILLINOIS' where midwest.state='IL'",
 "Update midwest set state_idMW='INDIANA' where midwest.state='IN'",
 "Update midwest set state_idMW='MICHIGAN' where midwest.state='MI'",
"Update midwest set state_idMW='OHIO' where midwest.state='OH'",
"Update midwest set state_idMW='WISCONSIN' where midwest.state='WI'",

  "Select * from midwest"))

race_pop<- sqldf(c("select sum(popwhite),sum(popblack),sum(popamerindian), sum(popasian), sum(popother),state_idMW
                   from midwest group by state_idMW"))
pop1<-sqldf(c("select sum(poptotal) As poptotal, state_idMW As state
                  from midwest group by state_idMW"))

Population by State.

Up

ggplot(data = pop1, aes(x = state, y = poptotal, fill = state)) + geom_histogram(stat="identity") + ggtitle("Midwest States Vs. Total Population") +
xlab("Midwest States") + ylab("Total Population")

Conclusion:

The graph represents Illinois as highest populated state and Wisconsin as the least populated state. This can further be analyzed per race in each state which will provide more insight on economic status of individuals.

Demographic by Race.

Up

names(race_pop) <- c( "White_Population", "African_American_Population", "Native_American_Population", "Asian_American_Population", "Population_Other_Races", "State")
midwest2 <- melt(race_pop,id.vars="State", variable.name = "Race", value.name = "Population_by_Race" )

midwest2s <- aggregate(Population_by_Race ~ (Race), data = midwest2, FUN = 'sum')
midwest2s
##                          Race Population_by_Race
## 1            White_Population           35764043
## 2 African_American_Population            4817436
## 3  Native_American_Population             149939
## 4   Asian_American_Population             572673
## 5      Population_Other_Races             704851
midwest2s <- ddply(midwest2s, .(Race), transform, pos=cumsum(Population_by_Race)-0.5*(Population_by_Race))
ggplot(data = midwest2s, aes(x = "", y = Population_by_Race, fill = Race)) + geom_bar(stat = "identity", color = 'black') + coord_polar(theta="y") +  guides(fill=guide_legend(override.aes=list(colour=NA))) + ggtitle("The Total Population The Midwest By Race") +
ylab("Total Population") 

The graph represents the demograph of each race in midwest. If we look at the chart we see that the majoritry of population in midwest is White American, and the minoity are the Native Americans

   #create a histogram of midwest by demographic of races.
s <- ggplot(data = midwest2, aes(x = State, y = Population_by_Race, fill = Race))
s + geom_histogram(stat = "identity", position = "dodge") + ggtitle("Midwest States And Total Population By Each Race") +
xlab("Midwest States") + ylab("Total Population By Each Race")

image

Conclusion:

The graph shows the demographic of different races in each state.

Percent of overall poplation per state.

Up

       # Create a sub set using R
mid2 <- read.csv(file="https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ggplot2/midwest.csv", header=TRUE, sep=",") 
midwest.sub <- mid2[, c(4, 13:17)]
head(midwest.sub)
##   state percwhite  percblack percamerindan  percasian  percother
## 1    IL  96.71206  2.5752761     0.1482826 0.37675897 0.18762294
## 2    IL  66.38434 32.9004329     0.1788067 0.45172219 0.08469791
## 3    IL  96.57128  2.8617170     0.2334734 0.10673071 0.22680275
## 4    IL  95.25417  0.4122574     0.1493216 0.48691813 3.69733169
## 5    IL  90.19877  9.3728581     0.2398903 0.08567512 0.10281014
## 6    IL  98.51210  0.1401031     0.1821340 0.54640215 0.61925577
names(midwest.sub) <- c("State", "White_Population", "African_American_Population", "Native_American_Population", "Asian_American_Population", "Population_Other_Races")
m3 <- melt(midwest.sub,id.vars="State", variable.name = "Race", value.name = "Population_by_Race")
midwest3 <- aggregate(Population_by_Race ~ ((Race + State)), data = m3, FUN = 'sum')
midwest3 <- aggregate(Population_by_Race ~ ((Race + State)), data = m3, FUN = 'mean')
midwest3
##                           Race State Population_by_Race
## 1             White_Population    IL         94.9553048
## 2  African_American_Population    IL          3.6540936
## 3   Native_American_Population    IL          0.1738440
## 4    Asian_American_Population    IL          0.5638948
## 5       Population_Other_Races    IL          0.6528628
## 6             White_Population    IN         97.2071380
## 7  African_American_Population    IN          1.8903350
## 8   Native_American_Population    IN          0.2218422
## 9    Asian_American_Population    IN          0.3826576
## 10      Population_Other_Races    IN          0.2980273
## 11            White_Population    MI         94.4445679
## 12 African_American_Population    MI          3.0660734
## 13  Native_American_Population    MI          1.3652361
## 14   Asian_American_Population    MI          0.5068771
## 15      Population_Other_Races    MI          0.6172455
## 16            White_Population    OH         95.3978666
## 17 African_American_Population    OH          3.5145666
## 18  Native_American_Population    OH          0.1842380
## 19   Asian_American_Population    OH          0.4328035
## 20      Population_Other_Races    OH          0.4705253
## 21            White_Population    WI         95.7865216
## 22 African_American_Population    WI          0.8216085
## 23  Native_American_Population    WI          2.5203660
## 24   Asian_American_Population    WI          0.5562127
## 25      Population_Other_Races    WI          0.3152911
        # Create pie charts by each state.
midwest3 <- ddply(midwest3, .(State), transform, pos=cumsum(Population_by_Race)-0.5*(Population_by_Race));

ggplot(data = midwest3, aes(x = "", y = Population_by_Race, fill = Race)) + geom_bar(stat = "identity", color = 'black') + coord_polar(theta="y")  + guides(fill=guide_legend(override.aes=list(colour=NA)))+
facet_wrap(~State) + ggtitle("The Population Of Midwest By Race") +
ylab("Total Population By State") 

Here is a table to represent the data: image

Conclusion:

The pie charts shows us the percent of different races in each state.

Does education has impact on Poverty?

Up

p <- ggplot(data = midwest, aes(y = percbelowpoverty, x = percollege)) 
p + geom_point((aes(color = state))) + ggtitle("College Education Vs Total Poverty") + xlab("Percent College Educated") + ylab("Percentage of Total poverty")

The following summary table shows the percentage of population who are college educated and also the percentage of population living below poverty level in each of the Midwest states.

image

  # create a dot plot with correlation line.
p + geom_point(aes(color = state)) + geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) + facet_wrap(~state) + ggtitle("College Education Vs Total Proverty by Each Midwest State") + xlab("Percent College Educated") + ylab("Percentage of Total proverty") 

image

Conclusion:

Within each state, the correlation of college education and poverty level are inversely related. The analysis was done at county level within each state. With the highest correlation being in Ohio state: -0.5. Although correlation does not suggest causation, the trend is uniform across all 5 states.

BoxPlot that shows the outliers for the above analysis on Eductaion vs Poverty in each state

Up

The BoxPlot provides all outliers for education vs poverty in each state and this can further be analyzed to identify the gaps

  #Create box plot with outlliers.
p <- ggplot(data = midwest, aes(y = percbelowpoverty, x = percollege))
p + geom_boxplot(aes(color = state)) + facet_wrap(~state) + ggtitle("College Education Vs Total Poverty by Each Midwest State") +
xlab("Percent College Educated") + ylab("Percentage of Total poverty")

The table below examines the probability that a person to be college educated and living below poverty within each state. The calculation was simply the multiplication of percentage of college educated residents and percentage of residents below poverty. As we can see the highest percentage of outlier are in Michigan as shown by our boxplot in R.

image

Conclusion:

The college level education and poverty level are negatively correlated across 5 states in Midwest US. Although there are a few outlier counties which have higher college educated population but also higher poverty, the probability of these cases are lower than 0.02%.

Shiny Application for MidWest Demography Details

Up

To summarize all the analysis in simple query based application, we have tried a POC by developing a Shiny Application which can be integrated with any portal to provide insight on MidWest Demography. This can easily be extended to all the other possible FAQs for MidWest Region. Currently we have implemented the state wise demograpghic details around population and education level of the Midwest region.

Code for Shiny

library(shiny)
library(sqldf)
library(reshape2)
library(ggplot2)
library(gridExtra)
library(grid)
wd.datapath = paste0(getwd(),"/data")
wd.init = getwd()
setwd(wd.datapath)

midwest2 = read.csv("midwest.csv", header = TRUE)

midwestNew <- sqldf(c("Alter table midwest2 ADD state_idMW varchar(15)",
  
  "Update midwest2 set state_idMW='ILLINOIS' where midwest2.state='IL'",
  "Update midwest2 set state_idMW='INDIANA' where midwest2.state='IN'",
  "Update midwest2 set state_idMW='MICHIGAN' where midwest2.state='MI'",
  "Update midwest2 set state_idMW='OHIO' where midwest2.state='OH'",
  "Update midwest2 set state_idMW='WISCONSIN' where midwest2.state='WI'",
  
  "Select * from midwest2"))

setwd(wd.init)

# ui.R files for front end of shinyApp

library(shiny)
wd.datapath = paste0(getwd(),"/data")
wd.init = getwd()
setwd(wd.datapath)

midwest2 = read.csv("midwest.csv", header = TRUE)

setwd(wd.init)


#df.shiny = read.csv("C:/Users/sanjivek/Desktop/shine/data/midwest.csv")
#print(midwest2)

# Define UI for application that draws a histogram
shinyUI(fluidPage(
  
  # Application title
  titlePanel("Midwest State Demography Distribution"),
  
  # Sidebar with a slider input for the number of bins
  sidebarLayout(
    sidebarPanel(
      
      selectInput(inputId = "Stat", 
                  label = h4("Select Midwest State for Demographic details"),
                  choices = list("ILLINOIS", "MICHIGAN",
                                 "INDIANA", "WISCONSIN","OHIO"),
                  selected = "ILLINOIS")
    ),
    
    # Show a plot of the generated distribution
    mainPanel(
      #tabsetPanel(tabPanel("Main",plotOutput("distPlot", height = 1000, width = 1000))
      
      plotOutput("distPlot")
    ))
  )
)







# Define server logic required to draw a histogram
shinyServer(function(input, output) {
  
     output$distPlot <- renderPlot({
     
      
       popQuery <- sqldf(paste0( "select sum(popwhite),sum(popblack),sum(popamerindian), sum(popasian), sum(popother),state_idMW   from midwestNew 
         where state_idMW ='", input$Stat, "';" ) )
       print(popQuery)
      
       popQuery2<-sqldf(paste0( "select percpovertyknown,percollege,state_idMW   from midwestNew 
         where state_idMW ='", input$Stat, "';" ) )
       
       popQuery3 <- sqldf(paste0( "select avg(percwhite),avg(percblack),avg(percamerindan), avg(percasian),
                    avg(percother),state_idMW   from midwestNew 
                     where state_idMW ='", input$Stat, "';" ) )
       
       
      m <- melt(popQuery,id.vars="state_idMW", variable.name = "Race", value.name = "Population_by_Race")
       print(m)
       
       n <- melt(popQuery3,id.vars="state_idMW", variable.name = "Race", value.name = "Percentage_by_Population")
       print(n)
       
       
      w<-ggplot(data = m, aes(x=state_idMW ,y=Population_by_Race,fill=Race)) +  geom_histogram(stat="identity",position="dodge")
      
      p <- ggplot(data = popQuery2, aes(y = percbelowpoverty, x = percollege)) + geom_point((aes(color = state_idMW))) + ggtitle("College Education Vs Total Poverty") +
      xlab("Percent College Educated") + ylab("Percentage of Total poverty") 
      
 
      
      z <- ggplot(data = n, aes(x = "", y = Percentage_by_Population, fill = Race)) + 
        geom_bar(stat = "identity", color = 'black') + coord_polar(theta="y") + 
        guides(fill=guide_legend(override.aes=list(colour=NA)))
     
      
      pushViewport(viewport(layout = grid.layout(3, 1),width=0.75,height=1))
      
      print(w, vp = viewport(layout.pos.row = 1, layout.pos.col = 1 ))
      
      print(p, vp = viewport(layout.pos.row = 2, layout.pos.col = 1))
      
      print(z, vp = viewport(layout.pos.row = 3, layout.pos.col = 1))
      
     })
  })