We have started this assignment with MidWest Region Dataset which is focused on the population, economic and education level of all the 5 states. Goal is to analyze the data of Midwest Region demography to answer few basic questions related with education, population and poverty,We have used R, SQL and Data Science methodologies to provide analysis as shown below covering:
Data Tranformation with the use of sqldf
, ggplot2
, reshape2
and plyr
packages.
To use the data effectively, we did some data manipulation as creation of another table stateView which provides exclusive data of State and also extended teh orginal Midwest Data by adding an extra column of state_longname which can provide complete name of the states in midwest. To perform all these data manipulation, we have leveraged sqldf package which comes very handy to get the relevant subset of data and use it further for plotting.
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
library(ggplot2)
library(reshape2)
library(plyr)
midwest <- read.csv(file="https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ggplot2/midwest.csv", header=TRUE, sep=",")
stateView<-sqldf(c("Drop table if exists state_View","CREATE table state_View
(state_id varchar(4), state_shortname varchar(3),state_longname varchar(15), countyName varchar(10));",
"Insert into state_View(state_shortname,countyName) Select state,county from midwest",
"Update state_View set state_longname='ILLINOIS' where state_View.state_shortname='IL'",
"Update state_View set state_id='IL1' where state_View.state_shortname='IL'",
"Update state_View set state_longname='INDIANA' where state_View.state_shortname='IN'",
"Update state_View set state_id='IN2' where state_View.state_shortname='IN'",
"Update state_View set state_longname='MICHIGAN' where state_View.state_shortname='MI'",
"Update state_View set state_id='MI3' where state_View.state_shortname='MI'",
"Update state_View set state_longname='OHIO' where state_View.state_shortname='OH'",
"Update state_View set state_id='OH4' where state_View.state_shortname='OH'",
"Update state_View set state_longname='WISCONSIN' where state_View.state_shortname='WI'",
"Update state_View set state_id='WI5' where state_View.state_shortname='WI'",
"Select * from state_View"))
## Loading required package: tcltk
midwest <- sqldf(c("Alter table midwest ADD state_idMW varchar(15)",
"Update midwest set state_idMW='ILLINOIS' where midwest.state='IL'",
"Update midwest set state_idMW='INDIANA' where midwest.state='IN'",
"Update midwest set state_idMW='MICHIGAN' where midwest.state='MI'",
"Update midwest set state_idMW='OHIO' where midwest.state='OH'",
"Update midwest set state_idMW='WISCONSIN' where midwest.state='WI'",
"Select * from midwest"))
race_pop<- sqldf(c("select sum(popwhite),sum(popblack),sum(popamerindian), sum(popasian), sum(popother),state_idMW
from midwest group by state_idMW"))
pop1<-sqldf(c("select sum(poptotal) As poptotal, state_idMW As state
from midwest group by state_idMW"))
ggplot(data = pop1, aes(x = state, y = poptotal, fill = state)) + geom_histogram(stat="identity") + ggtitle("Midwest States Vs. Total Population") +
xlab("Midwest States") + ylab("Total Population")
The graph represents Illinois as highest populated state and Wisconsin as the least populated state. This can further be analyzed per race in each state which will provide more insight on economic status of individuals.
names(race_pop) <- c( "White_Population", "African_American_Population", "Native_American_Population", "Asian_American_Population", "Population_Other_Races", "State")
midwest2 <- melt(race_pop,id.vars="State", variable.name = "Race", value.name = "Population_by_Race" )
midwest2s <- aggregate(Population_by_Race ~ (Race), data = midwest2, FUN = 'sum')
midwest2s
## Race Population_by_Race
## 1 White_Population 35764043
## 2 African_American_Population 4817436
## 3 Native_American_Population 149939
## 4 Asian_American_Population 572673
## 5 Population_Other_Races 704851
midwest2s <- ddply(midwest2s, .(Race), transform, pos=cumsum(Population_by_Race)-0.5*(Population_by_Race))
ggplot(data = midwest2s, aes(x = "", y = Population_by_Race, fill = Race)) + geom_bar(stat = "identity", color = 'black') + coord_polar(theta="y") + guides(fill=guide_legend(override.aes=list(colour=NA))) + ggtitle("The Total Population The Midwest By Race") +
ylab("Total Population")
The graph represents the demograph of each race in midwest. If we look at the chart we see that the majoritry of population in midwest is White American, and the minoity are the Native Americans
#create a histogram of midwest by demographic of races.
s <- ggplot(data = midwest2, aes(x = State, y = Population_by_Race, fill = Race))
s + geom_histogram(stat = "identity", position = "dodge") + ggtitle("Midwest States And Total Population By Each Race") +
xlab("Midwest States") + ylab("Total Population By Each Race")
The graph shows the demographic of different races in each state.
# Create a sub set using R
mid2 <- read.csv(file="https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/ggplot2/midwest.csv", header=TRUE, sep=",")
midwest.sub <- mid2[, c(4, 13:17)]
head(midwest.sub)
## state percwhite percblack percamerindan percasian percother
## 1 IL 96.71206 2.5752761 0.1482826 0.37675897 0.18762294
## 2 IL 66.38434 32.9004329 0.1788067 0.45172219 0.08469791
## 3 IL 96.57128 2.8617170 0.2334734 0.10673071 0.22680275
## 4 IL 95.25417 0.4122574 0.1493216 0.48691813 3.69733169
## 5 IL 90.19877 9.3728581 0.2398903 0.08567512 0.10281014
## 6 IL 98.51210 0.1401031 0.1821340 0.54640215 0.61925577
names(midwest.sub) <- c("State", "White_Population", "African_American_Population", "Native_American_Population", "Asian_American_Population", "Population_Other_Races")
m3 <- melt(midwest.sub,id.vars="State", variable.name = "Race", value.name = "Population_by_Race")
midwest3 <- aggregate(Population_by_Race ~ ((Race + State)), data = m3, FUN = 'sum')
midwest3 <- aggregate(Population_by_Race ~ ((Race + State)), data = m3, FUN = 'mean')
midwest3
## Race State Population_by_Race
## 1 White_Population IL 94.9553048
## 2 African_American_Population IL 3.6540936
## 3 Native_American_Population IL 0.1738440
## 4 Asian_American_Population IL 0.5638948
## 5 Population_Other_Races IL 0.6528628
## 6 White_Population IN 97.2071380
## 7 African_American_Population IN 1.8903350
## 8 Native_American_Population IN 0.2218422
## 9 Asian_American_Population IN 0.3826576
## 10 Population_Other_Races IN 0.2980273
## 11 White_Population MI 94.4445679
## 12 African_American_Population MI 3.0660734
## 13 Native_American_Population MI 1.3652361
## 14 Asian_American_Population MI 0.5068771
## 15 Population_Other_Races MI 0.6172455
## 16 White_Population OH 95.3978666
## 17 African_American_Population OH 3.5145666
## 18 Native_American_Population OH 0.1842380
## 19 Asian_American_Population OH 0.4328035
## 20 Population_Other_Races OH 0.4705253
## 21 White_Population WI 95.7865216
## 22 African_American_Population WI 0.8216085
## 23 Native_American_Population WI 2.5203660
## 24 Asian_American_Population WI 0.5562127
## 25 Population_Other_Races WI 0.3152911
# Create pie charts by each state.
midwest3 <- ddply(midwest3, .(State), transform, pos=cumsum(Population_by_Race)-0.5*(Population_by_Race));
ggplot(data = midwest3, aes(x = "", y = Population_by_Race, fill = Race)) + geom_bar(stat = "identity", color = 'black') + coord_polar(theta="y") + guides(fill=guide_legend(override.aes=list(colour=NA)))+
facet_wrap(~State) + ggtitle("The Population Of Midwest By Race") +
ylab("Total Population By State")
Here is a table to represent the data:
The pie charts shows us the percent of different races in each state.
p <- ggplot(data = midwest, aes(y = percbelowpoverty, x = percollege))
p + geom_point((aes(color = state))) + ggtitle("College Education Vs Total Poverty") + xlab("Percent College Educated") + ylab("Percentage of Total poverty")
The following summary table shows the percentage of population who are college educated and also the percentage of population living below poverty level in each of the Midwest states.
# create a dot plot with correlation line.
p + geom_point(aes(color = state)) + geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) + facet_wrap(~state) + ggtitle("College Education Vs Total Proverty by Each Midwest State") + xlab("Percent College Educated") + ylab("Percentage of Total proverty")
Within each state, the correlation of college education and poverty level are inversely related. The analysis was done at county level within each state. With the highest correlation being in Ohio state: -0.5. Although correlation does not suggest causation, the trend is uniform across all 5 states.
The BoxPlot provides all outliers for education vs poverty in each state and this can further be analyzed to identify the gaps
#Create box plot with outlliers.
p <- ggplot(data = midwest, aes(y = percbelowpoverty, x = percollege))
p + geom_boxplot(aes(color = state)) + facet_wrap(~state) + ggtitle("College Education Vs Total Poverty by Each Midwest State") +
xlab("Percent College Educated") + ylab("Percentage of Total poverty")
The table below examines the probability that a person to be college educated and living below poverty within each state. The calculation was simply the multiplication of percentage of college educated residents and percentage of residents below poverty. As we can see the highest percentage of outlier are in Michigan as shown by our boxplot in R.
The college level education and poverty level are negatively correlated across 5 states in Midwest US. Although there are a few outlier counties which have higher college educated population but also higher poverty, the probability of these cases are lower than 0.02%.
To summarize all the analysis in simple query based application, we have tried a POC by developing a Shiny Application which can be integrated with any portal to provide insight on MidWest Demography. This can easily be extended to all the other possible FAQs for MidWest Region. Currently we have implemented the state wise demograpghic details around population and education level of the Midwest region.
library(shiny)
library(sqldf)
library(reshape2)
library(ggplot2)
library(gridExtra)
library(grid)
wd.datapath = paste0(getwd(),"/data")
wd.init = getwd()
setwd(wd.datapath)
midwest2 = read.csv("midwest.csv", header = TRUE)
midwestNew <- sqldf(c("Alter table midwest2 ADD state_idMW varchar(15)",
"Update midwest2 set state_idMW='ILLINOIS' where midwest2.state='IL'",
"Update midwest2 set state_idMW='INDIANA' where midwest2.state='IN'",
"Update midwest2 set state_idMW='MICHIGAN' where midwest2.state='MI'",
"Update midwest2 set state_idMW='OHIO' where midwest2.state='OH'",
"Update midwest2 set state_idMW='WISCONSIN' where midwest2.state='WI'",
"Select * from midwest2"))
setwd(wd.init)
# ui.R files for front end of shinyApp
library(shiny)
wd.datapath = paste0(getwd(),"/data")
wd.init = getwd()
setwd(wd.datapath)
midwest2 = read.csv("midwest.csv", header = TRUE)
setwd(wd.init)
#df.shiny = read.csv("C:/Users/sanjivek/Desktop/shine/data/midwest.csv")
#print(midwest2)
# Define UI for application that draws a histogram
shinyUI(fluidPage(
# Application title
titlePanel("Midwest State Demography Distribution"),
# Sidebar with a slider input for the number of bins
sidebarLayout(
sidebarPanel(
selectInput(inputId = "Stat",
label = h4("Select Midwest State for Demographic details"),
choices = list("ILLINOIS", "MICHIGAN",
"INDIANA", "WISCONSIN","OHIO"),
selected = "ILLINOIS")
),
# Show a plot of the generated distribution
mainPanel(
#tabsetPanel(tabPanel("Main",plotOutput("distPlot", height = 1000, width = 1000))
plotOutput("distPlot")
))
)
)
# Define server logic required to draw a histogram
shinyServer(function(input, output) {
output$distPlot <- renderPlot({
popQuery <- sqldf(paste0( "select sum(popwhite),sum(popblack),sum(popamerindian), sum(popasian), sum(popother),state_idMW from midwestNew
where state_idMW ='", input$Stat, "';" ) )
print(popQuery)
popQuery2<-sqldf(paste0( "select percpovertyknown,percollege,state_idMW from midwestNew
where state_idMW ='", input$Stat, "';" ) )
popQuery3 <- sqldf(paste0( "select avg(percwhite),avg(percblack),avg(percamerindan), avg(percasian),
avg(percother),state_idMW from midwestNew
where state_idMW ='", input$Stat, "';" ) )
m <- melt(popQuery,id.vars="state_idMW", variable.name = "Race", value.name = "Population_by_Race")
print(m)
n <- melt(popQuery3,id.vars="state_idMW", variable.name = "Race", value.name = "Percentage_by_Population")
print(n)
w<-ggplot(data = m, aes(x=state_idMW ,y=Population_by_Race,fill=Race)) + geom_histogram(stat="identity",position="dodge")
p <- ggplot(data = popQuery2, aes(y = percbelowpoverty, x = percollege)) + geom_point((aes(color = state_idMW))) + ggtitle("College Education Vs Total Poverty") +
xlab("Percent College Educated") + ylab("Percentage of Total poverty")
z <- ggplot(data = n, aes(x = "", y = Percentage_by_Population, fill = Race)) +
geom_bar(stat = "identity", color = 'black') + coord_polar(theta="y") +
guides(fill=guide_legend(override.aes=list(colour=NA)))
pushViewport(viewport(layout = grid.layout(3, 1),width=0.75,height=1))
print(w, vp = viewport(layout.pos.row = 1, layout.pos.col = 1 ))
print(p, vp = viewport(layout.pos.row = 2, layout.pos.col = 1))
print(z, vp = viewport(layout.pos.row = 3, layout.pos.col = 1))
})
})