Mid-west States in USA, the orginal dataset contains demographics and poverty data for 5 mid-west states in USA, In our analysis we are focusing on area, education, population,diversity and how it has impact on poverty level
   Fetching data from github using RCurl library
   Performed and captured the summary and statistics of mid west states using summary and Hmisc libraries
   Data profiling and its observation
   Subsetting, aggregation of data using sqldf library
   Modifying/Added the new attribute from lookup file (StateName)
   ggplot2 and reshape2 libraries used for data visualization
   corrplot for making correlation plot
2.Data Exploration
   Performing data summary and data profiling to understand the data
2.1 Getting data from github
library(RCurl)
## Loading required package: bitops
#Getting data from github repository
#midwest -> dataset with all data of midwest demographics population statistics
midwest <-read.csv(file = 'C:/Users/aisha/Dropbox/CUNY/CUNY_BridgeClass/midwest/midwest.csv', header = TRUE)
#states_lkp -> dataset having states and its abbrevation
states_lkp <-read.csv(file = 'C:/Users/aisha/Dropbox/CUNY/CUNY_BridgeClass/midwest/states_lkp.csv')
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
#### SQL join with lookup table to make a master dataset
#Performing left outer join to get the StateName from state_lkp.csv file
midwest_states <- sqldf("SELECT midwest.*,states_lkp.StateName from midwest
LEFT JOIN states_lkp on midwest.State=states_lkp.StateCode")
2.2 Data profiling using summary and hmist libraries
summary(midwest_states)
## X PID county state area
## Min. : 1 Min. : 561 CRAWFORD: 5 IL:102 Min. :0.00500
## 1st Qu.:110 1st Qu.: 670 JACKSON : 5 IN: 92 1st Qu.:0.02400
## Median :219 Median :1221 MONROE : 5 MI: 83 Median :0.03000
## Mean :219 Mean :1437 ADAMS : 4 OH: 88 Mean :0.03317
## 3rd Qu.:328 3rd Qu.:2059 BROWN : 4 WI: 72 3rd Qu.:0.03800
## Max. :437 Max. :3052 CLARK : 4 Max. :0.11000
## (Other) :410
## poptotal popdensity popwhite popblack
## Min. : 1701 Min. : 85.05 Min. : 416 Min. : 0
## 1st Qu.: 18840 1st Qu.: 622.41 1st Qu.: 18630 1st Qu.: 29
## Median : 35324 Median : 1156.21 Median : 34471 Median : 201
## Mean : 96130 Mean : 3097.74 Mean : 81840 Mean : 11024
## 3rd Qu.: 75651 3rd Qu.: 2330.00 3rd Qu.: 72968 3rd Qu.: 1291
## Max. :5105067 Max. :88018.40 Max. :3204947 Max. :1317147
##
## popamerindian popasian popother percwhite
## Min. : 4.0 Min. : 0 Min. : 0 Min. :10.69
## 1st Qu.: 44.0 1st Qu.: 35 1st Qu.: 20 1st Qu.:94.89
## Median : 94.0 Median : 102 Median : 66 Median :98.03
## Mean : 343.1 Mean : 1310 Mean : 1613 Mean :95.56
## 3rd Qu.: 288.0 3rd Qu.: 401 3rd Qu.: 345 3rd Qu.:99.07
## Max. :10289.0 Max. :188565 Max. :384119 Max. :99.82
##
## percblack percamerindan percasian percother
## Min. : 0.0000 Min. : 0.05623 Min. :0.0000 Min. :0.00000
## 1st Qu.: 0.1157 1st Qu.: 0.15793 1st Qu.:0.1737 1st Qu.:0.09102
## Median : 0.5390 Median : 0.21502 Median :0.2972 Median :0.17844
## Mean : 2.6763 Mean : 0.79894 Mean :0.4872 Mean :0.47906
## 3rd Qu.: 2.6014 3rd Qu.: 0.38362 3rd Qu.:0.5212 3rd Qu.:0.48050
## Max. :40.2100 Max. :89.17738 Max. :5.0705 Max. :7.52427
##
## popadults perchsd percollege percprof
## Min. : 1287 Min. :46.91 Min. : 7.336 Min. :-184.130
## 1st Qu.: 12271 1st Qu.:71.33 1st Qu.:14.114 1st Qu.: 2.998
## Median : 22188 Median :74.25 Median :16.798 Median : 3.814
## Mean : 60973 Mean :73.97 Mean :18.273 Mean : 4.020
## 3rd Qu.: 47541 3rd Qu.:77.20 3rd Qu.:20.550 3rd Qu.: 4.949
## Max. :3291995 Max. :88.90 Max. :48.079 Max. : 20.791
##
## poppovertyknown percpovertyknown percbelowpoverty percchildbelowpovert
## Min. : 1696 Min. :80.90 Min. : 2.180 Min. : 1.919
## 1st Qu.: 18364 1st Qu.:96.89 1st Qu.: 9.199 1st Qu.:11.624
## Median : 33788 Median :98.17 Median :11.822 Median :15.270
## Mean : 93642 Mean :97.11 Mean :12.511 Mean :16.447
## 3rd Qu.: 72840 3rd Qu.:98.60 3rd Qu.:15.133 3rd Qu.:20.352
## Max. :5023523 Max. :99.86 Max. :48.691 Max. :64.308
##
## percadultpoverty percelderlypoverty inmetro category
## Min. : 1.938 Min. : 3.547 Min. :0.0000 AAR :193
## 1st Qu.: 7.668 1st Qu.: 8.912 1st Qu.:0.0000 AAU : 77
## Median :10.008 Median :10.869 Median :0.0000 LAR : 30
## Mean :10.919 Mean :11.389 Mean :0.3432 LHR : 28
## 3rd Qu.:13.182 3rd Qu.:13.412 3rd Qu.:1.0000 HLU : 26
## Max. :43.312 Max. :31.162 Max. :1.0000 ALU : 20
## (Other): 63
## StateName
## Illinois :102
## Indiana : 92
## Ohio : 88
## Michigan : 83
## Wisconsin: 72
## Alabama : 0
## (Other) : 0
podensity_mean <- mean(midwest_states$popdensity) #Mean of population density
print(paste0("Mean of Population density ",podensity_mean))
## [1] "Mean of Population density 3097.74298489245"
podensity_median <- median(midwest_states$popdensity) #Median of population density
print(paste0("Median of Population density ",podensity_median))
## [1] "Median of Population density 1156.20833"
quartile_popdensity = midwest_states$inmetro
print("Quartile of Population Density")
## [1] "Quartile of Population Density"
quantile(quartile_popdensity)
## 0% 25% 50% 75% 100%
## 0 0 0 1 1
#####Describe the key attributes used in the analysis
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked _by_ '.GlobalEnv':
##
## midwest
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
Created a subset of data using few attributes with aggragated value based on States
Changed Column names for aggragated value
Reshaping data using reshape2 library to plot histogram
midwest_states_prof_std <- sqldf("SELECT StateName,avg(percollege) as College_Percentage,avg(percprof) as Professor_Percentage from midwest_states group by StateName")
#midwest_states_prof_std
library(reshape2)
reshape_prof_college <- melt(midwest_states_prof_std)
## Using StateName as id variables
#reshape_prof_college
library(ggplot2)
ggplot(reshape_prof_college, aes(x=StateName, y= value , fill=variable)) + geom_bar(stat="identity",position="dodge", colour="black", hjust = 0.5, vjust = 0.3) + scale_fill_discrete(name = "Legend") + labs( y = "Percentage", x = "Mid-west States") + ggtitle("Ratio of College and Professor in Mid west States") +
theme(plot.title = element_text(hjust = 0.5))