library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
setwd("C:/Users/User/Documents/Data_Science/Datasets")
us_census <- read_csv("acs2017_census_tract_data.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## State = col_character(),
## County = col_character()
## )
## i Use `spec()` for the full column specifications.
head(us_census)
## # A tibble: 6 x 37
## TractId State County TotalPop Men Women Hispanic White Black Native Asian
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.00e9 Alaba~ Autaug~ 1845 899 946 2.4 86.3 5.2 0 1.2
## 2 1.00e9 Alaba~ Autaug~ 2172 1167 1005 1.1 41.6 54.5 0 1
## 3 1.00e9 Alaba~ Autaug~ 3385 1533 1852 8 61.4 26.5 0.6 0.7
## 4 1.00e9 Alaba~ Autaug~ 4267 2001 2266 9.6 80.3 7.1 0.5 0.2
## 5 1.00e9 Alaba~ Autaug~ 9965 5054 4911 0.9 77.5 16.4 0 3.1
## 6 1.00e9 Alaba~ Autaug~ 3620 1765 1855 3 70.7 25.1 0 0
## # ... with 26 more variables: Pacific <dbl>, VotingAgeCitizen <dbl>,
## # Income <dbl>, IncomeErr <dbl>, IncomePerCap <dbl>, IncomePerCapErr <dbl>,
## # Poverty <dbl>, ChildPoverty <dbl>, Professional <dbl>, Service <dbl>,
## # Office <dbl>, Construction <dbl>, Production <dbl>, Drive <dbl>,
## # Carpool <dbl>, Transit <dbl>, Walk <dbl>, OtherTransp <dbl>,
## # WorkAtHome <dbl>, MeanCommute <dbl>, Employed <dbl>, PrivateWork <dbl>,
## # PublicWork <dbl>, SelfEmployed <dbl>, FamilyWork <dbl>, Unemployment <dbl>
summary(us_census)
## TractId State County TotalPop
## Min. :1.001e+09 Length:74001 Length:74001 Min. : 0
## 1st Qu.:1.304e+10 Class :character Class :character 1st Qu.: 2903
## Median :2.805e+10 Mode :character Mode :character Median : 4105
## Mean :2.839e+10 Mean : 4385
## 3rd Qu.:4.200e+10 3rd Qu.: 5506
## Max. :7.215e+10 Max. :65528
##
## Men Women Hispanic White
## Min. : 0 Min. : 0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1416 1st Qu.: 1465 1st Qu.: 2.60 1st Qu.: 38.00
## Median : 2007 Median : 2082 Median : 7.40 Median : 70.40
## Mean : 2158 Mean : 2227 Mean : 17.27 Mean : 61.31
## 3rd Qu.: 2707 3rd Qu.: 2803 3rd Qu.: 21.10 3rd Qu.: 87.70
## Max. :32266 Max. :33262 Max. :100.00 Max. :100.00
## NA's :696 NA's :696
## Black Native Asian Pacific
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.80 1st Qu.: 0.000 1st Qu.: 0.200 1st Qu.: 0.0000
## Median : 3.80 Median : 0.000 Median : 1.500 Median : 0.0000
## Mean : 13.29 Mean : 0.734 Mean : 4.754 Mean : 0.1473
## 3rd Qu.: 14.60 3rd Qu.: 0.400 3rd Qu.: 5.000 3rd Qu.: 0.0000
## Max. :100.00 Max. :100.000 Max. :100.000 Max. :71.9000
## NA's :696 NA's :696 NA's :696 NA's :696
## VotingAgeCitizen Income IncomeErr IncomePerCap
## Min. : 0 Min. : 2692 Min. : 728 Min. : 32
## 1st Qu.: 2061 1st Qu.: 40357 1st Qu.: 5735 1st Qu.: 20557
## Median : 2905 Median : 54375 Median : 8267 Median : 27216
## Mean : 3103 Mean : 61087 Mean : 9697 Mean : 30652
## 3rd Qu.: 3906 3rd Qu.: 74659 3rd Qu.: 11913 3rd Qu.: 36408
## Max. :39389 Max. :249750 Max. :153365 Max. :220253
## NA's :1116 NA's :1116 NA's :745
## IncomePerCapErr Poverty ChildPoverty Professional
## Min. : 20 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 2503 1st Qu.: 6.90 1st Qu.: 6.20 1st Qu.: 24.70
## Median : 3404 Median : 12.60 Median : 16.30 Median : 33.30
## Mean : 4270 Mean : 16.12 Mean : 21.16 Mean : 35.56
## 3rd Qu.: 4968 3rd Qu.: 21.80 3rd Qu.: 31.60 3rd Qu.: 44.90
## Max. :134017 Max. :100.00 Max. :100.00 Max. :100.00
## NA's :745 NA's :842 NA's :1110 NA's :811
## Service Office Construction Production
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.0
## 1st Qu.: 13.30 1st Qu.: 19.70 1st Qu.: 5.000 1st Qu.: 7.1
## Median : 17.70 Median : 23.20 Median : 8.300 Median : 11.8
## Mean : 18.87 Mean : 23.43 Mean : 9.239 Mean : 12.9
## 3rd Qu.: 23.30 3rd Qu.: 26.90 3rd Qu.: 12.500 3rd Qu.: 17.5
## Max. :100.00 Max. :100.00 Max. :100.000 Max. :100.0
## NA's :811 NA's :811 NA's :811 NA's :811
## Drive Carpool Transit Walk
## Min. : 0.0 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 72.2 1st Qu.: 5.700 1st Qu.: 0.000 1st Qu.: 0.400
## Median : 79.9 Median : 8.500 Median : 1.000 Median : 1.400
## Mean : 75.7 Mean : 9.308 Mean : 5.395 Mean : 3.043
## 3rd Qu.: 84.9 3rd Qu.: 11.900 3rd Qu.: 4.600 3rd Qu.: 3.300
## Max. :100.0 Max. :100.000 Max. :100.000 Max. :100.000
## NA's :801 NA's :801 NA's :801 NA's :801
## OtherTransp WorkAtHome MeanCommute Employed
## Min. : 0.000 Min. : 0.000 Min. : 1.00 Min. : 0
## 1st Qu.: 0.400 1st Qu.: 2.000 1st Qu.:21.10 1st Qu.: 1276
## Median : 1.200 Median : 3.800 Median :25.40 Median : 1895
## Mean : 1.895 Mean : 4.662 Mean :26.06 Mean : 2049
## 3rd Qu.: 2.500 3rd Qu.: 6.300 3rd Qu.:30.30 3rd Qu.: 2635
## Max. :100.000 Max. :100.000 Max. :73.90 Max. :28945
## NA's :801 NA's :801 NA's :946
## PrivateWork PublicWork SelfEmployed FamilyWork
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 75.20 1st Qu.: 9.30 1st Qu.: 3.500 1st Qu.: 0.0000
## Median : 80.60 Median : 13.00 Median : 5.500 Median : 0.0000
## Mean : 79.49 Mean : 14.16 Mean : 6.171 Mean : 0.1712
## 3rd Qu.: 85.00 3rd Qu.: 17.60 3rd Qu.: 8.000 3rd Qu.: 0.0000
## Max. :100.00 Max. :100.00 Max. :100.000 Max. :22.3000
## NA's :811 NA's :811 NA's :811 NA's :811
## Unemployment
## Min. : 0.000
## 1st Qu.: 3.900
## Median : 6.000
## Mean : 7.247
## 3rd Qu.: 9.000
## Max. :100.000
## NA's :810
names(us_census) <- tolower(names(us_census))
head(us_census)
## # A tibble: 6 x 37
## tractid state county totalpop men women hispanic white black native asian
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.00e9 Alaba~ Autaug~ 1845 899 946 2.4 86.3 5.2 0 1.2
## 2 1.00e9 Alaba~ Autaug~ 2172 1167 1005 1.1 41.6 54.5 0 1
## 3 1.00e9 Alaba~ Autaug~ 3385 1533 1852 8 61.4 26.5 0.6 0.7
## 4 1.00e9 Alaba~ Autaug~ 4267 2001 2266 9.6 80.3 7.1 0.5 0.2
## 5 1.00e9 Alaba~ Autaug~ 9965 5054 4911 0.9 77.5 16.4 0 3.1
## 6 1.00e9 Alaba~ Autaug~ 3620 1765 1855 3 70.7 25.1 0 0
## # ... with 26 more variables: pacific <dbl>, votingagecitizen <dbl>,
## # income <dbl>, incomeerr <dbl>, incomepercap <dbl>, incomepercaperr <dbl>,
## # poverty <dbl>, childpoverty <dbl>, professional <dbl>, service <dbl>,
## # office <dbl>, construction <dbl>, production <dbl>, drive <dbl>,
## # carpool <dbl>, transit <dbl>, walk <dbl>, othertransp <dbl>,
## # workathome <dbl>, meancommute <dbl>, employed <dbl>, privatework <dbl>,
## # publicwork <dbl>, selfemployed <dbl>, familywork <dbl>, unemployment <dbl>
poverty_by_state <- us_census %>%
filter(state == "Maryland" | state == "District of Columbia" | state == "Virginia")%>%
group_by(totalpop,state) %>%
arrange(desc(poverty))
poverty_by_state
## # A tibble: 3,492 x 37
## # Groups: totalpop, state [3,022]
## tractid state county totalpop men women hispanic white black native asian
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.10e10 Distr~ Distr~ 3682 1691 1991 11.8 63.9 7.4 0.2 13.4
## 2 2.40e10 Maryl~ Balti~ 5004 2109 2895 6.8 63.6 20.8 0 5.3
## 3 5.10e10 Virgi~ Albem~ 3483 1563 1920 5.6 64.6 7.6 0.1 16.2
## 4 2.40e10 Maryl~ Balti~ 3059 1732 1327 6.1 58 24.3 0 7.1
## 5 5.17e10 Virgi~ Harri~ 6325 2951 3374 8.1 75.8 8.4 0 4.4
## 6 5.17e10 Virgi~ Norfo~ 2157 774 1383 3.8 5 86 0 0
## 7 5.18e10 Virgi~ Richm~ 2754 935 1819 0.9 2.4 94.6 0 0
## 8 5.17e10 Virgi~ Norfo~ 1279 468 811 5.5 0.4 89 0 0
## 9 5.17e10 Virgi~ Norfo~ 1691 642 1049 0 3.5 96.5 0 0
## 10 5.18e10 Virgi~ Richm~ 4559 1988 2571 1.4 2 95.8 0 0
## # ... with 3,482 more rows, and 26 more variables: pacific <dbl>,
## # votingagecitizen <dbl>, income <dbl>, incomeerr <dbl>, incomepercap <dbl>,
## # incomepercaperr <dbl>, poverty <dbl>, childpoverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## # othertransp <dbl>, workathome <dbl>, meancommute <dbl>, employed <dbl>,
## # privatework <dbl>, publicwork <dbl>, selfemployed <dbl>, familywork <dbl>,
## # unemployment <dbl>
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
poverty_by_state_plot <- poverty_by_state %>%
ggplot(aes(totalpop, poverty, group=state,color=state))+
geom_point()+
xlab("Population") +
ylab("Poverty") +
ggtitle("Scatterplot of poverty level by population")
poverty_by_state_plot
## Warning: Removed 55 rows containing missing values (geom_point).
poverty_by_state_plot <- poverty_by_state %>%
ggplot() +
geom_boxplot(aes(y=poverty, group=state,fill=state)) +
ggtitle("Boxplot of poverty level by state")
poverty_by_state_plot
## Warning: Removed 55 rows containing non-finite values (stat_boxplot).
The source of the US census data comes from kaggle. This dataset is 2017 US census information. The US census is usually taken once every year and can be used to get insight of poverty level or population from a county. The US census bureau was founded in 1902 but the first census was made in 1790. The census first started as a population count, but as time progressed there was a need for demographics and economic information. The variables in this dataset range from state names, poverty level, population and gender. I cleaned the dataset by making all the capital variables lowercase and grouped the data by state, population and poverty level.
The visualization represents a viewing of the poverty level by each state. From this visualization we can see that the District of Columbia has the greatest outlier and highest level of poverty. One thing I also noticed from the scatterplot is that Virginia had the highest population. Maryland falls right under the District of Columbia for levels of poverty in this visual.
One thing I wish I could have shown was a streamgraph of the poverty with the population, which could show more of the population. Another thing I could have included was different name variables, Like population instead of totolpop. I could have also included more cleaning of the dataset. Overall I liked the graphs and thought the visualization gave it meaning.