library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
setwd("C:/Users/User/Documents/Data_Science/Datasets")
us_census <- read_csv("acs2017_census_tract_data.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   State = col_character(),
##   County = col_character()
## )
## i Use `spec()` for the full column specifications.

Look at the data

head(us_census)
## # A tibble: 6 x 37
##    TractId State  County  TotalPop   Men Women Hispanic White Black Native Asian
##      <dbl> <chr>  <chr>      <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>  <dbl> <dbl>
## 1   1.00e9 Alaba~ Autaug~     1845   899   946      2.4  86.3   5.2    0     1.2
## 2   1.00e9 Alaba~ Autaug~     2172  1167  1005      1.1  41.6  54.5    0     1  
## 3   1.00e9 Alaba~ Autaug~     3385  1533  1852      8    61.4  26.5    0.6   0.7
## 4   1.00e9 Alaba~ Autaug~     4267  2001  2266      9.6  80.3   7.1    0.5   0.2
## 5   1.00e9 Alaba~ Autaug~     9965  5054  4911      0.9  77.5  16.4    0     3.1
## 6   1.00e9 Alaba~ Autaug~     3620  1765  1855      3    70.7  25.1    0     0  
## # ... with 26 more variables: Pacific <dbl>, VotingAgeCitizen <dbl>,
## #   Income <dbl>, IncomeErr <dbl>, IncomePerCap <dbl>, IncomePerCapErr <dbl>,
## #   Poverty <dbl>, ChildPoverty <dbl>, Professional <dbl>, Service <dbl>,
## #   Office <dbl>, Construction <dbl>, Production <dbl>, Drive <dbl>,
## #   Carpool <dbl>, Transit <dbl>, Walk <dbl>, OtherTransp <dbl>,
## #   WorkAtHome <dbl>, MeanCommute <dbl>, Employed <dbl>, PrivateWork <dbl>,
## #   PublicWork <dbl>, SelfEmployed <dbl>, FamilyWork <dbl>, Unemployment <dbl>
summary(us_census)
##     TractId             State              County             TotalPop    
##  Min.   :1.001e+09   Length:74001       Length:74001       Min.   :    0  
##  1st Qu.:1.304e+10   Class :character   Class :character   1st Qu.: 2903  
##  Median :2.805e+10   Mode  :character   Mode  :character   Median : 4105  
##  Mean   :2.839e+10                                         Mean   : 4385  
##  3rd Qu.:4.200e+10                                         3rd Qu.: 5506  
##  Max.   :7.215e+10                                         Max.   :65528  
##                                                                           
##       Men            Women          Hispanic          White       
##  Min.   :    0   Min.   :    0   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.: 1416   1st Qu.: 1465   1st Qu.:  2.60   1st Qu.: 38.00  
##  Median : 2007   Median : 2082   Median :  7.40   Median : 70.40  
##  Mean   : 2158   Mean   : 2227   Mean   : 17.27   Mean   : 61.31  
##  3rd Qu.: 2707   3rd Qu.: 2803   3rd Qu.: 21.10   3rd Qu.: 87.70  
##  Max.   :32266   Max.   :33262   Max.   :100.00   Max.   :100.00  
##                                  NA's   :696      NA's   :696     
##      Black            Native            Asian            Pacific       
##  Min.   :  0.00   Min.   :  0.000   Min.   :  0.000   Min.   : 0.0000  
##  1st Qu.:  0.80   1st Qu.:  0.000   1st Qu.:  0.200   1st Qu.: 0.0000  
##  Median :  3.80   Median :  0.000   Median :  1.500   Median : 0.0000  
##  Mean   : 13.29   Mean   :  0.734   Mean   :  4.754   Mean   : 0.1473  
##  3rd Qu.: 14.60   3rd Qu.:  0.400   3rd Qu.:  5.000   3rd Qu.: 0.0000  
##  Max.   :100.00   Max.   :100.000   Max.   :100.000   Max.   :71.9000  
##  NA's   :696      NA's   :696       NA's   :696       NA's   :696      
##  VotingAgeCitizen     Income         IncomeErr       IncomePerCap   
##  Min.   :    0    Min.   :  2692   Min.   :   728   Min.   :    32  
##  1st Qu.: 2061    1st Qu.: 40357   1st Qu.:  5735   1st Qu.: 20557  
##  Median : 2905    Median : 54375   Median :  8267   Median : 27216  
##  Mean   : 3103    Mean   : 61087   Mean   :  9697   Mean   : 30652  
##  3rd Qu.: 3906    3rd Qu.: 74659   3rd Qu.: 11913   3rd Qu.: 36408  
##  Max.   :39389    Max.   :249750   Max.   :153365   Max.   :220253  
##                   NA's   :1116     NA's   :1116     NA's   :745     
##  IncomePerCapErr     Poverty        ChildPoverty     Professional   
##  Min.   :    20   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  2503   1st Qu.:  6.90   1st Qu.:  6.20   1st Qu.: 24.70  
##  Median :  3404   Median : 12.60   Median : 16.30   Median : 33.30  
##  Mean   :  4270   Mean   : 16.12   Mean   : 21.16   Mean   : 35.56  
##  3rd Qu.:  4968   3rd Qu.: 21.80   3rd Qu.: 31.60   3rd Qu.: 44.90  
##  Max.   :134017   Max.   :100.00   Max.   :100.00   Max.   :100.00  
##  NA's   :745      NA's   :842      NA's   :1110     NA's   :811     
##     Service           Office        Construction       Production   
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Min.   :  0.0  
##  1st Qu.: 13.30   1st Qu.: 19.70   1st Qu.:  5.000   1st Qu.:  7.1  
##  Median : 17.70   Median : 23.20   Median :  8.300   Median : 11.8  
##  Mean   : 18.87   Mean   : 23.43   Mean   :  9.239   Mean   : 12.9  
##  3rd Qu.: 23.30   3rd Qu.: 26.90   3rd Qu.: 12.500   3rd Qu.: 17.5  
##  Max.   :100.00   Max.   :100.00   Max.   :100.000   Max.   :100.0  
##  NA's   :811      NA's   :811      NA's   :811       NA's   :811    
##      Drive          Carpool           Transit             Walk        
##  Min.   :  0.0   Min.   :  0.000   Min.   :  0.000   Min.   :  0.000  
##  1st Qu.: 72.2   1st Qu.:  5.700   1st Qu.:  0.000   1st Qu.:  0.400  
##  Median : 79.9   Median :  8.500   Median :  1.000   Median :  1.400  
##  Mean   : 75.7   Mean   :  9.308   Mean   :  5.395   Mean   :  3.043  
##  3rd Qu.: 84.9   3rd Qu.: 11.900   3rd Qu.:  4.600   3rd Qu.:  3.300  
##  Max.   :100.0   Max.   :100.000   Max.   :100.000   Max.   :100.000  
##  NA's   :801     NA's   :801       NA's   :801       NA's   :801      
##   OtherTransp        WorkAtHome       MeanCommute       Employed    
##  Min.   :  0.000   Min.   :  0.000   Min.   : 1.00   Min.   :    0  
##  1st Qu.:  0.400   1st Qu.:  2.000   1st Qu.:21.10   1st Qu.: 1276  
##  Median :  1.200   Median :  3.800   Median :25.40   Median : 1895  
##  Mean   :  1.895   Mean   :  4.662   Mean   :26.06   Mean   : 2049  
##  3rd Qu.:  2.500   3rd Qu.:  6.300   3rd Qu.:30.30   3rd Qu.: 2635  
##  Max.   :100.000   Max.   :100.000   Max.   :73.90   Max.   :28945  
##  NA's   :801       NA's   :801       NA's   :946                    
##   PrivateWork       PublicWork      SelfEmployed       FamilyWork     
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.000   Min.   : 0.0000  
##  1st Qu.: 75.20   1st Qu.:  9.30   1st Qu.:  3.500   1st Qu.: 0.0000  
##  Median : 80.60   Median : 13.00   Median :  5.500   Median : 0.0000  
##  Mean   : 79.49   Mean   : 14.16   Mean   :  6.171   Mean   : 0.1712  
##  3rd Qu.: 85.00   3rd Qu.: 17.60   3rd Qu.:  8.000   3rd Qu.: 0.0000  
##  Max.   :100.00   Max.   :100.00   Max.   :100.000   Max.   :22.3000  
##  NA's   :811      NA's   :811      NA's   :811       NA's   :811      
##   Unemployment    
##  Min.   :  0.000  
##  1st Qu.:  3.900  
##  Median :  6.000  
##  Mean   :  7.247  
##  3rd Qu.:  9.000  
##  Max.   :100.000  
##  NA's   :810

Lowercase all variables

names(us_census) <- tolower(names(us_census))

Review the change

head(us_census)
## # A tibble: 6 x 37
##    tractid state  county  totalpop   men women hispanic white black native asian
##      <dbl> <chr>  <chr>      <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>  <dbl> <dbl>
## 1   1.00e9 Alaba~ Autaug~     1845   899   946      2.4  86.3   5.2    0     1.2
## 2   1.00e9 Alaba~ Autaug~     2172  1167  1005      1.1  41.6  54.5    0     1  
## 3   1.00e9 Alaba~ Autaug~     3385  1533  1852      8    61.4  26.5    0.6   0.7
## 4   1.00e9 Alaba~ Autaug~     4267  2001  2266      9.6  80.3   7.1    0.5   0.2
## 5   1.00e9 Alaba~ Autaug~     9965  5054  4911      0.9  77.5  16.4    0     3.1
## 6   1.00e9 Alaba~ Autaug~     3620  1765  1855      3    70.7  25.1    0     0  
## # ... with 26 more variables: pacific <dbl>, votingagecitizen <dbl>,
## #   income <dbl>, incomeerr <dbl>, incomepercap <dbl>, incomepercaperr <dbl>,
## #   poverty <dbl>, childpoverty <dbl>, professional <dbl>, service <dbl>,
## #   office <dbl>, construction <dbl>, production <dbl>, drive <dbl>,
## #   carpool <dbl>, transit <dbl>, walk <dbl>, othertransp <dbl>,
## #   workathome <dbl>, meancommute <dbl>, employed <dbl>, privatework <dbl>,
## #   publicwork <dbl>, selfemployed <dbl>, familywork <dbl>, unemployment <dbl>
poverty_by_state <- us_census %>%
  filter(state == "Maryland" | state == "District of Columbia" | state == "Virginia")%>%
  group_by(totalpop,state) %>%
  arrange(desc(poverty))
poverty_by_state
## # A tibble: 3,492 x 37
## # Groups:   totalpop, state [3,022]
##     tractid state  county totalpop   men women hispanic white black native asian
##       <dbl> <chr>  <chr>     <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>  <dbl> <dbl>
##  1  1.10e10 Distr~ Distr~     3682  1691  1991     11.8  63.9   7.4    0.2  13.4
##  2  2.40e10 Maryl~ Balti~     5004  2109  2895      6.8  63.6  20.8    0     5.3
##  3  5.10e10 Virgi~ Albem~     3483  1563  1920      5.6  64.6   7.6    0.1  16.2
##  4  2.40e10 Maryl~ Balti~     3059  1732  1327      6.1  58    24.3    0     7.1
##  5  5.17e10 Virgi~ Harri~     6325  2951  3374      8.1  75.8   8.4    0     4.4
##  6  5.17e10 Virgi~ Norfo~     2157   774  1383      3.8   5    86      0     0  
##  7  5.18e10 Virgi~ Richm~     2754   935  1819      0.9   2.4  94.6    0     0  
##  8  5.17e10 Virgi~ Norfo~     1279   468   811      5.5   0.4  89      0     0  
##  9  5.17e10 Virgi~ Norfo~     1691   642  1049      0     3.5  96.5    0     0  
## 10  5.18e10 Virgi~ Richm~     4559  1988  2571      1.4   2    95.8    0     0  
## # ... with 3,482 more rows, and 26 more variables: pacific <dbl>,
## #   votingagecitizen <dbl>, income <dbl>, incomeerr <dbl>, incomepercap <dbl>,
## #   incomepercaperr <dbl>, poverty <dbl>, childpoverty <dbl>,
## #   professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## #   production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## #   othertransp <dbl>, workathome <dbl>, meancommute <dbl>, employed <dbl>,
## #   privatework <dbl>, publicwork <dbl>, selfemployed <dbl>, familywork <dbl>,
## #   unemployment <dbl>

Call in more libraries

library(ggplot2)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

Create the visual

poverty_by_state_plot <- poverty_by_state %>% 
  ggplot(aes(totalpop, poverty, group=state,color=state))+ 
  geom_point()+
  xlab("Population") +
  ylab("Poverty") +
  ggtitle("Scatterplot of poverty level by population")
poverty_by_state_plot
## Warning: Removed 55 rows containing missing values (geom_point).

poverty_by_state_plot <- poverty_by_state %>%
  ggplot() + 
  geom_boxplot(aes(y=poverty, group=state,fill=state)) +
       ggtitle("Boxplot of poverty level by state") 
poverty_by_state_plot
## Warning: Removed 55 rows containing non-finite values (stat_boxplot).

The source of the US census data comes from kaggle. This dataset is 2017 US census information. The US census is usually taken once every year and can be used to get insight of poverty level or population from a county. The US census bureau was founded in 1902 but the first census was made in 1790. The census first started as a population count, but as time progressed there was a need for demographics and economic information. The variables in this dataset range from state names, poverty level, population and gender. I cleaned the dataset by making all the capital variables lowercase and grouped the data by state, population and poverty level.

The visualization represents a viewing of the poverty level by each state. From this visualization we can see that the District of Columbia has the greatest outlier and highest level of poverty. One thing I also noticed from the scatterplot is that Virginia had the highest population. Maryland falls right under the District of Columbia for levels of poverty in this visual.

One thing I wish I could have shown was a streamgraph of the poverty with the population, which could show more of the population. Another thing I could have included was different name variables, Like population instead of totolpop. I could have also included more cleaning of the dataset. Overall I liked the graphs and thought the visualization gave it meaning.