Bar Graph, Cluster Analysis and Map

Objectives

The objective of the project is the examine and visualize Asian Immigrant population in the United States and examine the which states are similar to each other in terms of Asian Immigrants.

We utilized the data set of Asian American population of 2017 retrieved from Census.gov.

Let us visualize the data using Bar graph:

if (!require (tidyverse)) {install.packages ("tidyverse")}
if (!require (patchwork)) {install.packages ("patchwork")}
if (!require (scales)) {install.packages ("scales")}

# For dendrogram visualization
if (!require (ggdendro)) {install.packages ("ggdendro")}

library(tidyverse)  # For data manipulation
library(readr)
library(dplyr)
library(ggplot2)
library(readxl)
library(readr)
library (ggdendro)  # For dendrogram visualization
library(patchwork)  # To join graphs 
library(scales)
library(plotly)

# Census data of Asian Immigrant 2017
CensusData2017 <- read_csv("CensusData2017.csv")

# Short names for states. These will be helpful during labelling of the figure
StateShort <- read_excel("StateShort.xlsx")
# View(StateShort)

# Total population of each state in 2017
TotalPop <- read_csv("TotalPop.csv", col_types = cols(GEO.id = col_skip(), 
                                                      GEO.id2 = col_skip(), Id = col_skip(), 
                                                      Id2 = col_skip(), `Margin of Error; Total` = col_skip(), 
                                                      X1 = col_skip()), skip = 1)

Adding all the Asian Immigrant population together

Total <- CensusData2017 %>% 
  select (-State) %>% 
  rowwise() %>% 
  do((.) %>% as.data.frame %>% 
       mutate (TotalAsian = sum(.))) %>% 
  ungroup() %>% 
  cbind(CensusData2017$State)

Remaning column name and preparing for the analysis

Total <- Total %>% 
  rename(State = "CensusData2017$State")


Total1 <- TotalPop %>% 
  inner_join(Total, by= c("Geography" = "State")) %>%
  inner_join(StateShort, by = c("Geography" = "State")) %>% 
  mutate (percent = (TotalAsian/Total)*100)

## Warning: Column `Geography`/`State` joining character vector and factor,
## coercing into character vector

Visualizing data

p <- ggplot(data = Total1, aes(x = reorder (Abbreviation, -percent), y = percent, fill = reorder (Geography, -percent)))+
  geom_bar(stat= "identity")+
  theme_bw()+
  theme(legend.position = "none") +
  scale_y_continuous(breaks = c(0, 5, 10, 15, 20, 25, 30, 35))+
  theme ( axis.text.y = element_text(face = 'italic', size = 8))+
  theme ( axis.text.x = element_text(face = 'italic', size = 8))+
  theme (axis.text.x = element_text(face = "bold", color = "#993333", angle = 45))+
  labs(x = 'States', y = " Percent of Asian Immigrants ")
ggplotly(p)

Figure: Asian Immigrant Population Percentage in 50 States and Puerto Rico in 2017

t <- ggplot(data = Total1, aes(x = reorder (Abbreviation, -TotalAsian), 
                               y = TotalAsian, fill =
                                 reorder (Geography, -percent)))+
  geom_bar(stat= "identity")+
  scale_y_continuous(breaks = seq(0, 6000000, 1000000), labels = comma)+
  theme_bw()+
  theme(legend.position = "none")+
  theme( axis.text.y = element_text(face = 'italic', size = 8))+
  theme( axis.text.x = element_text(face = 'italic', size = 8))+
  theme(axis.text.x = element_text(face = "bold", color = "#993333", angle = 45))+
  labs(x = 'States', y = " Total immigrant ")
# print (t)

ggplotly(t)

Figure: Total Asian Immigrant Population in 50 States and Puerto Rico in 2017

California has the highest total number of Asian Immigrants. But Hawaii, when total Asian Immigrant is divided by the total population of the state, has the highest percent of Asian Immigrants.

Now, let’s switch to cluster analysis.

h.clust <- Total1 %>%
  select (., Geography, percent) %>% 
  column_to_rownames(., "Geography") 


str(h.clust)

## 'data.frame':    52 obs. of  1 variable:
##  $ percent: num  1.27 6.02 3.02 1.38 13.65 ...

is.null(h.clust)  # To check is there is any Null values

## [1] FALSE

h.clust1 <- hclust(dist(h.clust), method = "complete")

ggdendrogram(h.clust1)+ 
  ggtitle(" Dendrogram of States with Asian Population") +
  labs( x = " States", y = "Euclidean Distance", title = "Dendrogram")

plot(h.clust1, hang = -1, xlab = " States", 
     ylab = "Euclidean Distance",ps = 10,
     main = "Cluster dendrogram of Asian population of States - 2017 Census")

rect.hclust(h.clust1, h=2 )

sub_grp <- cutree(h.clust1, k = 7)

groups <- as.data.frame(sub_grp) %>% 
  rownames_to_column("State")

Now,Let’s make map

if (! require (rgdal)) {install.packages ("rgdal")}
if (! require (geojsonio)) {install.packages ("geojsonio")}
if (! require (RColorBrewer)) {install.packages ("RColorBrewer")}
if (! require (broom)) {install.packages ("broom")}
if (! require (rgeos)) {install.packages ("rgeos")}
if (! require (mapproj)) {install.packages ("mapproj")}

library (rgdal)
library (geojsonio)
library (broom)
library (RColorBrewer)
library (mapproj)

The importance of Hexmap are as follows: 1. It gives every constituency (or States) the same visual weight 2. Eliminate discrepancies in US state sizes

The disadvantages are 1. Not appropriate for statistical analysis 2. Size is significantly distorted

map1 <- geojson_read("us_states_hexgrid.geojson", what = "sp") # reads the file
plot(map1) # To see if the file is what we need

map1@data = map1@data %>%
  mutate(google_name = gsub(" \\(United States\\)", "", google_name))
map1@data = map1@data %>% mutate(google_name = gsub(" \\(United States\\)", "", google_name))

map1_fortified <- tidy(map1, region = "google_name")
head(map1_fortified)

## # A tibble: 6 x 7
##    long   lat order hole  piece group     id     
##   <dbl> <dbl> <int> <lgl> <fct> <chr>     <chr>  
## 1 -94.4  35.8     1 FALSE 1     Alabama.1 Alabama
## 2 -91.7  34.5     2 FALSE 1     Alabama.1 Alabama
## 3 -91.7  31.9     3 FALSE 1     Alabama.1 Alabama
## 4 -94.4  30.5     4 FALSE 1     Alabama.1 Alabama
## 5 -97.1  31.9     5 FALSE 1     Alabama.1 Alabama
## 6 -97.1  34.5     6 FALSE 1     Alabama.1 Alabama

library(rgeos)

centers <- cbind.data.frame(data.frame(gCentroid(map1, byid=TRUE), 
                                       id=map1@data$iso3166_2))


groups1 <- map1_fortified %>% 
  inner_join(groups, by = c("id" = "State"))

a <- ggplot()+
  
  geom_polygon ( data = groups1, 
                 aes (x = long, y = lat, 
                      fill = as.factor(sub_grp), group = id),
                 color = "white")+
  
  geom_text(data=centers, aes(x=x, y=y, label=id))+
  
  coord_map()+
  
  theme_bw()+
  
  theme (panel.background = element_rect(fill = "#f5f5f2", color = NA))+
  
  theme(axis.title.x=element_blank(), axis.text.x=element_blank(), 
        axis.ticks.x=element_blank())+
  
  theme(axis.title.y=element_blank(), 
        axis.text.y=element_blank(), axis.ticks.y=element_blank())+

  ggtitle ("Map of the United States in Hexbin")+
  labs(fill = 'Clusters/Groups')+
  labs (caption = "Map created by \n Rajesh Sigdel")

print(a)

Bar Graph, Cluster Analysis and Map

Rajesh Sigdel

5/6/2020

Objectives