The objective of the project is the examine and visualize Asian Immigrant population in the United States and examine the which states are similar to each other in terms of Asian Immigrants.
We utilized the data set of Asian American population of 2017 retrieved from Census.gov.
Let us visualize the data using Bar graph:
if (!require (tidyverse)) {install.packages ("tidyverse")}
if (!require (patchwork)) {install.packages ("patchwork")}
if (!require (scales)) {install.packages ("scales")}
# For dendrogram visualization
if (!require (ggdendro)) {install.packages ("ggdendro")}
library(tidyverse) # For data manipulation
library(readr)
library(dplyr)
library(ggplot2)
library(readxl)
library(readr)
library (ggdendro) # For dendrogram visualization
library(patchwork) # To join graphs
library(scales)
library(plotly)
# Census data of Asian Immigrant 2017
CensusData2017 <- read_csv("CensusData2017.csv")
# Short names for states. These will be helpful during labelling of the figure
StateShort <- read_excel("StateShort.xlsx")
# View(StateShort)
# Total population of each state in 2017
TotalPop <- read_csv("TotalPop.csv", col_types = cols(GEO.id = col_skip(),
GEO.id2 = col_skip(), Id = col_skip(),
Id2 = col_skip(), `Margin of Error; Total` = col_skip(),
X1 = col_skip()), skip = 1)
Adding all the Asian Immigrant population together
Total <- CensusData2017 %>%
select (-State) %>%
rowwise() %>%
do((.) %>% as.data.frame %>%
mutate (TotalAsian = sum(.))) %>%
ungroup() %>%
cbind(CensusData2017$State)
Remaning column name and preparing for the analysis
Total <- Total %>%
rename(State = "CensusData2017$State")
Total1 <- TotalPop %>%
inner_join(Total, by= c("Geography" = "State")) %>%
inner_join(StateShort, by = c("Geography" = "State")) %>%
mutate (percent = (TotalAsian/Total)*100)
## Warning: Column `Geography`/`State` joining character vector and factor,
## coercing into character vector
Visualizing data
p <- ggplot(data = Total1, aes(x = reorder (Abbreviation, -percent), y = percent, fill = reorder (Geography, -percent)))+
geom_bar(stat= "identity")+
theme_bw()+
theme(legend.position = "none") +
scale_y_continuous(breaks = c(0, 5, 10, 15, 20, 25, 30, 35))+
theme ( axis.text.y = element_text(face = 'italic', size = 8))+
theme ( axis.text.x = element_text(face = 'italic', size = 8))+
theme (axis.text.x = element_text(face = "bold", color = "#993333", angle = 45))+
labs(x = 'States', y = " Percent of Asian Immigrants ")
ggplotly(p)
Figure: Asian Immigrant Population Percentage in 50 States and Puerto Rico in 2017
t <- ggplot(data = Total1, aes(x = reorder (Abbreviation, -TotalAsian),
y = TotalAsian, fill =
reorder (Geography, -percent)))+
geom_bar(stat= "identity")+
scale_y_continuous(breaks = seq(0, 6000000, 1000000), labels = comma)+
theme_bw()+
theme(legend.position = "none")+
theme( axis.text.y = element_text(face = 'italic', size = 8))+
theme( axis.text.x = element_text(face = 'italic', size = 8))+
theme(axis.text.x = element_text(face = "bold", color = "#993333", angle = 45))+
labs(x = 'States', y = " Total immigrant ")
# print (t)
ggplotly(t)
Figure: Total Asian Immigrant Population in 50 States and Puerto Rico in 2017
California has the highest total number of Asian Immigrants. But Hawaii, when total Asian Immigrant is divided by the total population of the state, has the highest percent of Asian Immigrants.
Now, let’s switch to cluster analysis.
h.clust <- Total1 %>%
select (., Geography, percent) %>%
column_to_rownames(., "Geography")
str(h.clust)
## 'data.frame': 52 obs. of 1 variable:
## $ percent: num 1.27 6.02 3.02 1.38 13.65 ...
is.null(h.clust) # To check is there is any Null values
## [1] FALSE
h.clust1 <- hclust(dist(h.clust), method = "complete")
ggdendrogram(h.clust1)+
ggtitle(" Dendrogram of States with Asian Population") +
labs( x = " States", y = "Euclidean Distance", title = "Dendrogram")
plot(h.clust1, hang = -1, xlab = " States",
ylab = "Euclidean Distance",ps = 10,
main = "Cluster dendrogram of Asian population of States - 2017 Census")
rect.hclust(h.clust1, h=2 )
sub_grp <- cutree(h.clust1, k = 7)
groups <- as.data.frame(sub_grp) %>%
rownames_to_column("State")
Now,Let’s make map
if (! require (rgdal)) {install.packages ("rgdal")}
if (! require (geojsonio)) {install.packages ("geojsonio")}
if (! require (RColorBrewer)) {install.packages ("RColorBrewer")}
if (! require (broom)) {install.packages ("broom")}
if (! require (rgeos)) {install.packages ("rgeos")}
if (! require (mapproj)) {install.packages ("mapproj")}
library (rgdal)
library (geojsonio)
library (broom)
library (RColorBrewer)
library (mapproj)
The importance of Hexmap are as follows: 1. It gives every constituency (or States) the same visual weight 2. Eliminate discrepancies in US state sizes
The disadvantages are 1. Not appropriate for statistical analysis 2. Size is significantly distorted
map1 <- geojson_read("us_states_hexgrid.geojson", what = "sp") # reads the file
plot(map1) # To see if the file is what we need
map1@data = map1@data %>%
mutate(google_name = gsub(" \\(United States\\)", "", google_name))
map1@data = map1@data %>% mutate(google_name = gsub(" \\(United States\\)", "", google_name))
map1_fortified <- tidy(map1, region = "google_name")
head(map1_fortified)
## # A tibble: 6 x 7
## long lat order hole piece group id
## <dbl> <dbl> <int> <lgl> <fct> <chr> <chr>
## 1 -94.4 35.8 1 FALSE 1 Alabama.1 Alabama
## 2 -91.7 34.5 2 FALSE 1 Alabama.1 Alabama
## 3 -91.7 31.9 3 FALSE 1 Alabama.1 Alabama
## 4 -94.4 30.5 4 FALSE 1 Alabama.1 Alabama
## 5 -97.1 31.9 5 FALSE 1 Alabama.1 Alabama
## 6 -97.1 34.5 6 FALSE 1 Alabama.1 Alabama
library(rgeos)
centers <- cbind.data.frame(data.frame(gCentroid(map1, byid=TRUE),
id=map1@data$iso3166_2))
groups1 <- map1_fortified %>%
inner_join(groups, by = c("id" = "State"))
a <- ggplot()+
geom_polygon ( data = groups1,
aes (x = long, y = lat,
fill = as.factor(sub_grp), group = id),
color = "white")+
geom_text(data=centers, aes(x=x, y=y, label=id))+
coord_map()+
theme_bw()+
theme (panel.background = element_rect(fill = "#f5f5f2", color = NA))+
theme(axis.title.x=element_blank(), axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(), axis.ticks.y=element_blank())+
ggtitle ("Map of the United States in Hexbin")+
labs(fill = 'Clusters/Groups')+
labs (caption = "Map created by \n Rajesh Sigdel")
print(a)