About

We will create a map that shows the most commonly spoken languages in the US per state other than English and Spanish. The map is essentially a recreation of the one found here under the following link, but using the “leaflet” library:

http://www.slate.com/articles/arts/culturebox/2014/05/language_map_what_s_the_most_popular_language_in_your_state.html

Data

The data is taken from the United States Census Bureau:

https://www.census.gov/programs-surveys/acs/

The information about longitude and latitude of the states is taken from here:

https://inkplant.com/code/state-latitudes-longitudes

Map creation

library(leaflet)

overview of geographic information for creating the pins:

ll <- read.csv("state_latlon_inkplant.csv", header = FALSE)
colnames(ll) <- c("Geography", "Latitude", "Longitude")
ll <- ll[!(ll$Geography == "District of Columbia"),]
ll <- ll[!(ll$Geography == "Alaska"),]
ll <- ll[!(ll$Geography == "Hawaii"),]

head(ll)
##     Geography Latitude  Longitude
## 1     Alabama 32.80667  -86.79113
## 3     Arizona 33.72976 -111.43122
## 4    Arkansas 34.96970  -92.37312
## 5  California 36.11620 -119.68156
## 6    Colorado 39.05981 -105.31110
## 7 Connecticut 41.59778  -72.75537

relevant languages:

languages <- c("Spanish", "French", "Italian", "Portuguese", "German", "Yiddish", "Other West Germanic", "Scandinavian", "Greek", "Russian", "Polish", "Serbo-Croatian", "Other Slavic", "Armenian", "Persian", "Gujarati", "Hindi", "Urdu", "Other Indic", "Other Indo-European", "Chinese", "Japanese", "Korean", "Hmong", "Thai", "Laotian", "Vietnamese", "Other Asian", "Tagalog", "Other Pacific Island", "Navajo", "Other Native North American", "Hungarian", "Arabic", "Hebrew", "African", "Other and unspecified")
data <- read.csv("ACS_14_5YR_B16001_with_ann.csv", skip = 1)

wells <- !grepl("well", colnames(data))
idc <- sapply(languages, function(x) {
  tmp_string <- paste("^Estimate.*", x, sep = "")
  tmp_idc <- grep(tmp_string, colnames(data))
  tmp_idc <- tmp_idc[wells[tmp_idc]]
  if (length(tmp_idc) == 0) {
    tmp_idc <- NA
  } else {
    tmp_idc[[1]]
  }
})
idc <- idc[!is.na(idc)]
data <- subset(data, select = colnames(data[c(3, idc)]))
colnames(data) <- c("Geography", names(idc))
# actually used languages:
colnames(data)
##  [1] "Geography"    "Spanish"      "French"       "Italian"     
##  [5] "Portuguese"   "German"       "Yiddish"      "Scandinavian"
##  [9] "Greek"        "Russian"      "Polish"       "Armenian"    
## [13] "Persian"      "Gujarati"     "Hindi"        "Urdu"        
## [17] "Chinese"      "Japanese"     "Korean"       "Hmong"       
## [21] "Thai"         "Laotian"      "Vietnamese"   "Tagalog"     
## [25] "Navajo"       "Hungarian"    "Arabic"       "Hebrew"      
## [29] "African"
data <- data[!(data$Geography == "District of Columbia"),]
data <- data[!(data$Geography == "Puerto Rico"),]
data <- data[!(data$Geography == "Alaska"),]
data <- data[!(data$Geography == "Hawaii"),]
language_winner <- apply(data[,3:29], 1, function(x) {
  tmp_idx <- which.max(x)
  tmp_idx <- tmp_idx + 2
  colnames(data[tmp_idx])
})
library(leaflet)

ll %>%
  leaflet() %>%
  addTiles() %>%
  addMarkers(popup = as.character(language_winner))
## Assuming 'Longitude' and 'Latitude' are longitude and latitude, respectively