Import data
data <- read_excel("My_Data.xlsx")
data
## # A tibble: 1,302 × 9
## Language Endonym `World Region` Country `Global Speakers` `Language Family`
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Abakuá Abakuá Caribbean "Cuba" NA <NA>
## 2 Abaza Абаза Western Asia "Turke… 49800 Abkhaz-Adyge
## 3 Abruzzese… Abruzz… Southern Euro… "Italy" NA Indo-European
## 4 Abruzzese… Abruzz… Southern Euro… "Italy" NA Indo-European
## 5 Acehnese Bahsa … Southeastern … "Indon… 3500000 Austronesian
## 6 Acehnese Bahsa … Southeastern … "Indon… 3500000 Austronesian
## 7 Adjoukrou <NA> Western Africa "Ivory… 140000 Atlantic-Congo
## 8 Adyghe <NA> Western Asia "Turke… 117500 Abkhaz-Adyge
## 9 Afenmai Afenmai Western Africa "Niger… 270000 Atlantic-Congo
## 10 African-A… Black … Northern Amer… "Unite… 45109521 Indo-European
## # ℹ 1,292 more rows
## # ℹ 3 more variables: Location <chr>, Size <chr>, Status <chr>
Introduction
Questions
Variation
data %>%
filter(`Global Speakers` < 300) %>%
ggplot(mapping = aes(x = `Language Family`, y = `Global Speakers`), position = position_jitter(width = 0.2, height = 0)) +
geom_point() +
scale_x_discrete(drop = FALSE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Language Family", y = "Global Speakers")

Visualizing distributions
data %>%
filter(`Global Speakers` < 300) %>%
ggplot(mapping = aes(x = `Global Speakers`, colour = `Language Family`)) +
geom_freqpoly()

Typical values
data %>%
filter(`Global Speakers` < 100000) %>%
ggplot(mapping = aes(x = `Global Speakers`)) +
geom_histogram(binwidth = 200) +
labs(x = "Global Speakers", y = "Number of Language Familys")

Unusual values
data %>%
filter(`Global Speakers` < 300) %>%
ggplot(mapping = aes(x = `Global Speakers`)) +
geom_histogram() +
labs(x = "Global Speakers", y = "Number of Language Familys")

Missing Values
data %>%
filter(`Global Speakers` < 30000) %>%
mutate(missing = is.na(`Global Speakers`)) %>%
ggplot(aes(x = `Global Speakers`, y = `Language Family`, color = missing)) +
geom_point(na.rm = TRUE)

Covariation
A categorical and continuous variable
data %>%
filter(`Global Speakers` < 3000) %>%
ggplot(mapping = aes(x = `Global Speakers`)) +
geom_freqpoly(mapping = aes(colour = `Language Family`), binwidth = 50)

Two categorical variables
data %>%
filter(Country == "China") %>%
count(Language, Country) %>%
ggplot(mapping = aes(x = Country, y = Language)) +
geom_tile(mapping = aes(fill = Language)) +
labs(x = "Country", y = "Language's in China")

Two continous variables
data %>%
filter(`Global Speakers` > 100000) %>%
ggplot(mapping = aes(x = Size, y = `Global Speakers`)) +
geom_point(mapping = aes(x = Size, y = `Global Speakers`), alpha = 1 / 10) +
scale_y_continuous(labels = scales::comma)

Patterns and models
library(modelr)
mod <- lm(log(`Global Speakers`) ~ Size, data = data)
data2 <- data %>%
modelr::add_residuals(mod) %>%
mutate(resid = exp(resid))
ggplot(data = data2) +
geom_boxplot(mapping = aes(x = Size, y = `Global Speakers`)) +
scale_y_continuous(labels = scales::comma) +
labs(x = "Size of NYC Locations", y = "Global Speakers per Size")
