Import data

data <- read_excel("My_Data.xlsx")
data
## # A tibble: 1,302 × 9
##    Language   Endonym `World Region` Country `Global Speakers` `Language Family`
##    <chr>      <chr>   <chr>          <chr>               <dbl> <chr>            
##  1 Abakuá     Abakuá  Caribbean      "Cuba"                 NA <NA>             
##  2 Abaza      Абаза   Western Asia   "Turke…             49800 Abkhaz-Adyge     
##  3 Abruzzese… Abruzz… Southern Euro… "Italy"                NA Indo-European    
##  4 Abruzzese… Abruzz… Southern Euro… "Italy"                NA Indo-European    
##  5 Acehnese   Bahsa … Southeastern … "Indon…           3500000 Austronesian     
##  6 Acehnese   Bahsa … Southeastern … "Indon…           3500000 Austronesian     
##  7 Adjoukrou  <NA>    Western Africa "Ivory…            140000 Atlantic-Congo   
##  8 Adyghe     <NA>    Western Asia   "Turke…            117500 Abkhaz-Adyge     
##  9 Afenmai    Afenmai Western Africa "Niger…            270000 Atlantic-Congo   
## 10 African-A… Black … Northern Amer… "Unite…          45109521 Indo-European    
## # ℹ 1,292 more rows
## # ℹ 3 more variables: Location <chr>, Size <chr>, Status <chr>

Introduction

Questions

Variation

data %>%
    
    filter(`Global Speakers` < 300) %>%
    
ggplot(mapping = aes(x = `Language Family`, y = `Global Speakers`), position = position_jitter(width = 0.2, height = 0)) +
  geom_point() +
  scale_x_discrete(drop = FALSE) +                
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  labs(x = "Language Family", y = "Global Speakers")

Visualizing distributions

data %>%    

    filter(`Global Speakers` < 300) %>%
    
    ggplot(mapping = aes(x = `Global Speakers`, colour = `Language Family`)) +
  geom_freqpoly()

Typical values

data %>% 
    
    filter(`Global Speakers` < 100000) %>%
    
ggplot(mapping = aes(x = `Global Speakers`)) +
  geom_histogram(binwidth = 200) +
  labs(x = "Global Speakers", y = "Number of Language Familys")

Unusual values

data %>% 
    
    filter(`Global Speakers` < 300) %>%
    
ggplot(mapping = aes(x = `Global Speakers`)) +
  geom_histogram() +
  labs(x = "Global Speakers", y = "Number of Language Familys")

Missing Values

data %>%
    
    filter(`Global Speakers` < 30000) %>%
    
    mutate(missing = is.na(`Global Speakers`)) %>%
  
ggplot(aes(x = `Global Speakers`, y = `Language Family`, color = missing)) +
  geom_point(na.rm = TRUE)

Covariation

A categorical and continuous variable

data %>%
    
    filter(`Global Speakers` < 3000) %>%
    
ggplot(mapping = aes(x = `Global Speakers`)) + 
  geom_freqpoly(mapping = aes(colour = `Language Family`), binwidth = 50)

Two categorical variables

data %>%

    filter(Country == "China") %>%
    
    count(Language, Country) %>%  
  ggplot(mapping = aes(x = Country, y = Language)) +
    geom_tile(mapping = aes(fill = Language)) +
    labs(x = "Country", y = "Language's in China")

Two continous variables

data %>%

    filter(`Global Speakers` > 100000) %>%
    
ggplot(mapping = aes(x = Size, y = `Global Speakers`)) +
  geom_point(mapping = aes(x = Size, y = `Global Speakers`), alpha = 1 / 10) +
  scale_y_continuous(labels = scales::comma)

Patterns and models

library(modelr)
mod <- lm(log(`Global Speakers`) ~ Size, data = data)

data2 <- data %>%
  modelr::add_residuals(mod) %>%

mutate(resid = exp(resid))

ggplot(data = data2) +
  geom_boxplot(mapping = aes(x = Size, y = `Global Speakers`)) +
  scale_y_continuous(labels = scales::comma) +
    labs(x = "Size of NYC Locations", y = "Global Speakers per Size")