Import your data

setwd("~/Desktop/PSU_DAT3000_IntroToDA/05_module8/data/")
data <- read_excel("My_Data.xlsx")
data
## # A tibble: 1,302 × 9
##    Language   Endonym `World Region` Country `Global Speakers` `Language Family`
##    <chr>      <chr>   <chr>          <chr>               <dbl> <chr>            
##  1 Abakuá     Abakuá  Caribbean      "Cuba"                 NA <NA>             
##  2 Abaza      Абаза   Western Asia   "Turke…             49800 Abkhaz-Adyge     
##  3 Abruzzese… Abruzz… Southern Euro… "Italy"                NA Indo-European    
##  4 Abruzzese… Abruzz… Southern Euro… "Italy"                NA Indo-European    
##  5 Acehnese   Bahsa … Southeastern … "Indon…           3500000 Austronesian     
##  6 Acehnese   Bahsa … Southeastern … "Indon…           3500000 Austronesian     
##  7 Adjoukrou  <NA>    Western Africa "Ivory…            140000 Atlantic-Congo   
##  8 Adyghe     <NA>    Western Asia   "Turke…            117500 Abkhaz-Adyge     
##  9 Afenmai    Afenmai Western Africa "Niger…            270000 Atlantic-Congo   
## 10 African-A… Black … Northern Amer… "Unite…          45109521 Indo-European    
## # ℹ 1,292 more rows
## # ℹ 3 more variables: Location <chr>, Size <chr>, Status <chr>

Chapter 15

Create a factor

data %>% count(Country)
## # A tibble: 354 × 2
##    Country                                                     n
##    <chr>                                                   <int>
##  1 "Afghanistan"                                               2
##  2 "Afghanistan,\r\nPakistan"                                  4
##  3 "Afghanistan,\r\nTajikistan"                                1
##  4 "Albania"                                                   5
##  5 "Albania,\r\nKosovo,\r\nMontenegro,\r\nNorth Macedonia"     5
##  6 "Algeria"                                                   1
##  7 "Algeria,\r\nEgypt"                                         2
##  8 "Algeria,\r\nFrance,\r\nIsrael"                             1
##  9 "Algeria,\r\nTunisia"                                       1
## 10 "Angola"                                                    2
## # ℹ 344 more rows
country_levels <- c("Mexico", "China", "France")

data_rev <- data %>%
    mutate(Country = Country %>% factor(levels = country_levels))

Modify factor order

Make two bar charts here - one before ordering another after

## Unordered
data_summary <- data %>%
    
    filter(Country %in% country_levels) %>%
    group_by(Country) %>%
    summarise(
        `Global Speakers` = mean(`Global Speakers`, na.rm = TRUE)
    )
data_summary
## # A tibble: 3 × 2
##   Country `Global Speakers`
##   <chr>               <dbl>
## 1 China           20617737 
## 2 France            279500 
## 3 Mexico          17647579.
data_summary %>%
    
    ggplot(aes(x = Country, y = `Global Speakers`)) +
    geom_point()

data_summary
## # A tibble: 3 × 2
##   Country `Global Speakers`
##   <chr>               <dbl>
## 1 China           20617737 
## 2 France            279500 
## 3 Mexico          17647579.
## Ordered
data_summary %>%
    
    filter(Country %in% country_levels) %>%
    ggplot(aes(x = `Global Speakers`, y = fct_reorder(.f = Country, .x = `Global Speakers`))) +
    geom_point() +
    labs(y = "Country", x = "Global Speakers") 

data_summary
## # A tibble: 3 × 2
##   Country `Global Speakers`
##   <chr>               <dbl>
## 1 China           20617737 
## 2 France            279500 
## 3 Mexico          17647579.

Modify factor levels

Show examples of three functions:

  • fct_recode
data_summary %>%
    
    filter(Country %in% country_levels) %>%
    mutate(Country = fct_recode(Country,
                                "NorthAmerica" = "Mexico",
                                "Asia" = "China",
                                "Europe" = "France")) %>%
    count(Country)
## # A tibble: 3 × 2
##   Country          n
##   <fct>        <int>
## 1 Asia             1
## 2 Europe           1
## 3 NorthAmerica     1
  • fct_collapse
data_summary %>%
    
    filter(Country %in% country_levels) %>%
    mutate(Country = fct_collapse(Country,
                                  NorthAmerica = "Mexico",
                                  Other = c("China", "France"))) %>%
    count(Country)
## # A tibble: 2 × 2
##   Country          n
##   <fct>        <int>
## 1 Other            2
## 2 NorthAmerica     1
  • fct_lump
data_summary %>%
    
    filter(Country %in% country_levels) %>%
    mutate(Country = fct_lump(Country)) %>%
    count(Country)
## # A tibble: 3 × 2
##   Country     n
##   <fct>   <int>
## 1 China       1
## 2 France      1
## 3 Mexico      1

Chapter 16

No need to do anything here.