setwd("~/Desktop/PSU_DAT3000_IntroToDA/05_module8/data/")
data <- read_excel("My_Data.xlsx")
data
## # A tibble: 1,302 × 9
## Language Endonym `World Region` Country `Global Speakers` `Language Family`
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 Abakuá Abakuá Caribbean "Cuba" NA <NA>
## 2 Abaza Абаза Western Asia "Turke… 49800 Abkhaz-Adyge
## 3 Abruzzese… Abruzz… Southern Euro… "Italy" NA Indo-European
## 4 Abruzzese… Abruzz… Southern Euro… "Italy" NA Indo-European
## 5 Acehnese Bahsa … Southeastern … "Indon… 3500000 Austronesian
## 6 Acehnese Bahsa … Southeastern … "Indon… 3500000 Austronesian
## 7 Adjoukrou <NA> Western Africa "Ivory… 140000 Atlantic-Congo
## 8 Adyghe <NA> Western Asia "Turke… 117500 Abkhaz-Adyge
## 9 Afenmai Afenmai Western Africa "Niger… 270000 Atlantic-Congo
## 10 African-A… Black … Northern Amer… "Unite… 45109521 Indo-European
## # ℹ 1,292 more rows
## # ℹ 3 more variables: Location <chr>, Size <chr>, Status <chr>
data %>% count(Country)
## # A tibble: 354 × 2
## Country n
## <chr> <int>
## 1 "Afghanistan" 2
## 2 "Afghanistan,\r\nPakistan" 4
## 3 "Afghanistan,\r\nTajikistan" 1
## 4 "Albania" 5
## 5 "Albania,\r\nKosovo,\r\nMontenegro,\r\nNorth Macedonia" 5
## 6 "Algeria" 1
## 7 "Algeria,\r\nEgypt" 2
## 8 "Algeria,\r\nFrance,\r\nIsrael" 1
## 9 "Algeria,\r\nTunisia" 1
## 10 "Angola" 2
## # ℹ 344 more rows
country_levels <- c("Mexico", "China", "France")
data_rev <- data %>%
mutate(Country = Country %>% factor(levels = country_levels))
Make two bar charts here - one before ordering another after
## Unordered
data_summary <- data %>%
filter(Country %in% country_levels) %>%
group_by(Country) %>%
summarise(
`Global Speakers` = mean(`Global Speakers`, na.rm = TRUE)
)
data_summary
## # A tibble: 3 × 2
## Country `Global Speakers`
## <chr> <dbl>
## 1 China 20617737
## 2 France 279500
## 3 Mexico 17647579.
data_summary %>%
ggplot(aes(x = Country, y = `Global Speakers`)) +
geom_point()
data_summary
## # A tibble: 3 × 2
## Country `Global Speakers`
## <chr> <dbl>
## 1 China 20617737
## 2 France 279500
## 3 Mexico 17647579.
## Ordered
data_summary %>%
filter(Country %in% country_levels) %>%
ggplot(aes(x = `Global Speakers`, y = fct_reorder(.f = Country, .x = `Global Speakers`))) +
geom_point() +
labs(y = "Country", x = "Global Speakers")
data_summary
## # A tibble: 3 × 2
## Country `Global Speakers`
## <chr> <dbl>
## 1 China 20617737
## 2 France 279500
## 3 Mexico 17647579.
Show examples of three functions:
data_summary %>%
filter(Country %in% country_levels) %>%
mutate(Country = fct_recode(Country,
"NorthAmerica" = "Mexico",
"Asia" = "China",
"Europe" = "France")) %>%
count(Country)
## # A tibble: 3 × 2
## Country n
## <fct> <int>
## 1 Asia 1
## 2 Europe 1
## 3 NorthAmerica 1
data_summary %>%
filter(Country %in% country_levels) %>%
mutate(Country = fct_collapse(Country,
NorthAmerica = "Mexico",
Other = c("China", "France"))) %>%
count(Country)
## # A tibble: 2 × 2
## Country n
## <fct> <int>
## 1 Other 2
## 2 NorthAmerica 1
data_summary %>%
filter(Country %in% country_levels) %>%
mutate(Country = fct_lump(Country)) %>%
count(Country)
## # A tibble: 3 × 2
## Country n
## <fct> <int>
## 1 China 1
## 2 France 1
## 3 Mexico 1
No need to do anything here.