In this section, I will combine both data wrangling and data visualisation.
# packages installations
my_packages <- c("tidyverse", "ggthemes", "ggplot2", "extrafont", "stringr")
install.packages(my_packages, repos = "http://cran.rstudio.com")## package 'tidyverse' successfully unpacked and MD5 sums checked
## package 'ggthemes' successfully unpacked and MD5 sums checked
## package 'ggplot2' successfully unpacked and MD5 sums checked
## package 'extrafont' successfully unpacked and MD5 sums checked
## package 'stringr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\mlcl.local\AppData\Local\Temp\Rtmp6h8gw0\downloaded_packages
# Load some packages for scrapping data and data manipulation:
rm(list = ls())
x <- c("rvest", "tidyverse", "magrittr")
lapply(x, library, character.only = TRUE)## [[1]]
## [1] "rvest" "xml2" "stats" "graphics" "grDevices" "utils"
## [7] "datasets" "methods" "base"
##
## [[2]]
## [1] "forcats" "stringr" "dplyr" "purrr" "readr"
## [6] "tidyr" "tibble" "ggplot2" "tidyverse" "rvest"
## [11] "xml2" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[3]]
## [1] "magrittr" "forcats" "stringr" "dplyr" "purrr"
## [6] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [11] "rvest" "xml2" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
# Loading data from the internet
link1 <- "https://college.harvard.edu/admissions/admissions-statistics"
field <- read_html("https://college.harvard.edu/admissions/admissions-statistics") %>%
html_table(fill = TRUE) %>%
.[[4]]
head(field)## Intended field of concentration
## 1 Humanities 14.5%
## 2 Social Sciences 23.8%
## 3 Biological Sciences 20.5%
## 4 Physical Sciences 7.6%
## 5 Engineering 11.7%
## 6 Computer Science 8.4%
This data doesn’t look clean enough.
# Re-names for variables
names(field) <- c("field", "rate")
head(field)## field rate
## 1 Humanities 14.5%
## 2 Social Sciences 23.8%
## 3 Biological Sciences 20.5%
## 4 Physical Sciences 7.6%
## 5 Engineering 11.7%
## 6 Computer Science 8.4%
# Extract "%" out
library(stringr)
# Creat my own function to pull out any strings rather than number
get_num <- function(x) {
x %>%
str_replace_all("[^0-9]" , "") %>%
as.numeric() %>%
return()
}
field %<>% mutate(rate = get_num(rate) / 10)
head(field)## field rate
## 1 Humanities 14.5
## 2 Social Sciences 23.8
## 3 Biological Sciences 20.5
## 4 Physical Sciences 7.6
## 5 Engineering 11.7
## 6 Computer Science 8.4
Now it’s time to visualize data!!!
theme_set(theme_minimal())
field %>%
ggplot(aes(field, rate)) +
geom_col()It looks boring, Let’s add more colours!
library(ggthemes)
p <- field %>%
ggplot(aes(reorder(field, rate), rate, fill = field)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(x = NULL,
y = NULL,
title = "Number of Students by Fields at Harvard University",
subtitle = "Created by Jenny Nguyen",
caption = "Data Source: https://college.harvard.edu/admissions/admissions-statistics")
p# I want to use the conomist theme, then,,,
p +
theme_economist(horizontal = FALSE) +
scale_fill_economist()p +
theme_wsj()# If I Want to add statistical numbers
p +
geom_text(aes(label = rate), hjust = 1.2, color = "white", size = 5)## And add "% symbol"
p +
geom_text(aes(label = paste(rate, "%", sep = " ")), hjust = 1.1, color = "white", size = 5) +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) -> p1
p1## Use the economist theme
p1 +
theme_economist(horizontal = FALSE) +
scale_fill_economist() +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())## Now, if I want to highlight one field over 8?
field %>%
ggplot(aes(reorder(field, rate), rate)) +
geom_col(show.legend = FALSE, fill = c("#27408B")) +
geom_col(data = field %>% filter(field %in% c( "Humanities")), fill = "red") +
coord_flip() +
labs(x = NULL,
y = NULL,
title = "Number of Students by Fields at Harvard University",
subtitle = "Created by Jenny Nguyen",
caption = "Data Source: https://college.harvard.edu/admissions/admissions-statistics")# A function for collecting population data:
get_population <- function(countryCode_selected, year_selected) {
base_url <- "https://www.census.gov/data-tools/demo/idb/region.php?N=%20Results%20&T=10&A=separate&RT=0&Y="
link <- paste0(paste0(paste0(base_url, year_selected), "&R=-1&C="), countryCode_selected)
link %>%
read_html() -> html_content
html_content %>%
html_table(fill = TRUE) %>%
.[[1]] -> pop_df
names(pop_df) <- str_replace_all(names(pop_df), " ", "_")
country_name <- html_content %>%
html_nodes('.query_source a') %>%
html_text()
if (length(country_name) == 2) {
country_name <- country_name[2]
}
final_df <- pop_df %>%
select(-Year, -Age, -contains("Percent"), -Sex_Ratio) %>%
mutate_all(function(x) {str_replace_all(x, "[^0-9]", "") %>% as.numeric()}) %>%
mutate(Year = pop_df$Year, Country = country_name[1], Age = pop_df$Age) %>%
select(Year, Country, Age, everything()) %>%
mutate(Percent_Both_Sexes = Both_Sexes_Population / Both_Sexes_Population[1],
Percent_Male = Male_Population / Male_Population[1],
Percent_Female = Female_Population / Female_Population[1],
Sex_Ratio = Male_Population / Female_Population,
Age = case_when(Age == "0-4" ~ "00-04", Age == "5-9" ~ "05-09", TRUE ~ Age))
return(final_df)
}#==============
# Style 1
#==============
# Use above function for Australia in 2016:
au_2016_pop <- get_population("AS", 2016)
head(au_2016_pop)## Year Country Age Both_Sexes_Population Male_Population
## 1 2016 Australia Total 22992654 11529136
## 2 2016 Australia 00-04 1376790 706537
## 3 2016 Australia 05-09 1369540 702678
## 4 2016 Australia 10-14 1356536 696218
## 5 2016 Australia 15-19 1403667 720507
## 6 2016 Australia 20-24 1576666 808486
## Female_Population Percent_Both_Sexes Percent_Male Percent_Female
## 1 11463518 1.00000000 1.00000000 1.00000000
## 2 670253 0.05987956 0.06128274 0.05846835
## 3 666862 0.05956424 0.06094802 0.05817254
## 4 660318 0.05899867 0.06038770 0.05760169
## 5 683160 0.06104850 0.06249445 0.05959427
## 6 768180 0.06857260 0.07012546 0.06701084
## Sex_Ratio
## 1 1.005724
## 2 1.054135
## 3 1.053708
## 4 1.054368
## 5 1.054668
## 6 1.052469
# Make a draft plot:
theme_set(theme_minimal())
au_2016_pop %>%
slice(-c(1, 21:22)) %>%
select(Age, Male_Population, Female_Population) %>%
mutate(Male_Population = -1*Male_Population) %>%
gather(Gender, Value, -Age) %>%
ggplot(aes(Age, Value, fill = Gender)) +
geom_col(position = "stack") +
coord_flip() +
theme(legend.position = "top") +
scale_y_continuous(breaks = seq(-1000000, 1000000, 200000),
limits = c(-1000000, 1000000),
labels = c(paste0(seq(10, 0, -2), ""), paste0(1:5, "")))This graph looks okay but not charming. We can make some improvements for making a more beautiful one by changing colours, adjusting axeses, adding titile and subtitles…
# Get more data from 1995 to 2017
total_df <- bind_rows(get_population("AS", 2017) %>%
slice(-c(1, 20:22)) %>%
select(Year, Age, Male_Population, Female_Population) %>%
mutate(Female_Population = -1*Female_Population) %>%
gather(Gender, Value, -Age, -Year),
get_population("AS", 2005) %>%
slice(-c(1, 20:22)) %>%
select(Year, Age, Male_Population, Female_Population) %>%
mutate(Female_Population = -1*Female_Population) %>%
gather(Gender, Value, -Age, -Year),
get_population("AS", 2000) %>%
slice(-c(1, 20:22)) %>%
select(Year, Age, Male_Population, Female_Population) %>%
mutate(Female_Population = -1*Female_Population) %>%
gather(Gender, Value, -Age, -Year),
get_population("AS", 1995) %>%
slice(-c(1, 20:22)) %>%
select(Year, Age, Male_Population, Female_Population) %>%
mutate(Female_Population = -1*Female_Population) %>%
gather(Gender, Value, -Age, -Year))# See the first 6 obs
head(total_df)## Year Age Gender Value
## 1 2017 00-04 Male_Population 711818
## 2 2017 05-09 Male_Population 706653
## 3 2017 10-14 Male_Population 703668
## 4 2017 15-19 Male_Population 718381
## 5 2017 20-24 Male_Population 805987
## 6 2017 25-29 Male_Population 851478
# See the last 6 obs
tail(total_df)## Year Age Gender Value
## 139 1995 60-64 Female_Population -345753
## 140 1995 65-69 Female_Population -349981
## 141 1995 70-74 Female_Population -319157
## 142 1995 75-79 Female_Population -239631
## 143 1995 80-84 Female_Population -174500
## 144 1995 85-89 Female_Population -92423
library(extrafont)
# Make final plot:
total_df %>%
ggplot(aes(x = Age, color = Gender))+
geom_linerange(data = total_df %>% filter(Gender == "Male_Population"),
aes(ymin = -0.35, ymax = -0.35 + Value),
size = 3.5, alpha = 0.8) +
geom_linerange(data = total_df %>% filter(Gender != "Male_Population"),
aes(ymin = 0.35, ymax = 0.35 + Value),
size = 3.5, alpha = 0.8) +
geom_label(aes(x = Age, y = 0, label = Age), inherit.aes = TRUE,
family = "OfficinaSansITC", size = 4, label.padding = unit(0, "lines"),
label.size = 0, label.r = unit(0.0, "lines"), fill = "#f5f5f2",
alpha = 1, color = "gray20") +
coord_flip() +
facet_wrap( ~ Year) +
labs(x = NULL, y = NULL,
title = "Population Pyramids of Australia from 1995 to 2017",
subtitle = "A population pyramid illustrates the age-sex structure of a country's population and may provide insights\nabout political and social stability, as well as economic development. Countries with young populations\nneed to invest more in schools, while countries with older populations need to invest more in the health sector.",
caption = "Data Source: https://www.census.gov") +
scale_y_continuous(breaks = seq(-1000000, 1000000, 200000),
limits = c(-1000000, 1000000),
labels = c(paste0(seq(10, 0, -2), ""), paste0(1:5, ""))) +
scale_color_manual(name = "", values = c(Female_Population = "#3E606F", Male_Population = "#8C3F4D"),
labels = c("Male", "Female")) +
theme(panel.grid.major.y = element_blank()) +
theme(plot.title = element_text(face = "bold", size = 24, family = "OfficinaSansITC", margin = margin(b = 9), hjust = 0, color = "grey20")) +
theme(plot.subtitle = element_text(size =12, margin = margin(b = 20), hjust = 0, family = "OfficinaSansITC", color = "grey30")) +
theme(plot.caption = element_text(size = 13, color = "grey50", family = "OfficinaSansITC")) +
theme(axis.text.y = element_blank()) +
theme(axis.text.x = element_text(size = 12, color = "grey20", family = "OfficinaSansITC", face = "bold")) +
theme(plot.background = element_rect(fill = "#EFF2F4", color = NA)) +
theme(panel.grid.major.x = element_line(linetype = "dotted", size = 0.2, color = "grey40")) +
theme(plot.margin = unit(c(1, 1, 1, 1), "cm")) +
theme(legend.position = "top") +
theme(legend.text = element_text(size = 10, face = "bold", color = "grey20")) +
theme(legend.text.align = 1) +
theme(strip.text = element_text(color = "grey20", size = 14, face = "bold", hjust = 0.026, family = "OfficinaSansITC"))