Load required package

library(dplyr)
library(readr)
library(ggplot2)
library(tidyr)
library(stringr)

Assigment 1

Question 1:

Why is the argument skip = 1 used to remove the first row of data set when reading the data

df_raw <- read.csv("D:/2020/R course/Day_3/multiple_choice_responses.csv", skip = 1)

The first row contain unnecessary information which disturb the process of data manipulation

Question 2:

Replace the space in column names by underscore

colnames(df_raw) <- paste(str_replace_all(colnames(df_raw), ".","_"), colnames(df_raw), sep="")

I got the problem with while reading the data. All the spaces in columne names were converted to dots. The code that I used to replace the dots/spaces with *"_"* returned weird outcomes. Have not figured out what is the problem here.

Question 3

Creat a new dataframe which contain only two variable What_is_your_gender?_-_Selected_Choice and In_which_country_do_you_currently_reside?. Rename as gender and nation respectively

Since I failed at question 2, I read the data gain in this step to work on other questions

df_raw <- read.csv("D:/2020/R course/Day_3/multiple_choice_responses.csv", skip = 1)

df_raw  %>% 
  select(gender = What.is.your.gender....Selected.Choice, nation = In.which.country.do.you.currently.reside.) -> df_profession

Question 4

Caculate the propotion of female working in Data Science for each country

df_profession %>% 
  group_by(nation, gender) %>% 
  count() %>% 
  group_by(nation) %>% 
  summarise(tot = sum(n)) %>% 
  select(nation, tot) %>% 
  arrange(-tot)-> df_profession_tot
## `summarise()` ungrouping output (override with `.groups` argument)
df_profession %>% 
  group_by(nation, gender) %>% 
  count() %>% 
  ungroup() %>% 
  filter(gender == "Female") %>% 
  select(nation, female = n) -> df_profession_female

df_profession %>% 
  group_by(nation, gender) %>% 
  count() %>% 
  ungroup() %>% 
  filter(gender == "Male") %>% 
  select(nation, male = n) -> df_profession_male

full_join(df_profession_tot, df_profession_female, by = "nation") -> df_profession_female 
full_join(df_profession_female, df_profession_male, by = "nation") -> df_profession_final

df_profession_final %>% 
  mutate(female_rate = (female/tot)*100, male_rate =(male/tot)*100) %>% 
  select(nation, female_rate, male_rate) -> df_rate

head(df_rate)
## # A tibble: 6 x 3
##   nation                   female_rate male_rate
##   <chr>                          <dbl>     <dbl>
## 1 India                          16.1       82.7
## 2 United States of America       20.2       77.4
## 3 Other                          15.1       79.9
## 4 Brazil                         10.9       88.3
## 5 Japan                           8.02      90.9
## 6 Russia                         12.3       86.7

Question 5

Extract the list of 19 countries which possess the largest propotions of female Data Scientist, add Vietnam to make the list 20 countries

df_vn <- df_rate %>% 
  filter(nation == "Viet Nam")

df_top19 <- df_rate %>% 
  top_n(19, female_rate)

rbind(df_vn, df_top19) -> list_country

head(list_country)
## # A tibble: 6 x 3
##   nation                                               female_rate male_rate
##   <chr>                                                      <dbl>     <dbl>
## 1 Viet Nam                                                    10.9      88.3
## 2 United States of America                                    20.2      77.4
## 3 United Kingdom of Great Britain and Northern Ireland        18.9      78.6
## 4 Canada                                                      22        74.2
## 5 Turkey                                                      22.9      75.7
## 6 South Korea                                                 19.8      79.1

Question 6

Reproduce a bar plot illustrating the propotion of female Data Scientists for the list of 20 countries in Q5

list_country %>% 
  rename(female = female_rate, male = male_rate) %>% 
  pivot_longer(cols = c("female", "male"), names_to = "gender", values_to = "rate") %>% 
  mutate(per = format(round(rate, 2), nsmall = 2))  %>% 
  mutate(per = as.character(per)) %>% 
  mutate(per = paste0(per, "%")) %>% 
  mutate(nation = case_when(str_detect(nation, "United States") ~ "United States", 
                            str_detect(nation, "United Kingdom") ~ "United Kingdom", 
                            str_detect(nation, "Islam") ~ "Iran",
                            TRUE ~ nation)) -> df_nation
  #%>% pivot_wider(names_from = gender, values_from = c(rate, per)) -> df_nation
df_nation %>%
  filter(gender == "female") %>%
  pull(per) -> per_female

df_nation %>%
  filter(gender == "female") %>%
  mutate(nation = factor(nation, levels = nation[order(rate)])) %>%
  pull(nation) %>%
  levels -> nation_levels


df_nation %>% 
  mutate(nation = factor(nation,
                    levels = nation_levels)) %>% 
  ggplot(aes(x = nation, y = rate, fill = gender)) +
           geom_col(position = "fill") +
  scale_y_continuous(labels = paste0(seq(0, 100, 25), "%"), expand = c(0, 0))+
  xlab(NULL) +
  ylab(NULL) +
  labs(title = "Fact 1: Women in Machine Learning and Data Science Comunity",
      caption = "Source: 2019 Kaggle ML & DS Survey") +
  geom_text(aes(x= nation, y=0.97, label = per), color ="white") +
  coord_flip() +
  scale_fill_manual(values= c("firebrick2", "dodgerblue2")) +
  theme(legend.title = element_blank(),
        legend.text = element_text(size = 11),
        legend.background = element_rect(fill = "lightgrey"),
        axis.ticks = element_blank(),
        plot.caption = element_text(face = "italic"),
        legend.position="top",
        legend.direction = "horizontal",
        legend.key.size = unit(0.3, "cm"),
        legend.key.width = unit(0.5,"cm"),
        plot.title = element_text(size = 20, face = "bold", hjust = 0.5)) +
  theme(axis.line.x = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        plot.background = element_rect(fill = "lightgrey"),
        plot.margin = unit(c(1,1,1,1),"cm")) 

The annotation of the text on the bars is not properly implemented. In current plot, there is overlap between annotated texts of male and female. I have been struggled with the codes to remove the one for male but have not figured out the appropriate way to do it.

Assignment 2

Question 1

library(rvest)
library(stringr)

all_links <- "https://www.citypopulation.de/Vietnam.html"

pg <- read_html(all_links)

m <- html_nodes(pg, "a")

k <- html_attr(m, "href")

all_links_communes_level <-  str_c("https://www.citypopulation.de/en/vietnam/", k[-c(1:6)])

The object all_links_communes_levels is character data. Each element of the object is the link direct to the page with population data at district and commune levels for each province of Vietnam

Question 2

library(dplyr)

specific_link <- all_links_communes_level[1]

specific_link %>% 
  read_html() %>% 
  html_nodes(xpath = '//*[@id="tl"]') %>% 
  html_table(fill = TRUE) %>% 
  .[[1]] -> df


head(df)
##                   Name         Status PopulationCensus2009-04-01         
## 1     Bình Th<U+1EE7>y Urban District                    113,565 <U+2192>
## 2       An Th<U+1EDB>i           Ward                     18,499 <U+2192>
## 3     Bình Th<U+1EE7>y           Ward                     18,307 <U+2192>
## 4 Bùi H<U+1EEF>u Nghia           Ward                     11,745 <U+2192>
## 5             Long Hòa           Ward                     16,450 <U+2192>
## 6    Long Tuy<U+1EC1>n           Ward                     15,232 <U+2192>

When replacing the number 1 with the number 2 in all_links_communes_level[1], specific_link provides the link to the data for Danang instead of Can Tho as in the initial link.

It is not possible to extract 100th elements of all_links_communes_level since the object contains only 63 elements. Therefore all_links_communes_level[100] returns NA, subsequently result in an error for the whole code.

Question 3

Write a function which accept html link as input and return a dataframe with the information on population at district and commune levels corresponding to the html link

pop_func <- function(link_list){
  
  for (i in length(link_list)) {
 
  link_list [i] %>% 
  read_html() %>% 
  html_nodes(xpath = '//*[@id="tl"]') %>% 
  html_table(fill = TRUE) %>% 
  .[[1]] -> df
   }
  return(df)
}

pop_func(all_links_communes_level) -> population_data

head(population_data)
##               Name        Status PopulationCensus2009-04-01         
## 1   L<U+1EE5>c Yên      District                    102,946 <U+2192>
## 2    An L<U+1EA1>c Rural Commune                      2,546 <U+2192>
## 3           An Phú Rural Commune                      4,599 <U+2192>
## 4 Ð<U+1ED9>ng Quan Rural Commune                      5,809 <U+2192>
## 5       Khai Trung Rural Commune                      1,142 <U+2192>
## 6        Khánh Hoà Rural Commune                      2,964 <U+2192>

Question 4

Generate Choropleth Map illustrate population density at different levels : (a) Province, (b) District assuming that the population data acquired is the population density.