I had a unique opportunity to give the first tutorial organized by Abuja R User Group on their Inaugural Meetup. The tutorial was meant for aspiring Data Scientist looking forward to using R as they journey to realise their dream.
A lot has happened and reported in the media on the 2019 Presidential Election result. So, what dataset could be more interesting than this result. It would make the class more vibrant to take a look at the result from a Data Analyst perspective. The 2019 Presidential Election results data was obtained from BBC website, https://www.bbc.co.uk/news/resources/idt-f0b25208-4a1d-4068-a204-940cbe88d1d3
# Load Libraries ----
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
library(leaflet)
library(sp)
The working directory or folder where files related to the project or document are saved
setwd("~/AbujaRUG")
The results are available in html table and json format. To understand the structure of the HTML a little understanding of HTML and CSS would be needed. We can take a look at the webpage by using the Google’s Developer tools to get But we would concentrate on the html table format. We would be using the rvest package.
# 1.0 Web Scraping using rvest ----
url <- read_html("https://www.bbc.co.uk/news/resources/idt-f0b25208-4a1d-4068-a204-940cbe88d1d3")
#available_tables <- html_nodes(url, "table") #we have 4 html tables on the page
election_results_df <-
#Get out all the html tables on the page
html_nodes(url, "table") %>%
#we have 4 html tables on the page
#After examination of all the tables, our table of interest is the fourth one,
#so we get out our data into a datatable
# get the fourth table
nth(4) %>%
# convert to data frame
html_table(header = FALSE) %>%
glimpse()
## Observations: 38
## Variables: 6
## $ X1 <chr> "Abia", "Adamawa", "Akwa Ibom", "Anambra", "Bauchi", "Bayel...
## $ X2 <chr> "85,058", "378,078", "175,429", "33,298", "798,428", "118,8...
## $ X3 <chr> "219,698", "410,266", "395,832", "524,738", "209,313", "197...
## $ X4 <chr> "1,489", "3,670", "1,902", "4,374", "2,104", "1,584", "2,79...
## $ X5 <chr> "336", "3,989", "230", "227", "296", "1,078", "554", "301",...
## $ X6 <chr> "9,638", "159", "61", "30,034", "149", "53", "4,582", "187"...
A look at the data.frame column displays some weird X1,X2 … X6 names. We need to replace these names with meaningful column names. Also, the unwanted last row which contains the total number of votes for each party in all the states needs to be removed.
# Assign the columns names as follow
names(election_results_df) <- c("State","APC","PDP","PCP","ADC","APGA")
# Remove the unwanted last row
election_results_df <- election_results_df[-38,]
# Display the data.frame
head(election_results_df)
## State APC PDP PCP ADC APGA
## 1 Abia 85,058 219,698 1,489 336 9,638
## 2 Adamawa 378,078 410,266 3,670 3,989 159
## 3 Akwa Ibom 175,429 395,832 1,902 230 61
## 4 Anambra 33,298 524,738 4,374 227 30,034
## 5 Bauchi 798,428 209,313 2,104 296 149
## 6 Bayelsa 118,821 197,933 1,584 1,078 53
# Categorizing each states into 6 different geopolitical zones
North_Central <- c("Benue", "FCT", "Kogi", "Kwara", "Nasarawa", "Niger", "Plateau")
North_East <- c("Adamawa", "Bauchi", "Borno", "Gombe", "Taraba", "Yobe")
North_West <- c("Kaduna", "Katsina", "Kano", "Kebbi", "Sokoto", "Jigawa","Zamfara")
South_East <- c("Abia", "Anambra", "Ebonyi", "Enugu", "Imo")
South_South <- c("Akwa Ibom", "Bayelsa", "Cross River", "Delta", "Edo", "Rivers")
South_West <- c("Ekiti", "Lagos", "Osun", "Ondo", "Ogun", "Oyo")
# Add new features i.e. new column for the geopolitical zones
election_results_df <-
election_results_df %>%
mutate(
geopolitical_zone = case_when(
State %in% North_Central ~ "North Central",
State %in% North_East ~ "North East",
State %in% North_West ~ "North west",
State %in% South_East ~ "South East",
State %in% South_South ~ "South South",
State %in% South_West ~ "South West"
)
) %>%
glimpse() #All columns data are character type
## Observations: 37
## Variables: 7
## $ State <chr> "Abia", "Adamawa", "Akwa Ibom", "Anambra", "...
## $ APC <chr> "85,058", "378,078", "175,429", "33,298", "7...
## $ PDP <chr> "219,698", "410,266", "395,832", "524,738", ...
## $ PCP <chr> "1,489", "3,670", "1,902", "4,374", "2,104",...
## $ ADC <chr> "336", "3,989", "230", "227", "296", "1,078"...
## $ APGA <chr> "9,638", "159", "61", "30,034", "149", "53",...
## $ geopolitical_zone <chr> "South East", "North East", "South South", "...
# Convert all the party columns from character type to numeric
# Create a function to convert to numeric
convert_to_numeric <- function(x){as.numeric(gsub("\\,", "", x))}
# map the function to each party column
election_results_df <- election_results_df %>%
select(c(2:6)) %>%
map(convert_to_numeric) %>%
as_tibble() %>%
cbind(select(election_results_df, c(State, geopolitical_zone))) %>%
# A new look at the data frame
glimpse()
## Observations: 37
## Variables: 7
## $ APC <dbl> 85058, 378078, 175429, 33298, 798428, 118821...
## $ PDP <dbl> 219698, 410266, 395832, 524738, 209313, 1979...
## $ PCP <dbl> 1489, 3670, 1902, 4374, 2104, 1584, 2793, 15...
## $ ADC <dbl> 336, 3989, 230, 227, 296, 1078, 554, 301, 32...
## $ APGA <dbl> 9638, 159, 61, 30034, 149, 53, 4582, 187, 43...
## $ State <chr> "Abia", "Adamawa", "Akwa Ibom", "Anambra", "...
## $ geopolitical_zone <chr> "South East", "North East", "South South", "...
# Reoder the columns to add more meaning to what we see
election_results_df <- election_results_df[c(6:7,1:5)]
head(election_results_df)
## State geopolitical_zone APC PDP PCP ADC APGA
## 1 Abia South East 85058 219698 1489 336 9638
## 2 Adamawa North East 378078 410266 3670 3989 159
## 3 Akwa Ibom South South 175429 395832 1902 230 61
## 4 Anambra South East 33298 524738 4374 227 30034
## 5 Bauchi North East 798428 209313 2104 296 149
## 6 Bayelsa South South 118821 197933 1584 1078 53
# Calculate total number of votes for each party in each state(each row)
total_votes_per_party <- election_results_df%>%
select(c(3:7)) %>%
map(sum)%>%
as.data.frame()
# Gather the total votes for each party to have key-value pairs
total_votes_per_party <- gather(total_votes_per_party, key = "Party", value = "Votes")
glimpse(total_votes_per_party)
## Observations: 5
## Variables: 2
## $ Party <chr> "APC", "PDP", "PCP", "ADC", "APGA"
## $ Votes <dbl> 15191847, 11262978, 107286, 97874, 66851
# Plot the graph for the total votes per state
ggplot(total_votes_per_party, aes(x=fct_reorder(Party, Votes),
y=Votes, color=Party, fill=Party)) +
geom_bar(stat="identity")
# 3.0 Examine votes spread within the geopolitical zones ----
votes_per_gp_zones <- election_results_df %>%
group_by(geopolitical_zone)%>%
summarise(ADC = sum(ADC), APC = sum(APC),APGA = sum(APGA),
PCP = sum(PCP),PDP = sum(PDP)) %>%
# Removes any groups that could be left in the tibble or data frame.
# Inadvertently leaving groups will be a source difficult to detect
ungroup()
votes_per_gp_zones%>%
rename('Geopolitical Zone' = geopolitical_zone)
## # A tibble: 6 x 6
## `Geopolitical Zone` ADC APC APGA PCP PDP
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 North Central 7142 2465599 7316 19028 2023769
## 2 North East 5207 3238783 1916 11444 1255357
## 3 North west 2449 5995651 3048 17953 2280465
## 4 South East 1665 403968 52392 14720 1693485
## 5 South South 4156 1051396 1059 14752 2233232
## 6 South West 77255 2036450 1120 29389 1776670
# Visualizing the votes as spread over the geopolitical zones
gather(votes_per_gp_zones, Party, Votes, -geopolitical_zone)%>%
ggplot(aes(x=Party, y=Votes, fill=Party)) +
#geom_bar(stat = "identity") +
geom_col(aes(fill = Party), position = "dodge") +
# Create different baxes for each geopolitical zone
facet_wrap(~geopolitical_zone)+ #add number of columns or rows setting values for ncol and nrow respectively
# Change the scientific notaion of the Votes value to normal digits
# Add comma to separate the digits using comma function of the scales package
# by passing only the name of the function without the parenthesis, ()
scale_y_continuous(label = scales::comma)+
# Add comma to separate the digits using comma function of the scales package
geom_text(aes(label = scales::comma(Votes)), position = "dodge")+
# Add labels for title, subtitle to the graph
labs(
title = "2019 Nigeria Presidential Election",
subtitle = "Votes Distribution within the Six Geopolitical Zones",
x = "Political Party",
y = "Number of Votes Per Geopolitical Zone"
)
# highest votes and the party that has the highest votes for each state
highest_votes_per_state <- gather(election_results_df, key="party", value = "votes", -c(1,2)) %>% # create new columns - party and votes
# group by state
group_by(State) %>%
# summarize to get the highest scorer and scores for each state
summarise('Highest Votes Count for the State' = max(votes), Winner = party[votes == max(votes)])
highest_votes_per_state
## # A tibble: 37 x 3
## State `Highest Votes Count for the State` Winner
## <chr> <dbl> <chr>
## 1 Abia 219698 PDP
## 2 Adamawa 410266 PDP
## 3 Akwa Ibom 395832 PDP
## 4 Anambra 524738 PDP
## 5 Bauchi 798428 APC
## 6 Bayelsa 197933 PDP
## 7 Benue 356817 PDP
## 8 Borno 836496 APC
## 9 Cross River 295737 PDP
## 10 Delta 594068 PDP
## # ... with 27 more rows
# Visualization
# Simple barplot with ggplot2
ggplot(highest_votes_per_state, aes(Winner))+
# Number of cars in each class:
geom_bar(aes(fill=Winner))+
ylim(0,20)
# So let's get the percentage value of the result between APC and PDP
highest_votes_per_state%>%
summarise('Percent Votes for APC'=round(mean(Winner=="APC"),2), 'Percent Votes for PDP'=round(mean(Winner=="PDP"),2))
## # A tibble: 1 x 2
## `Percent Votes for APC` `Percent Votes for PDP`
## <dbl> <dbl>
## 1 0.51 0.49
# Percentage representation of votes for each party
percent_votes_per_state <- election_results_df[,3:7] %>%
# We are calculating based on each row
apply(1, function(x){x*100/sum(x,na.rm=T)})%>%
t()%>%
# Applied to each column
apply(2, function(x){round(x,1)})%>%
as.data.frame()
# Assign names to the columns
names(percent_votes_per_state) <- c("APC_Percent_Votes","PDP_Percent_Votes","PCP_Percent_Votes","ADC_Percent_Votes","APGA_Percent_Votes")
#Add the state column from the data dataframe
percent_votes_per_state <-percent_votes_per_state %>%
# Add State column as the first column
add_column(State = election_results_df[,1], .before = 1)
percent_votes_per_state %>%
set_names(c("State","APC Percent Votes","PDP Percent Votes","PCP Percent Votes","ADC Percent Votes","APGA Percent Votes")) %>% head()
## State APC Percent Votes PDP Percent Votes PCP Percent Votes
## 1 Abia 26.9 69.5 0.5
## 2 Adamawa 47.5 51.5 0.5
## 3 Akwa Ibom 30.6 69.0 0.3
## 4 Anambra 5.6 88.5 0.7
## 5 Bauchi 79.0 20.7 0.2
## 6 Bayelsa 37.2 62.0 0.5
## ADC Percent Votes APGA Percent Votes
## 1 0.1 3.0
## 2 0.5 0.0
## 3 0.0 0.0
## 4 0.0 5.1
## 5 0.0 0.0
## 6 0.3 0.0
#Combine all generated data frames
election_results_df <- election_results_df %>%
left_join(percent_votes_per_state, by="State") %>%
left_join(highest_votes_per_state, by="State")
# Change the States corresponding to FCT and Nasarawa to have the same name as is in the SpatialPolygonsDataFame
election_results_df$State[election_results_df$State=="FCT"] = "Federal Capital Territory"
election_results_df$State[election_results_df$State=="Nasarawa"] = "Nassarawa"
head(election_results_df)
## State geopolitical_zone APC PDP PCP ADC APGA
## 1 Abia South East 85058 219698 1489 336 9638
## 2 Adamawa North East 378078 410266 3670 3989 159
## 3 Akwa Ibom South South 175429 395832 1902 230 61
## 4 Anambra South East 33298 524738 4374 227 30034
## 5 Bauchi North East 798428 209313 2104 296 149
## 6 Bayelsa South South 118821 197933 1584 1078 53
## APC_Percent_Votes PDP_Percent_Votes PCP_Percent_Votes ADC_Percent_Votes
## 1 26.9 69.5 0.5 0.1
## 2 47.5 51.5 0.5 0.5
## 3 30.6 69.0 0.3 0.0
## 4 5.6 88.5 0.7 0.0
## 5 79.0 20.7 0.2 0.0
## 6 37.2 62.0 0.5 0.3
## APGA_Percent_Votes Highest Votes Count for the State Winner
## 1 3.0 219698 PDP
## 2 0.0 410266 PDP
## 3 0.0 395832 PDP
## 4 5.1 524738 PDP
## 5 0.0 798428 APC
## 6 0.0 197933 PDP
#Saving your wrangled data for future use
# Save dataframe to Excel csv
write_csv(election_results_df, "election_results.csv")
# Save a single object to a file
saveRDS(election_results_df, "election_results_df.rds")
# Restore it under a back
election_results_df <- readRDS("election_results_df.rds")
# Load the SpatialPolygonsDataFame
mypolygons <- readRDS("r_sp/gadm36_NGA_1_sp.rds")
# Next, I merged the mapping data with the Nigeria administrative level I SpatialPolygonsDataFrame.
# Since I am using administrative level I, I match the state name in my data with Name_1
# in the SpatialPolygonsDataFrame.
mypolygons@data <-
data.frame(mypolygons@data,
election_results_df[match(mypolygons@data[,"NAME_1"],
election_results_df[,"State"]),])
# Assign colors to 5 parties
palColor <- c("#e66101","#fdb863", "#f7f7f7","#b2abd2","#5e3c99")
factpal <- colorFactor(palColor, mypolygons@data$Winner)
# Create a popup to display election information for each state
popup <-
paste0(
"<strong>STATE: </strong>", mypolygons@data$STATE,
"<br><strong>WINNER: </strong>", mypolygons@data$Winner,
"<br><strong>Results: </strong>",
"<br><strong>APC: </strong>", paste(formatC(mypolygons@data$APC,format = "d",big.mark = ","), sep = ""),
"<br><strong>PDP: </strong>",paste(formatC(mypolygons@data$PDP,format = "d",big.mark = ","), sep = ""),
"<br><strong>ADC: </strong>",paste(formatC(mypolygons@data$ADC,format = "d",big.mark = ","), sep = ""),
"<br><strong>PCP: </strong>",paste(formatC(mypolygons@data$PCP,format = "d",big.mark = ","), sep = ""),
"<br><strong>APGA: </strong>",paste(formatC(mypolygons@data$APGA,format = "d",big.mark = ","), sep = ""))
#Displays information for each state only when each state polygon is clicked
leaflet() %>%
addPolygons(
data=mypolygons, weight = 2, color = "white",
fillOpacity = 0.8, fillColor = ~factpal(Winner),
popup = popup)
labels <-
paste0(
"<h1>STATE: </h1>", mypolygons@data$State,
"<br><strong>WINNER: </strong>", mypolygons@data$Winner,
"<br><strong>Results: </strong>",
"<br><strong>APC: </strong>", paste(formatC(mypolygons@data$APC,format = "d",big.mark = ","), sep = ""),
"<br><strong>PDP: </strong>",paste(formatC(mypolygons@data$PDP,format = "d",big.mark = ","), sep = ""),
"<br><strong>ADC: </strong>",paste(formatC(mypolygons@data$ADC,format = "d",big.mark = ","), sep = ""),
"<br><strong>PCP: </strong>",paste(formatC(mypolygons@data$PCP,format = "d",big.mark = ","), sep = ""),
"<br><strong>APGA: </strong>",paste(formatC(mypolygons@data$APGA,format = "d",big.mark = ","), sep = ""))%>%
lapply(htmltools::HTML)
#With Labels
leaflet() %>%
addPolygons(
data=mypolygons, weight = 2, color = "white",
fillOpacity = 0.8, fillColor = ~factpal(Winner),
label = labels)
labels_details <- sprintf("<h2>%s</h2>
<table style='font-family: arial, sans-serif; border-collapse: collapse; width: 350px;font-size: 14px;'>
<tr>
<th style='border: 1px solid #dddddd;text-align: left;padding: 8px;'>Candidate</th>
<th style='border: 1px solid #dddddd;text-align: left;padding: 8px;'>Votes</th>
<th style='border: 1px solid #dddddd;text-align: left;padding: 8px;'>%%</th>
</tr>
<tr style='background-color: #dddddd;'>
<td>Muhammadu Buhari (APC)</td>
<td>%s</td>
<td>%s</td>
</tr>
<tr>
<td>Atiku Abubakar (PDP)</td>
<td>%s</td>
<td>%s</td>
</tr>
<tr style='background-color: #dddddd;'>
<td>Obadiah Mailafia (ADC)</td>
<td>%s</td>
<td>%s</td>
</tr>
<tr>
<td>Felix Nicolas (PCP)</td>
<td>%s</td>
<td>%s</td>
</tr>
<tr style='background-color: #dddddd;'>
<td>Gbor John Wilson Terwase (APGA)</td>
<td>%s</td>
<td>%s</td>
</tr>
</table>",
mypolygons@data$State,paste(formatC(mypolygons@data$APC,format = "d",big.mark = ","), sep =""),
mypolygons@data$APC_Percent_Votes,paste(formatC(mypolygons@data$PDP,format = "d",big.mark = ","), sep = ""),
mypolygons@data$PDP_Percent_Votes,paste(formatC(mypolygons@data$ADC,format = "d",big.mark = ","), sep = ""),
mypolygons@data$ADC_Percent_Votes,paste(formatC(mypolygons@data$PCP,format = "d",big.mark = ","), sep = ""),
mypolygons@data$PCP_Percent_Votes,paste(formatC(mypolygons@data$APGA,format = "d",big.mark = ","), sep = ""),
mypolygons@data$APGA_Percent_Votes)%>%
lapply(htmltools::HTML)
#Display detail information Label
leaflet() %>%
addPolygons(
data=mypolygons, weight = 2, color = "white",
fillOpacity = 0.8, fillColor = ~factpal(Winner),
label = labels_details)