Executive Summary

I had a unique opportunity to give the first tutorial organized by Abuja R User Group on their Inaugural Meetup. The tutorial was meant for aspiring Data Scientist looking forward to using R as they journey to realise their dream.

A lot has happened and reported in the media on the 2019 Presidential Election result. So, what dataset could be more interesting than this result. It would make the class more vibrant to take a look at the result from a Data Analyst perspective. The 2019 Presidential Election results data was obtained from BBC website, https://www.bbc.co.uk/news/resources/idt-f0b25208-4a1d-4068-a204-940cbe88d1d3

Loading Libraries

# Load Libraries ----
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(leaflet)
library(sp)

Set working directory

The working directory or folder where files related to the project or document are saved

setwd("~/AbujaRUG")

Importing Data - Web Scrapping

The results are available in html table and json format. To understand the structure of the HTML a little understanding of HTML and CSS would be needed. We can take a look at the webpage by using the Google’s Developer tools to get But we would concentrate on the html table format. We would be using the rvest package.

# 1.0 Web Scraping using rvest ----

url <- read_html("https://www.bbc.co.uk/news/resources/idt-f0b25208-4a1d-4068-a204-940cbe88d1d3")


#available_tables <- html_nodes(url, "table") #we have 4 html tables on the page

election_results_df <- 
    #Get out all the html tables on the page
    html_nodes(url, "table") %>%
    #we have 4 html tables on the page
    #After examination of all the tables, our table of interest is the fourth one, 
    #so we get out our data into a datatable
    # get the fourth table
    nth(4) %>%
    # convert to data frame
    html_table(header = FALSE) %>% 
    glimpse()
## Observations: 38
## Variables: 6
## $ X1 <chr> "Abia", "Adamawa", "Akwa Ibom", "Anambra", "Bauchi", "Bayel...
## $ X2 <chr> "85,058", "378,078", "175,429", "33,298", "798,428", "118,8...
## $ X3 <chr> "219,698", "410,266", "395,832", "524,738", "209,313", "197...
## $ X4 <chr> "1,489", "3,670", "1,902", "4,374", "2,104", "1,584", "2,79...
## $ X5 <chr> "336", "3,989", "230", "227", "296", "1,078", "554", "301",...
## $ X6 <chr> "9,638", "159", "61", "30,034", "149", "53", "4,582", "187"...

Tidying up the Data

Assign the new columns names

A look at the data.frame column displays some weird X1,X2 … X6 names. We need to replace these names with meaningful column names. Also, the unwanted last row which contains the total number of votes for each party in all the states needs to be removed.

# Assign the columns names as follow
names(election_results_df) <- c("State","APC","PDP","PCP","ADC","APGA")


# Remove the unwanted last row
election_results_df <- election_results_df[-38,]

# Display the data.frame
head(election_results_df)
##       State     APC     PDP   PCP   ADC   APGA
## 1      Abia  85,058 219,698 1,489   336  9,638
## 2   Adamawa 378,078 410,266 3,670 3,989    159
## 3 Akwa Ibom 175,429 395,832 1,902   230     61
## 4   Anambra  33,298 524,738 4,374   227 30,034
## 5    Bauchi 798,428 209,313 2,104   296    149
## 6   Bayelsa 118,821 197,933 1,584 1,078     53

Categorizing each states into 6 different geopolitical zones

# Categorizing each states into 6 different geopolitical zones

North_Central <- c("Benue", "FCT", "Kogi", "Kwara", "Nasarawa", "Niger", "Plateau")

North_East <- c("Adamawa", "Bauchi", "Borno", "Gombe", "Taraba", "Yobe") 

North_West <- c("Kaduna", "Katsina", "Kano", "Kebbi", "Sokoto", "Jigawa","Zamfara") 

South_East <- c("Abia", "Anambra", "Ebonyi", "Enugu", "Imo")

South_South <- c("Akwa Ibom", "Bayelsa", "Cross River", "Delta", "Edo", "Rivers") 

South_West <- c("Ekiti", "Lagos", "Osun", "Ondo", "Ogun", "Oyo")


# Add new features i.e. new column for the geopolitical zones
election_results_df <-
    election_results_df %>% 
    mutate(
        geopolitical_zone = case_when(
            State %in% North_Central ~ "North Central",
            State %in% North_East ~ "North East",
            State %in% North_West ~ "North west",
            State %in% South_East ~ "South East",
            State %in% South_South ~ "South South",
            State %in% South_West ~ "South West"
        )
    ) %>% 
    glimpse() #All columns data are character type
## Observations: 37
## Variables: 7
## $ State             <chr> "Abia", "Adamawa", "Akwa Ibom", "Anambra", "...
## $ APC               <chr> "85,058", "378,078", "175,429", "33,298", "7...
## $ PDP               <chr> "219,698", "410,266", "395,832", "524,738", ...
## $ PCP               <chr> "1,489", "3,670", "1,902", "4,374", "2,104",...
## $ ADC               <chr> "336", "3,989", "230", "227", "296", "1,078"...
## $ APGA              <chr> "9,638", "159", "61", "30,034", "149", "53",...
## $ geopolitical_zone <chr> "South East", "North East", "South South", "...

Convert all the party columns from character type to numeric

# Convert all the party columns from character type to numeric

# Create a function to convert to numeric
convert_to_numeric <- function(x){as.numeric(gsub("\\,", "", x))}

# map the function to each party column
election_results_df <- election_results_df %>% 
    select(c(2:6)) %>% 
    map(convert_to_numeric) %>% 
    as_tibble() %>% 
    cbind(select(election_results_df, c(State, geopolitical_zone))) %>% 
    # A new look at the data frame
    glimpse()
## Observations: 37
## Variables: 7
## $ APC               <dbl> 85058, 378078, 175429, 33298, 798428, 118821...
## $ PDP               <dbl> 219698, 410266, 395832, 524738, 209313, 1979...
## $ PCP               <dbl> 1489, 3670, 1902, 4374, 2104, 1584, 2793, 15...
## $ ADC               <dbl> 336, 3989, 230, 227, 296, 1078, 554, 301, 32...
## $ APGA              <dbl> 9638, 159, 61, 30034, 149, 53, 4582, 187, 43...
## $ State             <chr> "Abia", "Adamawa", "Akwa Ibom", "Anambra", "...
## $ geopolitical_zone <chr> "South East", "North East", "South South", "...
# Reoder the columns to add more meaning to what we see
election_results_df <- election_results_df[c(6:7,1:5)]

head(election_results_df)
##       State geopolitical_zone    APC    PDP  PCP  ADC  APGA
## 1      Abia        South East  85058 219698 1489  336  9638
## 2   Adamawa        North East 378078 410266 3670 3989   159
## 3 Akwa Ibom       South South 175429 395832 1902  230    61
## 4   Anambra        South East  33298 524738 4374  227 30034
## 5    Bauchi        North East 798428 209313 2104  296   149
## 6   Bayelsa       South South 118821 197933 1584 1078    53

Calculate total number of votes for each party in each state(each row)

# Calculate total number of votes for each party in each state(each row)

total_votes_per_party <- election_results_df%>% 
    select(c(3:7)) %>% 
    map(sum)%>%
    as.data.frame()
# Gather the total votes for each party to have key-value pairs
total_votes_per_party <- gather(total_votes_per_party, key = "Party", value = "Votes")

glimpse(total_votes_per_party)
## Observations: 5
## Variables: 2
## $ Party <chr> "APC", "PDP", "PCP", "ADC", "APGA"
## $ Votes <dbl> 15191847, 11262978, 107286, 97874, 66851

Visualize the total votes per state

# Plot the graph for the total votes per state
ggplot(total_votes_per_party, aes(x=fct_reorder(Party, Votes), 
 y=Votes, color=Party, fill=Party)) +
    geom_bar(stat="identity")

Examine votes spread within the geopolitical zones

# 3.0 Examine votes spread within the geopolitical zones ----
votes_per_gp_zones <- election_results_df %>%
    group_by(geopolitical_zone)%>%
    summarise(ADC = sum(ADC), APC = sum(APC),APGA = sum(APGA),
              PCP = sum(PCP),PDP = sum(PDP)) %>% 
    # Removes any groups that could be left in the tibble or data frame.
    # Inadvertently leaving groups will be a source difficult to detect
    ungroup() 

votes_per_gp_zones%>% 
    rename('Geopolitical Zone' = geopolitical_zone)
## # A tibble: 6 x 6
##   `Geopolitical Zone`   ADC     APC  APGA   PCP     PDP
##   <chr>               <dbl>   <dbl> <dbl> <dbl>   <dbl>
## 1 North Central        7142 2465599  7316 19028 2023769
## 2 North East           5207 3238783  1916 11444 1255357
## 3 North west           2449 5995651  3048 17953 2280465
## 4 South East           1665  403968 52392 14720 1693485
## 5 South South          4156 1051396  1059 14752 2233232
## 6 South West          77255 2036450  1120 29389 1776670

Visualizing the votes as spread over the geopolitical zones

# Visualizing the votes as spread over the geopolitical zones
gather(votes_per_gp_zones, Party, Votes, -geopolitical_zone)%>%
    ggplot(aes(x=Party, y=Votes, fill=Party)) +
    #geom_bar(stat = "identity") +
   geom_col(aes(fill = Party), position = "dodge") +
    
    # Create different baxes for each geopolitical zone
    facet_wrap(~geopolitical_zone)+ #add number of columns or rows setting values for ncol and nrow respectively
    
    # Change the scientific notaion of the Votes value to normal digits
    # Add comma to separate the digits using comma function of the scales package
    # by passing only the name of the function without the parenthesis, ()
    scale_y_continuous(label = scales::comma)+
    
    # Add comma to separate the digits using comma function of the scales package
    geom_text(aes(label = scales::comma(Votes)), position = "dodge")+
    
    # Add labels for title, subtitle to the graph
    labs(
        title = "2019 Nigeria Presidential Election",
        subtitle = "Votes Distribution within the Six Geopolitical Zones",
        x = "Political Party",
        y = "Number of Votes Per Geopolitical Zone"
    )

Determine the highest votes and the party that has the highest votes for each state

# highest votes and the party that has the highest votes for each state
highest_votes_per_state <- gather(election_results_df, key="party", value = "votes", -c(1,2)) %>% # create new columns - party and votes
    # group by state  
    group_by(State) %>% 
    # summarize to get the highest scorer and scores for each state
    summarise('Highest Votes Count for the State' = max(votes), Winner = party[votes == max(votes)]) 

highest_votes_per_state
## # A tibble: 37 x 3
##    State       `Highest Votes Count for the State` Winner
##    <chr>                                     <dbl> <chr> 
##  1 Abia                                     219698 PDP   
##  2 Adamawa                                  410266 PDP   
##  3 Akwa Ibom                                395832 PDP   
##  4 Anambra                                  524738 PDP   
##  5 Bauchi                                   798428 APC   
##  6 Bayelsa                                  197933 PDP   
##  7 Benue                                    356817 PDP   
##  8 Borno                                    836496 APC   
##  9 Cross River                              295737 PDP   
## 10 Delta                                    594068 PDP   
## # ... with 27 more rows
# Visualization
# Simple barplot with ggplot2
ggplot(highest_votes_per_state, aes(Winner))+
    # Number of cars in each class:
    geom_bar(aes(fill=Winner))+
    ylim(0,20)

# So let's get the percentage value of the result between APC and PDP
highest_votes_per_state%>%
    summarise('Percent Votes for APC'=round(mean(Winner=="APC"),2), 'Percent Votes for PDP'=round(mean(Winner=="PDP"),2))
## # A tibble: 1 x 2
##   `Percent Votes for APC` `Percent Votes for PDP`
##                     <dbl>                   <dbl>
## 1                    0.51                    0.49

Calculate the Percentage representation of votes for each party

# Percentage representation of votes for each party
percent_votes_per_state <- election_results_df[,3:7] %>%
    # We are calculating based on each row
    apply(1, function(x){x*100/sum(x,na.rm=T)})%>%
    t()%>%
    # Applied to each column
    apply(2, function(x){round(x,1)})%>%
    as.data.frame()

# Assign names to the columns
names(percent_votes_per_state) <- c("APC_Percent_Votes","PDP_Percent_Votes","PCP_Percent_Votes","ADC_Percent_Votes","APGA_Percent_Votes")

#Add the state column from the data dataframe
percent_votes_per_state <-percent_votes_per_state %>%
    # Add State column as the first column
    add_column(State = election_results_df[,1], .before = 1)

percent_votes_per_state %>% 
    set_names(c("State","APC Percent Votes","PDP Percent Votes","PCP Percent Votes","ADC Percent Votes","APGA Percent Votes")) %>% head()
##       State APC Percent Votes PDP Percent Votes PCP Percent Votes
## 1      Abia              26.9              69.5               0.5
## 2   Adamawa              47.5              51.5               0.5
## 3 Akwa Ibom              30.6              69.0               0.3
## 4   Anambra               5.6              88.5               0.7
## 5    Bauchi              79.0              20.7               0.2
## 6   Bayelsa              37.2              62.0               0.5
##   ADC Percent Votes APGA Percent Votes
## 1               0.1                3.0
## 2               0.5                0.0
## 3               0.0                0.0
## 4               0.0                5.1
## 5               0.0                0.0
## 6               0.3                0.0

Combine all generated data frames

#Combine all generated data frames
election_results_df <- election_results_df %>% 
    left_join(percent_votes_per_state, by="State") %>% 
    left_join(highest_votes_per_state, by="State")



# Change the States corresponding to FCT and Nasarawa to have the same name as is in the SpatialPolygonsDataFame
election_results_df$State[election_results_df$State=="FCT"] = "Federal Capital Territory"
election_results_df$State[election_results_df$State=="Nasarawa"] = "Nassarawa"

head(election_results_df)
##       State geopolitical_zone    APC    PDP  PCP  ADC  APGA
## 1      Abia        South East  85058 219698 1489  336  9638
## 2   Adamawa        North East 378078 410266 3670 3989   159
## 3 Akwa Ibom       South South 175429 395832 1902  230    61
## 4   Anambra        South East  33298 524738 4374  227 30034
## 5    Bauchi        North East 798428 209313 2104  296   149
## 6   Bayelsa       South South 118821 197933 1584 1078    53
##   APC_Percent_Votes PDP_Percent_Votes PCP_Percent_Votes ADC_Percent_Votes
## 1              26.9              69.5               0.5               0.1
## 2              47.5              51.5               0.5               0.5
## 3              30.6              69.0               0.3               0.0
## 4               5.6              88.5               0.7               0.0
## 5              79.0              20.7               0.2               0.0
## 6              37.2              62.0               0.5               0.3
##   APGA_Percent_Votes Highest Votes Count for the State Winner
## 1                3.0                            219698    PDP
## 2                0.0                            410266    PDP
## 3                0.0                            395832    PDP
## 4                5.1                            524738    PDP
## 5                0.0                            798428    APC
## 6                0.0                            197933    PDP

Saving Generated datasets for future use

#Saving your wrangled data for future use
# Save dataframe to Excel csv
write_csv(election_results_df, "election_results.csv")

# Save a single object to a file
saveRDS(election_results_df, "election_results_df.rds")
# Restore it under a back
election_results_df <- readRDS("election_results_df.rds")

Spatial Analysis of the Election data on Spatial map

# Load the SpatialPolygonsDataFame
mypolygons <- readRDS("r_sp/gadm36_NGA_1_sp.rds")

# Next, I merged the mapping data with the Nigeria administrative level I SpatialPolygonsDataFrame.
# Since I am using administrative level I, I match the state name in my data with Name_1 
# in the SpatialPolygonsDataFrame.

mypolygons@data <- 
    data.frame(mypolygons@data, 
               election_results_df[match(mypolygons@data[,"NAME_1"], 
                              election_results_df[,"State"]),])

Option 1 - Election result per state when each of the state polygon is clicked

# Assign colors to 5 parties

palColor <- c("#e66101","#fdb863", "#f7f7f7","#b2abd2","#5e3c99")

factpal <- colorFactor(palColor, mypolygons@data$Winner)

# Create a popup to display election information for each state
popup <- 
    paste0(
        "<strong>STATE: </strong>", mypolygons@data$STATE, 
        "<br><strong>WINNER: </strong>", mypolygons@data$Winner,
        "<br><strong>Results: </strong>",
        "<br><strong>APC: </strong>", paste(formatC(mypolygons@data$APC,format = "d",big.mark = ","), sep = ""),
        "<br><strong>PDP: </strong>",paste(formatC(mypolygons@data$PDP,format = "d",big.mark = ","), sep = ""),
        "<br><strong>ADC: </strong>",paste(formatC(mypolygons@data$ADC,format = "d",big.mark = ","), sep = ""),
        "<br><strong>PCP: </strong>",paste(formatC(mypolygons@data$PCP,format = "d",big.mark = ","), sep = ""),
        "<br><strong>APGA: </strong>",paste(formatC(mypolygons@data$APGA,format = "d",big.mark = ","), sep = ""))

#Displays information for each state only when each state polygon is clicked
leaflet() %>% 
    addPolygons(
        data=mypolygons, weight = 2, color = "white", 
        fillOpacity = 0.8,  fillColor =   ~factpal(Winner),
        popup = popup)

Option 2 - Election result per state which pops up on rollover of each state polygon.

labels <-  
    paste0(
        "<h1>STATE: </h1>", mypolygons@data$State, 
        "<br><strong>WINNER: </strong>", mypolygons@data$Winner,
        "<br><strong>Results: </strong>",
        "<br><strong>APC: </strong>", paste(formatC(mypolygons@data$APC,format = "d",big.mark = ","), sep = ""),
        "<br><strong>PDP: </strong>",paste(formatC(mypolygons@data$PDP,format = "d",big.mark = ","), sep = ""),
        "<br><strong>ADC: </strong>",paste(formatC(mypolygons@data$ADC,format = "d",big.mark = ","), sep = ""),
        "<br><strong>PCP: </strong>",paste(formatC(mypolygons@data$PCP,format = "d",big.mark = ","), sep = ""),
        "<br><strong>APGA: </strong>",paste(formatC(mypolygons@data$APGA,format = "d",big.mark = ","), sep = ""))%>% 
    lapply(htmltools::HTML)

#With Labels
leaflet() %>% 
    addPolygons(
        data=mypolygons, weight = 2, color = "white", 
        fillOpacity = 0.8,  fillColor =   ~factpal(Winner),
        label = labels)

Option 2 - More Details on Election result per state which pops up on rollover of each state polygon in table format.

labels_details <- sprintf("<h2>%s</h2>
    <table style='font-family: arial, sans-serif; border-collapse: collapse; width: 350px;font-size: 14px;'>
        <tr>
            <th style='border: 1px solid #dddddd;text-align: left;padding: 8px;'>Candidate</th>
            <th style='border: 1px solid #dddddd;text-align: left;padding: 8px;'>Votes</th>
            <th style='border: 1px solid #dddddd;text-align: left;padding: 8px;'>%%</th>
        </tr>
        <tr style='background-color: #dddddd;'>
            <td>Muhammadu Buhari (APC)</td>
            <td>%s</td>
            <td>%s</td>
        </tr>
        <tr>
            <td>Atiku Abubakar (PDP)</td>
            <td>%s</td>
            <td>%s</td>
        </tr>
        <tr style='background-color: #dddddd;'>
            <td>Obadiah Mailafia (ADC)</td>
            <td>%s</td>
            <td>%s</td>
        </tr>
        <tr>
            <td>Felix Nicolas (PCP)</td>
            <td>%s</td>
            <td>%s</td>
        </tr>
        <tr style='background-color: #dddddd;'>
            <td>Gbor John Wilson Terwase (APGA)</td>
            <td>%s</td>
            <td>%s</td>
        </tr>
   </table>", 
   mypolygons@data$State,paste(formatC(mypolygons@data$APC,format = "d",big.mark = ","), sep =""),
   mypolygons@data$APC_Percent_Votes,paste(formatC(mypolygons@data$PDP,format = "d",big.mark = ","), sep = ""),
   mypolygons@data$PDP_Percent_Votes,paste(formatC(mypolygons@data$ADC,format = "d",big.mark = ","), sep = ""),
   mypolygons@data$ADC_Percent_Votes,paste(formatC(mypolygons@data$PCP,format = "d",big.mark = ","), sep = ""),
   mypolygons@data$PCP_Percent_Votes,paste(formatC(mypolygons@data$APGA,format = "d",big.mark = ","), sep = ""),
   mypolygons@data$APGA_Percent_Votes)%>% 
    lapply(htmltools::HTML)

#Display detail information Label
leaflet() %>% 
    addPolygons(
        data=mypolygons, weight = 2, color = "white", 
        fillOpacity = 0.8,  fillColor =   ~factpal(Winner),
        label = labels_details)

Thank you for your audience. Thank you!