Student Details

Data Source

Load R packages

library(XML)
library(RCurl)
library(ggplot2)
library(dplyr)
library(magrittr)

Load datasets

# Save url
wikipediaPage <- "https://www.wcrf.org/int/cancer-facts-figures/data-cancer-frequency-country"
# Save web page tables to result
htmlContent <- getURL(wikipediaPage)
result <- readHTMLTable(htmlContent)
# Save tables
tableMenCancer <- readHTMLTable(htmlContent, which = 2, stringsAsFactors = FALSE)
tableWomenCancer <- readHTMLTable(htmlContent, which = 3, stringsAsFactors = FALSE)
# Remove index column
tableMenCancer <- tableMenCancer[,-1]
# Add column headers
colnames(tableMenCancer)<- c("Country", "Men_Cancers")
# Sort table according to Country column
tableMenCancer <- tableMenCancer[order(tableMenCancer$Country),]
# Remove index column
tableWomenCancer <- tableWomenCancer[,-1]
# Add column headers
colnames(tableWomenCancer)<- c("Country", "Women_Cancers")
# Sort table according to Country column
tableWomenCancer <- tableWomenCancer[order(tableWomenCancer$Country),]
# Combine tables and omit rows containing missing values
tableCombine <- left_join(tableMenCancer, tableWomenCancer, by = "Country") %>% na.omit()
# Read in table containing country regions
region <- read.csv("region.csv")
# Combine region and cancer tables
tableCombineRegion <- left_join(tableCombine, region, by = "Country")
# Convert columns containing character values to numeric values
tableCombineRegion[,3]  <-  as.numeric(tableCombineRegion[,3])
tableCombineRegion[,2]  <-  as.numeric(tableCombineRegion[,2])

Code and Visualisation

# Create an object for the plot, start the ggplot by specifiying the dataset and variables
plot1 <- ggplot(tableCombineRegion, aes(x = Women_Cancers, y = Men_Cancers))
# Add a line of best fit before plotting the points to ensure it appears below the points on the final plot
plot1 <- plot1 + geom_smooth(alpha=0.15, method="lm", color="light grey")
# Add a layer for the plot type (point/scatter), colour the points by region
plot1 <- plot1 + geom_point(aes(colour = Region), size = 3) + theme_minimal(base_size = 16)
# Add title, make main and legend titles bold for better clarity
plot1 <- plot1 + ggtitle("Global Cancer Rates for Women and Men", subtitle = "by country, 2012") +
theme(plot.title = element_text(lineheight=.8, face="bold"), legend.title = element_text(lineheight=.8, face="bold")) + theme(plot.title = element_text(hjust = 0.5)) 
# Label x and y axis
plot1 <- plot1 + xlab("Female Cancers \n(Age-standardised rate per 100,000 people)") + ylab("Male Cancers \n(Age-standardised rate per 100,000 people)")
# Add annotations; Australia = country of target audience, France, Denmark, Bulgaria, Portugal & Spain = extremes
plot1 <- plot1 + annotate("text", x = 291.5, y = 376, label = "Australia", size = 5) + annotate("text", x = 257, y = 385, label = "France", size = 5, colour = "lightsteelblue4") + annotate("text", x = 318, y= 350, label = "Denmark", size = 5, colour = "lightsteelblue4") + annotate("text", x = 232, y= 260, label = "Bulgaria", size = 5, colour = "lightsteelblue4") + annotate("text", x = 210, y= 306, label = "Portugal", size = 5, colour = "lightsteelblue4") + annotate("text", x = 207, y= 313, label = "Spain", size = 5, colour = "lightsteelblue4")
plot1