# Import libraries
library(readr)
library(janitor) # Clean data
library(plyr) # MUST be loaded before dpylr
library(dplyr) # Wrangle data
Attaching package: ‘dplyr’
The following objects are masked from ‘package:plyr’:
arrange, count, desc, failwith, id, mutate, rename, summarise, summarize
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(RMySQL) # Import MySQL database to R
Loading required package: DBI
library(ggvis) # Visualisations
library(maps) # Map visualisation
Attaching package: ‘maps’
The following object is masked from ‘package:plyr’:
ozone
library(ggplot2) # visualisations
Attaching package: ‘ggplot2’
The following object is masked from ‘package:ggvis’:
resolution
library(ggmap) # Map visualisations
Google Maps API Terms of Service: http://developers.google.com/maps/terms.
Please cite ggmap if you use it: see citation('ggmap') for details.
library(mapproj) # Map visulisations
library(reshape2) # Shape data
# Connect to the MySQL database
conn <- dbConnect(
RMySQL::MySQL(),
dbname='global-warming',
host='127.0.0.1',
port= 3306,
user='root',
password='TempPassword')
# Import required tables into R data.frames
df_co2 <- dbGetQuery(conn, "select * from co2_total;")
df_obs <- dbGetQuery(conn, "select * from observation_joined;")
Decimal MySQL column 4 imported as numericDecimal MySQL column 5 imported as numericDecimal MySQL column 9 imported as numericDecimal MySQL column 10 imported as numericDecimal MySQL column 11 imported as numeric
# Disconnect from the database
dbDisconnect(conn)
[1] TRUE
Order the data by CO2 emissions and plot top 10 and lowest 10 emitting countries
# Order the data by co2_emissions and split into top and lowest emitting countries
ordered <- df_co2[order(-df_co2$co2_emissions),]
top_co2_countries <- head(ordered, 10)
lowest_co2_countries <- tail(ordered,10)
# Plot the top co2 emitting countries
top_co2_countries %>%
ggvis(~country_name, ~co2_emissions) %>%
layer_bars() %>%
# Add plot title
add_axis("x", orient = "top", ticks = 0, title = "Top 10 CO2 Emitting Countries", properties = axis_props(axis = list(stroke = "white"),labels = list(fontSize = 0))) %>%
# Set x and y axis labels
add_axis(type = 'x', title="Country", title_offset = 100, properties = axis_props(labels = list(angle = 310, align = "right", baseline = "middle"))) %>%
add_axis(type = 'y', title="Total Lifetime CO2 Output (in million tonnes)", title_offset = 100, format='####')
Results
The USA, China and the Russia are the largest contributors to global CO2 emissions for the cleaned dataset
However, the dataset does not include the EU28 as a consolidated region, which would likely reveal the consolidated EU region as a top emitter as well.
The top contributors of the dataset appear in line with expectations.
# Plot the lowest co2 emitting countries
lowest_co2_countries %>%
ggvis(~country_name, ~co2_emissions) %>%
layer_bars() %>%
# Add plot title
add_axis("x", orient = "top", ticks = 0, title = "Lowest 10 CO2 Emitting Countries", properties = axis_props(axis = list(stroke = "white"),labels = list(fontSize = 0))) %>%
# Set x and y axis labels
add_axis(type = 'x', title="Country", title_offset = 100, properties = axis_props(labels = list(angle = 310, align = "right", baseline = "middle"))) %>%
add_axis(type = 'y', title="Total Lifetime CO2 Output (in million tonnes)", title_offset = 100, format='####')
Results
All of the lowest emitting countries appear to be islands, or extremely small nations, which is to be expected due to their size.
# Split data into 60s and 10s
df_60 <- filter(df_obs, year <=1965)
df_10 <- filter(df_obs, year >1965)
# Calculate the unique number of weather stations prior to removing of stations without adequate records for data analysis
dist_stations_60 <- length(unique(df_60$stn_id))
dist_stations_10 <- length(unique(df_10$stn_id))
Results
1603 unqiue stations in the 1960’s data
1603 unqiue stations in the 2010’s data
Remove any weather which does not have recordings for at least 6 of the 10 years
# Calculate the count of unique stations in 60s and 10s based on std_id and year
unique_60_stn <- count(unique(select(df_60, stn_id, year)), stn_id)
unique_10_stn <- count(unique(select(df_10, stn_id, year)), stn_id)
# Filter stations which have less than 3 years of data in either period
low_recordings_60 <- filter(unique_60_stn, n < 3)
low_recordings_10 <- filter(unique_10_stn, n < 3)
# Join the list of stations with low recordings in either period together for a final list of stations across both periods with low recordings
low_years_recordings <- full_join(low_recordings_60, low_recordings_10, by = 'stn_id')
# Display the number of unique stations with low recordings
count(unique(low_years_recordings))
Results
609 weather stations have less than 3 years worth of recordings in either the 1960s or 2010s.
As these stations do not have an adequate number of years worth of data in one or both periods, they are to be dropped from the dataset
# Number of stns before dropping
stn_before <- count(unique(select(df_obs, stn_id)))
# Drop weather stations with low number of annual recordings
drop_rows <- low_years_recordings$stn_id
df_obs <- subset(df_obs, !(stn_id %in% drop_rows))
df_60 <- subset(df_60, !(stn_id %in% drop_rows))
df_10 <- subset(df_10, !(stn_id %in% drop_rows))
# Number of stns after dropping
stn_after <- count(unique(select(df_obs, stn_id)))
Results
609 weather stations dropped
994 weather stations remaining
Check that each weather station has recordings for each month for least 3 out of the 5 years in each period
# Calculate the count of unique stations in 60s and 10s based on std_id and month
unique_60_mth_stn <- count(select(df_60, stn_id, month), stn_id, month)
unique_10_mth_stn <- count(select(df_10, stn_id, month), stn_id, month)
# Filter to those stations which have less than 6
low_recordings_60 <- filter(unique_60_mth_stn, n < 3)
low_recordings_10 <- filter(unique_10_mth_stn, n < 3)
# Filter out low_recordings to the unique number of stantions
low_recordings_60 <- unique(select(low_recordings_60, stn_id))
low_recordings_10 <- unique(select(low_recordings_10, stn_id))
# Join the list of stations with low recordings in either period together for a final list of stations across both periods with low recordings
low_month_recordings <- full_join(low_recordings_60, low_recordings_10, by = 'stn_id')
# Display the number of unique stations with low recordings
count(unique(low_month_recordings))
Results
15 unique stations have months with less than 3 recordings in either or both period
As the data coverage for these stations is inadequte, conclude to drop all of these weather stations
# Number of stns before dropping rows
stn_before_mth <- count(unique(select(df_obs, stn_id)))
# List of stations to drop
drop_rows_mth <- low_month_recordings$stn_id
# Drop all weather stations with any month with less than 6 years worth of data
df_obs <- subset(df_obs, !(stn_id %in% drop_rows_mth))
# Number of stns before dropping rows
stn_after_mth <- count(unique(select(df_obs, stn_id)))
Results
15 weather stations dropped
979 weather stations remaining
58 countries covered by the remaining data
Identify any records with missing weather stations data
# Identify any weather stations with missing data (location data in particular)
sapply(df_obs, function(y) sum(length(which(is.na(y)))))
stn_id wban month year temp temp_points station_name country_code country_name
0 0 0 0 0 0 120 120 192
lat long elev begin end
120 120 120 120 120
Results
120 records with missing station name, country code, country name, lat, long, elev, begin and end Total of 192 missing country names
Check how many weather stations impacted by missing weather station data
# Group by stn_id to identify how many stations are impacted by missing values
na_values <- df_obs %>%
filter(is.na(df_obs$country_name)|
is.na(df_obs$country_code) |
is.na(df_obs$station_name) |
is.na(df_obs$lat) |
is.na(df_obs$long) |
is.na(df_obs$elev) |
is.na(df_obs$begin) |
is.na(df_obs$end)) %>%
group_by(stn_id) %>%
summarise(n = n())
na_values
Results
The missing values relate to a total of 2 weather stations, which is not considered a material number of stations. Further, one of these weather stations is invalid, as it is represented by the missing data value of 999999 as its stn_id.
Conclude to drop these stations from the dataset to preserve data integrity.
# Set drop rows to be the int values of na_values
drop_rows <- na_values[['stn_id']]
# Apply subset to df_obs to exclude any rows with theses stn_id values
df_obs <- subset(df_obs, !(stn_id %in% drop_rows))
# Group by country_name and count the number of stn_ids
stn_per_country <- df_obs %>%
group_by(country_name) %>%
summarise(count = n_distinct(stn_id))
# Visualise number of stations per country
stn_per_country %>%
ggvis(~count) %>%
# Set to bar chart
layer_histograms(center = 35, fill:= "#E74C3C") %>%
# Add plot title
add_axis("x", orient = "top", ticks = 0, title = "Histogram of No. of Weather Stations per Country", properties = axis_props(axis = list(stroke = "white"),labels = list(fontSize = 0))) %>%
# Set x and y axis labels
add_axis(type = 'x', title="No. of Weather Stations", title_offset = 50) %>%
add_axis('y', title="Frequency", title_offset = 50)
Guessing width = 10 # range / 25
Results
There appears to be a significant number of countries with less thn 10 weather stations. Whilst this is not ideal, unfortunately this is what we are left with after thoughly cleaning the data to ensure the temperature measurements will be as accurate as possible.
Next review the spread of weather stations per country for countries where the number of stations is less than 20
# Visualise number of stations per country
stn_per_country %>%
# Countries with less than 20 stations
filter(count <=20) %>%
ggvis(~count) %>%
# Set to bar chart
layer_histograms(center = 35, fill:= "#E74C3C") %>%
# Add plot title
add_axis("x", orient = "top", ticks = 0, title = "Histogram of No. of Weather Stations per Country (less than 20 stations)", properties = axis_props(axis = list(stroke = "white"),labels = list(fontSize = 0))) %>%
# Set x and y axis labels
add_axis(type = 'x', title="No. of Weather Stations", title_offset = 50) %>%
add_axis('y', title="Frequency", title_offset = 50)
Guessing width = 0.5 # range / 38
Results
Further investigation reveals that approximately 25 of the 58 countries have only one weather station.
As a result of this finding, it is anticipated that any differences of regions within a country will not be possible to calculate.
Visualise which countries have a high or low number of weather stations
# Create a function to plot weather stations
plot_map <- function(comparison, no_stn, dot_colour='blue', dot_size=1){
# Create conditional setup of comparison
if (comparison==">") {
stn_list <- subset(stn_per_country, count > no_stn)
} else if (comparison==">=") {
stn_list <- subset(stn_per_country, count >= no_stn)
} else if (comparison=="<") {
stn_list <- subset(stn_per_country, count < no_stn)
} else if (comparison=="<=") {
stn_list <- subset(stn_per_country, count <= no_stn)
} else if (comparison=="==") {
stn_list <- subset(stn_per_country, count == no_stn)
} else {
print("Error")
}
# Create coordinates for unique weather stations in list
coords <- df_obs %>%
filter((country_name %in% stn_list[['country_name']])) %>%
select(stn_id, lat, long) %>%
unique()
# Create lat and long coordinates for each station
lat <- unlist(select(coords, lat))
lon <- unlist(select(coords, long))
# Create world map
map_world <- borders("world", colour='gray50',fill="gray50")
mp <- ggplot() + map_world
# Plot coords on world map
mp <- mp + geom_point(aes(x=lon, y=lat) ,color=dot_colour, size=dot_size)
return(mp)
}
# Plot weather stations where country total stations are less than or equal to 10
plot_map("<=", 10)
# Plot weather stations where country total stations are greater than 10
plot_map(">", 10, 'red')
Results
The major georgraphical regions with 10 or less weather stations are found in the generally less developed, geographically smaller or less inhabited nations, such as Africa, South East Asia, Middle America, South America, Eastern Europe, Greenland and New Zealand.
However, notably, many countries in Eastern Europe have very few stations - perhaps due to the relative size of each country and the wealth of these countries.
The highest density of weather stations appear in the United States, Australia, China and many Western and wealthier European countries.
One notable country with no weather station data is India, which is in the top 10 highest CO2 emitting group. However, the India data was removed due to missing data in the weather station recordings. Despite India not being represented, other high CO2 emitting countries are included (China, USA etc), therefore the data is still considered appropriate.
Calculation of annual average temperatures and temperature differences between the two periods.
Calculated as an absolute difference, as well as a percentage difference of the base 1960s temperatures.
# Create a decade column and append to df_obs
df_obs$decade <- ifelse(df_obs$year > 1965, "10s", "60s")
# Calculate the average annual temperature for each station for the 60s & 10s
annual_avg_temp <- df_obs %>%
group_by(country_name, decade) %>%
summarise(mean = mean(temp), n= n()) %>%
acast(country_name~decade, value.var="mean") %>%
as.data.frame()
# Add column for temp_diff between two periods and % increase
annual_avg_temp$temp_diff <- annual_avg_temp$`10s` - annual_avg_temp$`60s`
annual_avg_temp$temp_percent <- annual_avg_temp$temp_diff / annual_avg_temp$`60s` * 100
# Set country_name to a column and reset indexes
annual_avg_temp$country_name <- rownames(annual_avg_temp)
rownames(annual_avg_temp) <- NULL
# Plot annual temp difference (as %)
annual_avg_temp %>%
ggvis(~country_name, ~temp_percent) %>%
layer_bars() %>%
# Add plot title
add_axis("x", orient = "top", ticks = 0, title = "Temperature Difference as a Percentage by Country", properties = axis_props(axis = list(stroke = "white"),labels = list(fontSize = 0))) %>%
# Set x and y axis labels
add_axis(type = 'x', title="Country", title_offset = 100, properties = axis_props(labels = list(angle = 280, align = "right", baseline = "middle"))) %>%
add_axis(type = 'y', title="Percentage Temperature Increase over 1960s", title_offset = 80, format='####')
Results
The vast majority of countries have an average increase in temperature of less than 50%.
However, there are some notable outliers, in particular Canada which has a reported percenatage increase of over 450%. This could be an error, or a result of their harsh, cold climate.
Further, there are several countries with decreases in average temperature.
Each of these will be explored below.
# Plot countries with negative temperature changes
annual_avg_temp %>%
filter(temp_percent < 0) %>%
ggvis(~country_name, ~-temp_percent) %>%
layer_bars() %>%
# Add plot title
add_axis("x", orient = "top", ticks = 0, title = "Countries with negative temperature difference percentage", properties = axis_props(axis = list(stroke = "white"),labels = list(fontSize = 0))) %>%
# Set x and y axis labels
add_axis(type = 'x', title="Country", title_offset = 100, properties = axis_props(labels = list(angle = 280, align = "right", baseline = "middle"))) %>%
add_axis(type = 'y', title="Percentage Temperature Decrease over 1960s", title_offset = 80, format='####')
Results
Each of the above countries, except for indonesia, typically have very cold climates. Perhaps, the displayed percentage decrease in temperatures for these countries is due to the annual average actually being negative.
# Filter DF to countries under investigation
c_to_review = c('greenland','indonesia','mongolia','russia', 'canada')
filter(annual_avg_temp, country_name %in% c_to_review)
Results
As expected, greenland, mongolia and russian appear to be showing as negative as their annual average temperatures have a negative baseline in the 1960s.
Canada is showing a radical percentage increase due to its baseline being almost 0, therefore any change at all will provide a radical result.
Indonesia appears to have approx the same temperature, ever so slightly less than it was in the 1960s.
# Check for other countries with negative or close to zero average annual temperatures in the 1960's
filter(annual_avg_temp, `60s` < 1)
Results
A further check confirms that only these countries have a negative baseline or a baseline very close to 0. Therefore, consider these temperature changes as accurate.
To ensure the percentage results are not distored when plotting or reporting, the negative temp_percent values are to be changed to absolute positive values.
# Convert temp_diff values to absolute
annual_avg_temp$temp_percent <- abs(annual_avg_temp$temp_percent)
# Replot
annual_avg_temp %>%
ggvis(~country_name, ~temp_percent) %>%
layer_bars() %>%
# Add plot title
add_axis("x", orient = "top", ticks = 0, title = "Temperature Difference as a Percentage by Country", properties = axis_props(axis = list(stroke = "white"),labels = list(fontSize = 0))) %>%
# Set x and y axis labels
add_axis(type = 'x', title="Country", title_offset = 100, properties = axis_props(labels = list(angle = 280, align = "right", baseline = "middle"))) %>%
add_axis(type = 'y', title="Percentage Temperature Increase over 1960s", title_offset = 80, format='####')
# Plot only those countries with temperature increases of less than 100% to more clearly identify trends and patterns
annual_avg_temp %>%
filter(temp_percent <100) %>%
ggvis(~country_name, ~temp_percent) %>%
layer_bars() %>%
# Add plot title
add_axis("x", orient = "top", ticks = 0, title = "Temperature Difference as a Percentage by Country (less than 100%)", properties = axis_props(axis = list(stroke = "white"),labels = list(fontSize = 0))) %>%
# Set x and y axis labels
add_axis(type = 'x', title="Country", title_offset = 100, properties = axis_props(labels = list(angle = 280, align = "right", baseline = "middle"))) %>%
add_axis(type = 'y', title="Percentage Temperature Increase over 1960s", title_offset = 80, format='####')
No interesting trends or patterns noted.
# Plot temperature difference for each country in degrees Celsius
annual_avg_temp %>%
ggvis(~country_name, ~temp_diff) %>%
layer_bars() %>%
# Add plot title
add_axis("x", orient = "top", ticks = 0, title = "Temperature Difference in Degrees Celsius by Country", properties = axis_props(axis = list(stroke = "white"),labels = list(fontSize = 0))) %>%
# Set x and y axis labels
add_axis(type = 'x', title="Country", title_offset = 100, properties = axis_props(labels = list(angle = 280, align = "right", baseline = "middle"))) %>%
add_axis(type = 'y', title="Percentage Temperature Increase over 1960s", title_offset = 80, format='####')
With the exception of Indonesia, noted previously, all countries appear to have had an average annual increase in temperature since the 1960s of between 0.2 to 2.4 degrees.
Join the CO2 and Observation datasets based on the country_name attribute to facilitate the answering of the research question
df <- join(x=annual_avg_temp, y=df_co2, by='country_name')
Since 1960, has the average annual temperature of all countries of the globe increased by approximately the same amount, or have those countries with historically higher CO2 emission levels experienced a higher average temperature rise?
Create scatterplots of CO2 vs Temperature increase (absolute and percentage change) to identify any potential correlations
# Plot percentage temperature difference
df %>%
ggvis(~co2_emissions, ~temp_percent) %>%
# Set to bar chart
layer_points() %>%
# Add plot title
add_axis("x", orient = "top", ticks = 0, title = "Temperature Difference (%) VS CO2 output by Country", properties = axis_props(axis = list(stroke = "white"),labels = list(fontSize = 0))) %>%
# Set x and y axis labels
add_axis(type = 'x', title="Total CO2 Emissions", title_offset = 50) %>%
add_axis('y', title="Rise in Temperature as Percentage of 1960s", title_offset = 50)
# Plot abolsute temperature difference
df %>%
ggvis(~co2_emissions, ~temp_diff) %>%
# Set to bar chart
layer_points() %>%
# Add plot title
add_axis("x", orient = "top", ticks = 0, title = "Temperature Difference (degree Celsius) VS CO2 output by Country", properties = axis_props(axis = list(stroke = "white"),labels = list(fontSize = 0))) %>%
# Set x and y axis labels
add_axis(type = 'x', title="Total CO2 Emissions", title_offset = 50) %>%
add_axis('y', title="Rise in Temperature in degrees Celsius", title_offset = 50)
Results
Significant outliers 450% increase in temperature (Canada) and 400,000 CO2 emissions (USA) may be skewing the results.
Based on initial review of the absolute and percentage temperature differeces there is a lack of linear relationship between a countries CO2 output and the rate at which global warming is occurring.
Resolve to review the same plots, but with these outliers removed.
# Create scatter plots with USA and canada removed
df %>%
filter(country_name != 'canada') %>%
filter(country_name != 'united states') %>%
ggvis(~co2_emissions, ~temp_percent) %>%
# Set to bar chart
layer_points() %>%
# Add plot title
add_axis("x", orient = "top", ticks = 0, title = "Temperature Difference (%) VS CO2 output by Country (excluding USA and Canada)", properties = axis_props(axis = list(stroke = "white"),labels = list(fontSize = 0))) %>%
# Set x and y axis labels
add_axis(type = 'x', title="Total CO2 Emissions", title_offset = 50) %>%
add_axis('y', title="Rise in Temperature as Percentage of 1960s Base", title_offset = 50)
# Create scatter plots with USA and canada removed
df %>%
filter(country_name != 'canada') %>%
filter(country_name != 'united states') %>%
ggvis(~co2_emissions, ~temp_diff) %>%
# Add plot title
add_axis("x", orient = "top", ticks = 0, title = "Temperature Difference (degree Celsius) VS CO2 output by Country (excluding USA and Canada)", properties = axis_props(axis = list(stroke = "white"),labels = list(fontSize = 0))) %>%
# Set to bar chart
layer_points() %>%
# Set x and y axis labels
add_axis(type = 'x', title="Total CO2 Emissions", title_offset = 50) %>%
add_axis('y', title="Rise in Temperature as Percentage of 1960s Base", title_offset = 50)
Results
Removal of outlier further indicates that there is not a linear relationship between the total lifetine CO2 emissions of a country and the rate at which they are experiencing global warming.
As the CO2 values vary significantly, the logged CO2 and temperature values may provide better insights into if there truly is a correlation or not.
# Calculate the natrual log values of CO2, temp_diff and temp_percent
df$co2_emissions_log <- log(df$co2_emissions)
df$temp_diff_log <- log(df$temp_diff)
NaNs produced
df$temp_percent_log <- log(df$temp_percent)
# Omit any NAN values created by attempting to take log of negative values (indonesia)
na.omit(df)
# Create scatter plots with logged values
df %>%
ggvis(~co2_emissions_log, ~temp_percent_log) %>%
# Set to bar chart
layer_points() %>%
# Add plot title
add_axis("x", orient = "top", ticks = 0, title = "Temperature Difference (%) VS CO2 output by Country (natuual log)", properties = axis_props(axis = list(stroke = "white"),labels = list(fontSize = 0))) %>%
# Set x and y axis labels
add_axis(type = 'x', title="Total CO2 Emissions", title_offset = 50) %>%
add_axis('y', title="Rise in Temperature as Percentage of 1960s Base", title_offset = 50)
# Create scatter plots with logged values
df %>%
ggvis(~co2_emissions_log, ~temp_diff_log) %>%
# Add plot title
add_axis("x", orient = "top", ticks = 0, title = "Temperature Difference (degree Celsius) VS CO2 output by Country (natrual log)", properties = axis_props(axis = list(stroke = "white"),labels = list(fontSize = 0))) %>%
# Set to bar chart
layer_points() %>%
# Set x and y axis labels
add_axis(type = 'x', title="Total CO2 Emissions", title_offset = 50) %>%
add_axis('y', title="Rise in Temperature as Percentage of 1960s Base", title_offset = 50)
Results
Logged scatterplots reveal a slighly more linear relationship, however this does not appear to be a strong relationship.
Resolved to check the correlation coefficients of the logged data to confirm weak correlation observed in the scatterplots
# Create dataframes with co2 and logged temp values
cor_temp_diff <- data.frame(df$co2_emissions_log, df$temp_diff_log)
cor_temp_percent <- data.frame(df$co2_emissions_log, df$temp_percent_log)
# Generate correlation coefficients
cor(cor_temp_diff, use="complete.obs")
df.co2_emissions_log df.temp_diff_log
df.co2_emissions_log 1.0000000 0.2002022
df.temp_diff_log 0.2002022 1.0000000
cor(cor_temp_percent, use="complete.obs")
df.co2_emissions_log df.temp_percent_log
df.co2_emissions_log 1.0000000 0.2898778
df.temp_percent_log 0.2898778 1.0000000
Results
Both correlation coefficents are below 0.3, indicating a weak linear relationship between a countries CO2 emissions and the level of global warming.
Based on the above, it is concluded that there is no statistically significant correlation between the historical CO2 emissions of a country and the rate at which they are experiencing global warming.
T here is no evidence to suggest that historically high CO2 emitting countries are experiencing global waring at a faster rate than low CO2 emitting countries.