In this report, we use R programming language to analyze climate data collected from four different sites in Ethiopia - Bahir Dar, Chagni, Debre Tabor, and Gondar. The data includes extreme minimum temperature records for each month from 1951 to 2014 each site have different start year and end years. Here, we present the following steps:-
We used the readxl package in R to read the data from Excel files. We created a list of data frames for each site and removed the first row which contains the data information in each data frame that contained the column names. We also removed any rows that had missing values for all variables. We renamed the columns to months and added a new column for site names. We then combined the data from all sites into one data frame.
if (knitr::is_html_output()) {
setwd("H:/Course Material/6 Masters Document/First Year/Second Semster/Time Series Analysis/Lab Work")
}
library(readxl)
library(tidyr)
library(knitr)
library(zoo)
library(dplyr)
library(ggplot2)
library(dplyr)
library(lubridate)
library(forecast)
library(tseries)
library(tidyr)
library(forecast)
options(warn = -1)
# Read Excel Data of all data frame
Bahir_Dar <-
read_excel("Datasets/climate/B.Dar.xls", sheet = "Extreme Min")
Chagni <-
read_excel("Datasets/climate/Chagni.xls", sheet = "extreme Min.temp")
Debre_Tabor <-
read_excel("Datasets/climate/D.Tabor.xls", sheet = "extreme Min.Temp")
Gondar <-
read_excel("Datasets/climate/Gondar.xls", sheet = "Extreme Min.Temp")
create_site_df <- function(df, site_name) {
if (site_name == "Bahir Dar") {
df <- df[-c(1:2), ]
} else if (site_name == "Chagni") {
df <- df[-1, ]
} else if (site_name == "Debre Tabor") {
df <- df[-c(1:3), ]
} else {
df <- df[-c(1:2), ]
}
df <- df[!apply(is.na(df), 1, all), ]
colnames(df) <-
c(
"Year",
"Jan",
"Feb",
"Mar",
"Apr",
"May",
"Jun",
"Jul",
"Aug",
"Sep",
"Oct",
"Nov",
"Dec"
)
df$site_name <- site_name
df <- df[!is.na(df$Year), ]
cat("Dimension of", site_name, "Site\t", dim(df), "\n")
return(df)
}
data_list <- list(Bahir_Dar, Chagni, Debre_Tabor, Gondar)
site_names <- c("Bahir Dar", "Chagni", "Debre Tabor", "Gondar")
for (i in seq_along(data_list)) {
data_list[[i]] <- create_site_df(data_list[[i]], site_names[i])
}
## Dimension of Bahir Dar Site 54 14
## Dimension of Chagni Site 26 14
## Dimension of Debre Tabor Site 61 14
## Dimension of Gondar Site 60 14
# Combine all the data frame in one(Bahirdar, chagni, debre tabor, gondar)
combined_data <- do.call(rbind, data_list)
cat("Dimension of The Combined Dataset: ", dim(combined_data))
## Dimension of The Combined Dataset: 201 14
We checked for missing values in the combined data frame and replaced the asterisks with NA. We then showed any rows that had missing values for all variables. We filled missing values in the rows with the mean of the respective column of each attribute from each site separately.
colSums(is.na(combined_data)) # Count the number of cells that have NA
colSums(combined_data == "*") # Count the number of cells that have *
table(is.na(combined_data))
table(combined_data == "*")
combined_data[combined_data == "*"] <- NA # replace * with NA
remove_missing_rows <- function(df, check_cols, missing_threshold) {
missing_threshold <- round(0.15 * ncol(df[, 2:13]))
df <-
df[!apply(df[check_cols], 1, function(row)
sum(is.na(row))) > missing_threshold,]
return(df)
}
combined_data <-
remove_missing_rows(combined_data,!(names(combined_data) %in% c("Year", "site_name")), missing_threshold)
fill_columns <-
c("Jan",
"Feb",
"Mar",
"Apr",
"May",
"Jun",
"Jul",
"Aug",
"Sep",
"Oct",
"Nov",
"Dec")
fill_mean <- function(df) {
for (col in fill_columns) {
df[[col]] <- as.numeric(df[[col]])
mean_val <- mean(df[[col]], na.rm = TRUE)
df[[col]][which(is.na(df[[col]]) == TRUE)] <- mean_val
}
return(df)
}
sites <- c("Bahir Dar", "Chagni", "Gondar", "Debre Tabor")
site_dfs <- lapply(sites, function(site) {
filter(combined_data, site_name == site)
})
site_dfs_filled <-
lapply(site_dfs, fill_mean) # Fill missing values in each data frame with the mean of its own values
combined_data_filled <-
do.call(rbind, site_dfs_filled) # Combine the filled data frames back into one data frame
combined_data <-
combined_data_filled # Replace the original combined_data with the filled one
We calculated the mean, minimum, maximum, and variance of the monthly temperature data for all sites. We also calculated the 25th, 50th (median), and 75th percentile using the quantile function. We used the apply function to calculate these statistics for each column of the data frame.
for (site_name in sites) {
site_df <-
subset(combined_data, site_name == site_name, select = fill_columns)
for (col in fill_columns) {
col_data <- site_df[[col]]
col_min <- min(col_data, na.rm = TRUE)
col_max <- max(col_data, na.rm = TRUE)
col_var <- var(col_data, na.rm = TRUE)
col_quantiles <-
quantile(col_data,
probs = c(0.25, 0.50, 0.75),
na.rm = TRUE)
col_mean <- mean(col_data, na.rm = TRUE)
# Print the statistics
cat("Site:\t\t", site_name, "Column:", col, "\n")
cat("Mean:\t\t", col_mean, "\n")
cat("Minimum:\t", col_min, "\n")
cat("Maximum:\t", col_max, "\n")
cat("Variance:\t", col_var, "\n")
cat("Quantiles:\t", col_quantiles, "\n\n")
}
}
We calculates the covariance and correlation matrices for the temperature data (columns 2 to 13) for the Bahir Dar site using the cov() and cor() functions, respectively. The round() function is used to round the output to 4 decimal places.
bahirdar_data <- filter(combined_data, site_name == "Bahir Dar")
chagni_data <- filter(combined_data, site_name == "Chagni")
gondar_data <- filter(combined_data, site_name == "Gondar")
debretabor_dtaa <- filter(combined_data, site_name == "Debre Tabor")
# Sample Correlation and Convenience for Bahirdar Temperature Data we can do the same for other site
dt_covarience <- round(cov(bahirdar_data[, 2:13]), 4)
dt_covarience
dt_correlation <- round(cor(bahirdar_data[, 2:13]), 4)
dt_correlation
We plot the yearly temperature of all sites by taking average temperature of each month for the four sites. We also, plot the monthly and quarterly plot of all sites presented here.
# Yearly Temperature
site_year_data <- combined_data %>%
group_by(site_name, Year) %>%
summarise(avg_temp = mean(c_across(Jan:Dec)))
# Plot yearly time series by site with trend line
ggplot(site_year_data, aes(x = as.numeric(Year), y = avg_temp)) +
geom_line(colour = "blue", size = 0.9) +
facet_wrap( ~ site_name, ncol = 2) +
labs(x = "Year", y = "Average Temperature",
title = "Check The Stationarity of Each Site") +
theme(
axis.text = element_text(face = "bold"),
axis.title = element_text(face = "bold"),
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold"),
panel.grid.major = element_line(colour = "#E5E5E5"),
panel.background = element_rect(fill = "white") # White background
)
# Monthly Temperature
bahir_data_long <- bahirdar_data %>%
pivot_longer(cols = Jan:Dec,
names_to = "Month",
values_to = "Temperature")
# Create new columns for year and month
bahir_data_long <- bahir_data_long %>%
mutate(Year = year(as.Date(paste0(Year, "-01-01"))))
# Plot monthly time series for Bahir Dar site
ggplot(bahir_data_long, aes(x = Year, y = Temperature)) +
geom_line(colour = "blue", size = 0.9) +
facet_wrap( ~ Month, ncol = 3, scales = "free_y") +
labs(x = "Year", y = "Temperature (Celsius)",
title = "Monthly Temperature for Bahir Dar Site") +
theme(
axis.text = element_text(face = "bold"),
axis.title = element_text(face = "bold"),
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold"),
panel.grid.major = element_line(colour = "#E5E5E5"),
panel.background = element_rect(fill = "white") # White background
)
# Quarterly Temperature
bahir_data_long <- bahirdar_data %>%
pivot_longer(cols = Jan:Dec,
names_to = "Month",
values_to = "Temperature")
bahir_data_quarter <- bahir_data_long %>%
mutate(Quarter = ifelse(
Month %in% c("Sep", "Oct", "Nov"),
"Spring",
ifelse(
Month %in% c("Dec", "Jan", "Feb"),
"Winter",
ifelse(
Month %in% c("Mar", "Apr", "May"),
"Autumn",
ifelse(Month %in% c("Jun", "Jul", "Aug"), "Summer", NA)
)
)
))
bahir_data_quarter <-
bahir_data_quarter[!is.na(bahir_data_quarter$Quarter), ] # remove rows with NA
bahir_data_quarter <- bahir_data_quarter %>%
group_by(Year, Quarter) %>%
summarise(Quarter_Temp = mean(Temperature))
ggplot(bahir_data_quarter,
aes(x = as.numeric(Year), y = Quarter_Temp, group = Quarter)) +
geom_line(colour = "blue", size = 0.9) +
facet_wrap( ~ Quarter, ncol = 2, scales = "free_y") +
labs(x = "Year", y = "Average Temperature (Celsius)",
title = "Quarterly Temperature for Bahir Dar Site") +
scale_x_continuous(breaks = seq(
min(bahir_data_quarter$Year),
max(bahir_data_quarter$Year),
10
)) +
theme(
axis.text = element_text(face = "bold"),
axis.title = element_text(face = "bold"),
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold"),
panel.grid.major = element_line(colour = "#E5E5E5"),
panel.background = element_rect(fill = "white") # White background
)
We checked the stationarity of the data using the Augmented Dickey-Fuller (ADF) test. We first aggregated the yearly temperature data by site and performed the ADF test for each site. We then printed the p-values for each site and determined whether the site was stationary or not based on the p-value being less than 0.05. We also plotted the yearly time series data for each site with a trend line.
# Aggregate monthly temperature data by site and year
site_year_data <- combined_data %>%
group_by(site_name, Year) %>%
summarise(avg_temp = mean(c_across(Jan:Dec)))
# Plot yearly time series by site with trend line
ggplot(site_year_data, aes(x = as.numeric(Year), y = avg_temp)) +
geom_line(colour = "blue", size = 0.9) +
facet_wrap( ~ site_name, ncol = 2) +
labs(x = "Year", y = "Average Temperature",
title = "Stationarity of Each Site") +
geom_smooth(
method = "loess",
formula = "y ~ x",
se = FALSE,
col = "green"
) +
theme(
axis.text = element_text(face = "bold"),
axis.title = element_text(face = "bold"),
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold"),
panel.grid.major = element_line(colour = "#E5E5E5"),
panel.background = element_rect(fill = "white") # White background
)
# Check stationary using ADF test for each site
adf_results <- site_year_data %>%
group_by(site_name) %>%
summarise(p_value = adf.test(avg_temp)$p.value)
# Print p-values for each site
print(adf_results)
## # A tibble: 4 × 2
## site_name p_value
## <chr> <dbl>
## 1 Bahir Dar 0.527
## 2 Chagni 0.99
## 3 Debre Tabor 0.0706
## 4 Gondar 0.0719
for (i in 1:nrow(adf_results)) {
if (adf_results$p_value[i] < 0.05) {
print(paste0(adf_results$site_name[i], " is stationary"))
} else {
print(paste0(adf_results$site_name[i], " is not stationary"))
}
}
## [1] "Bahir Dar is not stationary"
## [1] "Chagni is not stationary"
## [1] "Debre Tabor is not stationary"
## [1] "Gondar is not stationary"
We plotted the autocorrelation function (ACF) and partial autocorrelation function (PACF) to determine the order of the ARIMA model. We used the ggAcf and ggPacf functions from the forecast package to plot the ACF and PACF, respectively.
# Calculate the ACF and PACF for each site
site_year_data <- combined_data %>%
group_by(site_name, Year) %>%
summarise(avg_temp = mean(c_across(Jan:Dec)))
par(mfrow = c(4, 2), mar = c(3, 3, 3, 0.5))
for (site in unique(site_year_data$site_name)) {
# Subset the data for the current site
site_data <- site_year_data %>% filter(site_name == site)
# Get the starting year for the current site
start_year <- min(site_data$Year)
# Convert the site data to a time series object
site_ts <-
ts(site_data$avg_temp,
start = c(start_year, 1),
frequency = 1)
# Calculate the ACF and PACF
acf_values <- acf(site_ts, lag.max = 24, plot = FALSE)
pacf_values <- pacf(site_ts, lag.max = 24, plot = FALSE)
#print(acf_values)
# Create a plot for the ACF and PACF
plot(acf_values, main = paste("ACF for", site))
plot(pacf_values, main = paste("PACF for", site))
}
We checked the normality of the data using the Shapiro-Wilk test for normality. We plotted a histogram of the temperature data for one site and performed the Shapiro-Wilk test to determine whether the data was normally distributed or not.
ggplot(site_year_data, aes(x = avg_temp)) +
geom_histogram(fill = "lightblue", color = "black", size = 0.9) +
facet_wrap( ~ site_name, ncol = 2) +
labs(x = "Average Temperature", y = "Frequency",
title = "Distribution of Yearly Temperature for each Site") +
theme(
axis.text = element_text(face = "bold"),
axis.title = element_text(face = "bold"),
plot.title = element_text(face = "bold", hjust = 0.5),
strip.text = element_text(face = "bold"),
panel.grid.major = element_line(colour = "#E5E5E5"),
panel.background = element_rect(fill = "white") # White background
)
# Perform Shapiro-Wilk test for normality for each site
site_year_data_summary <- site_year_data %>%
group_by(site_name) %>%
summarize(shapiro_test = shapiro.test(avg_temp)$p.value)
# Label each site as normal or not normal based on the Shapiro-Wilk test
for (i in 1:nrow(site_year_data_summary)) {
if (site_year_data_summary$shapiro_test[i] >= 0.05) {
#print(paste0(site_year_data_summary$site_name[i], " is Normal"))
} else {
#print(paste0(site_year_data_summary$site_name[i], " is not Normal"))
}
}
We tested for independence between the months temperature variables using a correlation test. The test showed that the two variables were not significantly correlated.
for (i in 2:12) {
for (j in (i + 1):13) {
temp_col1 <- bahirdar_data[, i]
temp_col2 <- bahirdar_data[, j]
# Remove rows with missing values in either column
temp_col1 <- temp_col1[!is.na(temp_col1) & !is.na(temp_col2)]
temp_col2 <- temp_col2[!is.na(temp_col1) & !is.na(temp_col2)]
# Perform correlation test and print results
corr_test <- cor.test(temp_col1, temp_col2, method = "pearson")
cat("Month 1:", names(bahirdar_data)[i], "\n")
cat("Month 2:", names(bahirdar_data)[j], "\n")
cat("Correlation Coefficient:", corr_test$estimate, "\n")
cat("P-value:", corr_test$p.value, "\n")
# Determine whether variables are dependent or independent
if (corr_test$p.value < 0.05) {
cat("Result: Dependent\n\n")
} else {
cat("Result: Independent\n\n")
}
}
}
We forecasted the change in January temperature for the next five years using an ARIMA model. The model was fitted using the auto.arima function, and the forecast was made using the forecast function. The forecast showed that the January temperature for Gondar was expected to increase slightly over the next five years.
start_years <- c(1961, 1984, 1953, 1952)
end_years <- c(2014, 2009, 2011, 2011)
par(mfrow = c(2, 2), mar = c(4, 4, 2, 1))
for (i in seq_along(sites)) {
# Select data for site
site_data <- filter(combined_data, site_name == sites[i])
# Compute yearly averages
yearly_avg <- site_data %>%
group_by(Year) %>%
summarise(avg_temp = mean(c_across(Jan:Dec)))
#print(yearly_avg) # Create time series object
ts_data <-
ts(yearly_avg[, 2],
end = c(end_years[i]),
frequency = 1)
# Fit ARIMA model
arima_model <- auto.arima(ts_data)
# Forecast future 5 years
arima_forecast <- forecast(arima_model, h = 5)
#print(arima_forecast)
# Plot data and forecast
plot(
arima_forecast,
main = sites[i],
xlab = "Year",
ylab = "Temperature",
col = "blue",
lwd = 2,
xlim = c(start_years[i], end_years[i] + 5)
)
lines(ts_data, col = "red", lwd = 2)
legend(
"topleft",
legend = c("Observed", "Forecast"),
col = c("red", "blue"),
lty = 1,
lwd = 2
)
}
We also perform seasonal analysis on time series data for Bahirdar The data is analyzed separately for monthly and quarterly time intervals using different methods to calculate seasonal variations. For monthly data, the Simple Averages Method, Ratio-to-Moving-Average Method, and Ratio-to-Trend Method are used to calculate seasonal variations for each month. For quarterly data, the same methods are used to calculate seasonal variations for each quarter. Selecting the best method is hard to conclude but as shown from the output the Ratio to Moving average method can discribe the varation from the given dataset. the same step will be followed for other site
# By Monthly
ts_data <-
ts(bahirdar_data[, 2:13],
start = c(bahirdar_data$Year[1], 1), end = c(2014, 12),
frequency = 12)
base_period <- 1:12
base_avg <- mean(ts_data[base_period])
# Calculate seasonal variations for each month using the Simple Averages Method
simple_avg_seasonal_variations <- rep(0, 12)
for (i in 1:12) {
simple_avg_seasonal_variations[i] <-
mean(ts_data[i:12], na.rm = TRUE) / mean(ts_data, na.rm = TRUE)
}
# Calculate seasonal variations for each month using the Ratio-to-Moving-Average Method
library(zoo)
ratio_to_ma_seasonal_variations <- rep(0, 12)
for (i in 1:12) {
ma <- rollmean(ts_data[, i], k = 3, align = "center")
ratio_to_ma_seasonal_variations[i] <- ts_data[, i] / ma
}
# Calculate seasonal variations for each month using the Ratio-to-Trend Method
ratio_to_trend_seasonal_variations <- rep(0, 12)
for (i in 1:12) {
model <- lm(ts_data[, i] ~ time(ts_data))
trend <- predict(model)
ratio_to_trend_seasonal_variations[i] <- ts_data[, i] / trend
}
link_relative_seasonal_variations <- rep(0, 12)
for (i in 1:12) {
link_relative_seasonal_variations[i] <-
(ts_data[i] / base_avg) * 100
}
# Print the results
print("Seasonal Variation for each month")
cat("Simple Averages Method:", round(simple_avg_seasonal_variations,1), "\n")
cat("Ratio-to-Moving-Average Method:", round(ratio_to_ma_seasonal_variations,1), "\n")
cat("Ratio-to-Trend Method:", round(ratio_to_trend_seasonal_variations,1), "\n")
cat("Link Relative Method:", round(link_relative_seasonal_variations,1), "\n\n")
# By Quarterly
# Convert the data to a time series object
ts_data <-
ts(bahirdar_data[, 2:13],
start = c(bahirdar_data$Year[1], 1),
frequency = 4)
# Calculate seasonal variations for each quarter using the Simple Averages Method
simple_avg_seasonal_variations <- rep(0, 4)
for (i in 1:4) {
simple_avg_seasonal_variations[i] <-
mean(ts_data[i:4], na.rm = TRUE) / mean(ts_data, na.rm = TRUE)
}
# Calculate seasonal variations for each quarter using the Ratio-to-Moving-Average Method
ratio_to_ma_seasonal_variations <- rep(0, 4)
for (i in 1:4) {
ts_data_i <- ts_data[, i]
ts_data_i[is.na(ts_data_i)] <- approx(time(ts_data), ts_data_i, xout = time(ts_data))[["y"]] # interpolate missing values
mov_avg <- ma(ts_data_i, order = 1, centre = TRUE) # calculate a moving average of order 3
ratio_to_ma_seasonal_variations[i] <- ts_data_i / mov_avg
}
# Calculate seasonal variations for each quarter using the Ratio-to-Trend Method
ratio_to_trend_seasonal_variations <- rep(0, 4)
for (i in 1:4) {
model <- lm(ts_data[, i] ~ 1 + time(ts_data))
trend <- predict(model)
ratio_to_trend_seasonal_variations[i] <- ts_data[, i] / trend
}
# Print the results
print("\n\nSeasonal Varietion for each Quarter")
cat("Simple Averages Method:",
round(simple_avg_seasonal_variations,1),
"\n")
cat("Ratio-to-Moving-Average Method:",
round(ratio_to_ma_seasonal_variations,1),
"\n")
cat("Ratio-to-Trend Method:",
round(ratio_to_trend_seasonal_variations,1),
"\n")
In conclusion, we analyzed climate data for four different sites in Ethiopia using R programming language. We read and managed the data, checked for missing values, calculated summary statistics, checked for stationary, and forecasted the change in temperature for the next five years. Our work showed that the temperature data for all site Bahir Dar, Chagni, Gonadr, and Debre Tabor were not stationary. The normality check showed that the temperature data for Gondar and Bahirdar was normally distributed but the other site data are note normally distibuted. Finally, we forecasted the change in temperature for each site and found that it was expected to increase slightly over the next five years for Debre Tabore site, Decreases for Gondar and constant flow for the other site. These findings can be used to inform policy makers and stakeholders in Ethiopia about the current and future climate trends in these regions, which can help them make informed decisions about adaptation and mitigation strategies.