title: "FUEL DATA ANALYSIS"
author: "JOYCE GATHONI"
date: "2025-03-18"
output: html_document
#Load required libraries.
library(readxl)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- read_excel("Fuel_Data.xlsx")
df$DATE<-as.Date(df$DATE)
df<-read_excel("Fuel_Data.xlsx")
df$`Station Assigned`<- as.factor(df$`Station Assigned`)
df$`Date Requested`<- as.factor(df$`Date Requested`)
df$`Fuel Type` <- as.factor(df$`Fuel Type`)
fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df" "tbl" "data.frame"
search()
## [1] ".GlobalEnv" "package:dplyr" "package:ggplot2"
## [4] "package:readxl" "package:stats" "package:graphics"
## [7] "package:grDevices" "package:utils" "package:datasets"
## [10] "package:methods" "Autoloads" "package:base"
fuel_per_station <- fuel_data %>%
group_by(`Station Assigned`) %>%
summarise(Total_Liters = sum(as.numeric(`Liters Fueled`), na.rm = TRUE))
## Warning: There were 17 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Total_Liters = sum(as.numeric(`Liters Fueled`), na.rm = TRUE)`.
## ℹ In group 2: `Station Assigned = "Central Region"`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 16 remaining warnings.
fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df" "tbl" "data.frame"
search()
## [1] ".GlobalEnv" "package:dplyr" "package:ggplot2"
## [4] "package:readxl" "package:stats" "package:graphics"
## [7] "package:grDevices" "package:utils" "package:datasets"
## [10] "package:methods" "Autoloads" "package:base"
fuel_comparison <- fuel_data %>%
group_by(`Fuel Type`) %>%
summarise(Total_Liters = sum(as.numeric(`Liters Fueled`), na.rm = TRUE), .groups = "drop") %>%
arrange(desc(Total_Liters))
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Total_Liters = sum(as.numeric(`Liters Fueled`), na.rm = TRUE)`.
## ℹ In group 1: `Fuel Type = "Diesel"`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
print(fuel_comparison)
## # A tibble: 2 × 2
## `Fuel Type` Total_Liters
## <chr> <dbl>
## 1 Diesel 305790.
## 2 Petrol 12821.
ggplot(fuel_comparison, aes(x = reorder(`Fuel Type`, -Total_Liters), y = Total_Liters, fill = `Fuel Type`)) +
geom_bar(stat = "identity", width = 0.5) +
geom_text(aes(label = round(Total_Liters, 1)), vjust = -0.5, size = 5) +
labs(title = "Diesel vs Petrol Consumption", x = "Fuel Type", y = "Total Liters") +
theme_minimal() +
theme(legend.position = "none")
fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df" "tbl" "data.frame"
search()
## [1] ".GlobalEnv" "package:dplyr" "package:ggplot2"
## [4] "package:readxl" "package:stats" "package:graphics"
## [7] "package:grDevices" "package:utils" "package:datasets"
## [10] "package:methods" "Autoloads" "package:base"
fuel_expenditure <- fuel_data %>%
group_by(DATE) %>%
summarise(Total_Expenditure = sum(as.numeric(`Cost (Kes)`), na.rm = TRUE)) %>%
arrange(DATE)
print(fuel_expenditure)
## # A tibble: 1 × 2
## DATE Total_Expenditure
## <lgl> <dbl>
## 1 NA 53546639.
ggplot(fuel_expenditure, aes(x = DATE, y = Total_Expenditure)) +
geom_line(color = "blue") +
geom_point(color = "red") +
labs(title = "Total Fuel Expenditure Over Time", x = "Date", y = "Total Cost (KES)") +
theme_minimal()
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df" "tbl" "data.frame"
search()
## [1] ".GlobalEnv" "package:dplyr" "package:ggplot2"
## [4] "package:readxl" "package:stats" "package:graphics"
## [7] "package:grDevices" "package:utils" "package:datasets"
## [10] "package:methods" "Autoloads" "package:base"
monthly_savings <- fuel_data %>%
group_by(YEAR, MONTH) %>%
summarise(Total_Discount = sum(as.numeric(`Discount (Kes)`), na.rm = TRUE), .groups = "drop") %>%
arrange(YEAR, match(MONTH, month.abb))
print(monthly_savings)
## # A tibble: 6 × 3
## YEAR MONTH Total_Discount
## <dbl> <chr> <dbl>
## 1 2024 Oct 2769.
## 2 2024 Nov 2728
## 3 2024 Dec 2588.
## 4 2024 Sept 748
## 5 2025 Jan 3296
## 6 2025 Feb 3216.
ggplot(monthly_savings, aes(x = MONTH, y = Total_Discount, fill = as.factor(YEAR))) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Monthly Savings Trends from Fuel Discounts", x = "Month", y = "Total Discount (KES)") +
theme_minimal()
fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df" "tbl" "data.frame"
search()
## [1] ".GlobalEnv" "package:dplyr" "package:ggplot2"
## [4] "package:readxl" "package:stats" "package:graphics"
## [7] "package:grDevices" "package:utils" "package:datasets"
## [10] "package:methods" "Autoloads" "package:base"
total_savings <- sum(as.numeric(fuel_data$`Discount (Kes)`), na.rm = TRUE)
cat("Total Savings from Discounts: ", total_savings, "KES\n")
## Total Savings from Discounts: 15345.25 KES
fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df" "tbl" "data.frame"
search()
## [1] ".GlobalEnv" "package:dplyr" "package:ggplot2"
## [4] "package:readxl" "package:stats" "package:graphics"
## [7] "package:grDevices" "package:utils" "package:datasets"
## [10] "package:methods" "Autoloads" "package:base"
vehicles_per_station <- fuel_data %>%
group_by(`Station Assigned`) %>%
summarise(Number_of_Vehicles = n_distinct(`Registration Number`),
Total_Fuel_Used = sum(as.numeric(`Liters Fueled`), na.rm = TRUE))
## Warning: There were 17 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Total_Fuel_Used = sum(as.numeric(`Liters Fueled`), na.rm =
## TRUE)`.
## ℹ In group 2: `Station Assigned = "Central Region"`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 16 remaining warnings.
print(vehicles_per_station)
## # A tibble: 20 × 3
## `Station Assigned` Number_of_Vehicles Total_Fuel_Used
## <chr> <int> <dbl>
## 1 Cameo 1 998.
## 2 Central Region 31 26510.
## 3 Dagoretti 24 18889.
## 4 ECD 14 37505.
## 5 Headquarters 68 97759.
## 6 Informal Settlements Region 9 15316.
## 7 Kabete Laboratory 4 2905.
## 8 Kabete Treatment Works 1 1525.
## 9 Kasarani 25 14323.
## 10 Langata 29 24327.
## 11 Lower Embakasi 20 14949.
## 12 NRW 5 2613.
## 13 Ngethu Treatment Works 3 2003.
## 14 Roysambu 27 18538.
## 15 Ruai 1 262.
## 16 Ruiru Dam 3 989.
## 17 Sasumua Treatment Works 4 1618.
## 18 Thika Dam 3 2147.
## 19 Upper Embakasi 22 14926.
## 20 Westlands 18 20509.
ggplot(vehicles_per_station, aes(x = Number_of_Vehicles, y = Total_Fuel_Used, label = `Station Assigned`)) +
geom_point(color = "blue", size = 5) +
geom_text(vjust = -5) +
labs(title = "Fuel Used vs. Number of Vehicles per Station", x = "Number of Vehicles", y = "Total Fuel Used (Liters)") +
theme_minimal()
fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df" "tbl" "data.frame"
search()
## [1] ".GlobalEnv" "package:dplyr" "package:ggplot2"
## [4] "package:readxl" "package:stats" "package:graphics"
## [7] "package:grDevices" "package:utils" "package:datasets"
## [10] "package:methods" "Autoloads" "package:base"
fuel_data$`Liters Fueled` <- as.numeric(gsub("[^0-9.]", "", fuel_data$`Liters Fueled`))
# Remove NA values from 'Liters Fueled'
fuel_cleaned <- fuel_data %>% filter(!is.na(`Liters Fueled`))
# Define the threshold for unusually high fuel consumption (90th percentile)
threshold <- quantile(fuel_cleaned$`Liters Fueled`, 0.90, na.rm = TRUE)
fuel_wastage <- fuel_cleaned %>%
filter(`Liters Fueled` > threshold) %>%
arrange(desc(`Liters Fueled`))
print(fuel_wastage)
## # A tibble: 403 × 24
## `Registration Number` `Station Assigned` Vendor `Date Requested` `Fuel Type`
## <chr> <chr> <chr> <chr> <chr>
## 1 KDA 758P Headquarters Shell Jan. 3, 2025, 8:… Diesel
## 2 KDA 758P Headquarters Shell Jan. 22, 2025, 1… Diesel
## 3 KDA 758P Headquarters Shell Jan. 27, 2025, 1… Diesel
## 4 KDA 758P Headquarters Shell Jan. 30, 2025, 1… Diesel
## 5 KDA 758P Headquarters Shell Feb. 3, 2025, 10… Diesel
## 6 KDA 758P Headquarters Shell Oct. 25, 2024, 8… Diesel
## 7 KDA 758P Headquarters Shell Nov. 13, 2024, 3… Diesel
## 8 KDA 758P Headquarters Shell Nov. 22, 2024, 8… Diesel
## 9 KDA 758P Headquarters Shell Dec. 6, 2024, 8:… Diesel
## 10 KDA 758P Headquarters Shell Dec. 4, 2024, 8:… Diesel
## # ℹ 393 more rows
## # ℹ 19 more variables: `Liters Fueled` <dbl>, `Price per Liter` <dbl>,
## # `Cost (Kes)` <dbl>, `Discount (Kes)` <dbl>, `D.p per liter` <dbl>,
## # `Discount Total (Kes)` <dbl>, `Attended by` <chr>, `Station Fueled` <chr>,
## # `FUEL CAPACITY` <lgl>, REMARK <lgl>, MONTH <chr>, DAY <dbl>, YEAR <dbl>,
## # DATE <lgl>, QUARTER <lgl>, WEEKDAY <lgl>, DAY2 <lgl>, TIME <lgl>,
## # `MONTH NUM` <lgl>
ggplot(fuel_wastage, aes(x = `Registration Number`, y = `Liters Fueled`, fill = `Station Assigned`)) +
geom_bar(stat = "identity") +
labs(title = "Potential Fuel Wastage - Vehicles with Unusually High Fueling",
x = "Vehicle Registration", y ="Fuel Consumed (Liters)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df" "tbl" "data.frame"
search()
## [1] ".GlobalEnv" "package:dplyr" "package:ggplot2"
## [4] "package:readxl" "package:stats" "package:graphics"
## [7] "package:grDevices" "package:utils" "package:datasets"
## [10] "package:methods" "Autoloads" "package:base"
print(colnames(fuel_data))
## [1] "Registration Number" "Station Assigned" "Vendor"
## [4] "Date Requested" "Fuel Type" "Liters Fueled"
## [7] "Price per Liter" "Cost (Kes)" "Discount (Kes)"
## [10] "D.p per liter" "Discount Total (Kes)" "Attended by"
## [13] "Station Fueled" "FUEL CAPACITY" "REMARK"
## [16] "MONTH" "DAY" "YEAR"
## [19] "DATE" "QUARTER" "WEEKDAY"
## [22] "DAY2" "TIME" "MONTH NUM"
fuel_data <- fuel_data %>%
rename_with(~ trimws(.)) # Remove any leading/trailing spaces in column names
fuel_data$`Cost (Kes)` <- as.numeric(gsub("[^0-9.]", "", fuel_data$`Cost (Kes)`))
fuel_data$`Discount Total (Kes)` <- as.numeric(gsub("[^0-9.]", "", fuel_data$`Discount Total (Kes)`))
if (any(is.na(fuel_data$`Cost (Kes)`)) || any(is.na(fuel_data$`Discount Total (Kes)`))) {
warning("Warning: Some values in Cost (Kes) or Discount Total (Kes) are missing after conversion.")
}
correlation_value <- cor(fuel_data$`Cost (Kes)`, fuel_data$`Discount Total (Kes)`, use = "complete.obs")
cat("Correlation between Fuel Price and Total Cost: ", correlation_value, "\n")
## Correlation between Fuel Price and Total Cost: 0.9999974
ggplot(fuel_data, aes(x = `Cost (Kes)`, y = `Discount Total (Kes)`)) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "lm", col = "red", se = TRUE) +
labs(title = "Correlation Between Fuel price and Total cost ",
x = "Cost (KES)", y = "Discount Total (KES)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
#### Since the correlation value is 0.999997366874709, which is close to
1, #### we say there is a perfect positive correlation between the fuel
price #### and the total cost.
fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df" "tbl" "data.frame"
search()
## [1] ".GlobalEnv" "package:dplyr" "package:ggplot2"
## [4] "package:readxl" "package:stats" "package:graphics"
## [7] "package:grDevices" "package:utils" "package:datasets"
## [10] "package:methods" "Autoloads" "package:base"
fuel_price_trends <- fuel_data %>%
group_by(DATE) %>%
summarise(Average_Fuel_Price = mean(`Cost (Kes)`, na.rm = TRUE)) %>%
arrange(DATE)
print(fuel_price_trends)
## # A tibble: 1 × 2
## DATE Average_Fuel_Price
## <lgl> <dbl>
## 1 NA 10256.
# Plot fuel price trends over time
ggplot(fuel_price_trends, aes(x = DATE, y = Average_Fuel_Price)) +
geom_line(color = "blue") +
geom_point(color = "red") +
labs(title = "Fuel Price Trends Over Time", x = "Date", y = "mean Cost (Kes)") +
theme_minimal()
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
library(readxl)
library(dplyr)
library(ggplot2)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tsibble)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
##
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:tsibble':
##
## interval
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Load the data
fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
# Convert Date column to Date format
fuel_data$Date_Requested <- as.Date(fuel_data$DATE)
# Aggregate fuel prices by month to ensure a proper time series
fuel_price_ts_data <- fuel_data %>%
mutate(Month = floor_date(Date_Requested, "month")) %>%
group_by(Month) %>%
summarise(Avg_Price = mean(`Price per Liter`, na.rm = TRUE))
# Forecasting Future Fuel Prices
fuel_data <- read_excel("Fuel_Data.xlsx")
fuel_forecast<-fuel_data
fuel_price_ts <-fuel_data
fuel_price_ts <-ts(df$`Price per Liter`,start = c(2024,1),frequency = 12)
forecast_model <-auto.arima(fuel_price_ts)
forecast_prices <-forecast(forecast_model,6)
plot(forecast_prices)