title: "FUEL DATA ANALYSIS"
author: "JOYCE GATHONI"
date: "2025-03-18"
output: html_document

#Load required libraries.

library(readxl)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Load data.

df <- read_excel("Fuel_Data.xlsx")

Convert Date column to proper Date format

df$DATE<-as.Date(df$DATE)

Data Cleaning.

df<-read_excel("Fuel_Data.xlsx")
df$`Station Assigned`<- as.factor(df$`Station Assigned`)
df$`Date Requested`<- as.factor(df$`Date Requested`)
df$`Fuel Type` <- as.factor(df$`Fuel Type`) 

1. Total fuel consumption per station

fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df"     "tbl"        "data.frame"
search()
##  [1] ".GlobalEnv"        "package:dplyr"     "package:ggplot2"  
##  [4] "package:readxl"    "package:stats"     "package:graphics" 
##  [7] "package:grDevices" "package:utils"     "package:datasets" 
## [10] "package:methods"   "Autoloads"         "package:base"
  fuel_per_station <- fuel_data %>%
    group_by(`Station Assigned`) %>%
    summarise(Total_Liters = sum(as.numeric(`Liters Fueled`), na.rm = TRUE))
## Warning: There were 17 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Total_Liters = sum(as.numeric(`Liters Fueled`), na.rm = TRUE)`.
## ℹ In group 2: `Station Assigned = "Central Region"`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 16 remaining warnings.

2. Comparison of Diesel vs Petrol consumption

fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df"     "tbl"        "data.frame"
search()
##  [1] ".GlobalEnv"        "package:dplyr"     "package:ggplot2"  
##  [4] "package:readxl"    "package:stats"     "package:graphics" 
##  [7] "package:grDevices" "package:utils"     "package:datasets" 
## [10] "package:methods"   "Autoloads"         "package:base"
fuel_comparison <- fuel_data %>% 
  group_by(`Fuel Type`) %>% 
  summarise(Total_Liters = sum(as.numeric(`Liters Fueled`), na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(Total_Liters))
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Total_Liters = sum(as.numeric(`Liters Fueled`), na.rm = TRUE)`.
## ℹ In group 1: `Fuel Type = "Diesel"`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
  print(fuel_comparison)
## # A tibble: 2 × 2
##   `Fuel Type` Total_Liters
##   <chr>              <dbl>
## 1 Diesel           305790.
## 2 Petrol            12821.
ggplot(fuel_comparison, aes(x = reorder(`Fuel Type`, -Total_Liters), y = Total_Liters, fill = `Fuel Type`)) +
  geom_bar(stat = "identity", width = 0.5) +
  geom_text(aes(label = round(Total_Liters, 1)), vjust = -0.5, size = 5) +
  labs(title = "Diesel vs Petrol Consumption", x = "Fuel Type", y = "Total Liters") +
  theme_minimal() +
  theme(legend.position = "none")

3. Total fuel expenditure over time

fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df"     "tbl"        "data.frame"
search()
##  [1] ".GlobalEnv"        "package:dplyr"     "package:ggplot2"  
##  [4] "package:readxl"    "package:stats"     "package:graphics" 
##  [7] "package:grDevices" "package:utils"     "package:datasets" 
## [10] "package:methods"   "Autoloads"         "package:base"
fuel_expenditure <- fuel_data %>%
  group_by(DATE) %>%
  summarise(Total_Expenditure = sum(as.numeric(`Cost (Kes)`), na.rm = TRUE)) %>%
  arrange(DATE)
  print(fuel_expenditure)
## # A tibble: 1 × 2
##   DATE  Total_Expenditure
##   <lgl>             <dbl>
## 1 NA            53546639.

Plot total fuel expenditure over time

ggplot(fuel_expenditure, aes(x = DATE, y = Total_Expenditure)) +
  geom_line(color = "blue") +
  geom_point(color = "red") +
  labs(title = "Total Fuel Expenditure Over Time", x = "Date", y = "Total Cost (KES)") +
  theme_minimal()
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

DISCOUNT ANALYSIS

5. Total savings from discount

fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df"     "tbl"        "data.frame"
search()
##  [1] ".GlobalEnv"        "package:dplyr"     "package:ggplot2"  
##  [4] "package:readxl"    "package:stats"     "package:graphics" 
##  [7] "package:grDevices" "package:utils"     "package:datasets" 
## [10] "package:methods"   "Autoloads"         "package:base"
total_savings <- sum(as.numeric(fuel_data$`Discount (Kes)`), na.rm = TRUE)
cat("Total Savings from Discounts: ", total_savings, "KES\n")
## Total Savings from Discounts:  15345.25 KES

Cost efficiency and vehicle optimization.

6. Fuel used vs number of vehicles per station

fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df"     "tbl"        "data.frame"
search()
##  [1] ".GlobalEnv"        "package:dplyr"     "package:ggplot2"  
##  [4] "package:readxl"    "package:stats"     "package:graphics" 
##  [7] "package:grDevices" "package:utils"     "package:datasets" 
## [10] "package:methods"   "Autoloads"         "package:base"
vehicles_per_station <- fuel_data %>%
  group_by(`Station Assigned`) %>%
  summarise(Number_of_Vehicles = n_distinct(`Registration Number`),
            Total_Fuel_Used = sum(as.numeric(`Liters Fueled`), na.rm = TRUE))
## Warning: There were 17 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `Total_Fuel_Used = sum(as.numeric(`Liters Fueled`), na.rm =
##   TRUE)`.
## ℹ In group 2: `Station Assigned = "Central Region"`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 16 remaining warnings.
print(vehicles_per_station)
## # A tibble: 20 × 3
##    `Station Assigned`          Number_of_Vehicles Total_Fuel_Used
##    <chr>                                    <int>           <dbl>
##  1 Cameo                                        1            998.
##  2 Central Region                              31          26510.
##  3 Dagoretti                                   24          18889.
##  4 ECD                                         14          37505.
##  5 Headquarters                                68          97759.
##  6 Informal Settlements Region                  9          15316.
##  7 Kabete Laboratory                            4           2905.
##  8 Kabete Treatment Works                       1           1525.
##  9 Kasarani                                    25          14323.
## 10 Langata                                     29          24327.
## 11 Lower Embakasi                              20          14949.
## 12 NRW                                          5           2613.
## 13 Ngethu Treatment Works                       3           2003.
## 14 Roysambu                                    27          18538.
## 15 Ruai                                         1            262.
## 16 Ruiru Dam                                    3            989.
## 17 Sasumua Treatment Works                      4           1618.
## 18 Thika Dam                                    3           2147.
## 19 Upper Embakasi                              22          14926.
## 20 Westlands                                   18          20509.

Plot fuel used vs. number of vehicles per station

ggplot(vehicles_per_station, aes(x = Number_of_Vehicles, y = Total_Fuel_Used, label = `Station Assigned`)) +
  geom_point(color = "blue", size = 5) +
  geom_text(vjust = -5) +
  labs(title = "Fuel Used vs. Number of Vehicles per Station", x = "Number of Vehicles", y = "Total Fuel Used (Liters)") +
  theme_minimal()

Identifying potential fuel wastage

Define a threshold (e.g., 90th percentile of fuel usage)

fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df"     "tbl"        "data.frame"
search()
##  [1] ".GlobalEnv"        "package:dplyr"     "package:ggplot2"  
##  [4] "package:readxl"    "package:stats"     "package:graphics" 
##  [7] "package:grDevices" "package:utils"     "package:datasets" 
## [10] "package:methods"   "Autoloads"         "package:base"
fuel_data$`Liters Fueled` <- as.numeric(gsub("[^0-9.]", "", fuel_data$`Liters Fueled`))
# Remove NA values from 'Liters Fueled'
fuel_cleaned <- fuel_data %>% filter(!is.na(`Liters Fueled`))
# Define the threshold for unusually high fuel consumption (90th percentile)
threshold <- quantile(fuel_cleaned$`Liters Fueled`, 0.90, na.rm = TRUE)

7. Identify vehicles fueling unusually high amounts

fuel_wastage <- fuel_cleaned %>%
  filter(`Liters Fueled` > threshold) %>%
  arrange(desc(`Liters Fueled`))
print(fuel_wastage)
## # A tibble: 403 × 24
##    `Registration Number` `Station Assigned` Vendor `Date Requested`  `Fuel Type`
##    <chr>                 <chr>              <chr>  <chr>             <chr>      
##  1 KDA 758P              Headquarters       Shell  Jan. 3, 2025, 8:… Diesel     
##  2 KDA 758P              Headquarters       Shell  Jan. 22, 2025, 1… Diesel     
##  3 KDA 758P              Headquarters       Shell  Jan. 27, 2025, 1… Diesel     
##  4 KDA 758P              Headquarters       Shell  Jan. 30, 2025, 1… Diesel     
##  5 KDA 758P              Headquarters       Shell  Feb. 3, 2025, 10… Diesel     
##  6 KDA 758P              Headquarters       Shell  Oct. 25, 2024, 8… Diesel     
##  7 KDA 758P              Headquarters       Shell  Nov. 13, 2024, 3… Diesel     
##  8 KDA 758P              Headquarters       Shell  Nov. 22, 2024, 8… Diesel     
##  9 KDA 758P              Headquarters       Shell  Dec. 6, 2024, 8:… Diesel     
## 10 KDA 758P              Headquarters       Shell  Dec. 4, 2024, 8:… Diesel     
## # ℹ 393 more rows
## # ℹ 19 more variables: `Liters Fueled` <dbl>, `Price per Liter` <dbl>,
## #   `Cost (Kes)` <dbl>, `Discount (Kes)` <dbl>, `D.p per liter` <dbl>,
## #   `Discount Total (Kes)` <dbl>, `Attended by` <chr>, `Station Fueled` <chr>,
## #   `FUEL CAPACITY` <lgl>, REMARK <lgl>, MONTH <chr>, DAY <dbl>, YEAR <dbl>,
## #   DATE <lgl>, QUARTER <lgl>, WEEKDAY <lgl>, DAY2 <lgl>, TIME <lgl>,
## #   `MONTH NUM` <lgl>

Plot vehicles with high fuel consumption

ggplot(fuel_wastage, aes(x = `Registration Number`, y = `Liters Fueled`, fill = `Station Assigned`)) +
  geom_bar(stat = "identity") +
  labs(title = "Potential Fuel Wastage - Vehicles with Unusually High Fueling", 
       x = "Vehicle Registration", y ="Fuel Consumed (Liters)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Impact of fuel price fluctuations

8. Correlation between fuel price and total cost

fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")
class(fuel_data)
## [1] "tbl_df"     "tbl"        "data.frame"
search()
##  [1] ".GlobalEnv"        "package:dplyr"     "package:ggplot2"  
##  [4] "package:readxl"    "package:stats"     "package:graphics" 
##  [7] "package:grDevices" "package:utils"     "package:datasets" 
## [10] "package:methods"   "Autoloads"         "package:base"
print(colnames(fuel_data))
##  [1] "Registration Number"  "Station Assigned"     "Vendor"              
##  [4] "Date Requested"       "Fuel Type"            "Liters Fueled"       
##  [7] "Price per Liter"      "Cost (Kes)"           "Discount (Kes)"      
## [10] "D.p per liter"        "Discount Total (Kes)" "Attended by"         
## [13] "Station Fueled"       "FUEL CAPACITY"        "REMARK"              
## [16] "MONTH"                "DAY"                  "YEAR"                
## [19] "DATE"                 "QUARTER"              "WEEKDAY"             
## [22] "DAY2"                 "TIME"                 "MONTH NUM"
fuel_data <- fuel_data %>%
  rename_with(~ trimws(.))  # Remove any leading/trailing spaces in column names
fuel_data$`Cost (Kes)` <- as.numeric(gsub("[^0-9.]", "", fuel_data$`Cost (Kes)`))
fuel_data$`Discount Total (Kes)` <- as.numeric(gsub("[^0-9.]", "", fuel_data$`Discount Total (Kes)`))
if (any(is.na(fuel_data$`Cost (Kes)`)) || any(is.na(fuel_data$`Discount Total (Kes)`))) {
  warning("Warning: Some values in Cost (Kes) or Discount Total (Kes) are missing after conversion.")
}
correlation_value <- cor(fuel_data$`Cost (Kes)`, fuel_data$`Discount Total (Kes)`, use = "complete.obs")

cat("Correlation between Fuel Price and Total Cost: ", correlation_value, "\n")
## Correlation between Fuel Price and Total Cost:  0.9999974

Scatter plot to visualize relationship

ggplot(fuel_data, aes(x = `Cost (Kes)`, y = `Discount Total (Kes)`)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_smooth(method = "lm", col = "red", se = TRUE) +
  labs(title = "Correlation Between Fuel price and Total cost ", 
       x = "Cost (KES)", y = "Discount Total (KES)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#### Since the correlation value is 0.999997366874709, which is close to 1, #### we say there is a perfect positive correlation between the fuel price #### and the total cost.

10. Convert into time series object

Load necessary libraries

library(readxl)
library(dplyr)
library(ggplot2)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tsibble)
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## 
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:tsibble':
## 
##     interval
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# Load the data
fuel_data <- read_excel("Fuel_Data.xlsx", sheet = "Sheet1")

# Convert Date column to Date format
fuel_data$Date_Requested <- as.Date(fuel_data$DATE)

# Aggregate fuel prices by month to ensure a proper time series
fuel_price_ts_data <- fuel_data %>%
  mutate(Month = floor_date(Date_Requested, "month")) %>%
  group_by(Month) %>%
  summarise(Avg_Price = mean(`Price per Liter`, na.rm = TRUE))
# Forecasting Future Fuel Prices
fuel_data <- read_excel("Fuel_Data.xlsx")
fuel_forecast<-fuel_data
fuel_price_ts <-fuel_data
fuel_price_ts <-ts(df$`Price per Liter`,start = c(2024,1),frequency = 12)
forecast_model <-auto.arima(fuel_price_ts)
forecast_prices <-forecast(forecast_model,6)
plot(forecast_prices)