Introduction Data analysis is often hampered by the lack of properly structured datasets. Before meaningful insights can be derived, data must be tidied to follow a structured and consistent format, as advocated by Hadley Wickham. This report showcases the transformation and analysis of three datasets: Product Sales, Store Sales, and Emissions Data.
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("tidyr")
library("ggplot2")
# 3 URLs
store_df <- read.csv("~/R-documents/store_sales.csv")
# Column Names
store_col_names <- colnames(store_df)
print(store_col_names)
## [1] "Store.ID" "Store" "Country" "January" "February" "March"
## [7] "April" "May" "June" "July" "August" "September"
## [13] "October" "November"
# Preview first 5 rows of each
print(head(store_df))
## Store.ID Store Country January February March April May June
## 1 1 Palisades US 371700 435950 372460 192260 157550 332550
## 2 2 Billings US 97530 324140 454480 36810 219790 210970
## 3 3 Laguardia US 346130 157510 288990 358190 96860 461950
## 4 4 Cheeseburger US 442010 212390 183580 308650 184340 156540
## 5 5 Detroit US 33250 36840 320170 242650 350300 421980
## 6 6 Towns US 16632 25372 38178 13222 16031 37162
## July August September October November
## 1 89630 372090 421670 173010 173220
## 2 84840 175440 283710 275320 401940
## 3 80440 404990 450630 327270 370100
## 4 328180 281430 498150 473150 23740
## 5 307190 16900 443990 346230 312670
## 6 22541 16700 10475 1897 10493
# Filter for a store
## To apply the filter, make sure to add a , after the condition
store_example <- store_df[store_df$Store.ID == 1, ]
print(store_example)
## Store.ID Store Country January February March April May June July
## 1 1 Palisades US 371700 435950 372460 192260 157550 332550 89630
## August September October November
## 1 372090 421670 173010 173220
# Chosen column names
month_col_names <- c()
for (i in 4:length(store_col_names)) {
# Get specific month column
month_col <- store_col_names[i]
print(month_col)
# Append to the month_col_names
month_col_names <- c(month_col_names, month_col)
}
## [1] "January"
## [1] "February"
## [1] "March"
## [1] "April"
## [1] "May"
## [1] "June"
## [1] "July"
## [1] "August"
## [1] "September"
## [1] "October"
## [1] "November"
print(month_col_names)
## [1] "January" "February" "March" "April" "May" "June"
## [7] "July" "August" "September" "October" "November"
# Select only the month columns for the store example
selected_store_example <- store_example[month_col_names]
print(selected_store_example)
## January February March April May June July August September October
## 1 371700 435950 372460 192260 157550 332550 89630 372090 421670 173010
## November
## 1 173220
# Apply a Transpose so then the columns become the rows
selected_store_transpose <- t(selected_store_example)
print(selected_store_transpose)
## 1
## January 371700
## February 435950
## March 372460
## April 192260
## May 157550
## June 332550
## July 89630
## August 372090
## September 421670
## October 173010
## November 173220
# Reset the index to be a column and then rename the columns to Month, Sale Amount
print(rownames(selected_store_transpose))
## [1] "January" "February" "March" "April" "May" "June"
## [7] "July" "August" "September" "October" "November"
print(selected_store_transpose)
## 1
## January 371700
## February 435950
## March 372460
## April 192260
## May 157550
## June 332550
## July 89630
## August 372090
## September 421670
## October 173010
## November 173220
store_ids <- c()
store_names <- c()
sales_values <- c()
for (i in 1:length(selected_store_transpose)) {
sales_value <- selected_store_transpose[i]
sales_values <- c(sales_values, sales_value)
store_names <- c(store_names, "Palisades")
store_ids <- c(store_ids, 1)
}
print(sales_values)
## [1] 371700 435950 372460 192260 157550 332550 89630 372090 421670 173010
## [11] 173220
tidy_store_df <- data.frame(
Store.ID = store_ids,
Store = store_names,
Month = rownames(selected_store_transpose),
Sales_Value = sales_values
)
print(tidy_store_df)
## Store.ID Store Month Sales_Value
## 1 1 Palisades January 371700
## 2 1 Palisades February 435950
## 3 1 Palisades March 372460
## 4 1 Palisades April 192260
## 5 1 Palisades May 157550
## 6 1 Palisades June 332550
## 7 1 Palisades July 89630
## 8 1 Palisades August 372090
## 9 1 Palisades September 421670
## 10 1 Palisades October 173010
## 11 1 Palisades November 173220
month_tidy <- function(single_df, id_col, store_col, selected_cols) {
# single DataFrame selecting only the columns of interest
month_df <- single_df[selected_cols]
# Month Column Names
month_col_names <- colnames(month_df)
# Month Values
month_values <- c()
# Store ID
store_ids <- c()
# Store Name
store_names <- c()
# Create a transpose
months_transposed <- t(month_df)
# Iterate through each row
for (i in 1:length(months_transposed)) {
# Gets the store value
store_value <- months_transposed[i]
# Store the ID and Name
store_ids <- c(store_ids, id_col)
store_names <- c(store_names, store_col)
}
# Create the DataFrame
tidy_df <- data.frame(
Store.ID = store_ids,
Store = store_names,
Month = month_col_names,
Sales_Amount = sales_values
)
# Return the tidy_df
return (tidy_df)
}
month_tidy_v2 <- function(single_df, id_col, store_col, selected_cols) {
# Select only the columns of interest
month_df <- single_df %>%
select(all_of(selected_cols))
# Reshape the data from wide to long format
tidy_df <- month_df %>%
pivot_longer(
cols = everything(), # Pivot all selected columns
names_to = "Month", # New column for months (previously column names)
values_to = "Sales_Amount" # New column for sales values
) %>%
mutate(Store.ID = id_col, Store = store_col) %>% # Add store details
select(Store.ID, Store, Month, Sales_Amount) # Reorder columns
return(tidy_df)
}
# Test the function for store_example
function_test <- month_tidy_v2(store_example, 1, "Palisades", month_col_names)
print(function_test)
## # A tibble: 11 × 4
## Store.ID Store Month Sales_Amount
## <dbl> <chr> <chr> <int>
## 1 1 Palisades January 371700
## 2 1 Palisades February 435950
## 3 1 Palisades March 372460
## 4 1 Palisades April 192260
## 5 1 Palisades May 157550
## 6 1 Palisades June 332550
## 7 1 Palisades July 89630
## 8 1 Palisades August 372090
## 9 1 Palisades September 421670
## 10 1 Palisades October 173010
## 11 1 Palisades November 173220
# List of DataFrames
store_df_frames <- list()
# Loop through each store
for (i in 1:length(store_df$Store.ID)) {
# Store ID
store_id <- store_df$Store.ID[i]
# Store Name
store_name <- store_df$Store[i]
# Single row
single_row <- store_df[store_df$Store.ID == i, ]
# Tidy that row
store_tidy_df <- month_tidy_v2(single_row, store_id, store_name, month_col_names)
# Store in the array
store_df_frames[[i]] <- store_tidy_df
}
# Combine the DataFrames into a single one by the rows
## bind_rows from dplyr
final_df <- bind_rows(store_df_frames)
print(final_df)
## # A tibble: 220 × 4
## Store.ID Store Month Sales_Amount
## <int> <chr> <chr> <int>
## 1 1 Palisades January 371700
## 2 1 Palisades February 435950
## 3 1 Palisades March 372460
## 4 1 Palisades April 192260
## 5 1 Palisades May 157550
## 6 1 Palisades June 332550
## 7 1 Palisades July 89630
## 8 1 Palisades August 372090
## 9 1 Palisades September 421670
## 10 1 Palisades October 173010
## # ℹ 210 more rows
# Analysis
# Assume final_df is the dataset containing store sales
# Total sales per store across all months
total_sales_store <- final_df %>%
group_by(Store, Store.ID) %>%
summarise(Total_Sales = sum(Sales_Amount, na.rm = TRUE)) %>%
arrange(desc(Total_Sales))
## `summarise()` has grouped output by 'Store'. You can override using the
## `.groups` argument.
# Identify the store with the highest total sales
best_selling_store <- total_sales_store %>%
slice_max(Total_Sales, n = 1)
# Identify the store with the lowest total sales
worst_selling_store <- total_sales_store %>%
slice_min(Total_Sales, n = 1)
# Total sales per month across all stores
total_sales_month <- final_df %>%
group_by(Month) %>%
summarise(Total_Sales = sum(Sales_Amount, na.rm = TRUE)) %>%
arrange(desc(Total_Sales))
# Identify the month with the highest sales
best_month <- total_sales_month %>%
slice_max(Total_Sales, n = 1)
# Identify the month with the lowest sales
worst_month <- total_sales_month %>%
slice_min(Total_Sales, n = 1)
# Print insights
print("Store with the highest total sales:")
## [1] "Store with the highest total sales:"
print(best_selling_store)
## # A tibble: 20 × 3
## # Groups: Store [20]
## Store Store.ID Total_Sales
## <chr> <int> <int>
## 1 Billings 2 2564970
## 2 Blimey 13 240649
## 3 Camembert 20 1047658
## 4 Cheeseburger 4 3092160
## 5 Chelsea 11 191157
## 6 Detroit 5 2832170
## 7 Eh 8 367044
## 8 Fuji 16 283811
## 9 Hokkaido 15 244173
## 10 Innit 14 315480
## 11 Laguardia 3 3343060
## 12 Maple 9 362619
## 13 Nanjing 18 219850
## 14 Palisades 1 3092090
## 15 Paolo 19 330761
## 16 Steve 17 308982
## 17 Tim Horton's 7 258845
## 18 Towns 6 208703
## 19 Victoria 10 264917
## 20 Wimbledon 12 199778
print("Store with the lowest total sales:")
## [1] "Store with the lowest total sales:"
print(worst_selling_store)
## # A tibble: 20 × 3
## # Groups: Store [20]
## Store Store.ID Total_Sales
## <chr> <int> <int>
## 1 Billings 2 2564970
## 2 Blimey 13 240649
## 3 Camembert 20 1047658
## 4 Cheeseburger 4 3092160
## 5 Chelsea 11 191157
## 6 Detroit 5 2832170
## 7 Eh 8 367044
## 8 Fuji 16 283811
## 9 Hokkaido 15 244173
## 10 Innit 14 315480
## 11 Laguardia 3 3343060
## 12 Maple 9 362619
## 13 Nanjing 18 219850
## 14 Palisades 1 3092090
## 15 Paolo 19 330761
## 16 Steve 17 308982
## 17 Tim Horton's 7 258845
## 18 Towns 6 208703
## 19 Victoria 10 264917
## 20 Wimbledon 12 199778
print("Month with the highest total sales:")
## [1] "Month with the highest total sales:"
print(best_month)
## # A tibble: 1 × 2
## Month Total_Sales
## <chr> <int>
## 1 September 2568518
print("Month with the lowest total sales:")
## [1] "Month with the lowest total sales:"
print(worst_month)
## # A tibble: 1 × 2
## Month Total_Sales
## <chr> <int>
## 1 July 1244149
# Visualization of total sales by store
ggplot(total_sales_store, aes(x = reorder(Store, Total_Sales), y = Total_Sales, fill = Store)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Total Sales by Store",
x = "Store",
y = "Total Sales") +
theme_minimal()
# Visualization of total sales by month
ggplot(total_sales_month, aes(x = reorder(Month, Total_Sales), y = Total_Sales, fill = Month)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Total Sales by Month",
x = "Month",
y = "Total Sales") +
theme_minimal()
library('dplyr')
library('tidyr')
product_df <- read.csv("~/R-documents/product_sales.csv")
print(head(product_df))
## Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 1 Product A North 100 110 120 130 140
## 2 Product A South 200 210 220 230 240
## 3 Product A East 300 310 320 330 340
## 4 Product B North 150 160 170 180 190
## 5 Product B South 250 260 270 280 290
## 6 Product B East 350 360 370 380 390
## Jun.Sales
## 1 150
## 2 250
## 3 350
## 4 200
## 5 300
## 6 400
month_tidy_v2 <- function(single_df, product_name_col, region_col) {
# Actually get the product name and region values
product_name <- single_df$product_name_col
region_name <- single_df$region_col
# Reshape the data from wide to long format
tidy_df <- single_df %>%
pivot_longer(
cols = month_col_names, # Pivot all selected columns
names_to = "Month", # New column for months (previously column names)
values_to = "Sales_Amount" # New column for sales values
) %>%
mutate(product_name_col = product_name, region_col = region_name) # Add store details
return(tidy_df)
}
# List of R DataFrames
frame_list <- list()
# Month Columns
product_col_names <- colnames(product_df)
print(product_col_names)
## [1] "Product.Name" "Region" "Jan.Sales" "Feb.Sales" "Mar.Sales"
## [6] "Apr.Sales" "May.Sales" "Jun.Sales"
month_col_names <- product_col_names[3:length(product_col_names)]
print(month_col_names)
## [1] "Jan.Sales" "Feb.Sales" "Mar.Sales" "Apr.Sales" "May.Sales" "Jun.Sales"
# Iterate through the product df
for (i in 1:length(product_df$Product.Name)) {
# Single row
single_row <- product_df[i,]
print(single_row)
# Apply the tidy transformation
tidy_df <- month_tidy_v2(single_row, "Product.Name", "Region")
# Apply the tidy_df to your list
frame_list[[i]] <- tidy_df
}
## Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 1 Product A North 100 110 120 130 140
## Jun.Sales
## 1 150
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(month_col_names)
##
## # Now:
## data %>% select(all_of(month_col_names))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 2 Product A South 200 210 220 230 240
## Jun.Sales
## 2 250
## Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 3 Product A East 300 310 320 330 340
## Jun.Sales
## 3 350
## Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 4 Product B North 150 160 170 180 190
## Jun.Sales
## 4 200
## Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 5 Product B South 250 260 270 280 290
## Jun.Sales
## 5 300
## Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 6 Product B East 350 360 370 380 390
## Jun.Sales
## 6 400
## Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 7 Product C North 50 55 60 65 70
## Jun.Sales
## 7 75
## Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 8 Product C South 100 105 110 115 120
## Jun.Sales
## 8 125
## Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 9 Product C East 150 155 160 165 170
## Jun.Sales
## 9 175
# Bind the rows
final_df <- bind_rows(frame_list)
print(final_df)
## # A tibble: 54 × 4
## Product.Name Region Month Sales_Amount
## <chr> <chr> <chr> <int>
## 1 Product A North Jan.Sales 100
## 2 Product A North Feb.Sales 110
## 3 Product A North Mar.Sales 120
## 4 Product A North Apr.Sales 130
## 5 Product A North May.Sales 140
## 6 Product A North Jun.Sales 150
## 7 Product A South Jan.Sales 200
## 8 Product A South Feb.Sales 210
## 9 Product A South Mar.Sales 220
## 10 Product A South Apr.Sales 230
## # ℹ 44 more rows
# Analysis
# Assume final_df is the dataset containing product sales
# Total sales per product across all regions
total_sales_product <- final_df %>%
group_by(Product.Name) %>%
summarise(Total_Sales = sum(Sales_Amount, na.rm = TRUE)) %>%
arrange(desc(Total_Sales))
# Identify the product with the highest sales
best_selling_product <- total_sales_product %>%
slice_max(Total_Sales, n = 1)
# Identify the product with the lowest sales
worst_selling_product <- total_sales_product %>%
slice_min(Total_Sales, n = 1)
# Average sales per region across all products
avg_sales_region <- final_df %>%
group_by(Region) %>%
summarise(Average_Sales = mean(Sales_Amount, na.rm = TRUE)) %>%
arrange(desc(Average_Sales))
# Identify the region with the highest average sales
best_performing_region <- avg_sales_region %>%
slice_max(Average_Sales, n = 1)
# Identify the region with the lowest average sales
worst_performing_region <- avg_sales_region %>%
slice_min(Average_Sales, n = 1)
# Print insights
print("Product with the highest sales:")
## [1] "Product with the highest sales:"
print(best_selling_product)
## # A tibble: 1 × 2
## Product.Name Total_Sales
## <chr> <int>
## 1 Product B 4950
print("Product with the lowest sales:")
## [1] "Product with the lowest sales:"
print(worst_selling_product)
## # A tibble: 1 × 2
## Product.Name Total_Sales
## <chr> <int>
## 1 Product C 2025
print("Region with the highest average sales:")
## [1] "Region with the highest average sales:"
print(best_performing_region)
## # A tibble: 1 × 2
## Region Average_Sales
## <chr> <dbl>
## 1 East 288.
print("Region with the lowest average sales:")
## [1] "Region with the lowest average sales:"
print(worst_performing_region)
## # A tibble: 1 × 2
## Region Average_Sales
## <chr> <dbl>
## 1 North 121.
# Visualization of total sales per product
ggplot(total_sales_product, aes(x = reorder(Product.Name, Total_Sales), y = Total_Sales, fill = Product.Name)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Total Sales by Product",
x = "Product Name",
y = "Total Sales") +
theme_minimal()
# Visualization of average sales per region
ggplot(avg_sales_region, aes(x = reorder(Region, Average_Sales), y = Average_Sales, fill = Region)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Average Sales by Region",
x = "Region",
y = "Average Sales") +
theme_minimal()
library("dplyr")
library("tidyr")
library("stringr")
library("ggplot2")
# DataFrame
df <- read.csv("~/R-documents/emissions_data.csv")
# Function
tidy_transform <- function(single_row, col_names) {
tidy_df <- single_row %>%
pivot_longer(
cols = col_names, # Pivot all selected columns
names_to = "Emission Year", # New column for years of emission of a specific chemical
values_to = "Element Emission Value" # New column for Emissions Values from a specific year
)
return (tidy_df)
}
# Specific emission columns
df_columns <- colnames(df)
emission_columns <- df_columns[5:length(df_columns)]
print(emission_columns)
## [1] "X2000" "X2001" "X2002" "X2003" "X2004" "X2005" "X2006" "X2007" "X2008"
# Save all the frames
frames <- list()
# Length of the df$Area
area_length <- length(df$Area)
print(area_length)
## [1] 11
# Create a loop that goes through each row in the DataFrame
for (i in 1:area_length) {
# Single row
single_row <- df[i,]
# Apply tidy transform
frames[[i]] <- tidy_transform(single_row, emission_columns)
}
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(col_names)
##
## # Now:
## data %>% select(all_of(col_names))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Apply bind rows
final_frame <- bind_rows(frames)
print(final_frame)
## # A tibble: 99 × 6
## Area Item Element Unit `Emission Year` Element Emission Val…¹
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Afghanistan Crop Residu… Direct… kilo… X2000 0.52
## 2 Afghanistan Crop Residu… Direct… kilo… X2001 0.527
## 3 Afghanistan Crop Residu… Direct… kilo… X2002 0.82
## 4 Afghanistan Crop Residu… Direct… kilo… X2003 0.999
## 5 Afghanistan Crop Residu… Direct… kilo… X2004 0.822
## 6 Afghanistan Crop Residu… Direct… kilo… X2005 1.18
## 7 Afghanistan Crop Residu… Direct… kilo… X2006 1.03
## 8 Afghanistan Crop Residu… Direct… kilo… X2007 1.24
## 9 Afghanistan Crop Residu… Direct… kilo… X2008 0.887
## 10 Afghanistan Crop Residu… Indire… kilo… X2000 0.117
## # ℹ 89 more rows
## # ℹ abbreviated name: ¹​`Element Emission Value`
# Analytics
# Assume final_frame is the input dataset
# Compute average exposure for each area and element
avg_exposure <- final_frame %>%
group_by(Area, Element) %>%
summarise(Average_Emission = mean(`Element Emission Value`, na.rm = TRUE), .groups = "drop")
# Identify areas with highest exposure for each chemical element
highest_exposure <- avg_exposure %>%
group_by(Element) %>%
filter(Average_Emission == max(Average_Emission)) %>%
ungroup()
# Compute Indirect/Direct emissions ratio per area
direct_emissions <- final_frame %>%
filter(str_detect(Element, "Direct")) %>%
group_by(Area) %>%
summarise(Total_Direct = sum(`Element Emission Value`, na.rm = TRUE))
indirect_emissions <- final_frame %>%
filter(str_detect(Element, "Indirect")) %>%
group_by(Area) %>%
summarise(Total_Indirect = sum(`Element Emission Value`, na.rm = TRUE))
# Join the datasets and calculate the ratio
emissions_ratio <- direct_emissions %>%
left_join(indirect_emissions, by = "Area") %>%
mutate(Indirect_Direct_Ratio = Total_Indirect / Total_Direct) %>%
replace_na(list(Indirect_Direct_Ratio = 0)) %>%
arrange(desc(Indirect_Direct_Ratio))
# Display results
print("Areas with the Highest Average Exposure for Each Chemical")
## [1] "Areas with the Highest Average Exposure for Each Chemical"
print(highest_exposure)
## # A tibble: 6 × 3
## Area Element Average_Emission
## <chr> <chr> <dbl>
## 1 Afghanistan Direct emissions (N2O) 0.892
## 2 Afghanistan Emissions (CH4) 105.
## 3 Afghanistan Emissions (CO2eq) from CH4 (AR5) 2933.
## 4 Afghanistan Emissions (CO2eq) from N2O (AR5) 290.
## 5 Afghanistan Emissions (N2O) 0.585
## 6 Afghanistan Indirect emissions (N2O) 0.201
print("Areas Sorted by Indirect/Direct Emissions Ratio")
## [1] "Areas Sorted by Indirect/Direct Emissions Ratio"
print(emissions_ratio)
## # A tibble: 1 × 4
## Area Total_Direct Total_Indirect Indirect_Direct_Ratio
## <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan 8.03 1.81 0.225
# Visualization of highest exposure areas
ggplot(highest_exposure, aes(x = Average_Emission, y = Area, fill = Element)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Areas with Highest Average Exposure for Each Chemical",
x = "Average Emission Value (kilotonnes)",
y = "Area") +
theme_minimal()
# Visualization of indirect/direct emissions ratio
ggplot(emissions_ratio, aes(x = Indirect_Direct_Ratio, y = reorder(Area, Indirect_Direct_Ratio))) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Areas Sorted by Indirect to Direct Emissions Ratio",
x = "Indirect/Direct Emissions Ratio",
y = "Area") +
theme_minimal()
Data Transformation 1. Transforming Store Sales Data The store sales data originally had a wide format, where each month was a separate column. To make the data more accessible for analysis, it was transformed using the pivot_longer function. The transformation involved:
Extracting the month-wise sales values into a single column. Creating a tidy tibble with columns: Store.ID, Store, Month, and Sales_Amount. Key Code Snippet for Store Data Tidying
fina_df <- store_df %>%
pivot_longer(
cols = c("January","February","March","April","May","June","July","August","September","October","November"),
names_to = "Month",
values_to = "Sales_Amount"
)
print(fina_df)
## # A tibble: 220 × 5
## Store.ID Store Country Month Sales_Amount
## <int> <chr> <chr> <chr> <int>
## 1 1 Palisades US January 371700
## 2 1 Palisades US February 435950
## 3 1 Palisades US March 372460
## 4 1 Palisades US April 192260
## 5 1 Palisades US May 157550
## 6 1 Palisades US June 332550
## 7 1 Palisades US July 89630
## 8 1 Palisades US August 372090
## 9 1 Palisades US September 421670
## 10 1 Palisades US October 173010
## # ℹ 210 more rows
Product.Name, Region, Month, Sales_Amount. Key Code Snippet for Product Data Tidying
fin_df <- product_df %>%
pivot_longer(
cols = c("Jan.Sales","Feb.Sales","Mar.Sales","Apr.Sales","May.Sales","Jun.Sales"),
names_to = "Month",
values_to = "Sales_Amount"
)
print(fin_df)
## # A tibble: 54 × 4
## Product.Name Region Month Sales_Amount
## <chr> <chr> <chr> <int>
## 1 Product A North Jan.Sales 100
## 2 Product A North Feb.Sales 110
## 3 Product A North Mar.Sales 120
## 4 Product A North Apr.Sales 130
## 5 Product A North May.Sales 140
## 6 Product A North Jun.Sales 150
## 7 Product A South Jan.Sales 200
## 8 Product A South Feb.Sales 210
## 9 Product A South Mar.Sales 220
## 10 Product A South Apr.Sales 230
## # ℹ 44 more rows
Reshaping yearly emissions into a single Emission Year column. Storing emission values in a new column: Element Emission Value. Key Code Snippet for Emissions Data Tidying
final_frame <- df %>%
pivot_longer(
cols = c("X2000","X2001","X2002","X2003","X2004","X2005","X2006","X2007","X2008"),
names_to = "Emission Year",
values_to = "Element Emission Value"
)
print(final_frame)
## # A tibble: 99 × 6
## Area Item Element Unit `Emission Year` Element Emission Val…¹
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Afghanistan Crop Residu… Direct… kilo… X2000 0.52
## 2 Afghanistan Crop Residu… Direct… kilo… X2001 0.527
## 3 Afghanistan Crop Residu… Direct… kilo… X2002 0.82
## 4 Afghanistan Crop Residu… Direct… kilo… X2003 0.999
## 5 Afghanistan Crop Residu… Direct… kilo… X2004 0.822
## 6 Afghanistan Crop Residu… Direct… kilo… X2005 1.18
## 7 Afghanistan Crop Residu… Direct… kilo… X2006 1.03
## 8 Afghanistan Crop Residu… Direct… kilo… X2007 1.24
## 9 Afghanistan Crop Residu… Direct… kilo… X2008 0.887
## 10 Afghanistan Crop Residu… Indire… kilo… X2000 0.117
## # ℹ 89 more rows
## # ℹ abbreviated name: ¹​`Element Emission Value`
Analytics and Insights 1. Store Sales Analysis Key Insights:
The store with the highest total sales was identified. The best and worst-performing months in terms of sales were determined. A visualization of sales trends across months was generated. Code Snippet for Store Sales Analysis
best_selling_store <- fina_df %>%
group_by(Store) %>%
summarise(Total_Sales = sum(Sales_Amount, na.rm = TRUE)) %>%
slice_max(Total_Sales, n = 1)
# Visualization: Sales by Store
ggplot(fina_df, aes(x = Store, y = Sales_Amount, fill = Store)) +
geom_bar(stat = "identity") +
theme_minimal()
The best-selling product across regions was identified. The worst-selling product was also highlighted. A comparison of average sales across different regions was performed. Code Snippet for Product Sales Analysis
total_sales_product <- fin_df %>%
group_by(Product.Name) %>%
summarise(Total_Sales = sum(Sales_Amount, na.rm = TRUE)) %>%
arrange(desc(Total_Sales))
# Visualization: Sales by Product
ggplot(total_sales_product, aes(x = Product.Name, y = Total_Sales, fill = Product.Name)) +
geom_bar(stat = "identity") +
theme_minimal()
Identified the areas with the highest average exposure for each chemical. Determined areas with the highest Indirect/Direct emissions ratio. Code Snippet for Emissions Analysis
highest_exposure <- avg_exposure %>%
group_by(Element) %>%
filter(Average_Emission == max(Average_Emission))
# Visualization: Emissions Exposure by Area
ggplot(highest_exposure, aes(x = Average_Emission, y = Area, fill = Element)) +
geom_bar(stat = "identity") +
theme_minimal()
Conclusion The process of tidying data was essential to enable meaningful analysis. Without the restructuring of these datasets:
Store and product sales data would be difficult to summarize due to their wide format. Emissions data would not allow for trend analysis over time. By leveraging tidyr and dplyr, I transformed the data into a long format, allowing for easy filtering, grouping, and visualization. This structured approach enables deeper insights, improving decision-making and analytical efficiency.