project2

Code Assets

Store Sales Analysis Code

library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library("tidyr")
library("ggplot2")


# 3 URLs
store_df <- read.csv("~/R-documents/store_sales.csv")

# Column Names
store_col_names <- colnames(store_df)

print(store_col_names)

##  [1] "Store.ID"  "Store"     "Country"   "January"   "February"  "March"    
##  [7] "April"     "May"       "June"      "July"      "August"    "September"
## [13] "October"   "November"

# Preview first 5 rows of each
print(head(store_df))

##   Store.ID        Store Country January February  March  April    May   June
## 1        1    Palisades      US  371700   435950 372460 192260 157550 332550
## 2        2     Billings      US   97530   324140 454480  36810 219790 210970
## 3        3    Laguardia      US  346130   157510 288990 358190  96860 461950
## 4        4 Cheeseburger      US  442010   212390 183580 308650 184340 156540
## 5        5      Detroit      US   33250    36840 320170 242650 350300 421980
## 6        6        Towns      US   16632    25372  38178  13222  16031  37162
##     July August September October November
## 1  89630 372090    421670  173010   173220
## 2  84840 175440    283710  275320   401940
## 3  80440 404990    450630  327270   370100
## 4 328180 281430    498150  473150    23740
## 5 307190  16900    443990  346230   312670
## 6  22541  16700     10475    1897    10493

# Filter for a store
## To apply the filter, make sure to add a , after the condition
store_example <- store_df[store_df$Store.ID == 1, ]

print(store_example)

##   Store.ID     Store Country January February  March  April    May   June  July
## 1        1 Palisades      US  371700   435950 372460 192260 157550 332550 89630
##   August September October November
## 1 372090    421670  173010   173220

# Chosen column names
month_col_names <- c()

for (i in 4:length(store_col_names)) {
    # Get specific month column
    month_col <- store_col_names[i]

    print(month_col)

    # Append to the month_col_names
    month_col_names <- c(month_col_names, month_col)
}

## [1] "January"
## [1] "February"
## [1] "March"
## [1] "April"
## [1] "May"
## [1] "June"
## [1] "July"
## [1] "August"
## [1] "September"
## [1] "October"
## [1] "November"

print(month_col_names)

##  [1] "January"   "February"  "March"     "April"     "May"       "June"     
##  [7] "July"      "August"    "September" "October"   "November"

# Select only the month columns for the store example
selected_store_example <- store_example[month_col_names]

print(selected_store_example)

##   January February  March  April    May   June  July August September October
## 1  371700   435950 372460 192260 157550 332550 89630 372090    421670  173010
##   November
## 1   173220

# Apply a Transpose so then the columns become the rows
selected_store_transpose <- t(selected_store_example)

print(selected_store_transpose)

##                1
## January   371700
## February  435950
## March     372460
## April     192260
## May       157550
## June      332550
## July       89630
## August    372090
## September 421670
## October   173010
## November  173220

# Reset the index to be a column and then rename the columns to Month, Sale Amount
print(rownames(selected_store_transpose))

##  [1] "January"   "February"  "March"     "April"     "May"       "June"     
##  [7] "July"      "August"    "September" "October"   "November"

print(selected_store_transpose)

##                1
## January   371700
## February  435950
## March     372460
## April     192260
## May       157550
## June      332550
## July       89630
## August    372090
## September 421670
## October   173010
## November  173220

store_ids <- c()
store_names <- c()
sales_values <- c()

for (i in 1:length(selected_store_transpose)) {
    sales_value <- selected_store_transpose[i]

    sales_values <- c(sales_values, sales_value)

    store_names <- c(store_names, "Palisades")

    store_ids <- c(store_ids, 1)
}

print(sales_values)

##  [1] 371700 435950 372460 192260 157550 332550  89630 372090 421670 173010
## [11] 173220

tidy_store_df <- data.frame(
    Store.ID = store_ids,
    Store = store_names,
    Month = rownames(selected_store_transpose),
    Sales_Value = sales_values
)

print(tidy_store_df)

##    Store.ID     Store     Month Sales_Value
## 1         1 Palisades   January      371700
## 2         1 Palisades  February      435950
## 3         1 Palisades     March      372460
## 4         1 Palisades     April      192260
## 5         1 Palisades       May      157550
## 6         1 Palisades      June      332550
## 7         1 Palisades      July       89630
## 8         1 Palisades    August      372090
## 9         1 Palisades September      421670
## 10        1 Palisades   October      173010
## 11        1 Palisades  November      173220

month_tidy <- function(single_df, id_col, store_col, selected_cols) {

    # single DataFrame selecting only the columns of interest
    month_df <- single_df[selected_cols]

    # Month Column Names
    month_col_names <- colnames(month_df)

    # Month Values
    month_values <- c()

    # Store ID
    store_ids <- c()

    # Store Name
    store_names <- c()

    # Create a transpose
    months_transposed <- t(month_df)

    # Iterate through each row
    for (i in 1:length(months_transposed)) {
        # Gets the store value
        store_value <- months_transposed[i]

        # Store the ID and Name
        store_ids <- c(store_ids, id_col)

        store_names <- c(store_names, store_col)

    }

    # Create the DataFrame
    tidy_df <- data.frame(
        Store.ID = store_ids,
        Store = store_names,
        Month = month_col_names,
        Sales_Amount = sales_values
    )

    # Return the tidy_df
    return (tidy_df)

}

month_tidy_v2 <- function(single_df, id_col, store_col, selected_cols) {
  
  # Select only the columns of interest
  month_df <- single_df %>%
    select(all_of(selected_cols))
  
  # Reshape the data from wide to long format
  tidy_df <- month_df %>%
    pivot_longer(
      cols = everything(),  # Pivot all selected columns
      names_to = "Month",   # New column for months (previously column names)
      values_to = "Sales_Amount" # New column for sales values
    ) %>%
    mutate(Store.ID = id_col, Store = store_col) %>% # Add store details
    select(Store.ID, Store, Month, Sales_Amount)  # Reorder columns

  return(tidy_df)
}


# Test the function for store_example
function_test <- month_tidy_v2(store_example, 1, "Palisades", month_col_names)

print(function_test)

## # A tibble: 11 × 4
##    Store.ID Store     Month     Sales_Amount
##       <dbl> <chr>     <chr>            <int>
##  1        1 Palisades January         371700
##  2        1 Palisades February        435950
##  3        1 Palisades March           372460
##  4        1 Palisades April           192260
##  5        1 Palisades May             157550
##  6        1 Palisades June            332550
##  7        1 Palisades July             89630
##  8        1 Palisades August          372090
##  9        1 Palisades September       421670
## 10        1 Palisades October         173010
## 11        1 Palisades November        173220

# List of DataFrames
store_df_frames <- list()

# Loop through each store
for (i in 1:length(store_df$Store.ID)) {
    # Store ID
    store_id <- store_df$Store.ID[i]

    # Store Name
    store_name <- store_df$Store[i]

    # Single row
    single_row <- store_df[store_df$Store.ID == i, ]

    # Tidy that row
    store_tidy_df <- month_tidy_v2(single_row, store_id, store_name, month_col_names)

    # Store in the array
    store_df_frames[[i]] <- store_tidy_df
}

# Combine the DataFrames into a single one by the rows

## bind_rows from dplyr
final_df <- bind_rows(store_df_frames)

print(final_df)

## # A tibble: 220 × 4
##    Store.ID Store     Month     Sales_Amount
##       <int> <chr>     <chr>            <int>
##  1        1 Palisades January         371700
##  2        1 Palisades February        435950
##  3        1 Palisades March           372460
##  4        1 Palisades April           192260
##  5        1 Palisades May             157550
##  6        1 Palisades June            332550
##  7        1 Palisades July             89630
##  8        1 Palisades August          372090
##  9        1 Palisades September       421670
## 10        1 Palisades October         173010
## # ℹ 210 more rows

# Analysis

# Assume final_df is the dataset containing store sales

# Total sales per store across all months
total_sales_store <- final_df %>%
  group_by(Store, Store.ID) %>%
  summarise(Total_Sales = sum(Sales_Amount, na.rm = TRUE)) %>%
  arrange(desc(Total_Sales))

## `summarise()` has grouped output by 'Store'. You can override using the
## `.groups` argument.

# Identify the store with the highest total sales
best_selling_store <- total_sales_store %>%
  slice_max(Total_Sales, n = 1)

# Identify the store with the lowest total sales
worst_selling_store <- total_sales_store %>%
  slice_min(Total_Sales, n = 1)

# Total sales per month across all stores
total_sales_month <- final_df %>%
  group_by(Month) %>%
  summarise(Total_Sales = sum(Sales_Amount, na.rm = TRUE)) %>%
  arrange(desc(Total_Sales))

# Identify the month with the highest sales
best_month <- total_sales_month %>%
  slice_max(Total_Sales, n = 1)

# Identify the month with the lowest sales
worst_month <- total_sales_month %>%
  slice_min(Total_Sales, n = 1)

# Print insights
print("Store with the highest total sales:")

## [1] "Store with the highest total sales:"

print(best_selling_store)

## # A tibble: 20 × 3
## # Groups:   Store [20]
##    Store        Store.ID Total_Sales
##    <chr>           <int>       <int>
##  1 Billings            2     2564970
##  2 Blimey             13      240649
##  3 Camembert          20     1047658
##  4 Cheeseburger        4     3092160
##  5 Chelsea            11      191157
##  6 Detroit             5     2832170
##  7 Eh                  8      367044
##  8 Fuji               16      283811
##  9 Hokkaido           15      244173
## 10 Innit              14      315480
## 11 Laguardia           3     3343060
## 12 Maple               9      362619
## 13 Nanjing            18      219850
## 14 Palisades           1     3092090
## 15 Paolo              19      330761
## 16 Steve              17      308982
## 17 Tim Horton's        7      258845
## 18 Towns               6      208703
## 19 Victoria           10      264917
## 20 Wimbledon          12      199778

print("Store with the lowest total sales:")

## [1] "Store with the lowest total sales:"

print(worst_selling_store)

## # A tibble: 20 × 3
## # Groups:   Store [20]
##    Store        Store.ID Total_Sales
##    <chr>           <int>       <int>
##  1 Billings            2     2564970
##  2 Blimey             13      240649
##  3 Camembert          20     1047658
##  4 Cheeseburger        4     3092160
##  5 Chelsea            11      191157
##  6 Detroit             5     2832170
##  7 Eh                  8      367044
##  8 Fuji               16      283811
##  9 Hokkaido           15      244173
## 10 Innit              14      315480
## 11 Laguardia           3     3343060
## 12 Maple               9      362619
## 13 Nanjing            18      219850
## 14 Palisades           1     3092090
## 15 Paolo              19      330761
## 16 Steve              17      308982
## 17 Tim Horton's        7      258845
## 18 Towns               6      208703
## 19 Victoria           10      264917
## 20 Wimbledon          12      199778

print("Month with the highest total sales:")

## [1] "Month with the highest total sales:"

print(best_month)

## # A tibble: 1 × 2
##   Month     Total_Sales
##   <chr>           <int>
## 1 September     2568518

print("Month with the lowest total sales:")

## [1] "Month with the lowest total sales:"

print(worst_month)

## # A tibble: 1 × 2
##   Month Total_Sales
##   <chr>       <int>
## 1 July      1244149

# Visualization of total sales by store
ggplot(total_sales_store, aes(x = reorder(Store, Total_Sales), y = Total_Sales, fill = Store)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Total Sales by Store",
       x = "Store",
       y = "Total Sales") +
  theme_minimal()

# Visualization of total sales by month
ggplot(total_sales_month, aes(x = reorder(Month, Total_Sales), y = Total_Sales, fill = Month)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Total Sales by Month",
       x = "Month",
       y = "Total Sales") +
  theme_minimal()

Product Sales Analysis Code

library('dplyr')
library('tidyr')

product_df <- read.csv("~/R-documents/product_sales.csv")

print(head(product_df))

##   Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 1    Product A  North       100       110       120       130       140
## 2    Product A  South       200       210       220       230       240
## 3    Product A   East       300       310       320       330       340
## 4    Product B  North       150       160       170       180       190
## 5    Product B  South       250       260       270       280       290
## 6    Product B   East       350       360       370       380       390
##   Jun.Sales
## 1       150
## 2       250
## 3       350
## 4       200
## 5       300
## 6       400

month_tidy_v2 <- function(single_df, product_name_col, region_col) {
  
    # Actually get the product name and region values
    product_name <- single_df$product_name_col
    region_name <- single_df$region_col
  
  # Reshape the data from wide to long format
  tidy_df <- single_df %>%
    pivot_longer(
      cols = month_col_names,  # Pivot all selected columns
      names_to = "Month",   # New column for months (previously column names)
      values_to = "Sales_Amount" # New column for sales values
    ) %>%
    mutate(product_name_col = product_name, region_col = region_name) # Add store details
  return(tidy_df)
}

# List of R DataFrames
frame_list <- list()

# Month Columns
product_col_names <- colnames(product_df)

print(product_col_names)

## [1] "Product.Name" "Region"       "Jan.Sales"    "Feb.Sales"    "Mar.Sales"   
## [6] "Apr.Sales"    "May.Sales"    "Jun.Sales"

month_col_names <- product_col_names[3:length(product_col_names)]

print(month_col_names)

## [1] "Jan.Sales" "Feb.Sales" "Mar.Sales" "Apr.Sales" "May.Sales" "Jun.Sales"

# Iterate through the product df
for (i in 1:length(product_df$Product.Name)) {
    # Single row
    single_row <- product_df[i,]

    print(single_row)

    # Apply the tidy transformation
    tidy_df <- month_tidy_v2(single_row, "Product.Name", "Region")

    # Apply the tidy_df to your list
    frame_list[[i]] <- tidy_df
}

##   Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 1    Product A  North       100       110       120       130       140
##   Jun.Sales
## 1       150

## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(month_col_names)
## 
##   # Now:
##   data %>% select(all_of(month_col_names))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

##   Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 2    Product A  South       200       210       220       230       240
##   Jun.Sales
## 2       250
##   Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 3    Product A   East       300       310       320       330       340
##   Jun.Sales
## 3       350
##   Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 4    Product B  North       150       160       170       180       190
##   Jun.Sales
## 4       200
##   Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 5    Product B  South       250       260       270       280       290
##   Jun.Sales
## 5       300
##   Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 6    Product B   East       350       360       370       380       390
##   Jun.Sales
## 6       400
##   Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 7    Product C  North        50        55        60        65        70
##   Jun.Sales
## 7        75
##   Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 8    Product C  South       100       105       110       115       120
##   Jun.Sales
## 8       125
##   Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 9    Product C   East       150       155       160       165       170
##   Jun.Sales
## 9       175

# Bind the rows
final_df <- bind_rows(frame_list)

print(final_df)

## # A tibble: 54 × 4
##    Product.Name Region Month     Sales_Amount
##    <chr>        <chr>  <chr>            <int>
##  1 Product A    North  Jan.Sales          100
##  2 Product A    North  Feb.Sales          110
##  3 Product A    North  Mar.Sales          120
##  4 Product A    North  Apr.Sales          130
##  5 Product A    North  May.Sales          140
##  6 Product A    North  Jun.Sales          150
##  7 Product A    South  Jan.Sales          200
##  8 Product A    South  Feb.Sales          210
##  9 Product A    South  Mar.Sales          220
## 10 Product A    South  Apr.Sales          230
## # ℹ 44 more rows

# Analysis

# Assume final_df is the dataset containing product sales

# Total sales per product across all regions
total_sales_product <- final_df %>%
  group_by(Product.Name) %>%
  summarise(Total_Sales = sum(Sales_Amount, na.rm = TRUE)) %>%
  arrange(desc(Total_Sales))

# Identify the product with the highest sales
best_selling_product <- total_sales_product %>%
  slice_max(Total_Sales, n = 1)

# Identify the product with the lowest sales
worst_selling_product <- total_sales_product %>%
  slice_min(Total_Sales, n = 1)

# Average sales per region across all products
avg_sales_region <- final_df %>%
  group_by(Region) %>%
  summarise(Average_Sales = mean(Sales_Amount, na.rm = TRUE)) %>%
  arrange(desc(Average_Sales))

# Identify the region with the highest average sales
best_performing_region <- avg_sales_region %>%
  slice_max(Average_Sales, n = 1)

# Identify the region with the lowest average sales
worst_performing_region <- avg_sales_region %>%
  slice_min(Average_Sales, n = 1)

# Print insights
print("Product with the highest sales:")

## [1] "Product with the highest sales:"

print(best_selling_product)

## # A tibble: 1 × 2
##   Product.Name Total_Sales
##   <chr>              <int>
## 1 Product B           4950

print("Product with the lowest sales:")

## [1] "Product with the lowest sales:"

print(worst_selling_product)

## # A tibble: 1 × 2
##   Product.Name Total_Sales
##   <chr>              <int>
## 1 Product C           2025

print("Region with the highest average sales:")

## [1] "Region with the highest average sales:"

print(best_performing_region)

## # A tibble: 1 × 2
##   Region Average_Sales
##   <chr>          <dbl>
## 1 East            288.

print("Region with the lowest average sales:")

## [1] "Region with the lowest average sales:"

print(worst_performing_region)

## # A tibble: 1 × 2
##   Region Average_Sales
##   <chr>          <dbl>
## 1 North           121.

# Visualization of total sales per product
ggplot(total_sales_product, aes(x = reorder(Product.Name, Total_Sales), y = Total_Sales, fill = Product.Name)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Total Sales by Product",
       x = "Product Name",
       y = "Total Sales") +
  theme_minimal()

# Visualization of average sales per region
ggplot(avg_sales_region, aes(x = reorder(Region, Average_Sales), y = Average_Sales, fill = Region)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Average Sales by Region",
       x = "Region",
       y = "Average Sales") +
  theme_minimal()

Emissions Data Analysis Code

library("dplyr")
library("tidyr")
library("stringr")
library("ggplot2")

# DataFrame
df <- read.csv("~/R-documents/emissions_data.csv")

# Function
tidy_transform <- function(single_row, col_names) {

    tidy_df <- single_row %>%
        pivot_longer(
            cols = col_names,  # Pivot all selected columns
      names_to = "Emission Year",   # New column for years of emission of a specific chemical
      values_to = "Element Emission Value" # New column for Emissions Values from a specific year
    )

    return (tidy_df)
}

# Specific emission columns
df_columns <- colnames(df)

emission_columns <- df_columns[5:length(df_columns)]

print(emission_columns)

## [1] "X2000" "X2001" "X2002" "X2003" "X2004" "X2005" "X2006" "X2007" "X2008"

# Save all the frames
frames <- list()

# Length of the df$Area
area_length <- length(df$Area)

print(area_length)

## [1] 11

# Create a loop that goes through each row in the DataFrame
for (i in 1:area_length) {
    # Single row
    single_row <- df[i,]

    # Apply tidy transform
    frames[[i]] <- tidy_transform(single_row, emission_columns)
}

## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(col_names)
## 
##   # Now:
##   data %>% select(all_of(col_names))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Apply bind rows
final_frame <- bind_rows(frames)

print(final_frame)

## # A tibble: 99 × 6
##    Area        Item         Element Unit  `Emission Year` Element Emission Val…¹
##    <chr>       <chr>        <chr>   <chr> <chr>                            <dbl>
##  1 Afghanistan Crop Residu… Direct… kilo… X2000                            0.52 
##  2 Afghanistan Crop Residu… Direct… kilo… X2001                            0.527
##  3 Afghanistan Crop Residu… Direct… kilo… X2002                            0.82 
##  4 Afghanistan Crop Residu… Direct… kilo… X2003                            0.999
##  5 Afghanistan Crop Residu… Direct… kilo… X2004                            0.822
##  6 Afghanistan Crop Residu… Direct… kilo… X2005                            1.18 
##  7 Afghanistan Crop Residu… Direct… kilo… X2006                            1.03 
##  8 Afghanistan Crop Residu… Direct… kilo… X2007                            1.24 
##  9 Afghanistan Crop Residu… Direct… kilo… X2008                            0.887
## 10 Afghanistan Crop Residu… Indire… kilo… X2000                            0.117
## # ℹ 89 more rows
## # ℹ abbreviated name: ¹`Element Emission Value`

# Analytics

# Assume final_frame is the input dataset

# Compute average exposure for each area and element
avg_exposure <- final_frame %>%
  group_by(Area, Element) %>%
  summarise(Average_Emission = mean(`Element Emission Value`, na.rm = TRUE), .groups = "drop")

# Identify areas with highest exposure for each chemical element
highest_exposure <- avg_exposure %>%
  group_by(Element) %>%
  filter(Average_Emission == max(Average_Emission)) %>%
  ungroup()

# Compute Indirect/Direct emissions ratio per area
direct_emissions <- final_frame %>%
  filter(str_detect(Element, "Direct")) %>%
  group_by(Area) %>%
  summarise(Total_Direct = sum(`Element Emission Value`, na.rm = TRUE))

indirect_emissions <- final_frame %>%
  filter(str_detect(Element, "Indirect")) %>%
  group_by(Area) %>%
  summarise(Total_Indirect = sum(`Element Emission Value`, na.rm = TRUE))

# Join the datasets and calculate the ratio
emissions_ratio <- direct_emissions %>%
  left_join(indirect_emissions, by = "Area") %>%
  mutate(Indirect_Direct_Ratio = Total_Indirect / Total_Direct) %>%
  replace_na(list(Indirect_Direct_Ratio = 0)) %>%
  arrange(desc(Indirect_Direct_Ratio))

# Display results
print("Areas with the Highest Average Exposure for Each Chemical")

## [1] "Areas with the Highest Average Exposure for Each Chemical"

print(highest_exposure)

## # A tibble: 6 × 3
##   Area        Element                          Average_Emission
##   <chr>       <chr>                                       <dbl>
## 1 Afghanistan Direct emissions (N2O)                      0.892
## 2 Afghanistan Emissions (CH4)                           105.   
## 3 Afghanistan Emissions (CO2eq) from CH4 (AR5)         2933.   
## 4 Afghanistan Emissions (CO2eq) from N2O (AR5)          290.   
## 5 Afghanistan Emissions (N2O)                             0.585
## 6 Afghanistan Indirect emissions (N2O)                    0.201

print("Areas Sorted by Indirect/Direct Emissions Ratio")

## [1] "Areas Sorted by Indirect/Direct Emissions Ratio"

print(emissions_ratio)

## # A tibble: 1 × 4
##   Area        Total_Direct Total_Indirect Indirect_Direct_Ratio
##   <chr>              <dbl>          <dbl>                 <dbl>
## 1 Afghanistan         8.03           1.81                 0.225

# Visualization of highest exposure areas
ggplot(highest_exposure, aes(x = Average_Emission, y = Area, fill = Element)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Areas with Highest Average Exposure for Each Chemical",
       x = "Average Emission Value (kilotonnes)",
       y = "Area") +
  theme_minimal()

# Visualization of indirect/direct emissions ratio
ggplot(emissions_ratio, aes(x = Indirect_Direct_Ratio, y = reorder(Area, Indirect_Direct_Ratio))) +
  geom_bar(stat = "identity", fill = "blue") +
  labs(title = "Areas Sorted by Indirect to Direct Emissions Ratio",
       x = "Indirect/Direct Emissions Ratio",
       y = "Area") +
  theme_minimal()

Data Transformation 1. Transforming Store Sales Data The store sales data originally had a wide format, where each month was a separate column. To make the data more accessible for analysis, it was transformed using the pivot_longer function. The transformation involved:

Extracting the month-wise sales values into a single column. Creating a tidy tibble with columns: Store.ID, Store, Month, and Sales_Amount. Key Code Snippet for Store Data Tidying

fina_df <- store_df %>%
  pivot_longer(
    cols = c("January","February","March","April","May","June","July","August","September","October","November"),
    names_to = "Month",
    values_to = "Sales_Amount"
  )
print(fina_df)

## # A tibble: 220 × 5
##    Store.ID Store     Country Month     Sales_Amount
##       <int> <chr>     <chr>   <chr>            <int>
##  1        1 Palisades US      January         371700
##  2        1 Palisades US      February        435950
##  3        1 Palisades US      March           372460
##  4        1 Palisades US      April           192260
##  5        1 Palisades US      May             157550
##  6        1 Palisades US      June            332550
##  7        1 Palisades US      July             89630
##  8        1 Palisades US      August          372090
##  9        1 Palisades US      September       421670
## 10        1 Palisades US      October         173010
## # ℹ 210 more rows

Transforming Product Sales Data Similar to store sales, the product sales dataset was also in a wide format. Using pivot_longer, sales across months were reshaped into a single column, resulting in a tidy tibble with:

Product.Name, Region, Month, Sales_Amount. Key Code Snippet for Product Data Tidying

fin_df <- product_df %>%
  pivot_longer(
    cols = c("Jan.Sales","Feb.Sales","Mar.Sales","Apr.Sales","May.Sales","Jun.Sales"),
    names_to = "Month",
    values_to = "Sales_Amount"
  )
print(fin_df)

## # A tibble: 54 × 4
##    Product.Name Region Month     Sales_Amount
##    <chr>        <chr>  <chr>            <int>
##  1 Product A    North  Jan.Sales          100
##  2 Product A    North  Feb.Sales          110
##  3 Product A    North  Mar.Sales          120
##  4 Product A    North  Apr.Sales          130
##  5 Product A    North  May.Sales          140
##  6 Product A    North  Jun.Sales          150
##  7 Product A    South  Jan.Sales          200
##  8 Product A    South  Feb.Sales          210
##  9 Product A    South  Mar.Sales          220
## 10 Product A    South  Apr.Sales          230
## # ℹ 44 more rows

Transforming Emissions Data The emissions dataset initially had yearly emissions data spread across multiple columns. The transformation process involved:

Reshaping yearly emissions into a single Emission Year column. Storing emission values in a new column: Element Emission Value. Key Code Snippet for Emissions Data Tidying

final_frame <- df %>%
  pivot_longer(
    cols = c("X2000","X2001","X2002","X2003","X2004","X2005","X2006","X2007","X2008"),
    names_to = "Emission Year",
    values_to = "Element Emission Value"
  )
print(final_frame)

## # A tibble: 99 × 6
##    Area        Item         Element Unit  `Emission Year` Element Emission Val…¹
##    <chr>       <chr>        <chr>   <chr> <chr>                            <dbl>
##  1 Afghanistan Crop Residu… Direct… kilo… X2000                            0.52 
##  2 Afghanistan Crop Residu… Direct… kilo… X2001                            0.527
##  3 Afghanistan Crop Residu… Direct… kilo… X2002                            0.82 
##  4 Afghanistan Crop Residu… Direct… kilo… X2003                            0.999
##  5 Afghanistan Crop Residu… Direct… kilo… X2004                            0.822
##  6 Afghanistan Crop Residu… Direct… kilo… X2005                            1.18 
##  7 Afghanistan Crop Residu… Direct… kilo… X2006                            1.03 
##  8 Afghanistan Crop Residu… Direct… kilo… X2007                            1.24 
##  9 Afghanistan Crop Residu… Direct… kilo… X2008                            0.887
## 10 Afghanistan Crop Residu… Indire… kilo… X2000                            0.117
## # ℹ 89 more rows
## # ℹ abbreviated name: ¹`Element Emission Value`

Analytics and Insights 1. Store Sales Analysis Key Insights:

The store with the highest total sales was identified. The best and worst-performing months in terms of sales were determined. A visualization of sales trends across months was generated. Code Snippet for Store Sales Analysis

best_selling_store <- fina_df %>%
  group_by(Store) %>%
  summarise(Total_Sales = sum(Sales_Amount, na.rm = TRUE)) %>%
  slice_max(Total_Sales, n = 1)
  
# Visualization: Sales by Store
ggplot(fina_df, aes(x = Store, y = Sales_Amount, fill = Store)) +
  geom_bar(stat = "identity") +
  theme_minimal()

Product Sales Analysis Key Insights:

The best-selling product across regions was identified. The worst-selling product was also highlighted. A comparison of average sales across different regions was performed. Code Snippet for Product Sales Analysis

total_sales_product <- fin_df %>%
  group_by(Product.Name) %>%
  summarise(Total_Sales = sum(Sales_Amount, na.rm = TRUE)) %>%
  arrange(desc(Total_Sales))
# Visualization: Sales by Product
ggplot(total_sales_product, aes(x = Product.Name, y = Total_Sales, fill = Product.Name)) +
  geom_bar(stat = "identity") +
  theme_minimal()

Emissions Data Analysis Key Insights:

Identified the areas with the highest average exposure for each chemical. Determined areas with the highest Indirect/Direct emissions ratio. Code Snippet for Emissions Analysis

highest_exposure <- avg_exposure %>%
  group_by(Element) %>%
  filter(Average_Emission == max(Average_Emission))
# Visualization: Emissions Exposure by Area
ggplot(highest_exposure, aes(x = Average_Emission, y = Area, fill = Element)) +
  geom_bar(stat = "identity") +
  theme_minimal()

Conclusion The process of tidying data was essential to enable meaningful analysis. Without the restructuring of these datasets:

Store and product sales data would be difficult to summarize due to their wide format. Emissions data would not allow for trend analysis over time. By leveraging tidyr and dplyr, I transformed the data into a long format, allowing for easy filtering, grouping, and visualization. This structured approach enables deeper insights, improving decision-making and analytical efficiency.

project2

Ali Ahmed

2025-03-10

Data Tidying and Analysis of Product Sales, Store Sales, and Emissions Data

Code Assets

Store Sales Analysis Code

Product Sales Analysis Code

Emissions Data Analysis Code