Library

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Load Dataset #1

dataset1 <- read.csv("https://raw.githubusercontent.com/JaydeeJan/Data-607-Project-2/refs/heads/main/products%20across%20different%20regions.csv")

print(dataset1)

##   Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 1    Product A North        100       110       120       130       140
## 2    Product A  South       200       210       220       230       240
## 3    Product A   East       300       310       320       330       340
## 4    Product B North        150       160       170       180       190
## 5    Product B  South       250       260       270       280       290
## 6    Product B   East       350       360       370       380       390
## 7    Product C North         50        55        60        65        70
## 8    Product C  South       100       105       110       115       120
## 9    Product C   East       150       155       160       165       170
##   Jun.Sales
## 1       150
## 2       250
## 3       350
## 4       200
## 5       300
## 6       400
## 7        75
## 8       125
## 9       175

Tidy and Transform the Data

# converting the data from wide format to long format for easier analysis
long_data1 <- dataset1 %>%
  pivot_longer(cols = starts_with("Jan") :starts_with("Jun"),
               names_to = "Month",
               values_to = "Sales")

head(long_data1)

## # A tibble: 6 × 4
##   Product.Name Region   Month     Sales
##   <chr>        <chr>    <chr>     <int>
## 1 Product A    "North " Jan.Sales   100
## 2 Product A    "North " Feb.Sales   110
## 3 Product A    "North " Mar.Sales   120
## 4 Product A    "North " Apr.Sales   130
## 5 Product A    "North " May.Sales   140
## 6 Product A    "North " Jun.Sales   150

Analysis

# calculating total sales per product and per region
total_sales_product <- long_data1 %>%
  group_by(Product.Name) %>%
  summarise(Total_Sales = sum(Sales))

total_sales_region <- long_data1 %>%
  group_by(Region) %>%
  summarise(Total_Sales = sum(Sales))

print(total_sales_product)

## # A tibble: 3 × 2
##   Product.Name Total_Sales
##   <chr>              <int>
## 1 Product A           4050
## 2 Product B           4950
## 3 Product C           2025

print(total_sales_region)

## # A tibble: 3 × 2
##   Region   Total_Sales
##   <chr>          <int>
## 1 "East"          5175
## 2 "North "        2175
## 3 "South"         3675

Conclusion

The analysis shows the total sales for each product and region. Product B has the highest total sales, and the East region has the highest total sales across all products.

Load Dataset #2

dataset2 <- read.csv("https://raw.githubusercontent.com/JaydeeJan/Data-607-Project-2/refs/heads/main/rolling_stone.csv")

head(dataset2)

##        sort_name     clean_name                      album rank_2003 rank_2012
## 1 Sinatra, Frank  Frank Sinatra     In the Wee Small Hours       100       101
## 2    Diddley, Bo     Bo Diddley Bo Diddley / Go Bo Diddley       214       216
## 3 Presley, Elvis  Elvis Presley              Elvis Presley        55        56
## 4 Sinatra, Frank  Frank Sinatra Songs for Swingin' Lovers!       306       308
## 5 Little Richard Little Richard      Here's Little Richard        50        50
## 6        Beyonce        Beyonce                   Lemonade        NA        NA
##   rank_2020 differential release_year                       genre   type
## 1       282         -182         1955               Big Band/Jazz Studio
## 2       455         -241         1955 Rock n' Roll/Rhythm & Blues Studio
## 3       332         -277         1956 Rock n' Roll/Rhythm & Blues Studio
## 4        NA         -195         1956               Big Band/Jazz Studio
## 5       227         -177         1957                             Studio
## 6        32          469         2016                             Studio
##   weeks_on_billboard peak_billboard_position spotify_popularity
## 1                 14                       2                 48
## 2                 NA                     201                 50
## 3                100                       1                 58
## 4                 NA                       2                 62
## 5                  5                      13                 64
## 6                 87                       1                 73
##                            spotify_url artist_member_count artist_gender
## 1 spotify:album:3GmwKB1tgPZgXeRJZSm9WX                   1          Male
## 2 spotify:album:1cbtDEwxCjMhglb49OgNBR                   1          Male
## 3 spotify:album:7GXP5OhYyPVLmcVfO9Iqin                   1          Male
## 4 spotify:album:4kca7vXd1Wo5GE2DMafvMc                   1          Male
## 5 spotify:album:18tV6PLXYvVjsdOVk0S7M8                   1          Male
## 6 spotify:album:7dK54iZuOxXFarGhXwEXfF                   1        Female
##   artist_birth_year_sum debut_album_release_year ave_age_at_top_500
## 1                  1915                     1946                 40
## 2                  1928                     1955                 27
## 3                  1935                     1956                 21
## 4                  1915                     1946                 41
## 5                  1932                     1957                 25
## 6                  1981                     2003                 35
##   years_between               album_id
## 1             9 3GmwKB1tgPZgXeRJZSm9WX
## 2             0 1cbtDEwxCjMhglb49OgNBR
## 3             0 7GXP5OhYyPVLmcVfO9Iqin
## 4            10 4kca7vXd1Wo5GE2DMafvMc
## 5             0 18tV6PLXYvVjsdOVk0S7M8
## 6            13 7dK54iZuOxXFarGhXwEXfF

Tidy and Transform the Data

dataset2 <- dataset2 %>%
  select(album, rank_2003, rank_2012, rank_2020)

long_data2 <- dataset2 %>%
  pivot_longer(
    cols = starts_with("rank_"),
    names_to = "Rank_Year",
    values_to = "Rank")

head(long_data2)

## # A tibble: 6 × 3
##   album                      Rank_Year  Rank
##   <chr>                      <chr>     <int>
## 1 In the Wee Small Hours     rank_2003   100
## 2 In the Wee Small Hours     rank_2012   101
## 3 In the Wee Small Hours     rank_2020   282
## 4 Bo Diddley / Go Bo Diddley rank_2003   214
## 5 Bo Diddley / Go Bo Diddley rank_2012   216
## 6 Bo Diddley / Go Bo Diddley rank_2020   455

Analysis

# Calculate the average ranking for each album
average_ranking <- long_data2 %>%
  group_by(album) %>%
  summarise(average_rank = mean(Rank, na.rm = TRUE)) %>%
  arrange(average_rank)

head(average_ranking, 10)

## # A tibble: 10 × 2
##    album                                 average_rank
##    <chr>                                        <dbl>
##  1 Pet Sounds                                    2   
##  2 What's Going On                               4.33
##  3 Revolver                                      5.67
##  4 Highway 61 Revisited                          8.67
##  5 Sgt. Pepper's Lonely Hearts Club Band         8.67
##  6 Exile on Main St.                             9.33
##  7 London Calling                               10.7 
##  8 Abbey Road                                   11   
##  9 Nevermind                                    13.3 
## 10 Blood on the Tracks                          13.7

Conclusion

The analysis shows the top 10 albums for average ranking over 2003, 2012, and 2020.

Load Dataset #3

dataset3 <- read.csv("https://raw.githubusercontent.com/JaydeeJan/Data-607-Project-2/refs/heads/main/Sales%20Data.csv")

head(dataset3)

##   Score.ID        Store Country       January      February         March
## 1        1    Palisades      US  $371,700.00   $435,950.00   $372,460.00 
## 2        2     Billings      US   $97,530.00   $324,140.00   $454,480.00 
## 3        3    Laguardia      US  $346,130.00   $157,510.00   $288,990.00 
## 4        4 Cheeseburger      US  $442,010.00   $212,390.00   $183,580.00 
## 5        5      Detroit      US   $33,250.00    $36,840.00   $320,170.00 
## 6        6        Towns      US   $16,632.00    $25,372.00    $38,178.00 
##           April           May          June          July        August
## 1  $192,260.00   $157,550.00   $332,550.00    $89,630.00   $372,090.00 
## 2   $36,810.00   $219,790.00   $210,970.00    $84,840.00   $175,440.00 
## 3  $358,190.00    $96,860.00   $461,950.00    $80,440.00   $404,990.00 
## 4  $308,650.00   $184,340.00   $156,540.00   $328,180.00   $281,430.00 
## 5  $242,650.00   $350,300.00   $421,980.00   $307,190.00    $16,900.00 
## 6   $13,222.00    $16,031.00    $17,162.00    $22,541.00    $16,700.00 
##       September       October      November
## 1  $421,670.00   $173,010.00   $173,220.00 
## 2  $283,710.00   $275,320.00   $401,940.00 
## 3  $450,830.00   $327,270.00   $370,100.00 
## 4  $498,150.00   $473,150.00    $23,740.00 
## 5  $443,990.00   $346,230.00   $312,670.00 
## 6   $10,475.00     $1,897.00    $10,493.00

Tidy and Transform the Data

long_data3 <- dataset3 %>%
  pivot_longer(cols = January:November, names_to = "Month", values_to = "Sales") %>%
  mutate(Sales = as.numeric(gsub("[\\$,]", "", Sales)))

head(long_data3)

## # A tibble: 6 × 5
##   Score.ID Store     Country Month     Sales
##      <int> <chr>     <chr>   <chr>     <dbl>
## 1        1 Palisades US      January  371700
## 2        1 Palisades US      February 435950
## 3        1 Palisades US      March    372460
## 4        1 Palisades US      April    192260
## 5        1 Palisades US      May      157550
## 6        1 Palisades US      June     332550

Analysis

# Calculate total sales per store
total_sales_per_store <- long_data3 %>%
  group_by(Store) %>%
  summarise(Total_Sales = sum(Sales)) %>%
  arrange(desc(Total_Sales))

print(total_sales_per_store)

## # A tibble: 21 × 2
##    Store        Total_Sales
##    <chr>              <dbl>
##  1 Laguardia        3343260
##  2 Cheeseburger     3092160
##  3 Palisades        3092090
##  4 Detroit          2832170
##  5 Billings         2564970
##  6 Camembert         688990
##  7 Maple             362989
##  8 Paolo             330761
##  9 Innit,            315480
## 10 Steve             308982
## # ℹ 11 more rows

Conclusion

The analysis shows the 5 stores in US market has the highest total sales amount comparing to other regions.

Data 607 Project 2

Jayden Jiang

2025-03-10

Library

Load Dataset #1

Tidy and Transform the Data

Analysis

Conclusion

Load Dataset #2

Tidy and Transform the Data

Analysis

Conclusion

Load Dataset #3

Tidy and Transform the Data

Analysis

Conclusion