library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dataset1 <- read.csv("https://raw.githubusercontent.com/JaydeeJan/Data-607-Project-2/refs/heads/main/products%20across%20different%20regions.csv")
print(dataset1)
## Product.Name Region Jan.Sales Feb.Sales Mar.Sales Apr.Sales May.Sales
## 1 Product A North 100 110 120 130 140
## 2 Product A South 200 210 220 230 240
## 3 Product A East 300 310 320 330 340
## 4 Product B North 150 160 170 180 190
## 5 Product B South 250 260 270 280 290
## 6 Product B East 350 360 370 380 390
## 7 Product C North 50 55 60 65 70
## 8 Product C South 100 105 110 115 120
## 9 Product C East 150 155 160 165 170
## Jun.Sales
## 1 150
## 2 250
## 3 350
## 4 200
## 5 300
## 6 400
## 7 75
## 8 125
## 9 175
# converting the data from wide format to long format for easier analysis
long_data1 <- dataset1 %>%
pivot_longer(cols = starts_with("Jan") :starts_with("Jun"),
names_to = "Month",
values_to = "Sales")
head(long_data1)
## # A tibble: 6 × 4
## Product.Name Region Month Sales
## <chr> <chr> <chr> <int>
## 1 Product A "North " Jan.Sales 100
## 2 Product A "North " Feb.Sales 110
## 3 Product A "North " Mar.Sales 120
## 4 Product A "North " Apr.Sales 130
## 5 Product A "North " May.Sales 140
## 6 Product A "North " Jun.Sales 150
# calculating total sales per product and per region
total_sales_product <- long_data1 %>%
group_by(Product.Name) %>%
summarise(Total_Sales = sum(Sales))
total_sales_region <- long_data1 %>%
group_by(Region) %>%
summarise(Total_Sales = sum(Sales))
print(total_sales_product)
## # A tibble: 3 × 2
## Product.Name Total_Sales
## <chr> <int>
## 1 Product A 4050
## 2 Product B 4950
## 3 Product C 2025
print(total_sales_region)
## # A tibble: 3 × 2
## Region Total_Sales
## <chr> <int>
## 1 "East" 5175
## 2 "North " 2175
## 3 "South" 3675
The analysis shows the total sales for each product and region. Product B has the highest total sales, and the East region has the highest total sales across all products.
dataset2 <- read.csv("https://raw.githubusercontent.com/JaydeeJan/Data-607-Project-2/refs/heads/main/rolling_stone.csv")
head(dataset2)
## sort_name clean_name album rank_2003 rank_2012
## 1 Sinatra, Frank Frank Sinatra In the Wee Small Hours 100 101
## 2 Diddley, Bo Bo Diddley Bo Diddley / Go Bo Diddley 214 216
## 3 Presley, Elvis Elvis Presley Elvis Presley 55 56
## 4 Sinatra, Frank Frank Sinatra Songs for Swingin' Lovers! 306 308
## 5 Little Richard Little Richard Here's Little Richard 50 50
## 6 Beyonce Beyonce Lemonade NA NA
## rank_2020 differential release_year genre type
## 1 282 -182 1955 Big Band/Jazz Studio
## 2 455 -241 1955 Rock n' Roll/Rhythm & Blues Studio
## 3 332 -277 1956 Rock n' Roll/Rhythm & Blues Studio
## 4 NA -195 1956 Big Band/Jazz Studio
## 5 227 -177 1957 Studio
## 6 32 469 2016 Studio
## weeks_on_billboard peak_billboard_position spotify_popularity
## 1 14 2 48
## 2 NA 201 50
## 3 100 1 58
## 4 NA 2 62
## 5 5 13 64
## 6 87 1 73
## spotify_url artist_member_count artist_gender
## 1 spotify:album:3GmwKB1tgPZgXeRJZSm9WX 1 Male
## 2 spotify:album:1cbtDEwxCjMhglb49OgNBR 1 Male
## 3 spotify:album:7GXP5OhYyPVLmcVfO9Iqin 1 Male
## 4 spotify:album:4kca7vXd1Wo5GE2DMafvMc 1 Male
## 5 spotify:album:18tV6PLXYvVjsdOVk0S7M8 1 Male
## 6 spotify:album:7dK54iZuOxXFarGhXwEXfF 1 Female
## artist_birth_year_sum debut_album_release_year ave_age_at_top_500
## 1 1915 1946 40
## 2 1928 1955 27
## 3 1935 1956 21
## 4 1915 1946 41
## 5 1932 1957 25
## 6 1981 2003 35
## years_between album_id
## 1 9 3GmwKB1tgPZgXeRJZSm9WX
## 2 0 1cbtDEwxCjMhglb49OgNBR
## 3 0 7GXP5OhYyPVLmcVfO9Iqin
## 4 10 4kca7vXd1Wo5GE2DMafvMc
## 5 0 18tV6PLXYvVjsdOVk0S7M8
## 6 13 7dK54iZuOxXFarGhXwEXfF
dataset2 <- dataset2 %>%
select(album, rank_2003, rank_2012, rank_2020)
long_data2 <- dataset2 %>%
pivot_longer(
cols = starts_with("rank_"),
names_to = "Rank_Year",
values_to = "Rank")
head(long_data2)
## # A tibble: 6 × 3
## album Rank_Year Rank
## <chr> <chr> <int>
## 1 In the Wee Small Hours rank_2003 100
## 2 In the Wee Small Hours rank_2012 101
## 3 In the Wee Small Hours rank_2020 282
## 4 Bo Diddley / Go Bo Diddley rank_2003 214
## 5 Bo Diddley / Go Bo Diddley rank_2012 216
## 6 Bo Diddley / Go Bo Diddley rank_2020 455
# Calculate the average ranking for each album
average_ranking <- long_data2 %>%
group_by(album) %>%
summarise(average_rank = mean(Rank, na.rm = TRUE)) %>%
arrange(average_rank)
head(average_ranking, 10)
## # A tibble: 10 × 2
## album average_rank
## <chr> <dbl>
## 1 Pet Sounds 2
## 2 What's Going On 4.33
## 3 Revolver 5.67
## 4 Highway 61 Revisited 8.67
## 5 Sgt. Pepper's Lonely Hearts Club Band 8.67
## 6 Exile on Main St. 9.33
## 7 London Calling 10.7
## 8 Abbey Road 11
## 9 Nevermind 13.3
## 10 Blood on the Tracks 13.7
The analysis shows the top 10 albums for average ranking over 2003, 2012, and 2020.
dataset3 <- read.csv("https://raw.githubusercontent.com/JaydeeJan/Data-607-Project-2/refs/heads/main/Sales%20Data.csv")
head(dataset3)
## Score.ID Store Country January February March
## 1 1 Palisades US $371,700.00 $435,950.00 $372,460.00
## 2 2 Billings US $97,530.00 $324,140.00 $454,480.00
## 3 3 Laguardia US $346,130.00 $157,510.00 $288,990.00
## 4 4 Cheeseburger US $442,010.00 $212,390.00 $183,580.00
## 5 5 Detroit US $33,250.00 $36,840.00 $320,170.00
## 6 6 Towns US $16,632.00 $25,372.00 $38,178.00
## April May June July August
## 1 $192,260.00 $157,550.00 $332,550.00 $89,630.00 $372,090.00
## 2 $36,810.00 $219,790.00 $210,970.00 $84,840.00 $175,440.00
## 3 $358,190.00 $96,860.00 $461,950.00 $80,440.00 $404,990.00
## 4 $308,650.00 $184,340.00 $156,540.00 $328,180.00 $281,430.00
## 5 $242,650.00 $350,300.00 $421,980.00 $307,190.00 $16,900.00
## 6 $13,222.00 $16,031.00 $17,162.00 $22,541.00 $16,700.00
## September October November
## 1 $421,670.00 $173,010.00 $173,220.00
## 2 $283,710.00 $275,320.00 $401,940.00
## 3 $450,830.00 $327,270.00 $370,100.00
## 4 $498,150.00 $473,150.00 $23,740.00
## 5 $443,990.00 $346,230.00 $312,670.00
## 6 $10,475.00 $1,897.00 $10,493.00
long_data3 <- dataset3 %>%
pivot_longer(cols = January:November, names_to = "Month", values_to = "Sales") %>%
mutate(Sales = as.numeric(gsub("[\\$,]", "", Sales)))
head(long_data3)
## # A tibble: 6 × 5
## Score.ID Store Country Month Sales
## <int> <chr> <chr> <chr> <dbl>
## 1 1 Palisades US January 371700
## 2 1 Palisades US February 435950
## 3 1 Palisades US March 372460
## 4 1 Palisades US April 192260
## 5 1 Palisades US May 157550
## 6 1 Palisades US June 332550
# Calculate total sales per store
total_sales_per_store <- long_data3 %>%
group_by(Store) %>%
summarise(Total_Sales = sum(Sales)) %>%
arrange(desc(Total_Sales))
print(total_sales_per_store)
## # A tibble: 21 × 2
## Store Total_Sales
## <chr> <dbl>
## 1 Laguardia 3343260
## 2 Cheeseburger 3092160
## 3 Palisades 3092090
## 4 Detroit 2832170
## 5 Billings 2564970
## 6 Camembert 688990
## 7 Maple 362989
## 8 Paolo 330761
## 9 Innit, 315480
## 10 Steve 308982
## # ℹ 11 more rows
The analysis shows the 5 stores in US market has the highest total sales amount comparing to other regions.