library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
#1. Loading the dataset
library(readr)
online_retail <- read_csv("C:/Users/HP/OneDrive/Desktop/CA 3 Project of R/online retail.zip")
## Rows: 1000 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): category_name, product_name, payment_method, city, gender
## dbl (7): customer_id, product_id, category_id, quantity, price, review_scor...
## date (1): order_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(online_retail)
There are 1,000 rows and 13 columns in the dataset
#2.What is the structure of dataset?
str(online_retail)
## spc_tbl_ [1,000 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ customer_id : num [1:1000] 13542 23188 55098 65208 63872 ...
## $ order_date : Date[1:1000], format: "2024-12-17" "2024-06-01" ...
## $ product_id : num [1:1000] 784 682 684 204 202 829 706 405 549 545 ...
## $ category_id : num [1:1000] 10 50 50 40 20 10 10 50 30 30 ...
## $ category_name : chr [1:1000] "Electronics" "Sports & Outdoors" "Sports & Outdoors" "Books & Stationery" ...
## $ product_name : chr [1:1000] "Smartphone" "Soccer Ball" "Tent" "Story Book" ...
## $ quantity : num [1:1000] 2 5 5 2 4 4 5 2 3 4 ...
## $ price : num [1:1000] 373 299 23 230 177 ...
## $ payment_method: chr [1:1000] "Credit Card" "Credit Card" "Credit Card" "Bank Transfer" ...
## $ city : chr [1:1000] "New Oliviaberg" "Port Matthew" "West Sarah" "Hernandezburgh" ...
## $ review_score : num [1:1000] 1 NA 5 5 1 5 NA NA 3 5 ...
## $ gender : chr [1:1000] "F" "M" "F" "M" ...
## $ age : num [1:1000] 56 59 64 34 33 21 57 60 69 34 ...
## - attr(*, "spec")=
## .. cols(
## .. customer_id = col_double(),
## .. order_date = col_date(format = ""),
## .. product_id = col_double(),
## .. category_id = col_double(),
## .. category_name = col_character(),
## .. product_name = col_character(),
## .. quantity = col_double(),
## .. price = col_double(),
## .. payment_method = col_character(),
## .. city = col_character(),
## .. review_score = col_double(),
## .. gender = col_character(),
## .. age = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
This dataset contains detailed transactional data from an e-commerce platform including 99,441 rows and 13 columns.
#3.Are there any missing values?
colSums(is.na(online_retail))
## customer_id order_date product_id category_id category_name
## 0 0 0 0 0
## product_name quantity price payment_method city
## 0 0 0 0 0
## review_score gender age
## 201 103 0
Yes, there are 201 in review_score and 103 in gender missing values in dataset.
#4.How to Remove Missing values?
online_retail <- online_retail %>% drop_na(review_score)
online_retail <- online_retail %>% drop_na(gender)
colSums(is.na(online_retail))
## customer_id order_date product_id category_id category_name
## 0 0 0 0 0
## product_name quantity price payment_method city
## 0 0 0 0 0
## review_score gender age
## 0 0 0
Now, there is no missing values in the dataset.
#5.Are there any duplicate values?
sum(duplicated(online_retail))
## [1] 0
There is not a single duplicate value in dataset
#6.What is the distribution of transactions by City?
table(online_retail$city)
##
## Alexanderborough Allisonland Alvaradoborough
## 1 1 1
## Alvaradobury Amandaborough Amandamouth
## 1 1 1
## Amyhaven Andersonchester Andersonfort
## 1 1 1
## Andersonshire Andersonside Andersonview
## 1 1 1
## Andreatown Andrewmouth Annaborough
## 1 1 1
## Anthonyborough Anthonybury Anthonyport
## 1 1 1
## Anthonyside Anthonyville Aprilton
## 1 1 1
## Armstrongberg Ashleychester Ashleyfort
## 1 1 1
## Ashleyside Audreystad Austinbury
## 1 1 1
## Bakerburgh Bakerside Barberside
## 1 1 1
## Barryton Bateshaven Beckerview
## 1 1 1
## Benjaminburgh Blackchester Blakeshire
## 1 1 1
## Bowmanmouth Bradleyview Bradleyville
## 1 1 1
## Brandichester Brandonbury Brandyborough
## 1 1 1
## Brendaton Brentbury Brianburgh
## 1 1 1
## Brianfort Brooksfort Brookshaven
## 1 1 1
## Brownbury Brownchester Brownport
## 2 1 1
## Brownshire Butlerton Cabrerashire
## 1 1 1
## Calebmouth Cameronside Cameronstad
## 1 1 1
## Cantumouth Carlville Cartermouth
## 1 1 1
## Cassandramouth Cassandraville Cervantesland
## 1 1 1
## Charlesview Chenmouth Christopherburgh
## 1 1 1
## Christopherport Christophertown Cindyberg
## 1 1 1
## Clayfort Codyburgh Colemanchester
## 1 1 1
## Colemanmouth Colemouth Connerburgh
## 1 1 1
## Connerstad Conniebury Coreyfort
## 1 1 1
## Coxfurt Craigmouth Crawfordport
## 1 1 1
## Crosbyfort Cynthiaburgh Cynthiaport
## 1 1 1
## Danielstad Danielton Davidborough
## 1 1 1
## Davidbury Davidton Davistown
## 1 2 1
## Deborahview Dennisburgh Derekside
## 1 1 1
## Devonmouth Donaldhaven Donaldside
## 1 1 1
## Douglasport Douglastown Dukeborough
## 1 1 1
## East Alexandriashire East Alyssa East Amandaside
## 1 1 1
## East Ana East Annettemouth East Anthony
## 1 1 1
## East Blakeborough East Brittany East Bryanport
## 1 1 1
## East Christopher East Christopherfurt East Codyport
## 1 1 1
## East David East Dawn East Donna
## 1 1 1
## East Ericview East Erikton East Frankhaven
## 1 1 1
## East Hannahhaven East Jason East Jennifershire
## 1 1 1
## East Johnton East Joseph East Justin
## 1 1 1
## East Kevinchester East Kimberlyshire East Lisaview
## 1 1 1
## East Lucas East Mariebury East Martin
## 1 1 1
## East Maryland East Marymouth East Matthew
## 1 1 1
## East Michaelborough East Moniquefort East Nicole
## 1 1 1
## East Robert East Robynmouth East Russellville
## 2 1 1
## East Sarah East Tonyaberg East Veronicaville
## 1 1 1
## East William Eddiefurt Edwardview
## 1 1 1
## Elizabethmouth Elliottfort Elliottmouth
## 1 1 1
## Ericfurt Ericside Ericville
## 1 1 1
## Fernandezside Fishertown Floydchester
## 1 1 1
## Frankbury Frazierbury Freemanport
## 1 1 1
## Fritzchester Fryeberg Fullerland
## 1 1 1
## Garciamouth Garciaville Gardnerberg
## 1 1 1
## Garrisonberg Garzachester Georgestad
## 1 1 1
## Georgeville Gonzalezshire Grahamhaven
## 1 1 1
## Graystad Greenborough Gregoryville
## 1 1 1
## Griffintown Gutierreztown Hardingfort
## 1 1 1
## Hawkinschester Henryborough Henrystad
## 1 2 1
## Hernandezburgh Hernandezfort Hernandezmouth
## 1 1 1
## Hickmanside Hobbston Huffmanshire
## 1 1 1
## Hughesfort Hughesmouth Huntberg
## 1 1 1
## Huntershire Jackmouth Jackshire
## 1 1 1
## Jaclynview Jacobburgh Jacobfurt
## 1 1 1
## Jacobsside Jacobville Jamesfort
## 1 1 1
## Jamesmouth Jamesside Jamesview
## 1 1 1
## Jamiefort Jasonbury Jasonfort
## 1 1 1
## Jasontown Jaychester Jefferyborough
## 1 1 1
## Jefferyshire Jeffreyview Jeffreyville
## 1 1 1
## Jenkinshaven Jennachester Jennifermouth
## 1 1 1
## Jenniferville Jensenstad Jesseburgh
## 1 1 1
## Jessicastad Joetown Johnfort
## 1 1 1
## Johnhaven Johnland Johnmouth
## 1 1 1
## Johnnyton Johnsonborough Jonathanport
## 1 1 1
## Jonathanstad Jonesport Jonesville
## 1 1 1
## Jordanmouth Josephton Joshuabury
## 1 1 1
## Joshuafort Joshuamouth Joshuastad
## 1 2 1
## Joshuaton Julieborough Justinport
## 1 1 1
## Karenside Karenstad Kathleenton
## 1 1 1
## Kathleenview Kathrynton Kathyton
## 1 1 1
## Kellymouth Kelseyton Kennethstad
## 1 1 1
## Kevinland Kevinville Kiddmouth
## 1 1 1
## Kimberlymouth Kristenberg Kristenland
## 1 1 1
## Kruegerton Kurtfurt Kurtmouth
## 1 1 1
## Lake Alexandriachester Lake Alison Lake Annburgh
## 1 1 1
## Lake Bethmouth Lake Brent Lake Brentfurt
## 1 1 1
## Lake Brianbury Lake Brittanyberg Lake Brittanyburgh
## 2 1 1
## Lake Carla Lake Catherine Lake Charlestown
## 1 1 1
## Lake Cody Lake Cynthiastad Lake Ellen
## 1 1 1
## Lake Emily Lake Emilymouth Lake Ian
## 1 1 1
## Lake Jasmineport Lake Jason Lake Jennifer
## 1 2 1
## Lake Jessefort Lake Jillton Lake Joanna
## 1 1 1
## Lake Joseph Lake Kelly Lake Kristy
## 1 1 1
## Lake Linda Lake Lorichester Lake Mallory
## 1 1 1
## Lake Michael Lake Michaelfort Lake Michaelhaven
## 2 1 1
## Lake Nicholas Lake Paulastad Lake Rachaelside
## 1 1 1
## Lake Ryanland Lake Sara Lake Sarah
## 1 1 1
## Lake Scottfurt Lake Sharonstad Lake Spencer
## 1 1 1
## Lake Teresafurt Lake Tiffanyfurt Lake Tinahaven
## 1 1 1
## Lake Toddside Lake Tracy Lake Triciaburgh
## 1 1 1
## Lake Victoriafort Lake Vincentview Lake Williamfort
## 1 1 1
## Larsonburgh Lauraberg Lauraburgh
## 1 1 1
## Laurietown Lesliemouth Levyport
## 1 1 1
## Lewisburgh Lewisfort Lisafurt
## 2 1 1
## Luketon Lynnland Maddenview
## 1 1 1
## Marcusville Mariaborough Marieborough
## 1 1 1
## Markborough Markchester Markton
## 1 1 1
## Martinburgh Martinbury Martineztown
## 1 1 1
## Martinside Matthewmouth Maxwellhaven
## 1 1 1
## Mayland Mccallhaven Mcdanielchester
## 1 1 1
## Mcdonaldmouth Mcintyreburgh Melanieberg
## 1 1 1
## Melanieborough Melanieton Mendezburgh
## 1 1 1
## Mendozatown Meyerchester Michaelchester
## 1 1 2
## Michaelside Michaelstad Michellebury
## 1 1 1
## Michellefort Millerborough Millermouth
## 1 1 1
## Mitchellfort Mitchellhaven Monicaland
## 1 1 1
## Moniquemouth Moraleshaven Morrisburgh
## 1 1 1
## Morrisport Myershaven Nancytown
## 1 1 1
## Navarrostad New Alicia New Amy
## 1 1 2
## New Amyfurt New Brandon New Brian
## 1 1 1
## New Brooke New Carol New Christopherside
## 1 1 1
## New Danielberg New David New Edwardville
## 1 2 1
## New Felicia New Jeremy New Jonathan
## 1 1 1
## New Jose New Josephland New Joshua
## 1 1 1
## New Kelly New Kimberly New Kristen
## 1 1 1
## New Kurtmouth New Kyle New Lauramouth
## 1 1 1
## New Louis New Melissa New Michaeltown
## 1 1 2
## New Michealtown New Mindymouth New Miranda
## 1 1 1
## New Natalie New Norma New Oliviaberg
## 1 1 1
## New Patriciaville New Raymond New Rhondaborough
## 1 1 1
## New Ronniebury New Sean New Seth
## 1 1 1
## New Shawn New Steven New Terrancetown
## 1 1 1
## New Terri Nguyenfort Nguyenmouth
## 1 1 1
## Nicolebury Nicoletown North Alexis
## 1 1 1
## North Amandaland North Angela North Bradleymouth
## 1 1 1
## North Carrie North Christinaview North Dana
## 1 1 1
## North Danielburgh North David North Davidchester
## 1 1 1
## North Debraland North Diane North Elizabethbury
## 1 1 1
## North Erikbury North Heatherborough North Jacob
## 1 1 1
## North James North Jamesside North Jason
## 1 1 1
## North Jeannemouth North Jessica North Jimmyborough
## 1 1 1
## North Johnland North Juliaborough North Kathryn
## 1 1 1
## North Kelsey North Mary North Michaelborough
## 1 1 1
## North Michelletown North Stevenborough North Stevenshire
## 1 1 1
## North Thomas North Wendyberg North Whitneytown
## 1 1 1
## North Zacharytown Osborneland Patriciaberg
## 1 1 1
## Patriciaville Pattersonfort Paulabury
## 1 1 1
## Paulton Payneside Penningtonberg
## 1 1 1
## Perezmouth Pettymouth Phelpsstad
## 1 1 1
## Phillipsborough Phillipsport Pierceport
## 1 1 1
## Popeport Popeview Port Adrianhaven
## 1 1 1
## Port Alexanderton Port Allisonfort Port Andre
## 1 1 1
## Port Briana Port Christine Port Christopher
## 1 1 1
## Port Christopherbury Port Craigfort Port Cristianfort
## 1 1 1
## Port Dana Port Danielleview Port Darren
## 1 1 1
## Port Darrylstad Port Dawn Port Denisemouth
## 1 1 1
## Port Derek Port Diana Port Edward
## 1 1 1
## Port Elizabeth Port Emily Port Ginaside
## 1 1 1
## Port Hectorbury Port Jackview Port Jameston
## 1 1 1
## Port Jasmin Port Jasmine Port Jason
## 1 1 1
## Port Jenniferchester Port Jeremyfurt Port Jo
## 1 1 1
## Port Joseph Port Josephview Port Joseton
## 2 1 1
## Port Justinbury Port Kellyview Port Kenneth
## 1 1 1
## Port Laurenfort Port Lindaton Port Lisa
## 1 1 1
## Port Luke Port Marcusmouth Port Markport
## 1 1 1
## Port Marvinmouth Port Matthew Port Matthewfort
## 1 2 1
## Port Maxwell Port Melissa Port Melissaborough
## 1 1 2
## Port Michaelville Port Pamelatown Port Patriciashire
## 1 1 1
## Port Paul Port Richardfurt Port Robertchester
## 1 1 1
## Port Ryanville Port Samanthafort Port Sandra
## 1 1 1
## Port Sarahfort Port Steve Port Tammyberg
## 1 1 1
## Port Taraburgh Port Thomas Port Timothy
## 1 1 1
## Port Timothyshire Port Tony Port Tracystad
## 1 1 1
## Port Vincenttown Port Wandashire Priceburgh
## 1 1 1
## Pughton Ramirezmouth Ramirezton
## 1 1 1
## Ramosport Randallburgh Ravenland
## 1 1 1
## Raymondchester Raymondland Raytown
## 1 1 1
## Rebeccaborough Rebeccahaven Reidchester
## 1 1 1
## Reidville Reynoldsborough Rhodesfurt
## 1 1 1
## Riveraview Robertfort Robertmouth
## 1 1 2
## Robertton Robinhaven Rodgersfurt
## 1 1 1
## Rodgershaven Rodriguezport Romerostad
## 1 1 1
## Roychester Sanchezside Sarahtown
## 1 1 1
## Saunderston Savannahmouth Schroederland
## 1 1 1
## Scottfort Scottmouth Scottville
## 1 1 1
## Shanetown Shawnburgh Shawnview
## 1 1 1
## Sheltonmouth Shortmouth Simmonsview
## 1 1 1
## Simonfort Smithburgh Smithfort
## 1 1 1
## Smithstad Snyderborough South Anthony
## 1 1 1
## South Antonioton South Brianna South Brianstad
## 1 1 1
## South Carolynchester South Carriefurt South Christine
## 1 1 1
## South Christineberg South Christopherburgh South Coltonport
## 1 1 1
## South Douglasland South Edgartown South Elizabeth
## 1 1 2
## South Elizabethport South Huntermouth South Jacquelineside
## 1 1 1
## South Jeffrey South Jeffreystad South Jerry
## 1 1 1
## South Jessica South John South Jonathan
## 1 1 1
## South Katiefurt South Lauramouth South Lisaland
## 1 1 1
## South Mark South Matthew South Melinda
## 1 1 1
## South Melodystad South Michael South Omarport
## 1 2 1
## South Robert South Roger South Sharonburgh
## 1 1 1
## South Susanstad South Taylorchester South Thomasmouth
## 1 1 1
## South Tonya South Veronica South William
## 2 1 1
## Spencerside Stacieberg Stephanieshire
## 1 1 1
## Stevensfurt Stevensonside Susanchester
## 1 1 1
## Suttonfort Tatemouth Taylorview
## 1 1 1
## Teresaville Thomastown Tiffanyport
## 1 1 1
## Tiffanyshire Tiffanystad Timothychester
## 1 1 1
## Timothyland Toddmouth Toddton
## 1 1 1
## Tonyamouth Townsendville Traciburgh
## 1 1 1
## Valerieview Veronicaland Vickimouth
## 1 1 1
## Villanuevaburgh Walkerland Washingtonmouth
## 1 1 1
## Watsonchester Wayneside Webbburgh
## 1 1 1
## Webbfort Webermouth Wendyborough
## 1 1 1
## Wendychester Wesleyborough Wesleytown
## 1 1 1
## West Amanda West Amandamouth West Amyhaven
## 1 1 1
## West Anne West Antonio West April
## 1 1 1
## West Bradleymouth West Carlosberg West Courtneychester
## 1 1 1
## West Crystal West Daniel West Deanna
## 1 1 1
## West Denisemouth West Geraldhaven West Jacob
## 1 1 1
## West James West Jeffreyview West Jerome
## 1 1 1
## West Jerry West Jessica West Jesusville
## 1 1 1
## West Jimmystad West Jonathanmouth West Jonathon
## 1 1 1
## West Joshuahaven West Justin West Kendrafort
## 1 1 1
## West Kristinehaven West Larrymouth West Lisa
## 1 1 1
## West Mariaton West Masonton West Matthewton
## 1 1 1
## West Michellemouth West Molly West Nicholas
## 1 1 1
## West Patriciachester West Rebekahland West Reginald
## 1 1 1
## West Reginaldbury West Richard West Robertmouth
## 1 1 1
## West Rogerberg West Samuelhaven West Sarah
## 1 1 1
## West Stevenburgh West Tamiville West Thomas
## 1 1 1
## West Tyler Whitakerview Whiteborough
## 1 1 1
## Williamsburgh Williamsfort Williamston
## 1 1 1
## Williamton Woodshaven Woodsport
## 1 1 1
## Wrightfort Wrightmouth
## 1 1
Every city has single transaction
#7.What is the average price of products by category?
aggregate(price ~ category_name, data = online_retail, mean)
## category_name price
## 1 Books & Stationery 266.3931
## 2 Electronics 270.6829
## 3 Fashion 250.9762
## 4 Home & Living 244.2209
## 5 Sports & Outdoors 253.0276
Electronics have the highest average price, while Home & Living has the lowest.
#8.Which payment mode is preferred by customers?
table(online_retail$payment_method)
##
## Bank Transfer Cash on Delivery Credit Card
## 227 274 220
Mostly people pay through cash on delivery.
#9.Which product category has the highest sales quantity?
online_retail %>%
group_by(category_name) %>%
summarise(total_quantity = sum(quantity)) %>%
arrange(desc(total_quantity))
## # A tibble: 5 × 2
## category_name total_quantity
## <chr> <dbl>
## 1 Electronics 481
## 2 Sports & Outdoors 424
## 3 Fashion 419
## 4 Home & Living 408
## 5 Books & Stationery 361
Products of electronics are sold the most.
#10.What is the average review score for each product category?
online_retail %>%
group_by(category_name) %>%
summarise(avg_review_score = mean(review_score))
## # A tibble: 5 × 2
## category_name avg_review_score
## <chr> <dbl>
## 1 Books & Stationery 3.93
## 2 Electronics 4.01
## 3 Fashion 3.97
## 4 Home & Living 3.91
## 5 Sports & Outdoors 4.05
Sports & Outdoors category has the highest review_score.
#11.What is the gender distribution of customers?
table(online_retail$gender)
##
## F M
## 356 365
There are more males as compared to females.
#12.Which city has the highest number of orders?
online_retail %>%
group_by(city) %>%
summarise(total_orders = n()) %>%
arrange(desc(total_orders))
## # A tibble: 701 × 2
## city total_orders
## <chr> <int>
## 1 Brownbury 2
## 2 Davidton 2
## 3 East Robert 2
## 4 Henryborough 2
## 5 Joshuamouth 2
## 6 Lake Brianbury 2
## 7 Lake Jason 2
## 8 Lake Michael 2
## 9 Lewisburgh 2
## 10 Michaelchester 2
## # ℹ 691 more rows
Each city has two orders only.
#13. What is the distribution of review scores?
table(online_retail$review_score)
##
## 1 2 3 4 5
## 56 49 94 179 343
Most customers gave a review score of 5, indicating high customer satisfaction.
#14.What is the total revenue generated from each product category?
online_retail %>%
mutate(revenue = price * quantity) %>%
group_by(category_name) %>%
summarise(total_revenue = sum(revenue)) %>%
arrange(desc(total_revenue))
## # A tibble: 5 × 2
## category_name total_revenue
## <chr> <dbl>
## 1 Electronics 128907.
## 2 Sports & Outdoors 104871.
## 3 Fashion 102328.
## 4 Home & Living 101717.
## 5 Books & Stationery 97485.
Highest revenue is generated from electronics category.
#15.Which payment method generates the highest revenue?
online_retail %>%
mutate(revenue = price * quantity) %>%
group_by(payment_method) %>%
summarise(total_revenue = sum(revenue)) %>%
arrange(desc(total_revenue))
## # A tibble: 3 × 2
## payment_method total_revenue
## <chr> <dbl>
## 1 Cash on Delivery 205032.
## 2 Bank Transfer 168060.
## 3 Credit Card 162216.
Highest revenue is generated via cash on delivery.
#16.Which customer has placed the highest number of orders?
online_retail %>%
group_by(customer_id) %>%
summarise(total_orders = n()) %>%
arrange(desc(total_orders))
## # A tibble: 721 × 2
## customer_id total_orders
## <dbl> <int>
## 1 10211 1
## 2 10254 1
## 3 10299 1
## 4 10486 1
## 5 10539 1
## 6 10792 1
## 7 10825 1
## 8 10848 1
## 9 11008 1
## 10 11021 1
## # ℹ 711 more rows
Every customer bought one unique product.
#17.Which product is sold the most?
aggregate(quantity ~ product_name, data = online_retail, sum)
## product_name quantity
## 1 Basketball 63
## 2 Blanket 93
## 3 Carpet 69
## 4 Dress 77
## 5 Eraser 82
## 6 Headphones 89
## 7 Laptop 113
## 8 Notebook 81
## 9 Novel 48
## 10 Painting 75
## 11 Pants 95
## 12 Pen 71
## 13 Pillow 84
## 14 Running Shoes 81
## 15 Shirt 50
## 16 Skirt 85
## 17 Smartphone 112
## 18 Smartwatch 79
## 19 Soccer Ball 96
## 20 Story Book 79
## 21 T-shirt 112
## 22 Tablet 88
## 23 Tent 86
## 24 Vase 87
## 25 Yoga Mat 98
Mostly customers bought laptop, while least bought Novel.
#18.Creating New Column
online_retail <- online_retail %>% mutate(TotalPrice = quantity * price)
A new column is created named TotalPrice.
#19.Bar Chart: What is the count of orders in each product category?
category_counts <- online_retail %>%
count(category_name)
ggplot(category_counts, aes(x = reorder(category_name, -n), y = n)) +
geom_col(fill = "steelblue") +
ggtitle("Orders per Product Category") +
xlab("Category") +
ylab("Order Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The bar chart shows that Electronics is the most popular product
category, followed by Sports & Outdoors and Fashion. These
categories have the highest order counts, indicating strong customer
preference. Home & Living and Books & Stationery have slightly
lower order volumes, suggesting moderate demand. This insight can help
prioritize marketing and inventory planning.
#20.What is the gender distribution of customers in the dataset?
ggplot(online_retail, aes(x = gender)) +
geom_bar(fill = "mediumseagreen") +
ggtitle("Customer Gender Distribution") +
xlab("Gender") +
ylab("Number of Customers") +
theme_minimal()
Based on the observation, the bar for “Male” is slightly higher than
that for “Female,” indicating that the dataset has a larger number of
male customers.
#21.Scatter Plot: Is there a relationship between age and review_score?
ggplot(online_retail, aes(x = age, y = review_score)) +
geom_point(alpha = 0.5, color = "darkgreen") +
labs(title = "Age vs Review Score", x = "Age", y = "Review Score")
The scatter plot of age versus review score shows that most
customers give extreme ratings (1 or 5), with fewer ratings in the
middle. There is no clear correlation between age and review score,
suggesting that age does not significantly influence customer
satisfaction. The data appears to be polarized, with many customers
either very satisfied or very dissatisfied. This pattern may indicate
that factors other than age influence review scores more
strongly.
#22.Pair Plot: What relationships exist among age, quantity, and review_score?
GGally::ggpairs(online_retail[, c("age", "quantity", "review_score")])
The ggpairs plot shows very weak correlations among age, quantity,
and review_score, all close to zero. This indicates no significant
linear relationship between any of the variables. The review scores are
mostly polarized at 1 and 5, with little variation in between. Overall,
these variables do not strongly influence each other, suggesting other
factors may impact review behavior.
#23.Histogram: What is the distribution of product prices?
ggplot(online_retail, aes(x = price)) +
geom_histogram(binwidth = 5, fill = "orange", color = "black") +
labs(title = "Distribution of Product Prices", x = "Price", y = "Count")
The histogram shows that product prices are distributed fairly
evenly across the range, with some noticeable spikes. There is no strong
skew, indicating a wide variety of product pricing. A few price ranges,
especially near ₹500, appear more popular. Overall, the product prices
are well spread out without heavy concentration in a specific
range.
#24.Box Plot: How do review scores differ across gender?
ggplot(online_retail, aes(x = gender, y = review_score, fill = gender)) +
geom_boxplot() +
labs(title = "Review Score by Gender", x = "Gender", y = "Review Score") +
theme_minimal()
The boxplot shows that both male and female customers generally gave
high review scores, with medians around 4. The spread of scores is
similar for both genders, ranging mostly between 3 and 5. There are no
significant outliers, and the distribution appears symmetric. Overall,
both genders seem equally satisfied based on their review
scores.
#25.How does the quantity of products purchased vary by customer age and review score?
# Box plot 1: Quantity by Age
plot1 <- ggplot(online_retail, aes(x = "", y = quantity)) +
geom_boxplot(fill = "lightblue") +
ggtitle("Distribution of Quantity Purchased") +
ylab("Quantity") +
xlab("") +
theme_minimal()
# Box plot 2: Quantity by Review Score
plot2 <- ggplot(online_retail, aes(x = as.factor(review_score), y = quantity)) +
geom_boxplot(fill = "salmon") +
ggtitle("Quantity vs Review Score") +
xlab("Review Score") +
ylab("Quantity") +
theme_minimal()
# Arrange side by side
grid.arrange(plot1, plot2, ncol = 2)
The first box plot shows that the quantity purchased typically
ranges between 2 and 4 units, with a median of 3, suggesting most
customers buy small quantities. The second plot indicates that the
quantity purchased remains relatively consistent across different review
scores, showing no significant relationship between the quantity bought
and the customer satisfaction level. Overall, customer reviews don’t
seem to be influenced by the size of their order.
#26.Is there a correlation between the customer’s age and the quantity of products they purchase?
cor.test(online_retail$age, online_retail$quantity, use = "complete.obs")
##
## Pearson's product-moment correlation
##
## data: online_retail$age and online_retail$quantity
## t = -1.3932, df = 719, p-value = 0.164
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.12443067 0.02120834
## sample estimates:
## cor
## -0.05188704
The Pearson correlation coefficient between age and quantity is approximately -0.052, indicating a very weak negative correlation. The p-value is 0.164, which is greater than 0.05, meaning the result is not statistically significant. Therefore, we cannot conclude that there’s a meaningful relationship between a customer’s age and the number of products they purchase. The confidence interval also includes zero, reinforcing this finding.
#27.Is there a significant correlation between product price and review score?
cor.test(online_retail$price, online_retail$review_score, use = "complete.obs")
##
## Pearson's product-moment correlation
##
## data: online_retail$price and online_retail$review_score
## t = 2.2243, df = 719, p-value = 0.02644
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.009711257 0.154748654
## sample estimates:
## cor
## 0.08266768
The Pearson correlation coefficient between product price and review score is 0.083, indicating a very weak positive correlation. The p-value is 0.026, which is less than 0.05, suggesting that this relationship is statistically significant. Although the correlation is weak, the result implies that as product price increases, review scores tend to slightly increase. However, the effect size is minimal and may not have strong practical significance.
#28.Can a customer’s age predict the quantity of products they purchase?
model <- lm(quantity ~ age, data = online_retail)
summary(model)
##
## Call:
## lm(formula = quantity ~ age, data = online_retail)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.02996 -0.99004 0.04544 1.13414 2.22285
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.109790 0.157552 19.738 <2e-16 ***
## age -0.004435 0.003183 -1.393 0.164
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.414 on 719 degrees of freedom
## Multiple R-squared: 0.002692, Adjusted R-squared: 0.001305
## F-statistic: 1.941 on 1 and 719 DF, p-value: 0.164
The regression model shows that age is not a significant predictor of the quantity of products purchased, as the p-value is 0.164 (greater than 0.05). The coefficient for age is -0.004, indicating a very small negative relationship, but it is not statistically meaningful. The R-squared value is 0.0027, meaning the model explains less than 1% of the variance in quantity. Thus, customer age does not effectively predict purchase quantity in this dataset.
#29.How do age, gender, and product price together influence the quantity of products purchased?
model <- lm(quantity ~ age + gender + price, data = online_retail)
summary(model)
##
## Call:
## lm(formula = quantity ~ age + gender + price, data = online_retail)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0650 -0.9789 0.0402 1.1436 2.2571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.1445972 0.1945435 16.164 <2e-16 ***
## age -0.0043891 0.0031901 -1.376 0.169
## genderM 0.0287562 0.1055458 0.272 0.785
## price -0.0002003 0.0003769 -0.532 0.595
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.415 on 717 degrees of freedom
## Multiple R-squared: 0.003193, Adjusted R-squared: -0.0009773
## F-statistic: 0.7657 on 3 and 717 DF, p-value: 0.5135
The multiple regression model suggests that age, gender, and product price do not significantly influence the quantity of products purchased. All p-values are greater than 0.05, indicating none of the predictors are statistically significant. The R-squared value is only 0.0032, meaning the model explains less than 1% of the variance in quantity. Overall, this model has very weak explanatory power and does not provide useful predictions for product quantity based on these variables.
#30.Is there a significant difference in quantity purchased across different product categories?
anova_result <- aov(quantity ~ category_name, data = online_retail)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## category_name 4 8.1 2.035 1.017 0.398
## Residuals 716 1433.1 2.002
The ANOVA test evaluates whether the mean quantity purchased differs across product categories. The p-value is 0.398, which is much greater than 0.05, indicating that the difference in mean quantities across categories is not statistically significant. The F-value of 1.017 also supports this conclusion. Thus, we cannot conclude that product category has a significant effect on the quantity of products purchased.