Today we are gonna discuss a little about a dataset of a shipping company that takes orders from the US and Puerto Rico to a variety of markets around the world including themselves. Some customers fraud the system, this will be discussed shortly too.
*Wrangling data
*Checking for null values/na’s/infinities
*removing unnecessary attributes from data
*Questions
*Some plots/graphs
*Statistics and Modelling
Ahmed Abdallah Mohammed –1727138
Alaa Ahmed Yousef –1727040
Dalia Sarhan Mahmoud –1627252
Nada Shaaban –1727066
Omar Mohammed Abdel-Salam –1727234
Sarah Mahmoud Abdel-Aziz –1627262
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(datasets)
library(ggplot2)
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.1.1
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(hrbrthemes)
## Warning: package 'hrbrthemes' was built under R version 4.1.1
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
od<- read.csv('data/DataCoSupplyChainDataset.csv')
d <- od # changing name to put the whole work on the data frame d
summary(d)
## Type Days.for.shipping..real. Days.for.shipment..scheduled.
## Length:180519 Min. :0.000 Min. :0.000
## Class :character 1st Qu.:2.000 1st Qu.:2.000
## Mode :character Median :3.000 Median :4.000
## Mean :3.498 Mean :2.932
## 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :6.000 Max. :4.000
##
## Benefit.per.order Sales.per.customer Delivery.Status Late_delivery_risk
## Min. :-4274.98 Min. : 7.49 Length:180519 Min. :0.0000
## 1st Qu.: 7.00 1st Qu.: 104.38 Class :character 1st Qu.:0.0000
## Median : 31.52 Median : 163.99 Mode :character Median :1.0000
## Mean : 21.98 Mean : 183.11 Mean :0.5483
## 3rd Qu.: 64.80 3rd Qu.: 247.40 3rd Qu.:1.0000
## Max. : 911.80 Max. :1939.99 Max. :1.0000
##
## Category.Id Category.Name Customer.City Customer.Country
## Min. : 2.00 Length:180519 Length:180519 Length:180519
## 1st Qu.:18.00 Class :character Class :character Class :character
## Median :29.00 Mode :character Mode :character Mode :character
## Mean :31.85
## 3rd Qu.:45.00
## Max. :76.00
##
## Customer.Email Customer.Fname Customer.Id Customer.Lname
## Length:180519 Length:180519 Min. : 1 Length:180519
## Class :character Class :character 1st Qu.: 3258 Class :character
## Mode :character Mode :character Median : 6457 Mode :character
## Mean : 6691
## 3rd Qu.: 9779
## Max. :20757
##
## Customer.Password Customer.Segment Customer.State Customer.Street
## Length:180519 Length:180519 Length:180519 Length:180519
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Customer.Zipcode Department.Id Department.Name Latitude
## Min. : 603 Min. : 2.000 Length:180519 Min. :-33.94
## 1st Qu.: 725 1st Qu.: 4.000 Class :character 1st Qu.: 18.27
## Median :19380 Median : 5.000 Mode :character Median : 33.14
## Mean :35921 Mean : 5.443 Mean : 29.72
## 3rd Qu.:78207 3rd Qu.: 7.000 3rd Qu.: 39.28
## Max. :99205 Max. :12.000 Max. : 48.78
## NA's :3
## Longitude Market Order.City Order.Country
## Min. :-158.03 Length:180519 Length:180519 Length:180519
## 1st Qu.: -98.45 Class :character Class :character Class :character
## Median : -76.85 Mode :character Mode :character Mode :character
## Mean : -84.92
## 3rd Qu.: -66.37
## Max. : 115.26
##
## Order.Customer.Id order.date..DateOrders. Order.Id
## Min. : 1 Length:180519 Min. : 1
## 1st Qu.: 3258 Class :character 1st Qu.:18057
## Median : 6457 Mode :character Median :36140
## Mean : 6691 Mean :36222
## 3rd Qu.: 9779 3rd Qu.:54144
## Max. :20757 Max. :77204
##
## Order.Item.Cardprod.Id Order.Item.Discount Order.Item.Discount.Rate
## Min. : 19.0 Min. : 0.00 Min. :0.0000
## 1st Qu.: 403.0 1st Qu.: 5.40 1st Qu.:0.0400
## Median : 627.0 Median : 14.00 Median :0.1000
## Mean : 692.5 Mean : 20.66 Mean :0.1017
## 3rd Qu.:1004.0 3rd Qu.: 29.99 3rd Qu.:0.1600
## Max. :1363.0 Max. :500.00 Max. :0.2500
##
## Order.Item.Id Order.Item.Product.Price Order.Item.Profit.Ratio
## Min. : 1 Min. : 9.99 Min. :-2.7500
## 1st Qu.: 45131 1st Qu.: 50.00 1st Qu.: 0.0800
## Median : 90260 Median : 59.99 Median : 0.2700
## Mean : 90260 Mean : 141.23 Mean : 0.1206
## 3rd Qu.:135390 3rd Qu.: 199.99 3rd Qu.: 0.3600
## Max. :180519 Max. :1999.99 Max. : 0.5000
##
## Order.Item.Quantity Sales Order.Item.Total Order.Profit.Per.Order
## Min. :1.000 Min. : 9.99 Min. : 7.49 Min. :-4274.98
## 1st Qu.:1.000 1st Qu.: 119.98 1st Qu.: 104.38 1st Qu.: 7.00
## Median :1.000 Median : 199.92 Median : 163.99 Median : 31.52
## Mean :2.128 Mean : 203.77 Mean : 183.11 Mean : 21.98
## 3rd Qu.:3.000 3rd Qu.: 299.95 3rd Qu.: 247.40 3rd Qu.: 64.80
## Max. :5.000 Max. :1999.99 Max. :1939.99 Max. : 911.80
##
## Order.Region Order.State Order.Status Order.Zipcode
## Length:180519 Length:180519 Length:180519 Min. : 1040
## Class :character Class :character Class :character 1st Qu.:23464
## Mode :character Mode :character Mode :character Median :59405
## Mean :55426
## 3rd Qu.:90008
## Max. :99301
## NA's :155679
## Product.Card.Id Product.Category.Id Product.Description Product.Image
## Min. : 19.0 Min. : 2.00 Mode:logical Length:180519
## 1st Qu.: 403.0 1st Qu.:18.00 NA's:180519 Class :character
## Median : 627.0 Median :29.00 Mode :character
## Mean : 692.5 Mean :31.85
## 3rd Qu.:1004.0 3rd Qu.:45.00
## Max. :1363.0 Max. :76.00
##
## Product.Name Product.Price Product.Status shipping.date..DateOrders.
## Length:180519 Min. : 9.99 Min. :0 Length:180519
## Class :character 1st Qu.: 50.00 1st Qu.:0 Class :character
## Mode :character Median : 59.99 Median :0 Mode :character
## Mean : 141.23 Mean :0
## 3rd Qu.: 199.99 3rd Qu.:0
## Max. :1999.99 Max. :0
##
## Shipping.Mode
## Length:180519
## Class :character
## Mode :character
##
##
##
##
Wrangling data
d<-mutate_at(d,vars(Type,Delivery.Status,
Late_delivery_risk,Category.Id,
Category.Name,
Customer.City,
Customer.Country,Customer.Id,
Customer.Segment,Customer.State,
Customer.Street,
Customer.Zipcode,
Department.Id,
Department.Name,Market,
Order.City,Order.Country,
Order.Customer.Id,Order.Id,
Order.Item.Cardprod.Id,
Order.Item.Id,
Order.Region,Order.State,
Order.Status,Order.Zipcode,
Product.Card.Id,Product.Category.Id,
Product.Name,Shipping.Mode),as.factor)
##Product.Status, all zeroes means that all prouducts were available
## as 0 means available and 1 means unavailable
d$order.date..DateOrders.= as.POSIXct(d$order.date..DateOrders.,format = "%m/%d/%Y %H:%M")
d$shipping.date..DateOrders.= as.POSIXct(d$shipping.date..DateOrders.,format = "%m/%d/%Y %H:%M")
==Checking for null values/na’s/infinities
is.null(d)
## [1] FALSE
apply(d, 2, function(x) any(is.na(x)))
## Type Days.for.shipping..real.
## FALSE FALSE
## Days.for.shipment..scheduled. Benefit.per.order
## FALSE FALSE
## Sales.per.customer Delivery.Status
## FALSE FALSE
## Late_delivery_risk Category.Id
## FALSE FALSE
## Category.Name Customer.City
## FALSE FALSE
## Customer.Country Customer.Email
## FALSE FALSE
## Customer.Fname Customer.Id
## FALSE FALSE
## Customer.Lname Customer.Password
## FALSE FALSE
## Customer.Segment Customer.State
## FALSE FALSE
## Customer.Street Customer.Zipcode
## FALSE TRUE
## Department.Id Department.Name
## FALSE FALSE
## Latitude Longitude
## FALSE FALSE
## Market Order.City
## FALSE FALSE
## Order.Country Order.Customer.Id
## FALSE FALSE
## order.date..DateOrders. Order.Id
## FALSE FALSE
## Order.Item.Cardprod.Id Order.Item.Discount
## FALSE FALSE
## Order.Item.Discount.Rate Order.Item.Id
## FALSE FALSE
## Order.Item.Product.Price Order.Item.Profit.Ratio
## FALSE FALSE
## Order.Item.Quantity Sales
## FALSE FALSE
## Order.Item.Total Order.Profit.Per.Order
## FALSE FALSE
## Order.Region Order.State
## FALSE FALSE
## Order.Status Order.Zipcode
## FALSE TRUE
## Product.Card.Id Product.Category.Id
## FALSE FALSE
## Product.Description Product.Image
## TRUE FALSE
## Product.Name Product.Price
## FALSE FALSE
## Product.Status shipping.date..DateOrders.
## FALSE FALSE
## Shipping.Mode
## FALSE
apply(d, 2, function(x) any(is.infinite(x)))
## Type Days.for.shipping..real.
## FALSE FALSE
## Days.for.shipment..scheduled. Benefit.per.order
## FALSE FALSE
## Sales.per.customer Delivery.Status
## FALSE FALSE
## Late_delivery_risk Category.Id
## FALSE FALSE
## Category.Name Customer.City
## FALSE FALSE
## Customer.Country Customer.Email
## FALSE FALSE
## Customer.Fname Customer.Id
## FALSE FALSE
## Customer.Lname Customer.Password
## FALSE FALSE
## Customer.Segment Customer.State
## FALSE FALSE
## Customer.Street Customer.Zipcode
## FALSE FALSE
## Department.Id Department.Name
## FALSE FALSE
## Latitude Longitude
## FALSE FALSE
## Market Order.City
## FALSE FALSE
## Order.Country Order.Customer.Id
## FALSE FALSE
## order.date..DateOrders. Order.Id
## FALSE FALSE
## Order.Item.Cardprod.Id Order.Item.Discount
## FALSE FALSE
## Order.Item.Discount.Rate Order.Item.Id
## FALSE FALSE
## Order.Item.Product.Price Order.Item.Profit.Ratio
## FALSE FALSE
## Order.Item.Quantity Sales
## FALSE FALSE
## Order.Item.Total Order.Profit.Per.Order
## FALSE FALSE
## Order.Region Order.State
## FALSE FALSE
## Order.Status Order.Zipcode
## FALSE FALSE
## Product.Card.Id Product.Category.Id
## FALSE FALSE
## Product.Description Product.Image
## FALSE FALSE
## Product.Name Product.Price
## FALSE FALSE
## Product.Status shipping.date..DateOrders.
## FALSE FALSE
## Shipping.Mode
## FALSE
as for duplicates, the only unique’d value column is Order.Item.Id
apply(d, 2, function(x) any((duplicated(x))))
## Type Days.for.shipping..real.
## TRUE TRUE
## Days.for.shipment..scheduled. Benefit.per.order
## TRUE TRUE
## Sales.per.customer Delivery.Status
## TRUE TRUE
## Late_delivery_risk Category.Id
## TRUE TRUE
## Category.Name Customer.City
## TRUE TRUE
## Customer.Country Customer.Email
## TRUE TRUE
## Customer.Fname Customer.Id
## TRUE TRUE
## Customer.Lname Customer.Password
## TRUE TRUE
## Customer.Segment Customer.State
## TRUE TRUE
## Customer.Street Customer.Zipcode
## TRUE TRUE
## Department.Id Department.Name
## TRUE TRUE
## Latitude Longitude
## TRUE TRUE
## Market Order.City
## TRUE TRUE
## Order.Country Order.Customer.Id
## TRUE TRUE
## order.date..DateOrders. Order.Id
## TRUE TRUE
## Order.Item.Cardprod.Id Order.Item.Discount
## TRUE TRUE
## Order.Item.Discount.Rate Order.Item.Id
## TRUE FALSE
## Order.Item.Product.Price Order.Item.Profit.Ratio
## TRUE TRUE
## Order.Item.Quantity Sales
## TRUE TRUE
## Order.Item.Total Order.Profit.Per.Order
## TRUE TRUE
## Order.Region Order.State
## TRUE TRUE
## Order.Status Order.Zipcode
## TRUE TRUE
## Product.Card.Id Product.Category.Id
## TRUE TRUE
## Product.Description Product.Image
## TRUE TRUE
## Product.Name Product.Price
## TRUE TRUE
## Product.Status shipping.date..DateOrders.
## TRUE TRUE
## Shipping.Mode
## TRUE
Now, removing unnecessary attributes from data
d=select(d,-Customer.Email,-Customer.Fname,
-Customer.Lname,-Customer.Password,
-Customer.Street,
-Product.Description,-Product.Image,
-Order.Customer.Id,
-Product.Status, -Order.Zipcode,
-Order.Item.Product.Price, -Order.Profit.Per.Order,
-Order.Item.Cardprod.Id,
-Category.Id, -Department.Id,
-Product.Card.Id, -Product.Category.Id)
#dividing data to have a better look
d_long_and_lat <- filter(d, Latitude > 17 & Latitude < 50 & Longitude > -160 & Longitude < -65 )
C <- select(d, Customer.Id, Customer.Segment, Sales.per.customer, Customer.Country, Customer.State, Customer.City, Customer.Zipcode, Longitude ,Latitude)
O <- select(d, Order.Id, order.date..DateOrders., Type, Order.Status, Order.Item.Quantity, Order.Item.Id, Department.Name, Category.Name, Sales, Benefit.per.order, Product.Name, Product.Price, Order.Item.Discount.Rate, Order.Item.Discount, Order.Item.Total, Order.Item.Profit.Ratio, Market, Order.Region, Order.Country, Order.State, Order.City,)
Delivery <- select(d, Delivery.Status, order.date..DateOrders., shipping.date..DateOrders., Shipping.Mode, Late_delivery_risk, Days.for.shipment..scheduled., Days.for.shipping..real.)
summary(d)
## Type Days.for.shipping..real. Days.for.shipment..scheduled.
## CASH :19616 Min. :0.000 Min. :0.000
## DEBIT :69295 1st Qu.:2.000 1st Qu.:2.000
## PAYMENT :41725 Median :3.000 Median :4.000
## TRANSFER:49883 Mean :3.498 Mean :2.932
## 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :6.000 Max. :4.000
##
## Benefit.per.order Sales.per.customer Delivery.Status
## Min. :-4274.98 Min. : 7.49 Advance shipping :41592
## 1st Qu.: 7.00 1st Qu.: 104.38 Late delivery :98977
## Median : 31.52 Median : 163.99 Shipping canceled: 7754
## Mean : 21.98 Mean : 183.11 Shipping on time :32196
## 3rd Qu.: 64.80 3rd Qu.: 247.40
## Max. : 911.80 Max. :1939.99
##
## Late_delivery_risk Category.Name Customer.City
## 0:81542 Cleats :24551 Caguas :66770
## 1:98977 Men's Footwear :22246 Chicago : 3885
## Women's Apparel :21035 Los Angeles : 3417
## Indoor/Outdoor Games:19298 Brooklyn : 3412
## Fishing :17325 New York : 1816
## Water Sports :15540 Philadelphia: 1577
## (Other) :60524 (Other) :99642
## Customer.Country Customer.Id Customer.Segment Customer.State
## EE. UU. :111146 5654 : 47 Consumer :93504 PR :69373
## Puerto Rico: 69373 5004 : 45 Corporate :54789 CA :29223
## 10591 : 45 Home Office:32226 NY :11327
## 3708 : 44 TX : 9103
## 5715 : 44 IL : 7631
## 9371 : 44 FL : 5456
## (Other):180250 (Other):48406
## Customer.Zipcode Department.Name Latitude Longitude
## 725 : 66770 Fan Shop:66861 Min. :-33.94 Min. :-158.03
## 921 : 337 Apparel :48998 1st Qu.: 18.27 1st Qu.: -98.45
## 23455 : 334 Golf :33220 Median : 33.14 Median : -76.85
## 957 : 297 Footwear:14525 Mean : 29.72 Mean : -84.92
## 79109 : 292 Outdoors: 9686 3rd Qu.: 39.28 3rd Qu.: -66.37
## (Other):112486 Fitness : 2479 Max. : 48.78 Max. : 115.26
## NA's : 3 (Other) : 4750
## Market Order.City Order.Country
## Africa :11614 Santo Domingo: 2211 Estados Unidos: 24840
## Europe :50252 New York City: 2202 Francia : 13222
## LATAM :51594 Los Angeles : 1845 México : 13172
## Pacific Asia:41260 Tegucigalpa : 1783 Alemania : 9564
## USCA :25799 Managua : 1682 Australia : 8497
## Mexico City : 1484 Brasil : 7987
## (Other) :169312 (Other) :103237
## order.date..DateOrders. Order.Id Order.Item.Discount
## Min. :2015-01-01 00:00:00 5 : 5 Min. : 0.00
## 1st Qu.:2015-09-21 13:49:00 10 : 5 1st Qu.: 5.40
## Median :2016-06-11 13:06:00 11 : 5 Median : 14.00
## Mean :2016-06-12 17:47:04 12 : 5 Mean : 20.66
## 3rd Qu.:2017-03-01 08:42:00 15 : 5 3rd Qu.: 29.99
## Max. :2018-01-31 23:38:00 17 : 5 Max. :500.00
## (Other):180489
## Order.Item.Discount.Rate Order.Item.Id Order.Item.Profit.Ratio
## Min. :0.0000 1 : 1 Min. :-2.7500
## 1st Qu.:0.0400 2 : 1 1st Qu.: 0.0800
## Median :0.1000 3 : 1 Median : 0.2700
## Mean :0.1017 4 : 1 Mean : 0.1206
## 3rd Qu.:0.1600 5 : 1 3rd Qu.: 0.3600
## Max. :0.2500 6 : 1 Max. : 0.5000
## (Other):180513
## Order.Item.Quantity Sales Order.Item.Total
## Min. :1.000 Min. : 9.99 Min. : 7.49
## 1st Qu.:1.000 1st Qu.: 119.98 1st Qu.: 104.38
## Median :1.000 Median : 199.92 Median : 163.99
## Mean :2.128 Mean : 203.77 Mean : 183.11
## 3rd Qu.:3.000 3rd Qu.: 299.95 3rd Qu.: 247.40
## Max. :5.000 Max. :1999.99 Max. :1939.99
##
## Order.Region Order.State
## Central America:28341 Inglaterra : 6722
## Western Europe :27109 California : 4966
## South America :14935 Isla de Francia : 4580
## Oceania :10148 Renania del Norte-Westfalia: 3303
## Northern Europe: 9792 San Salvador : 3055
## Southeast Asia : 9539 Nueva York : 2753
## (Other) :80655 (Other) :155140
## Order.Status Product.Name
## COMPLETE :59491 Perfect Fitness Perfect Rip Deck :24515
## PENDING_PAYMENT:39832 Nike Men's CJ Elite 2 TD Football Cleat :22246
## PROCESSING :21902 Nike Men's Dri-FIT Victory Golf Polo :21035
## PENDING :20227 O'Brien Men's Neoprene Life Vest :19298
## CLOSED :19616 Field & Stream Sportsman 16 Gun Fire Safe:17325
## ON_HOLD : 9804 Pelican Sunstream 100 Kayak :15500
## (Other) : 9647 (Other) :60600
## Product.Price shipping.date..DateOrders. Shipping.Mode
## Min. : 9.99 Min. :2015-01-03 00:00:00 First Class : 27814
## 1st Qu.: 50.00 1st Qu.:2015-09-25 06:59:00 Same Day : 9737
## Median : 59.99 Median :2016-06-15 08:32:00 Second Class : 35216
## Mean : 141.23 Mean :2016-06-16 05:45:23 Standard Class:107752
## 3rd Qu.: 199.99 3rd Qu.:2017-03-04 21:29:00
## Max. :1999.99 Max. :2018-02-06 22:14:00
##
str(d)
## 'data.frame': 180519 obs. of 36 variables:
## $ Type : Factor w/ 4 levels "CASH","DEBIT",..: 2 4 1 2 3 4 2 4 1 1 ...
## $ Days.for.shipping..real. : int 3 5 4 3 2 6 2 2 3 2 ...
## $ Days.for.shipment..scheduled.: int 4 4 4 4 4 4 1 1 2 1 ...
## $ Benefit.per.order : num 91.2 -249.1 -247.8 22.9 134.2 ...
## $ Sales.per.customer : num 315 311 310 305 298 ...
## $ Delivery.Status : Factor w/ 4 levels "Advance shipping",..: 1 2 4 1 1 3 2 2 2 2 ...
## $ Late_delivery_risk : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 2 2 2 2 ...
## $ Category.Name : Factor w/ 50 levels "Accessories",..: 41 41 41 41 41 41 41 41 41 41 ...
## $ Customer.City : Factor w/ 563 levels "Aguadilla","Alameda",..: 67 67 453 286 67 506 67 319 67 458 ...
## $ Customer.Country : Factor w/ 2 levels "EE. UU.","Puerto Rico": 2 2 1 1 2 1 2 1 2 1 ...
## $ Customer.Id : Factor w/ 20652 levels "1","2","3","4",..: 20650 19387 19386 19385 19384 19383 19382 19381 19380 19379 ...
## $ Customer.Segment : Factor w/ 3 levels "Consumer","Corporate",..: 1 1 1 3 2 1 3 2 2 2 ...
## $ Customer.State : Factor w/ 46 levels "91732","95758",..: 37 37 6 6 37 32 37 11 37 6 ...
## $ Customer.Zipcode : Factor w/ 995 levels "603","612","674",..: 8 8 942 721 8 174 8 341 8 923 ...
## $ Department.Name : Factor w/ 11 levels "Apparel","Book Shop",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ Latitude : num 18.3 18.3 37.3 34.1 18.3 ...
## $ Longitude : num -66 -66 -122 -118 -66 ...
## $ Market : Factor w/ 5 levels "Africa","Europe",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Order.City : Factor w/ 3597 levels "Aachen","Aalen",..: 334 395 395 3230 3230 3211 1242 1242 1242 1242 ...
## $ Order.Country : Factor w/ 164 levels "Afganistán","Albania",..: 71 70 70 9 9 9 32 32 32 32 ...
## $ order.date..DateOrders. : POSIXct, format: "2018-01-31 22:56:00" "2018-01-13 12:27:00" ...
## $ Order.Id : Factor w/ 65752 levels "1","2","4","5",..: 65750 64487 64486 64485 64484 64483 64482 64481 64480 64479 ...
## $ Order.Item.Discount : num 13.1 16.4 18 22.9 29.5 ...
## $ Order.Item.Discount.Rate : num 0.04 0.05 0.06 0.07 0.09 ...
## $ Order.Item.Id : Factor w/ 180519 levels "1","2","3","4",..: 180517 179254 179253 179252 179251 179250 179249 179248 179247 179246 ...
## $ Order.Item.Profit.Ratio : num 0.29 -0.8 -0.8 0.08 0.45 ...
## $ Order.Item.Quantity : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Sales : num 328 328 328 328 328 ...
## $ Order.Item.Total : num 315 311 310 305 298 ...
## $ Order.Region : Factor w/ 23 levels "Canada","Caribbean",..: 16 14 14 12 12 12 8 8 8 8 ...
## $ Order.State : Factor w/ 1089 levels "Abia","Abruzos",..: 479 844 844 838 838 838 390 390 390 390 ...
## $ Order.Status : Factor w/ 9 levels "CANCELED","CLOSED",..: 3 6 2 3 7 1 3 8 2 2 ...
## $ Product.Name : Factor w/ 118 levels "adidas Brazuca 2014 Official Match Ball",..: 83 83 83 83 83 83 83 83 83 83 ...
## $ Product.Price : num 328 328 328 328 328 ...
## $ shipping.date..DateOrders. : POSIXct, format: "2018-02-03 22:56:00" "2018-01-18 12:27:00" ...
## $ Shipping.Mode : Factor w/ 4 levels "First Class",..: 4 4 4 4 4 4 1 1 3 1 ...
#1# who is the most effective/achieving sales in country/city/segment?
summary(C%>%
group_by(Customer.Country)%>%
summarise(Sales.per.customer))
## `summarise()` has grouped output by 'Customer.Country'. You can override using the `.groups` argument.
## Customer.Country Sales.per.customer
## EE. UU. :111146 Min. : 7.49
## Puerto Rico: 69373 1st Qu.: 104.38
## Median : 163.99
## Mean : 183.11
## 3rd Qu.: 247.40
## Max. :1939.99
summary(C%>%
group_by(Customer.City)%>%
summarise(Sales.per.customer))
## `summarise()` has grouped output by 'Customer.City'. You can override using the `.groups` argument.
## Customer.City Sales.per.customer
## Caguas :66770 Min. : 7.49
## Chicago : 3885 1st Qu.: 104.38
## Los Angeles : 3417 Median : 163.99
## Brooklyn : 3412 Mean : 183.11
## New York : 1816 3rd Qu.: 247.40
## Philadelphia: 1577 Max. :1939.99
## (Other) :99642
summary(C%>%
group_by(Customer.Segment)%>%
summarise(Sales.per.customer))
## `summarise()` has grouped output by 'Customer.Segment'. You can override using the `.groups` argument.
## Customer.Segment Sales.per.customer
## Consumer :93504 Min. : 7.49
## Corporate :54789 1st Qu.: 104.38
## Home Office:32226 Median : 163.99
## Mean : 183.11
## 3rd Qu.: 247.40
## Max. :1939.99
######
#Delivery#
#2#what time of the year is it a rush time of orders?
monthly_count=d%>%
group_by(month=floor_date(order.date..DateOrders., "month"))%>%
count(Order.Item.Id)
mc=monthly_count%>%
group_by(month=floor_date(date(month), "month"))%>%
summarise(n=sum(n))
barplot(mc$n,names = mc$month, las=2)
#we see that orders used to have a stable rate during the
#long term, but it dropped down in the latter 4 months
#3#why is that so
monthly_count=d%>%
group_by(month=floor_date(order.date..DateOrders., "month"))%>%
summarise(s.Bpo=sum(Benefit.per.order))
barplot(monthly_count$s.Bpo,names = monthly_count$month, las=2)
#nothing
monthly_count=d%>%
group_by(month=floor_date(order.date..DateOrders., "month"))%>%
summarise(Bpo=mean(Benefit.per.order))
#from here we could see that the company was going on a stable ratio for the benefit for a long period of time, and then it suddenly raise the benifit which caused a drop in the amount of orders as customers didn't like it(obviously), and so the company was trying to get its reach back again which forced them to even lower the benifit way less than ever
barplot(monthly_count$Bpo,names = monthly_count$month, las=2)
#########
#4#what is the most used payment type and by who is it used?
f=d%>%
group_by(Customer.Segment)%>%
count(Type)
p1<-filter(f,Customer.Segment=='Consumer')
p1$percent = round(100*p1$n/sum(p1$n), digits = 1)
p1$label = paste(p1$Type," (", p1$percent,"%)", sep = "")
pie(p1$n,labels=p1$label,radius = 1,main = "Consumer's Payments")
p2<-filter(f,Customer.Segment=='Home Office')
p2$percent = round(100*p2$n/sum(p2$n), digits = 1)
p2$label = paste(p2$Type," (", p2$percent,"%)", sep = "")
pie(p2$n,labels = p2$label,radius = 1,main = "Home office's Payments")
p3<-filter(f,Customer.Segment=='Corporate')
p3$percent = round(100*p3$n/sum(p3$n), digits = 1)
p3$label = paste(p3$Type," (", p3$percent,"%)", sep = "")
pie(p3$n,labels = p3$label,radius = 1,main = "Corporate's Payments")
#how about each region?
fr=select(d,Type,Order.Region)
fr=fr%>%
group_by(Order.Region)%>%
count(Type)
ggplot(fr, aes(fill=Type, y=n, x=Order.Region)) +
geom_bar(position="dodge", stat="identity")+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
##############
#5#Which country has the most suspected fraud?
fraud<-filter(d,Order.Status == "SUSPECTED_FRAUD")
x=fraud%>%
group_by(Order.Country)%>%
count(Order.Item.Id)
x%>%
group_by(Order.Country)%>%
summarise(n=sum(n))%>%
arrange(... = desc(n),n)
## # A tibble: 102 x 2
## Order.Country n
## <fct> <int>
## 1 Estados Unidos 573
## 2 Francia 362
## 3 México 317
## 4 Alemania 227
## 5 Australia 197
## 6 Brasil 196
## 7 Reino Unido 140
## 8 China 124
## 9 India 116
## 10 Italia 107
## # ... with 92 more rows
#if we just draw this it would have an awful look.. so let's take a step back and bring it from a regional prospective
x=fraud%>%
group_by(Order.Region)%>%
count(Order.Item.Id)
x=x%>%
group_by(Order.Region)%>%
summarise(n=sum(n))%>%
arrange(... = desc(n),n)
x$percent = round(100*x$n/sum(x$n), digits = 1)
x$label= paste(x$Order.Region," (", x$percent,"%)", sep = "")
barplot(x$n, names=x$label, las=2, cex.names = 0.7, xlab = " Region", ylab ="Total number of frauds per order" )
##What segment takes a bigger amount of discount?
d%>%
group_by(Customer.Segment)%>%
summarise(mean(Order.Item.Discount.Rate)) ##Bala7 bombala7
## # A tibble: 3 x 2
## Customer.Segment `mean(Order.Item.Discount.Rate)`
## <fct> <dbl>
## 1 Consumer 0.102
## 2 Corporate 0.102
## 3 Home Office 0.102
#6#What is the best shipping mode? and how to optimize a model to enhance the process
dp=d%>%
group_by(Delivery.Status)%>%
count(Shipping.Mode)%>%
arrange(.,Shipping.Mode)
ggplot(dp, aes(fill=Delivery.Status, y=n, x=Shipping.Mode)) +
geom_bar(position="stack", stat="identity")
dp1<-filter(dp, Shipping.Mode=="First Class")
dp1$percent = round(100*dp1$n/sum(dp1$n), digits = 1)
dp2<-filter(dp, Shipping.Mode=="Same Day")
dp2$percent = round(100*dp2$n/sum(dp2$n), digits = 1)
dp3<-filter(dp, Shipping.Mode=="Second Class")
dp3$percent = round(100*dp3$n/sum(dp3$n), digits = 1)
dp4<-filter(dp, Shipping.Mode=="Standard Class")
dp4$percent = round(100*dp4$n/sum(dp4$n), digits = 1)
mi=rbind(dp1,dp2,dp3,dp4)
ggplot(mi, aes(fill=Delivery.Status, y=percent, x=Shipping.Mode)) +
geom_bar(position="stack", stat="identity")
##**scheduled days
#first class ==>2
#second class==>4
#Standard==>4/5
#same==same==0
############
#The following is a clarification of the difference of the mean of the real and the mean of the scheduled days
d%>%
group_by(Shipping.Mode)%>%
summarise(mean(Days.for.shipping..real.),mean(Days.for.shipment..scheduled.))
## # A tibble: 4 x 3
## Shipping.Mode `mean(Days.for.shipping..real.)` `mean(Days.for.shipment..sche~
## <fct> <dbl> <dbl>
## 1 First Class 2 1
## 2 Same Day 0.478 0
## 3 Second Class 3.99 2
## 4 Standard Class 4.00 4
#this explains the number of days each shipping mode actually takes
d%>%
group_by(Shipping.Mode)%>%
count(Days.for.shipping..real.)
## # A tibble: 13 x 3
## # Groups: Shipping.Mode [4]
## Shipping.Mode Days.for.shipping..real. n
## <fct> <int> <int>
## 1 First Class 2 27814
## 2 Same Day 0 5080
## 3 Same Day 1 4657
## 4 Second Class 2 7138
## 5 Second Class 3 7065
## 6 Second Class 4 6978
## 7 Second Class 5 7052
## 8 Second Class 6 6983
## 9 Standard Class 2 21666
## 10 Standard Class 3 21700
## 11 Standard Class 4 21535
## 12 Standard Class 5 21111
## 13 Standard Class 6 21740
ggplot(d,aes(Days.for.shipping..real., Days.for.shipment..scheduled.,
color = as.factor(Delivery.Status))) +
geom_point(size = 3) +
theme_minimal()
Some plots/graphs
##Total Sales for all markets, regions
tsm=d%>%
group_by(Market)%>%
summarise(sales=sum(Sales.per.customer))
barplot(tsm$sales,names=tsm$Market,las=2,col="orange",main = "Total Sales for all Markets")
tsr=d%>%
group_by(Order.Region)%>%
summarise(sales=sum(Sales.per.customer))
barplot(tsr$sales,names=tsr$Order.Region,las=2,col = "purple",main = "Total Sales for all regions")
#Which catergory of products has highest sales?
cat.sal=d%>%
group_by(Category.Name)%>%
summarise(sales=sum(Sales.per.customer))
barplot(cat.sal$sales,names=cat.sal$Category.Name,las=2,col = "green",cex.names = 0.45,width = 0.7)
#We see that the highest sales category is Fishing followed by cleats, however the upcoming graph which indicates the total sales accomplished by each category name
gf=d%>%
group_by(Category.Name)%>%
summarise(s=mean(Sales.per.customer))%>%
arrange(... =desc(s),s)
barplot(gf$s, names=gf$Category.Name, las=2,cex.names = 0.45, col = "pink")
#Since correlation was high between Price and Sales it will be intresting to see how price is impacting the sales for all the products to see the trend.
ggplot(data = d,aes(x = Product.Price,y=Sales.per.customer))+
geom_point(color="blue")
#therefore we can say that prices have a linear relation with sales
############
#Which department of products has highest sales?
dep.sal=d%>%
group_by(Department.Name)%>%
summarise(sales=sum(Sales.per.customer))
barplot(dep.sal$sales,names=dep.sal$Department.Name,las=2,col = "violet",cex.names = 0.8,width = 0.7 ,ylab = "Sales amount")
Statistics and Modelling
with(d, plot(Sales, Benefit.per.order, main = "Sales and Benifit per order", pch = 20))
model <- lm(Benefit.per.order ~ Sales, d)
abline(model, lwd =2)
Some models
#GLM Benefits/Order by Shipping Mode
lmtest <- glm(Benefit.per.order ~ Shipping.Mode,data = d)
summary(lmtest)
##
## Call:
## glm(formula = Benefit.per.order ~ Shipping.Mode, data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4296.3 -15.0 9.5 42.8 888.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.1222 0.6262 36.925 <2e-16 ***
## Shipping.ModeSame Day -2.2720 1.2297 -1.848 0.0647 .
## Shipping.ModeSecond Class -1.8163 0.8377 -2.168 0.0301 *
## Shipping.ModeStandard Class -1.1231 0.7024 -1.599 0.1098
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 10906.18)
##
## Null deviance: 1968794529 on 180518 degrees of freedom
## Residual deviance: 1968729773 on 180515 degrees of freedom
## AIC: 2190597
##
## Number of Fisher Scoring iterations: 2
#BenefitPerOrder = (-2.2720 (Same Day)) + (-1.8163 (Second Class)) + (-1.1231 (Standard Class))
Shipping_D <- table(d$Shipping.Mode, d$Late_delivery_risk)
Shipping_D
##
## 0 1
## First Class 1301 26513
## Same Day 5283 4454
## Second Class 8229 26987
## Standard Class 66729 41023
#GLM Late Delivery
lmTest <- glm(Late_delivery_risk ~ Days.for.shipment..scheduled. +
Days.for.shipping..real. + Order.Region,
data = d, family = binomial)
summary(lmTest)
##
## Call:
## glm(formula = Late_delivery_risk ~ Days.for.shipment..scheduled. +
## Days.for.shipping..real. + Order.Region, family = binomial,
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.6132 -0.0382 0.0055 0.4314 0.6402
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.21612 0.15293 -14.491 <2e-16 ***
## Days.for.shipment..scheduled. -4.67669 0.02441 -191.599 <2e-16 ***
## Days.for.shipping..real. 4.48078 0.02362 189.698 <2e-16 ***
## Order.RegionCaribbean 0.14509 0.15905 0.912 0.3616
## Order.RegionCentral Africa 0.25801 0.18694 1.380 0.1675
## Order.RegionCentral America 0.21613 0.15340 1.409 0.1589
## Order.RegionCentral Asia 0.22661 0.24222 0.936 0.3495
## Order.RegionEast Africa 0.43903 0.18605 2.360 0.0183 *
## Order.RegionEast of USA 0.27574 0.16084 1.714 0.0865 .
## Order.RegionEastern Asia 0.20410 0.16075 1.270 0.2042
## Order.RegionEastern Europe 0.35432 0.16857 2.102 0.0356 *
## Order.RegionNorth Africa 0.27857 0.17226 1.617 0.1058
## Order.RegionNorthern Europe 0.21429 0.15812 1.355 0.1753
## Order.RegionOceania 0.26374 0.15775 1.672 0.0946 .
## Order.RegionSouth America 0.15016 0.15545 0.966 0.3341
## Order.RegionSouth Asia 0.30105 0.16012 1.880 0.0601 .
## Order.RegionSouth of USA 0.25496 0.16796 1.518 0.1290
## Order.RegionSoutheast Asia 0.25886 0.15822 1.636 0.1018
## Order.RegionSouthern Africa 0.11991 0.20346 0.589 0.5556
## Order.RegionSouthern Europe 0.29379 0.15822 1.857 0.0633 .
## Order.RegionUS Center 0.34156 0.16224 2.105 0.0353 *
## Order.RegionWest Africa 0.25333 0.16848 1.504 0.1327
## Order.RegionWest Asia 0.28949 0.16229 1.784 0.0744 .
## Order.RegionWest of USA 0.15508 0.15953 0.972 0.3310
## Order.RegionWestern Europe 0.21440 0.15347 1.397 0.1624
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 248566 on 180518 degrees of freedom
## Residual deviance: 61005 on 180494 degrees of freedom
## AIC: 61055
##
## Number of Fisher Scoring iterations: 8
t.test(d$Sales.per.customer,d$Benefit.per.order)
##
## Welch Two Sample t-test
##
## data: d$Sales.per.customer and d$Benefit.per.order
## t = 430.27, df = 354249, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 160.3986 161.8666
## sample estimates:
## mean of x mean of y
## 183.10761 21.97499
#as we see, the p-value is significantly small which tells that the relation is strong