df <- read.csv("Data/Sample - Superstore.csv")
head(df)
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 1 1 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520
## 2 2 CA-2016-152156 11/8/2016 11/11/2016 Second Class CG-12520
## 3 3 CA-2016-138688 6/12/2016 6/16/2016 Second Class DV-13045
## 4 4 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335
## 5 5 US-2015-108966 10/11/2015 10/18/2015 Standard Class SO-20335
## 6 6 CA-2014-115812 6/9/2014 6/14/2014 Standard Class BH-11710
## Customer.Name Segment Country City State
## 1 Claire Gute Consumer United States Henderson Kentucky
## 2 Claire Gute Consumer United States Henderson Kentucky
## 3 Darrin Van Huff Corporate United States Los Angeles California
## 4 Sean O'Donnell Consumer United States Fort Lauderdale Florida
## 5 Sean O'Donnell Consumer United States Fort Lauderdale Florida
## 6 Brosina Hoffman Consumer United States Los Angeles California
## Postal.Code Region Product.ID Category Sub.Category
## 1 42420 South FUR-BO-10001798 Furniture Bookcases
## 2 42420 South FUR-CH-10000454 Furniture Chairs
## 3 90036 West OFF-LA-10000240 Office Supplies Labels
## 4 33311 South FUR-TA-10000577 Furniture Tables
## 5 33311 South OFF-ST-10000760 Office Supplies Storage
## 6 90032 West FUR-FU-10001487 Furniture Furnishings
## Product.Name Sales
## 1 Bush Somerset Collection Bookcase 261.9600
## 2 Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back 731.9400
## 3 Self-Adhesive Address Labels for Typewriters by Universal 14.6200
## 4 Bretford CR4500 Series Slim Rectangular Table 957.5775
## 5 Eldon Fold 'N Roll Cart System 22.3680
## 6 Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood 48.8600
## Quantity Discount Profit
## 1 2 0.00 41.9136
## 2 3 0.00 219.5820
## 3 2 0.00 6.8714
## 4 5 0.45 -383.0310
## 5 2 0.20 2.5164
## 6 7 0.00 14.1694
2.2 Funciones y paquetes de R para análisis de datos
2.2.1 arrange()
# Ordena variables numéricas, tipo texto y fechas
df <- df %>%
mutate(Order.Date=mdy(Order.Date)) %>%
mutate(Ship.Date=mdy(Ship.Date)) %>%
arrange(Order.Date)
head(df)
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 1 7981 CA-2014-103800 2014-01-03 2014-01-07 Standard Class DP-13000
## 2 740 CA-2014-112326 2014-01-04 2014-01-08 Standard Class PO-19195
## 3 741 CA-2014-112326 2014-01-04 2014-01-08 Standard Class PO-19195
## 4 742 CA-2014-112326 2014-01-04 2014-01-08 Standard Class PO-19195
## 5 1760 CA-2014-141817 2014-01-05 2014-01-12 Standard Class MB-18085
## 6 5328 CA-2014-130813 2014-01-06 2014-01-08 Second Class LS-17230
## Customer.Name Segment Country City State
## 1 Darren Powers Consumer United States Houston Texas
## 2 Phillina Ober Home Office United States Naperville Illinois
## 3 Phillina Ober Home Office United States Naperville Illinois
## 4 Phillina Ober Home Office United States Naperville Illinois
## 5 Mick Brown Consumer United States Philadelphia Pennsylvania
## 6 Lycoris Saunders Consumer United States Los Angeles California
## Postal.Code Region Product.ID Category Sub.Category
## 1 77095 Central OFF-PA-10000174 Office Supplies Paper
## 2 60540 Central OFF-LA-10003223 Office Supplies Labels
## 3 60540 Central OFF-ST-10002743 Office Supplies Storage
## 4 60540 Central OFF-BI-10004094 Office Supplies Binders
## 5 19143 East OFF-AR-10003478 Office Supplies Art
## 6 90049 West OFF-PA-10002005 Office Supplies Paper
## Product.Name
## 1 Message Book, Wirebound, Four 5 1/2" X 4" Forms/Pg., 200 Dupl. Sets/Book
## 2 Avery 508
## 3 SAFCO Boltless Steel Shelving
## 4 GBC Standard Plastic Binding Systems Combs
## 5 Avery Hi-Liter EverBold Pen Style Fluorescent Highlighters, 4/Pack
## 6 Xerox 225
## Sales Quantity Discount Profit
## 1 16.448 2 0.2 5.5512
## 2 11.784 3 0.2 4.2717
## 3 272.736 3 0.2 -64.7748
## 4 3.540 2 0.8 -5.4870
## 5 19.536 3 0.2 4.8840
## 6 19.440 3 0.0 9.3312
2.2.2 as.character()
df <- df %>%
mutate(Postal.Code=as.character(Postal.Code))
head(df)
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 1 7981 CA-2014-103800 2014-01-03 2014-01-07 Standard Class DP-13000
## 2 740 CA-2014-112326 2014-01-04 2014-01-08 Standard Class PO-19195
## 3 741 CA-2014-112326 2014-01-04 2014-01-08 Standard Class PO-19195
## 4 742 CA-2014-112326 2014-01-04 2014-01-08 Standard Class PO-19195
## 5 1760 CA-2014-141817 2014-01-05 2014-01-12 Standard Class MB-18085
## 6 5328 CA-2014-130813 2014-01-06 2014-01-08 Second Class LS-17230
## Customer.Name Segment Country City State
## 1 Darren Powers Consumer United States Houston Texas
## 2 Phillina Ober Home Office United States Naperville Illinois
## 3 Phillina Ober Home Office United States Naperville Illinois
## 4 Phillina Ober Home Office United States Naperville Illinois
## 5 Mick Brown Consumer United States Philadelphia Pennsylvania
## 6 Lycoris Saunders Consumer United States Los Angeles California
## Postal.Code Region Product.ID Category Sub.Category
## 1 77095 Central OFF-PA-10000174 Office Supplies Paper
## 2 60540 Central OFF-LA-10003223 Office Supplies Labels
## 3 60540 Central OFF-ST-10002743 Office Supplies Storage
## 4 60540 Central OFF-BI-10004094 Office Supplies Binders
## 5 19143 East OFF-AR-10003478 Office Supplies Art
## 6 90049 West OFF-PA-10002005 Office Supplies Paper
## Product.Name
## 1 Message Book, Wirebound, Four 5 1/2" X 4" Forms/Pg., 200 Dupl. Sets/Book
## 2 Avery 508
## 3 SAFCO Boltless Steel Shelving
## 4 GBC Standard Plastic Binding Systems Combs
## 5 Avery Hi-Liter EverBold Pen Style Fluorescent Highlighters, 4/Pack
## 6 Xerox 225
## Sales Quantity Discount Profit
## 1 16.448 2 0.2 5.5512
## 2 11.784 3 0.2 4.2717
## 3 272.736 3 0.2 -64.7748
## 4 3.540 2 0.8 -5.4870
## 5 19.536 3 0.2 4.8840
## 6 19.440 3 0.0 9.3312
2.2.3 as.factor()
# La creación de factores facilita la división
# de la base de datos en grupos.
# Acelera el cómputo.
df <- df %>%
mutate(Ship.Mode=as.factor(Ship.Mode)) %>%
mutate(Segment=as.factor(Segment)) %>%
mutate(Country=as.factor(Country)) %>%
mutate(City=as.factor(City)) %>%
mutate(State=as.factor(State)) %>%
mutate(Region=as.factor(Region)) %>%
mutate(Category=as.factor(Category)) %>%
mutate(Sub.Category=as.factor(Sub.Category))
head(df)
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 1 7981 CA-2014-103800 2014-01-03 2014-01-07 Standard Class DP-13000
## 2 740 CA-2014-112326 2014-01-04 2014-01-08 Standard Class PO-19195
## 3 741 CA-2014-112326 2014-01-04 2014-01-08 Standard Class PO-19195
## 4 742 CA-2014-112326 2014-01-04 2014-01-08 Standard Class PO-19195
## 5 1760 CA-2014-141817 2014-01-05 2014-01-12 Standard Class MB-18085
## 6 5328 CA-2014-130813 2014-01-06 2014-01-08 Second Class LS-17230
## Customer.Name Segment Country City State
## 1 Darren Powers Consumer United States Houston Texas
## 2 Phillina Ober Home Office United States Naperville Illinois
## 3 Phillina Ober Home Office United States Naperville Illinois
## 4 Phillina Ober Home Office United States Naperville Illinois
## 5 Mick Brown Consumer United States Philadelphia Pennsylvania
## 6 Lycoris Saunders Consumer United States Los Angeles California
## Postal.Code Region Product.ID Category Sub.Category
## 1 77095 Central OFF-PA-10000174 Office Supplies Paper
## 2 60540 Central OFF-LA-10003223 Office Supplies Labels
## 3 60540 Central OFF-ST-10002743 Office Supplies Storage
## 4 60540 Central OFF-BI-10004094 Office Supplies Binders
## 5 19143 East OFF-AR-10003478 Office Supplies Art
## 6 90049 West OFF-PA-10002005 Office Supplies Paper
## Product.Name
## 1 Message Book, Wirebound, Four 5 1/2" X 4" Forms/Pg., 200 Dupl. Sets/Book
## 2 Avery 508
## 3 SAFCO Boltless Steel Shelving
## 4 GBC Standard Plastic Binding Systems Combs
## 5 Avery Hi-Liter EverBold Pen Style Fluorescent Highlighters, 4/Pack
## 6 Xerox 225
## Sales Quantity Discount Profit
## 1 16.448 2 0.2 5.5512
## 2 11.784 3 0.2 4.2717
## 3 272.736 3 0.2 -64.7748
## 4 3.540 2 0.8 -5.4870
## 5 19.536 3 0.2 4.8840
## 6 19.440 3 0.0 9.3312
2.2.4 as numeric()
df %>%
select(Sales, Quantity, Discount, Profit) %>%
mutate_all(as.numeric) %>%
head(.)
## Sales Quantity Discount Profit
## 1 16.448 2 0.2 5.5512
## 2 11.784 3 0.2 4.2717
## 3 272.736 3 0.2 -64.7748
## 4 3.540 2 0.8 -5.4870
## 5 19.536 3 0.2 4.8840
## 6 19.440 3 0.0 9.3312
# aplica función head a a lo que queda de lo anterior