Alumno: Giancarlo Kuway Chero
Curso: SGBD2
Link del data set: data - dirty
library(readxl)
datos<-read_excel("Ditry Data Sample.xlsx")
## New names:
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
Observamos que se ven totalmente desorganizados.
#install.packages("DT")
library(DT)
datos<-as.data.frame(datos)
datatable(datos)
Esto nos sirve para la creacion de nuevas columnas
colnames(datos)
## [1] "Segment>>" "Consumer" "...3"
## [4] "...4" "...5" "Consumer Total"
## [7] "Corporate" "...8" "...9"
## [10] "...10" "Corporate Total" "Home Office"
## [13] "...13" "...14" "...15"
## [16] "Home Office Total"
Empezamos cambiando los nombres de las columnas.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
datos<-datos %>% rename(order="Segment>>",
Consumer_First_Class="Consumer",
Consumer_Same_Day="...3",
Consumer_Second_Class="...4",
Consumer_Standard_Class="...5",
Consumer_Total="Consumer Total",
Corporate_First_Class="Corporate",
Corporate_Same_Day="...8",
Corporate_Second_Class="...9",
Corporate_Standard_Class="...10",
Corporate_Total="Corporate Total",
Home_Office_First_Class="Home Office",
Home_Office_Same_Day="...13",
Home_Office_Second_Class="...14",
Home_Office_Standard_Class="...15",
Home_Office_Total="Home Office Total")
rbind(head(datos,5),tail(datos,5))
## order Consumer_First_Class Consumer_Same_Day
## 1 Ship Mode>> First Class Same Day
## 2 Order ID <NA> <NA>
## 3 CA-2011-100293 <NA> <NA>
## 4 CA-2011-100706 <NA> <NA>
## 5 CA-2011-100895 <NA> <NA>
## 821 US-2014-166611 <NA> <NA>
## 822 US-2014-167920 <NA> <NA>
## 823 US-2014-168116 <NA> <NA>
## 824 US-2014-168690 <NA> <NA>
## 825 Grand Total 20802.172999999992 8132.4089999999987
## Consumer_Second_Class Consumer_Standard_Class Consumer_Total
## 1 Second Class Standard Class NA
## 2 <NA> <NA> NA
## 3 <NA> <NA> NA
## 4 129.44 <NA> 129.440
## 5 <NA> 605.47 605.470
## 821 <NA> <NA> NA
## 822 1827.51 <NA> 1827.510
## 823 <NA> <NA> NA
## 824 <NA> 2.8080000000000007 2.808
## 825 49724.254500000003 116922.13449999994 195580.971
## Corporate_First_Class Corporate_Same_Day Corporate_Second_Class
## 1 First Class Same Day Second Class
## 2 <NA> <NA> <NA>
## 3 <NA> <NA> <NA>
## 4 <NA> <NA> <NA>
## 5 <NA> <NA> <NA>
## 821 <NA> <NA> <NA>
## 822 <NA> <NA> <NA>
## 823 <NA> 8167.4199999999992 <NA>
## 824 <NA> <NA> <NA>
## 825 20792.607000000004 9907.3079999999991 35243.231000000007
## Corporate_Standard_Class Corporate_Total Home_Office_First_Class
## 1 Standard Class NA First Class
## 2 <NA> NA <NA>
## 3 <NA> NA <NA>
## 4 <NA> NA <NA>
## 5 <NA> NA <NA>
## 821 68.742000000000019 68.742 <NA>
## 822 <NA> NA <NA>
## 823 <NA> 8167.420 <NA>
## 824 <NA> NA <NA>
## 825 55942.786500000009 121885.932 7737.7859999999982
## Home_Office_Same_Day Home_Office_Second_Class Home_Office_Standard_Class
## 1 Same Day Second Class Standard Class
## 2 <NA> <NA> <NA>
## 3 <NA> <NA> 91.055999999999997
## 4 <NA> <NA> <NA>
## 5 <NA> <NA> <NA>
## 821 <NA> <NA> <NA>
## 822 <NA> <NA> <NA>
## 823 <NA> <NA> <NA>
## 824 <NA> <NA> <NA>
## 825 2977.4560000000001 8791.1270000000004 54748.632500000007
## Home_Office_Total
## 1 NA
## 2 NA
## 3 91.056
## 4 NA
## 5 NA
## 821 NA
## 822 NA
## 823 NA
## 824 NA
## 825 74255.001
Nos quedamos con la cantidad de filas que nos interesan, ya que las 2 primeras filas y la ultima sobran.
datos<-datos[3:824,]
Creamos nuevas columnas, las cuales contienen de forma inicial el numero de orden y y las ventas (sales), posteriormente le agrego dos columnas con el nonbre de modo de envio (ship mode) y el segmento al que pertecen.
Repito el proceso para cada segmento y modo de envio con los siguientes codigos.
a1<-select(datos,"order","Consumer_First_Class")
a1<-na.omit(a1)
a1<-mutate(a1,Segment="Consumer")
a1<-mutate(a1,"Ship Mode" ="First Class")
a1<-rename(a1,"Sales"="Consumer_First_Class")
a2<-select(datos,"order","Consumer_Same_Day")
a2<-na.omit(a2)
a2<-mutate(a2,Segment="Consumer")
a2<-mutate(a2,"Ship Mode" ="Same Day")
a2<-rename(a2,"Sales"="Consumer_Same_Day")
a3<-select(datos,"order","Consumer_Second_Class")
a3<-na.omit(a3)
a3<-mutate(a3,Segment="Consumer")
a3<-mutate(a3,"Ship Mode" ="Second Class")
a3<-rename(a3,"Sales"="Consumer_Second_Class")
a4<-select(datos,"order","Consumer_Standard_Class")
a4<-na.omit(a4)
a4<-mutate(a4,Segment="Consumer")
a4<-mutate(a4,"Ship Mode" ="Standard Class")
a4<-rename(a4,"Sales"="Consumer_Standard_Class")
a<-rbind(a1,a2,a3,a4)
b1<-select(datos,"order","Corporate_First_Class")
b1<-na.omit(b1)
b1<-mutate(b1,Segment="Corporate")
b1<-mutate(b1,"Ship Mode" ="First Class")
b1<-rename(b1,"Sales"="Corporate_First_Class")
b2<-select(datos,"order","Corporate_Same_Day")
b2<-na.omit(b2)
b2<-mutate(b2,Segment="Corporate")
b2<-mutate(b2,"Ship Mode" ="Same Day")
b2<-rename(b2,"Sales"="Corporate_Same_Day")
b3<-select(datos,"order","Corporate_Second_Class")
b3<-na.omit(b3)
b3<-mutate(b3,Segment="Corporate")
b3<-mutate(b3,"Ship Mode" ="Second Class")
b3<-rename(b3,"Sales"="Corporate_Second_Class")
b4<-select(datos,"order","Corporate_Standard_Class")
b4<-na.omit(b4)
b4<-mutate(b4,Segment="Corporate")
b4<-mutate(b4,"Ship Mode" ="Standard Class")
b4<-rename(b4,"Sales"="Corporate_Standard_Class")
b<-rbind(b1,b2,b3,b4)
c1<-select(datos,"order","Home_Office_First_Class")
c1<-na.omit(c1)
c1<-mutate(c1,Segment="Home_Office")
c1<-mutate(c1,"Ship Mode" ="First Class")
c1<-rename(c1,"Sales"="Home_Office_First_Class")
c2<-select(datos,"order","Home_Office_Same_Day")
c2<-na.omit(c2)
c2<-mutate(c2,Segment="Home_Office")
c2<-mutate(c2,"Ship Mode" ="Same Day")
c2<-rename(c2,"Sales"="Home_Office_Same_Day")
c3<-select(datos,"order","Home_Office_Second_Class")
c3<-na.omit(c3)
c3<-mutate(c3,Segment="Home_Office")
c3<-mutate(c3,"Ship Mode" ="Second Class")
c3<-rename(c3,"Sales"="Home_Office_Second_Class")
c4<-select(datos,"order","Home_Office_Standard_Class")
c4<-na.omit(c4)
c4<-mutate(c4,Segment="Home_Office")
c4<-mutate(c4,"Ship Mode" ="Standard Class")
c4<-rename(c4,"Sales"="Home_Office_Standard_Class")
c<-rbind(c1,c2,c3,c4)
final<-rbind(a,b,c)
final<-mutate(final,Sales = round(as.numeric(Sales),3))
str(final)
## 'data.frame': 822 obs. of 4 variables:
## $ order : chr "CA-2011-103366" "CA-2011-109043" "CA-2011-113166" "CA-2011-124023" ...
## $ Sales : num 149.95 243.6 9.57 8.96 34.2 ...
## $ Segment : chr "Consumer" "Consumer" "Consumer" "Consumer" ...
## $ Ship Mode: chr "First Class" "First Class" "First Class" "First Class" ...
## - attr(*, "na.action")= 'omit' Named int [1:750] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr [1:750] "3" "4" "5" "6" ...
Finalmente nos quedaria la siguiente tabla.
final<-as.data.frame(final)
datatable(final)
Posiblemente existan formas mas eficientes de limpiar esa data, si la conoces, comenta en el trabajo.