library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
salesbike<- read_xlsx("Sales.xlsx")
#library(readr) #Para leer archivo .csv
#salesbike<-read_csv("Sales.csv")
head(salesbike,10) #Muestra el encabezado de la data.frame por listas de diez filas
## # A tibble: 10 × 18
## Date Day Month Year Customer_Age Age_Group Customer_Gender
## <dttm> <dbl> <chr> <dbl> <dbl> <chr> <chr>
## 1 2013-11-26 00:00:00 26 Novem… 2013 19 Youth (<… M
## 2 2015-11-26 00:00:00 26 Novem… 2015 19 Youth (<… M
## 3 2014-03-23 00:00:00 23 March 2014 49 Adults (… M
## 4 2016-03-23 00:00:00 23 March 2016 49 Adults (… M
## 5 2014-05-15 00:00:00 15 May 2014 47 Adults (… F
## 6 2016-05-15 00:00:00 15 May 2016 47 Adults (… F
## 7 2014-05-22 00:00:00 22 May 2014 47 Adults (… F
## 8 2016-05-22 00:00:00 22 May 2016 47 Adults (… F
## 9 2014-02-22 00:00:00 22 Febru… 2014 35 Adults (… M
## 10 2016-02-22 00:00:00 22 Febru… 2016 35 Adults (… M
## # ℹ 11 more variables: Country <chr>, State <chr>, Product_Category <chr>,
## # Sub_Category <chr>, Product <chr>, Order_Quantity <dbl>, Unit_Cost <dbl>,
## # Unit_Price <dbl>, Profit <dbl>, Cost <dbl>, Revenue <dbl>
salesbike%>%head(10)
## # A tibble: 10 × 18
## Date Day Month Year Customer_Age Age_Group Customer_Gender
## <dttm> <dbl> <chr> <dbl> <dbl> <chr> <chr>
## 1 2013-11-26 00:00:00 26 Novem… 2013 19 Youth (<… M
## 2 2015-11-26 00:00:00 26 Novem… 2015 19 Youth (<… M
## 3 2014-03-23 00:00:00 23 March 2014 49 Adults (… M
## 4 2016-03-23 00:00:00 23 March 2016 49 Adults (… M
## 5 2014-05-15 00:00:00 15 May 2014 47 Adults (… F
## 6 2016-05-15 00:00:00 15 May 2016 47 Adults (… F
## 7 2014-05-22 00:00:00 22 May 2014 47 Adults (… F
## 8 2016-05-22 00:00:00 22 May 2016 47 Adults (… F
## 9 2014-02-22 00:00:00 22 Febru… 2014 35 Adults (… M
## 10 2016-02-22 00:00:00 22 Febru… 2016 35 Adults (… M
## # ℹ 11 more variables: Country <chr>, State <chr>, Product_Category <chr>,
## # Sub_Category <chr>, Product <chr>, Order_Quantity <dbl>, Unit_Cost <dbl>,
## # Unit_Price <dbl>, Profit <dbl>, Cost <dbl>, Revenue <dbl>
10%>%head(salesbike, .)
## # A tibble: 10 × 18
## Date Day Month Year Customer_Age Age_Group Customer_Gender
## <dttm> <dbl> <chr> <dbl> <dbl> <chr> <chr>
## 1 2013-11-26 00:00:00 26 Novem… 2013 19 Youth (<… M
## 2 2015-11-26 00:00:00 26 Novem… 2015 19 Youth (<… M
## 3 2014-03-23 00:00:00 23 March 2014 49 Adults (… M
## 4 2016-03-23 00:00:00 23 March 2016 49 Adults (… M
## 5 2014-05-15 00:00:00 15 May 2014 47 Adults (… F
## 6 2016-05-15 00:00:00 15 May 2016 47 Adults (… F
## 7 2014-05-22 00:00:00 22 May 2014 47 Adults (… F
## 8 2016-05-22 00:00:00 22 May 2016 47 Adults (… F
## 9 2014-02-22 00:00:00 22 Febru… 2014 35 Adults (… M
## 10 2016-02-22 00:00:00 22 Febru… 2016 35 Adults (… M
## # ℹ 11 more variables: Country <chr>, State <chr>, Product_Category <chr>,
## # Sub_Category <chr>, Product <chr>, Order_Quantity <dbl>, Unit_Cost <dbl>,
## # Unit_Price <dbl>, Profit <dbl>, Cost <dbl>, Revenue <dbl>
salesbike %>%
select(Date, Customer_Age, Customer_Gender, Country, 'Unit_Price')
## # A tibble: 113,036 × 5
## Date Customer_Age Customer_Gender Country Unit_Price
## <dttm> <dbl> <chr> <chr> <dbl>
## 1 2013-11-26 00:00:00 19 M Canada 120
## 2 2015-11-26 00:00:00 19 M Canada 120
## 3 2014-03-23 00:00:00 49 M Australia 120
## 4 2016-03-23 00:00:00 49 M Australia 120
## 5 2014-05-15 00:00:00 47 F Australia 120
## 6 2016-05-15 00:00:00 47 F Australia 120
## 7 2014-05-22 00:00:00 47 F Australia 120
## 8 2016-05-22 00:00:00 47 F Australia 120
## 9 2014-02-22 00:00:00 35 M Australia 120
## 10 2016-02-22 00:00:00 35 M Australia 120
## # ℹ 113,026 more rows
salesbike %>%
select(Date:Country, State='Unit_Price')
## # A tibble: 113,036 × 9
## Date Day Month Year Customer_Age Age_Group Customer_Gender
## <dttm> <dbl> <chr> <dbl> <dbl> <chr> <chr>
## 1 2013-11-26 00:00:00 26 Novem… 2013 19 Youth (<… M
## 2 2015-11-26 00:00:00 26 Novem… 2015 19 Youth (<… M
## 3 2014-03-23 00:00:00 23 March 2014 49 Adults (… M
## 4 2016-03-23 00:00:00 23 March 2016 49 Adults (… M
## 5 2014-05-15 00:00:00 15 May 2014 47 Adults (… F
## 6 2016-05-15 00:00:00 15 May 2016 47 Adults (… F
## 7 2014-05-22 00:00:00 22 May 2014 47 Adults (… F
## 8 2016-05-22 00:00:00 22 May 2016 47 Adults (… F
## 9 2014-02-22 00:00:00 22 Febru… 2014 35 Adults (… M
## 10 2016-02-22 00:00:00 22 Febru… 2016 35 Adults (… M
## # ℹ 113,026 more rows
## # ℹ 2 more variables: Country <chr>, State <dbl>
salesbike %>%
select(-'Unit_Cost', -'Unit_Price')
## # A tibble: 113,036 × 16
## Date Day Month Year Customer_Age Age_Group Customer_Gender
## <dttm> <dbl> <chr> <dbl> <dbl> <chr> <chr>
## 1 2013-11-26 00:00:00 26 Novem… 2013 19 Youth (<… M
## 2 2015-11-26 00:00:00 26 Novem… 2015 19 Youth (<… M
## 3 2014-03-23 00:00:00 23 March 2014 49 Adults (… M
## 4 2016-03-23 00:00:00 23 March 2016 49 Adults (… M
## 5 2014-05-15 00:00:00 15 May 2014 47 Adults (… F
## 6 2016-05-15 00:00:00 15 May 2016 47 Adults (… F
## 7 2014-05-22 00:00:00 22 May 2014 47 Adults (… F
## 8 2016-05-22 00:00:00 22 May 2016 47 Adults (… F
## 9 2014-02-22 00:00:00 22 Febru… 2014 35 Adults (… M
## 10 2016-02-22 00:00:00 22 Febru… 2016 35 Adults (… M
## # ℹ 113,026 more rows
## # ℹ 9 more variables: Country <chr>, State <chr>, Product_Category <chr>,
## # Sub_Category <chr>, Product <chr>, Order_Quantity <dbl>, Profit <dbl>,
## # Cost <dbl>, Revenue <dbl>
salesbike %>%
select(Date:Product, Unit_Price='Unit_Price') %>%
mutate(is_collab = grepl('Bike', Sub_Category) & grepl('Hitch Rack', Product)) %>%
select(Sub_Category,Product, is_collab, everything()) #La función grepl() se utiliza para buscar patrones en texto, y en este caso, está buscando la presencia de la palabra 'Featuring' en la columna "Product".
## # A tibble: 113,036 × 14
## Sub_Category Product is_collab Date Day Month Year
## <chr> <chr> <lgl> <dttm> <dbl> <chr> <dbl>
## 1 Bike Racks Hitch Rack - 4-… TRUE 2013-11-26 00:00:00 26 Nove… 2013
## 2 Bike Racks Hitch Rack - 4-… TRUE 2015-11-26 00:00:00 26 Nove… 2015
## 3 Bike Racks Hitch Rack - 4-… TRUE 2014-03-23 00:00:00 23 March 2014
## 4 Bike Racks Hitch Rack - 4-… TRUE 2016-03-23 00:00:00 23 March 2016
## 5 Bike Racks Hitch Rack - 4-… TRUE 2014-05-15 00:00:00 15 May 2014
## 6 Bike Racks Hitch Rack - 4-… TRUE 2016-05-15 00:00:00 15 May 2016
## 7 Bike Racks Hitch Rack - 4-… TRUE 2014-05-22 00:00:00 22 May 2014
## 8 Bike Racks Hitch Rack - 4-… TRUE 2016-05-22 00:00:00 22 May 2016
## 9 Bike Racks Hitch Rack - 4-… TRUE 2014-02-22 00:00:00 22 Febr… 2014
## 10 Bike Racks Hitch Rack - 4-… TRUE 2016-02-22 00:00:00 22 Febr… 2016
## # ℹ 113,026 more rows
## # ℹ 7 more variables: Customer_Age <dbl>, Age_Group <chr>,
## # Customer_Gender <chr>, Country <chr>, State <chr>, Product_Category <chr>,
## # Unit_Price <dbl>
salesbike %>%
select(Date, Customer_Age, Customer_Gender, Country, Unit_Price='Unit_Price') %>%
filter(Unit_Price >= 110, Customer_Age >= '30' & Customer_Gender == 'F')
## # A tibble: 9,324 × 5
## Date Customer_Age Customer_Gender Country Unit_Price
## <dttm> <dbl> <chr> <chr> <dbl>
## 1 2014-05-15 00:00:00 47 F Australia 120
## 2 2016-05-15 00:00:00 47 F Australia 120
## 3 2014-05-22 00:00:00 47 F Australia 120
## 4 2016-05-22 00:00:00 47 F Australia 120
## 5 2013-07-30 00:00:00 32 F Australia 120
## 6 2015-07-30 00:00:00 32 F Australia 120
## 7 2014-01-02 00:00:00 48 F Canada 120
## 8 2016-01-02 00:00:00 48 F Canada 120
## 9 2014-03-13 00:00:00 48 F Canada 120
## 10 2016-03-13 00:00:00 48 F Canada 120
## # ℹ 9,314 more rows
salesbike %>%
select(Date:Customer_Gender, Unit_Price='Unit_Price') %>%
filter(Customer_Gender == 'M')
## # A tibble: 58,312 × 8
## Date Day Month Year Customer_Age Age_Group Customer_Gender
## <dttm> <dbl> <chr> <dbl> <dbl> <chr> <chr>
## 1 2013-11-26 00:00:00 26 Novem… 2013 19 Youth (<… M
## 2 2015-11-26 00:00:00 26 Novem… 2015 19 Youth (<… M
## 3 2014-03-23 00:00:00 23 March 2014 49 Adults (… M
## 4 2016-03-23 00:00:00 23 March 2016 49 Adults (… M
## 5 2014-02-22 00:00:00 22 Febru… 2014 35 Adults (… M
## 6 2016-02-22 00:00:00 22 Febru… 2016 35 Adults (… M
## 7 2013-07-15 00:00:00 15 July 2013 34 Young Ad… M
## 8 2015-07-15 00:00:00 15 July 2015 34 Young Ad… M
## 9 2013-08-02 00:00:00 2 August 2013 29 Young Ad… M
## 10 2015-08-02 00:00:00 2 August 2015 29 Young Ad… M
## # ℹ 58,302 more rows
## # ℹ 1 more variable: Unit_Price <dbl>
distinct <- salesbike %>%
select(Date:Customer_Gender, Unit_Price='Unit_Price') %>%
filter(Customer_Gender == 'M') %>%
distinct(Customer_Age)
salesbike %>%
select(Date:Customer_Gender, Product, Unit_Price='Unit_Price') %>%
filter(Customer_Gender == 'M'| Customer_Gender =='F') %>%
group_by(Product) %>% #solo funciono con esta variable group_by
summarise(total_Unit_Price = mean(Unit_Price))
## # A tibble: 133 × 2
## Product total_Unit_Price
## <chr> <dbl>
## 1 AWC Logo Cap 9
## 2 All-Purpose Bike Stand 159
## 3 Bike Wash - Dissolver 8
## 4 Classic Vest, L 64
## 5 Classic Vest, M 64
## 6 Classic Vest, S 64
## 7 Fender Set - Mountain 22
## 8 HL Mountain Tire 35
## 9 HL Road Tire 33
## 10 Half-Finger Gloves L 24
## # ℹ 123 more rows
salesbike %>%
select(Date:Customer_Gender, Unit_Price='Unit_Price') %>%
filter(Customer_Gender == 'M') %>%
group_by(Year) %>%
summarise(total_Unit_Price = mean(Unit_Price)) %>%
arrange(desc(total_Unit_Price), Year) %>%
head(10)
## # A tibble: 6 × 2
## Year total_Unit_Price
## <dbl> <dbl>
## 1 2011 1890.
## 2 2012 1890.
## 3 2013 456.
## 4 2015 456.
## 5 2014 304.
## 6 2016 304.
salesbike %>%
select(Date:Customer_Gender, Unit_Price='Unit_Price') %>%
count(Customer_Gender) %>%
arrange(desc(n))
## # A tibble: 2 × 2
## Customer_Gender n
## <chr> <int>
## 1 M 58312
## 2 F 54724
#Calcular el promedio y la desviación estándar por genero
salesbike %>%
select(Date, Customer_Age, Customer_Gender, Country, 'Revenue')
## # A tibble: 113,036 × 5
## Date Customer_Age Customer_Gender Country Revenue
## <dttm> <dbl> <chr> <chr> <dbl>
## 1 2013-11-26 00:00:00 19 M Canada 950
## 2 2015-11-26 00:00:00 19 M Canada 950
## 3 2014-03-23 00:00:00 49 M Australia 2401
## 4 2016-03-23 00:00:00 49 M Australia 2088
## 5 2014-05-15 00:00:00 47 F Australia 418
## 6 2016-05-15 00:00:00 47 F Australia 522
## 7 2014-05-22 00:00:00 47 F Australia 379
## 8 2016-05-22 00:00:00 47 F Australia 190
## 9 2014-02-22 00:00:00 35 M Australia 2086
## 10 2016-02-22 00:00:00 35 M Australia 1991
## # ℹ 113,026 more rows
# Male
sm<-salesbike %>% filter(Customer_Gender=="M") %>%
summarise(mean=mean(Revenue),sd=sd(Revenue))
sm
## # A tibble: 1 × 2
## mean sd
## <dbl> <dbl>
## 1 743. 1314.
# Femane
sf<-salesbike %>% filter(Customer_Gender=="F") %>%
summarise(mean=mean(Revenue),sd=sd(Revenue))
sf
## # A tibble: 1 × 2
## mean sd
## <dbl> <dbl>
## 1 766. 1303.
#Promedio de ganancias por país
salesbike %>%
select(Date:Country, Revenue='Revenue') %>%
group_by(Country) %>%
summarise(total_Revenue = mean(Revenue)) %>%
arrange(desc(total_Revenue), Country) %>%
head(10)
## # A tibble: 6 × 2
## Country total_Revenue
## <chr> <dbl>
## 1 Australia 890.
## 2 Germany 809.
## 3 United Kingdom 782.
## 4 France 767.
## 5 United States 714.
## 6 Canada 560.
#Promedio ganancias por año
salesbike %>%
select(Date:Year, Revenue='Revenue') %>%
group_by(Year) %>%
summarise(total_Revenue = mean(Revenue)) %>%
arrange(desc(total_Revenue), Year) %>%
head(10)
## # A tibble: 6 × 2
## Year total_Revenue
## <dbl> <dbl>
## 1 2012 3428.
## 2 2011 3349.
## 3 2015 819.
## 4 2013 623.
## 5 2016 603.
## 6 2014 481.
salesbike
## # A tibble: 113,036 × 18
## Date Day Month Year Customer_Age Age_Group Customer_Gender
## <dttm> <dbl> <chr> <dbl> <dbl> <chr> <chr>
## 1 2013-11-26 00:00:00 26 Novem… 2013 19 Youth (<… M
## 2 2015-11-26 00:00:00 26 Novem… 2015 19 Youth (<… M
## 3 2014-03-23 00:00:00 23 March 2014 49 Adults (… M
## 4 2016-03-23 00:00:00 23 March 2016 49 Adults (… M
## 5 2014-05-15 00:00:00 15 May 2014 47 Adults (… F
## 6 2016-05-15 00:00:00 15 May 2016 47 Adults (… F
## 7 2014-05-22 00:00:00 22 May 2014 47 Adults (… F
## 8 2016-05-22 00:00:00 22 May 2016 47 Adults (… F
## 9 2014-02-22 00:00:00 22 Febru… 2014 35 Adults (… M
## 10 2016-02-22 00:00:00 22 Febru… 2016 35 Adults (… M
## # ℹ 113,026 more rows
## # ℹ 11 more variables: Country <chr>, State <chr>, Product_Category <chr>,
## # Sub_Category <chr>, Product <chr>, Order_Quantity <dbl>, Unit_Cost <dbl>,
## # Unit_Price <dbl>, Profit <dbl>, Cost <dbl>, Revenue <dbl>