Your Exercise

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.0
## v readr   1.3.1
## -- Conflicts --------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

In this section, you are expected to be more confident to create your own function. Here I advise you to create a function for each tasks bellow:

A. Univariate variable (one dimension)

Here we assign “a” as the data we’re going to test

a<- c(2, 3 ,5 ,4 ,6 ,7 ,4 ,3 ,2 ,5 ,40 ,15 ,4 ,4)
a
##  [1]  2  3  5  4  6  7  4  3  2  5 40 15  4  4

1. average

\[ Average={{x_1+x_2+...+x_n} \over n}={\Sigma x_i \over n} \]

Average <- function(x)
{
  sum(x)/length(x)
}
Average(a)
## [1] 7.428571

2. middle_value

\[Untuk\space data \space ganjil: Median=x_{{n+1 \over 2}} \\ Untuk\space data\space genap:Median={x_{n\over 2}+x_{{n\over 2}+1}}\]

middle_value <- function(x){
  # Order Vector Ascending
  sorted <- sort(x)
  # assign the length of x as n
  n <- length(x)
  
  ifelse(n%%2==0, 
         Average(sorted[(n/2):(n/2+1)]), 
         sorted[(n+1)/2])}
middle_value(a)
## [1] 4

3. most_frequent

most_frequent <- function(x) {                   
  unique_x <- unique(x)
  tabulate_x <- tabulate(match(x, unique_x))
  unique_x[tabulate_x == max(tabulate_x)]}
most_frequent(a)
## [1] 4

4. max_value

library(dplyr)
max_value <- function(x){
  sort(x) %>% 
    last()
}
max_value(a)
## [1] 40

5. min_value

library(dplyr)
min_value <- function(x){
  sort(x) %>% 
    first()
}
min_value(a)
## [1] 2

6. variance

\[S^2={{\Sigma (x_i-\bar x)^2 \over {n-1}}}\]

variance <- function(x) {
  n <- length(x)
  (sum((x-Average(x))^2))/(n-1)
}
variance(a)
## [1] 98.26374

7. standard_deviation

\[S=\sqrt{S^2}\]

standard_deviation <- function(x) {
  n <- length(x)
  ((sum((x-Average(x))^2))/(n-1))^(1/2)
}
standard_deviation(a)
## [1] 9.912807

8. Outliers

Outliers <- function(x){
  sorted <- sort(x)
  n <- length(x)
  Q_1 <- ifelse(n%%2==1&&(n+1)%%4==0,
                sorted[(n+1)/4],
                ifelse(n%%2==1&&(n+1)%%4==2,
                       mean(sorted[((n-1)/4):((n+3)/4)]),
                       ifelse(n%%2==0 && n%%4==0,
                              mean(sorted[((n)/4):((n/4)+1)]),
                              sorted[(n+2)/4])))
  Q_3 <- ifelse(n%%2==1&&(n+1)%%4==0,
                sorted[3*(n+1)/4],
                ifelse(n%%2==1&&(n+1)%%4==2,
                       mean(sorted[((3*n+1)/4):((3*n+5)/4)]),
                       ifelse(n%%2==0 && n%%4==0,
                              mean(sorted[((3*n)/4):((3*n/4)+1)]),
                              sorted[(3*n+2)/4])))
  IQR <- Q_3-Q_1
  Gate_Minor<- c(Q_1- IQR * 1.5, Q_3 + IQR * 1.5)
  Gate_Major<- c(Q_1- IQR * 3, Q_3 + IQR * 3)
  Outlier_Minor <- x[(x<Gate_Minor[1] & x>=Gate_Major[1]) | 
                       (x>Gate_Minor[2] & x<=Gate_Major[2])]
  Outlier_Major <- x[(x<Gate_Major[1] | x>Gate_Major[2])]
  result <- paste("Outlier Major =", sep = " ", Outlier_Major, " and Outlier Minor = ", Outlier_Minor) 

  return(result)
}
Outliers(a)
## [1] "Outlier Major = 40  and Outlier Minor =  15"

9. summary

library(dplyr)
Own_Summary <- function(x){
  n <- length(x)
  sorted <- sort(x)
  Average <- round(sum(x)/n, 2)
  Middle_Value <- ifelse(n%%2==0,
                         Average(sorted[(n/2):(n/2+1)]),
                         sorted[(n+1)/2])
  unique_x <- unique(x)
  tabulate_x <- tabulate(match(x, unique_x))
  Most_Frequent <- unique_x[tabulate_x == max(tabulate_x)]
  Max_Value <- last(sorted)
  Min_Value <- first(sorted)
  Variance_Sample <- round((sum((x-Average(x))^2))/(n-1),2)
  Variance_Population <- round((sum((x-Average(x))^2))/(n),2)
  Standard_Deviation_Sample <- round(sqrt((sum((x-Average(x))^2))/(n-1)),2)
  Standard_Deviation_Population <- round(sqrt((sum((x-Average(x))^2))/(n)),2)
  Q_1 <- ifelse(n%%2==1&&(n+1)%%4==0,
                sorted[(n+1)/4],
                ifelse(n%%2==1&&(n+1)%%4==2,
                       mean(sorted[((n-1)/4):((n+3)/4)]),
                       ifelse(n%%2==0 && n%%4==0,
                              mean(sorted[((n)/4):((n/4)+1)]),
                              sorted[(n+2)/4])))
  Q_3 <- ifelse(n%%2==1&&(n+1)%%4==0,
                sorted[3*(n+1)/4],
                ifelse(n%%2==1&&(n+1)%%4==2,
                       mean(sorted[((3*n+1)/4):((3*n+5)/4)]),
                       ifelse(n%%2==0 && n%%4==0,
                              mean(sorted[((3*n)/4):((3*n/4)+1)]),
                              sorted[(3*n+2)/4])))
  IQR <- Q_3-Q_1
  Gate_Minor<- c(Q_1- IQR * 1.5, Q_3 + IQR * 1.5)
  Gate_Major<- c(Q_1- IQR * 3, Q_3 + IQR * 3)
  Outlier_Minor <- x[(x<Gate_Minor[1] & x>=Gate_Major[1]) | 
                       (x>Gate_Minor[2] & x<=Gate_Major[2])]
  Outlier_Major <- x[(x<Gate_Major[1] | x>Gate_Major[2])]
  
  result <- matrix(c(Average, 
                     Middle_Value, 
                     Most_Frequent,
                     Max_Value, 
                     Min_Value, 
                     Variance_Sample, 
                     Variance_Population, 
                     Standard_Deviation_Sample, 
                     Standard_Deviation_Population, 
                     Outlier_Minor,
                     Outlier_Major), 
                   1, 11, 
                   dimnames = list("Value", 
                                   c("Mean", 
                                     "Med", 
                                     "Mode", 
                                     "Max", 
                                     "Min",
                                     "Var.S",
                                     "Var.P",
                                     "StDev.S",
                                     "StDev.P",
                                     "Outlier.Min",
                                     "Outlier.Maj")))
  return(Most_Frequent)
}  
Own_Summary(a)
## [1] 4

B. Multivariate variable (more dimension)

here we assign x_i as the data value, and f_i as the data frequency

x_i <- c(1,2,3,4,5,6)
f_i <- c(4,2,6,5,4,7)
rbind(x_i, f_i)
##     [,1] [,2] [,3] [,4] [,5] [,6]
## x_i    1    2    3    4    5    6
## f_i    4    2    6    5    4    7

1. average

\[Average={{\Sigma (f_i \bullet x_i)}\over{\Sigma f_i}}\]

Average_Freq <- function(xi, fi){
  sum(xi*fi)/(sum(fi))
}
Average_Freq(x_i, f_i)
## [1] 3.857143
  • middle_value
  • most_frequent

2. max_value

library(dplyr)
max_value_freq <- function(x,y){
  sort(x) %>% 
    last()
}
max_value_freq(x_i, f_i)
## [1] 6

3. min_value

library(dplyr)
min_value_freq <- function(x,y){
  sort(x) %>% 
    first()
}
min_value_freq(x_i, f_i)
## [1] 1

4. variance

\[S^2={{\Sigma(f_i (x_i-\bar x)^2) \over {n-1}}}, \space \space for \space n=\Sigma f_i\]

variance_freq <- function(x,y) {
  n <- sum(y)
  var_s <- round(sum(((x-Average_Freq(x,y))^2)*y)/(n-1),2)
  var_p <- round(sum(((x-Average_Freq(x,y))^2)*y)/(n),2)
  result <- paste(var_s, " is sample variance and ", 
                  var_p, " is population variance.")
  return(result)
}
variance_freq(x_i,f_i)
## [1] "3.02  is sample variance and  2.91  is population variance."

5. standard_deviation

\[S=\sqrt{S^2}\]

standard_deviation_freq <- function(x,y) {
  n <- sum(y)
  stdev_s <- round(sqrt(sum(((x-Average_Freq(x,y))^2)*y)/(n-1)),2)
  stdev_p <- round(sqrt(sum(((x-Average_Freq(x,y))^2)*y)/(n)),2)
  result <- paste(stdev_s, "is sample standard deviation and ", 
                  stdev_p, " is population standard deviation.")
  return(result)
}
standard_deviation_freq(x_i, f_i)
## [1] "1.74 is sample standard deviation and  1.71  is population standard deviation."
  • Outliers

6. summary

Summary_Freq <- function(x,y){
  n <- sum(y)
  average <- round(sum(x*y)/n,2)
  max <- tail(sort(x),1)
  min <- head(sort(x),1)
  var_s <- round(sum(((x-Average_Freq(x,y))^2)*y)/(n-1),2)
  var_p <- round(sum(((x-Average_Freq(x,y))^2)*y)/(n),2)
  stdev_s <- round(sqrt(sum(((x-Average_Freq(x,y))^2)*y)/(n-1)),2)
  stdev_p <- round(sqrt(sum(((x-Average_Freq(x,y))^2)*y)/(n)),2)
  result <- matrix(c(average, 
                     max, 
                     min, 
                     var_s, 
                     var_p, 
                     stdev_s, 
                     stdev_p), 
                   1, 7,
                   dimnames = list("Value", c("Mean", 
                                "Max", 
                                "Min", 
                                "Var.S", 
                                "Var.P", 
                                "StDev.S", 
                                "StDev.P")))
  return(result)
}
Summary_Freq(x_i, f_i)
##       Mean Max Min Var.S Var.P StDev.S StDev.P
## Value 3.86   6   1  3.02  2.91    1.74    1.71

C. Simple Case Example

Id       <- (1:5000)
Date     <- seq(as.Date("2018/01/01"), by = "day", length.out = 5000)

Name     <- sample(c("Angel","Sherly","Vanessa","Irene","Julian","Jeffry","Nikita","Kefas","Siana","Lala",
               "Fallen","Ardifo","Kevin","Michael","Felisha","Calisha","Patricia","Naomi","Eric","Jacob"),
               5000, replace = T)

City     <- sample(rep(c("Jakarta","Bogor","Depok","Tangerang","Bekasi"), times = 1000))

Outlet   <- sample(c("Outlet 1","Outlet 2","Outlet 3","Outlet 4","Outlet 5"),5000, replace = T)

Menu     <- c("Cappucino","Es Kopi Susu","Hot Caramel Latte","Hot Chocolate","Hot Red Velvet Latte","Ice Americano",
              "Ice Berry Coffee","Ice Cafe Latte","Ice Caramel Latte","Ice Coffee Avocado","Ice Coffee Lite",
              "Ice Matcha Espresso","Ice Matcha Latte","Ice Red Velvet Latte")
all_menu <- sample(Menu, 5000, replace = T)
Price    <- sample(18000:45000,14, replace = T)
DFPrice  <- data.frame(Menu, Price)
library(dplyr)
Menu_Price <- left_join(data.frame(Menu = all_menu),DFPrice)
## Joining, by = "Menu"
KopiKenangan <- cbind(data.frame(Id,
                                 Date,
                                 Name,
                                 City,
                                 Outlet),
                                 Menu_Price)
head(KopiKenangan,5)
##   Id       Date     Name      City   Outlet                 Menu Price
## 1  1 2018-01-01    Kefas   Jakarta Outlet 3      Ice Coffee Lite 25536
## 2  2 2018-01-02   Nikita Tangerang Outlet 2 Hot Red Velvet Latte 43454
## 3  3 2018-01-03 Patricia     Depok Outlet 2         Es Kopi Susu 33074
## 4  4 2018-01-04  Calisha Tangerang Outlet 5 Hot Red Velvet Latte 43454
## 5  5 2018-01-05 Patricia Tangerang Outlet 3         Es Kopi Susu 33074

Let’s say, you have a data set already in your hand as you can see above. Please create a function to calculate the following tasks:

1. The percentage of sales for each city.

Percentage <- function(x){
  percent <- round(x*100, 1)
  result <- paste(percent, sep = "", "%")
  return(result)
}
City_Sales <- aggregate(Price ~ City, data = KopiKenangan, sum)
Total_Sales <- sum(City_Sales$Price)
City_Sales$Percentage_City_Sales <- Percentage(City_Sales$Price/Total_Sales)
rename(City_Sales, "Total_Sales"="Price")
##        City Total_Sales Percentage_City_Sales
## 1    Bekasi    30848156                 19.8%
## 2     Bogor    30959415                 19.9%
## 3     Depok    30998148                 19.9%
## 4   Jakarta    31390724                 20.2%
## 5 Tangerang    31367534                 20.2%

2. The frequency of Name and Menu.

table(KopiKenangan$Name,KopiKenangan$Menu)
##           
##            Cappucino Es Kopi Susu Hot Caramel Latte Hot Chocolate
##   Angel           15           21                15            10
##   Ardifo          22           20                18            12
##   Calisha         17           13                20            19
##   Eric            16           11                13            16
##   Fallen          19           16                19            15
##   Felisha         23           19                22            11
##   Irene           17           20                18            18
##   Jacob           13           16                13            21
##   Jeffry          18           18                19            16
##   Julian          24           14                16            17
##   Kefas           13           22                25            16
##   Kevin           23           18                16            18
##   Lala            12           12                14            15
##   Michael         20           24                15            23
##   Naomi           16           14                27            12
##   Nikita          18           25                24            10
##   Patricia        17           17                12            21
##   Sherly          20           21                15            16
##   Siana           20           18                18            19
##   Vanessa         19           11                17            18
##           
##            Hot Red Velvet Latte Ice Americano Ice Berry Coffee Ice Cafe Latte
##   Angel                      14            15               25             11
##   Ardifo                     12            18               26             15
##   Calisha                    18            12               21             14
##   Eric                       20            21               17             15
##   Fallen                     13            14               25             17
##   Felisha                    27            14               18             14
##   Irene                      10            26               16             21
##   Jacob                      15            20               19             19
##   Jeffry                     20            23               13             22
##   Julian                     18            17               15             16
##   Kefas                      20            14               18             15
##   Kevin                      19            16               18             14
##   Lala                       14            16               21              7
##   Michael                    26            14               12             12
##   Naomi                      17            17               17             13
##   Nikita                     20            13               17             14
##   Patricia                   21            14               16             15
##   Sherly                     16            24               23             14
##   Siana                      29            20               23             17
##   Vanessa                    19            19               18             16
##           
##            Ice Caramel Latte Ice Coffee Avocado Ice Coffee Lite
##   Angel                   18                 23              19
##   Ardifo                  23                 27              16
##   Calisha                 15                 20              16
##   Eric                    18                 18              21
##   Fallen                  13                 21              17
##   Felisha                 20                 15              18
##   Irene                   12                 23              22
##   Jacob                   15                 18              19
##   Jeffry                  16                 19              19
##   Julian                  17                 24              15
##   Kefas                   20                 18              11
##   Kevin                   16                 11              20
##   Lala                    13                 16              16
##   Michael                 25                 21              18
##   Naomi                   14                 17              27
##   Nikita                  16                 12              19
##   Patricia                20                 13              17
##   Sherly                  22                 17              15
##   Siana                   21                  8              21
##   Vanessa                 22                 23              20
##           
##            Ice Matcha Espresso Ice Matcha Latte Ice Red Velvet Latte
##   Angel                     15               17                   23
##   Ardifo                    20               18                   20
##   Calisha                   21               15                   15
##   Eric                      14               15                   17
##   Fallen                    16               12                   20
##   Felisha                   22               19                   25
##   Irene                     15               15                   20
##   Jacob                     18               17                   30
##   Jeffry                    18               17                   23
##   Julian                    15               19                   17
##   Kefas                     18               26                   16
##   Kevin                     21               24                   21
##   Lala                      20               20                   19
##   Michael                   16               30                   12
##   Naomi                     16               20                   10
##   Nikita                    24               10                   26
##   Patricia                  19               22                   21
##   Sherly                    17               14                   22
##   Siana                     24               16                   17
##   Vanessa                   24               17                   19

3. The Average of monthly sales per-menu item.

library(tidyverse)
library(dplyr)
yearmonth <- KopiKenangan%>%
  separate(Date, c("Year", "Month", "Day"), sep = "-") %>%
  select(Year, Month)
number_of_months <- paste(yearmonth$Year, sep="-", yearmonth$Month) %>%
  table() %>%
  length()
Menu_Freq1 <- aggregate(Price~Menu, data = KopiKenangan, sum)
Menu_Freq1$Monthly_Avg_Sales <- Menu_Freq1$Price/number_of_months
Menu_Freq <- as.data.frame(table(KopiKenangan$Menu))
Menu_Freq$Monthly_Avg_Qty <- round(Menu_Freq$Freq/number_of_months,0)
Menu_Freq <- rename(Menu_Freq, "Menu"="Var1", "Qty"="Freq")
Menu_Monthly_Sales <- left_join(Menu_Freq, Menu_Freq1, by = "Menu")
Menu_Monthly_Sales
##                    Menu Qty Monthly_Avg_Qty    Price Monthly_Avg_Sales
## 1             Cappucino 362               2 14541540          88130.55
## 2          Es Kopi Susu 350               2 11575900          70156.97
## 3     Hot Caramel Latte 356               2 13544020          82084.97
## 4         Hot Chocolate 323               2  9866681          59798.07
## 5  Hot Red Velvet Latte 368               2 15991072          96915.59
## 6         Ice Americano 347               2 13598236          82413.55
## 7      Ice Berry Coffee 378               2 13868820          84053.45
## 8        Ice Cafe Latte 301               2  6438390          39020.55
## 9     Ice Caramel Latte 356               2  9629800          58362.42
## 10   Ice Coffee Avocado 364               2  7300748          44246.96
## 11      Ice Coffee Lite 366               2  9346176          56643.49
## 12  Ice Matcha Espresso 373               2  8156391          49432.67
## 13     Ice Matcha Latte 363               2 12453804          75477.60
## 14 Ice Red Velvet Latte 393               2  9252399          56075.15