##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v stringr 1.4.0
## v tidyr 1.1.2 v forcats 0.5.0
## v readr 1.3.1
## -- Conflicts --------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
In this section, you are expected to be more confident to create your own function. Here I advise you to create a function for each tasks bellow:
Here we assign “a” as the data we’re going to test
## [1] 2 3 5 4 6 7 4 3 2 5 40 15 4 4
\[ Average={{x_1+x_2+...+x_n} \over n}={\Sigma x_i \over n} \]
## [1] 7.428571
\[Untuk\space data \space ganjil: Median=x_{{n+1 \over 2}} \\ Untuk\space data\space genap:Median={x_{n\over 2}+x_{{n\over 2}+1}}\]
middle_value <- function(x){
# Order Vector Ascending
sorted <- sort(x)
# assign the length of x as n
n <- length(x)
ifelse(n%%2==0,
Average(sorted[(n/2):(n/2+1)]),
sorted[(n+1)/2])}
middle_value(a)## [1] 4
most_frequent <- function(x) {
unique_x <- unique(x)
tabulate_x <- tabulate(match(x, unique_x))
unique_x[tabulate_x == max(tabulate_x)]}
most_frequent(a)## [1] 4
\[S^2={{\Sigma (x_i-\bar x)^2 \over {n-1}}}\]
## [1] 98.26374
\[S=\sqrt{S^2}\]
standard_deviation <- function(x) {
n <- length(x)
((sum((x-Average(x))^2))/(n-1))^(1/2)
}
standard_deviation(a)## [1] 9.912807
Outliers <- function(x){
sorted <- sort(x)
n <- length(x)
Q_1 <- ifelse(n%%2==1&&(n+1)%%4==0,
sorted[(n+1)/4],
ifelse(n%%2==1&&(n+1)%%4==2,
mean(sorted[((n-1)/4):((n+3)/4)]),
ifelse(n%%2==0 && n%%4==0,
mean(sorted[((n)/4):((n/4)+1)]),
sorted[(n+2)/4])))
Q_3 <- ifelse(n%%2==1&&(n+1)%%4==0,
sorted[3*(n+1)/4],
ifelse(n%%2==1&&(n+1)%%4==2,
mean(sorted[((3*n+1)/4):((3*n+5)/4)]),
ifelse(n%%2==0 && n%%4==0,
mean(sorted[((3*n)/4):((3*n/4)+1)]),
sorted[(3*n+2)/4])))
IQR <- Q_3-Q_1
Gate_Minor<- c(Q_1- IQR * 1.5, Q_3 + IQR * 1.5)
Gate_Major<- c(Q_1- IQR * 3, Q_3 + IQR * 3)
Outlier_Minor <- x[(x<Gate_Minor[1] & x>=Gate_Major[1]) |
(x>Gate_Minor[2] & x<=Gate_Major[2])]
Outlier_Major <- x[(x<Gate_Major[1] | x>Gate_Major[2])]
result <- paste("Outlier Major =", sep = " ", Outlier_Major, " and Outlier Minor = ", Outlier_Minor)
return(result)
}
Outliers(a)## [1] "Outlier Major = 40 and Outlier Minor = 15"
library(dplyr)
Own_Summary <- function(x){
n <- length(x)
sorted <- sort(x)
Average <- round(sum(x)/n, 2)
Middle_Value <- ifelse(n%%2==0,
Average(sorted[(n/2):(n/2+1)]),
sorted[(n+1)/2])
unique_x <- unique(x)
tabulate_x <- tabulate(match(x, unique_x))
Most_Frequent <- unique_x[tabulate_x == max(tabulate_x)]
Max_Value <- last(sorted)
Min_Value <- first(sorted)
Variance_Sample <- round((sum((x-Average(x))^2))/(n-1),2)
Variance_Population <- round((sum((x-Average(x))^2))/(n),2)
Standard_Deviation_Sample <- round(sqrt((sum((x-Average(x))^2))/(n-1)),2)
Standard_Deviation_Population <- round(sqrt((sum((x-Average(x))^2))/(n)),2)
Q_1 <- ifelse(n%%2==1&&(n+1)%%4==0,
sorted[(n+1)/4],
ifelse(n%%2==1&&(n+1)%%4==2,
mean(sorted[((n-1)/4):((n+3)/4)]),
ifelse(n%%2==0 && n%%4==0,
mean(sorted[((n)/4):((n/4)+1)]),
sorted[(n+2)/4])))
Q_3 <- ifelse(n%%2==1&&(n+1)%%4==0,
sorted[3*(n+1)/4],
ifelse(n%%2==1&&(n+1)%%4==2,
mean(sorted[((3*n+1)/4):((3*n+5)/4)]),
ifelse(n%%2==0 && n%%4==0,
mean(sorted[((3*n)/4):((3*n/4)+1)]),
sorted[(3*n+2)/4])))
IQR <- Q_3-Q_1
Gate_Minor<- c(Q_1- IQR * 1.5, Q_3 + IQR * 1.5)
Gate_Major<- c(Q_1- IQR * 3, Q_3 + IQR * 3)
Outlier_Minor <- x[(x<Gate_Minor[1] & x>=Gate_Major[1]) |
(x>Gate_Minor[2] & x<=Gate_Major[2])]
Outlier_Major <- x[(x<Gate_Major[1] | x>Gate_Major[2])]
result <- matrix(c(Average,
Middle_Value,
Most_Frequent,
Max_Value,
Min_Value,
Variance_Sample,
Variance_Population,
Standard_Deviation_Sample,
Standard_Deviation_Population,
Outlier_Minor,
Outlier_Major),
1, 11,
dimnames = list("Value",
c("Mean",
"Med",
"Mode",
"Max",
"Min",
"Var.S",
"Var.P",
"StDev.S",
"StDev.P",
"Outlier.Min",
"Outlier.Maj")))
return(Most_Frequent)
}
Own_Summary(a)## [1] 4
here we assign x_i as the data value, and f_i as the data frequency
## [,1] [,2] [,3] [,4] [,5] [,6]
## x_i 1 2 3 4 5 6
## f_i 4 2 6 5 4 7
\[Average={{\Sigma (f_i \bullet x_i)}\over{\Sigma f_i}}\]
## [1] 3.857143
## [1] 6
## [1] 1
\[S^2={{\Sigma(f_i (x_i-\bar x)^2) \over {n-1}}}, \space \space for \space n=\Sigma f_i\]
variance_freq <- function(x,y) {
n <- sum(y)
var_s <- round(sum(((x-Average_Freq(x,y))^2)*y)/(n-1),2)
var_p <- round(sum(((x-Average_Freq(x,y))^2)*y)/(n),2)
result <- paste(var_s, " is sample variance and ",
var_p, " is population variance.")
return(result)
}
variance_freq(x_i,f_i)## [1] "3.02 is sample variance and 2.91 is population variance."
\[S=\sqrt{S^2}\]
standard_deviation_freq <- function(x,y) {
n <- sum(y)
stdev_s <- round(sqrt(sum(((x-Average_Freq(x,y))^2)*y)/(n-1)),2)
stdev_p <- round(sqrt(sum(((x-Average_Freq(x,y))^2)*y)/(n)),2)
result <- paste(stdev_s, "is sample standard deviation and ",
stdev_p, " is population standard deviation.")
return(result)
}
standard_deviation_freq(x_i, f_i)## [1] "1.74 is sample standard deviation and 1.71 is population standard deviation."
Summary_Freq <- function(x,y){
n <- sum(y)
average <- round(sum(x*y)/n,2)
max <- tail(sort(x),1)
min <- head(sort(x),1)
var_s <- round(sum(((x-Average_Freq(x,y))^2)*y)/(n-1),2)
var_p <- round(sum(((x-Average_Freq(x,y))^2)*y)/(n),2)
stdev_s <- round(sqrt(sum(((x-Average_Freq(x,y))^2)*y)/(n-1)),2)
stdev_p <- round(sqrt(sum(((x-Average_Freq(x,y))^2)*y)/(n)),2)
result <- matrix(c(average,
max,
min,
var_s,
var_p,
stdev_s,
stdev_p),
1, 7,
dimnames = list("Value", c("Mean",
"Max",
"Min",
"Var.S",
"Var.P",
"StDev.S",
"StDev.P")))
return(result)
}
Summary_Freq(x_i, f_i)## Mean Max Min Var.S Var.P StDev.S StDev.P
## Value 3.86 6 1 3.02 2.91 1.74 1.71
Id <- (1:5000)
Date <- seq(as.Date("2018/01/01"), by = "day", length.out = 5000)
Name <- sample(c("Angel","Sherly","Vanessa","Irene","Julian","Jeffry","Nikita","Kefas","Siana","Lala",
"Fallen","Ardifo","Kevin","Michael","Felisha","Calisha","Patricia","Naomi","Eric","Jacob"),
5000, replace = T)
City <- sample(rep(c("Jakarta","Bogor","Depok","Tangerang","Bekasi"), times = 1000))
Outlet <- sample(c("Outlet 1","Outlet 2","Outlet 3","Outlet 4","Outlet 5"),5000, replace = T)
Menu <- c("Cappucino","Es Kopi Susu","Hot Caramel Latte","Hot Chocolate","Hot Red Velvet Latte","Ice Americano",
"Ice Berry Coffee","Ice Cafe Latte","Ice Caramel Latte","Ice Coffee Avocado","Ice Coffee Lite",
"Ice Matcha Espresso","Ice Matcha Latte","Ice Red Velvet Latte")
all_menu <- sample(Menu, 5000, replace = T)
Price <- sample(18000:45000,14, replace = T)
DFPrice <- data.frame(Menu, Price)
library(dplyr)
Menu_Price <- left_join(data.frame(Menu = all_menu),DFPrice)## Joining, by = "Menu"
## Id Date Name City Outlet Menu Price
## 1 1 2018-01-01 Kefas Jakarta Outlet 3 Ice Coffee Lite 25536
## 2 2 2018-01-02 Nikita Tangerang Outlet 2 Hot Red Velvet Latte 43454
## 3 3 2018-01-03 Patricia Depok Outlet 2 Es Kopi Susu 33074
## 4 4 2018-01-04 Calisha Tangerang Outlet 5 Hot Red Velvet Latte 43454
## 5 5 2018-01-05 Patricia Tangerang Outlet 3 Es Kopi Susu 33074
Let’s say, you have a data set already in your hand as you can see above. Please create a function to calculate the following tasks:
Percentage <- function(x){
percent <- round(x*100, 1)
result <- paste(percent, sep = "", "%")
return(result)
}
City_Sales <- aggregate(Price ~ City, data = KopiKenangan, sum)
Total_Sales <- sum(City_Sales$Price)
City_Sales$Percentage_City_Sales <- Percentage(City_Sales$Price/Total_Sales)
rename(City_Sales, "Total_Sales"="Price")## City Total_Sales Percentage_City_Sales
## 1 Bekasi 30848156 19.8%
## 2 Bogor 30959415 19.9%
## 3 Depok 30998148 19.9%
## 4 Jakarta 31390724 20.2%
## 5 Tangerang 31367534 20.2%