Untuk mengetahui customer behaviour dan dapat menemukan potential customer target pasar dengan tepat sasaran.
library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
df <- read_xlsx("Sample Data.xlsx")
head(df)
## # A tibble: 6 × 12
## NamaLo…¹ HariL…² JamLogin Nama Gender Email NoTelp Tahun…³ MerkHp
## <chr> <chr> <dttm> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 Univers… Senin 1899-12-31 09:25:00 Adel Female adel… 8.51e10 2003 Samsu…
## 2 SMA 15 Senin 1899-12-31 16:30:00 Lisa Female lisa… 8.12e10 1995 Xiaomi
## 3 SMA 15 Senin 1899-12-31 12:30:00 Seso… Male seso… 8.19e10 2007 Oppo
## 4 SMA 15 Selasa 1899-12-31 09:05:00 Harum Male haru… 8.14e10 2005 Realme
## 5 Univers… Selasa 1899-12-31 10:30:00 Icha Female icha… 8.53e10 2004 Iphone
## 6 DP Mall Selasa 1899-12-31 12:30:00 Aldo Male aldo… 8.14e10 1992 Iphone
## # … with 3 more variables: DigitalInterest <chr>, LocationType <chr>,
## # Status <chr>, and abbreviated variable names ¹NamaLokasi, ²HariLogin,
## # ³TahunLahir
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(df)
## Rows: 51
## Columns: 12
## $ NamaLokasi <chr> "Universitas Muhammadiyah Semarang", "SMA 15", "SMA 15…
## $ HariLogin <chr> "Senin", "Senin", "Senin", "Selasa", "Selasa", "Selasa…
## $ JamLogin <dttm> 1899-12-31 09:25:00, 1899-12-31 16:30:00, 1899-12-31 …
## $ Nama <chr> "Adel", "Lisa", "Sesotya", "Harum", "Icha", "Aldo", "P…
## $ Gender <chr> "Female", "Female", "Male", "Male", "Female", "Male", …
## $ Email <chr> "adelia.fatma@gmail.com", "lisa.ningsih@gmail.com", "s…
## $ NoTelp <dbl> 85138901234, 81245819261, 81902831901, 81379634790, 85…
## $ TahunLahir <dbl> 2003, 1995, 2007, 2005, 2004, 1992, 2004, 1979, 2000, …
## $ MerkHp <chr> "Samsung", "Xiaomi", "Oppo", "Realme", "Iphone", "Ipho…
## $ DigitalInterest <chr> "Health&Care", "Game", "Education", "Education", "Big …
## $ LocationType <chr> "Kampus", "Halte", "Sekolah", "Sekolah", "Kampus", "Pu…
## $ Status <chr> "Student", "Student", "Student", "Student", "Student",…
Terdapat 51 sampel dengan 12 kolom yang menunjukkan informasi customer behaviour yang diambil selama satu minggu untuk mengetahui karakteristik dan pola pengguna WiFi
df$TahunLahir
## [1] 2003 1995 2007 2005 2004 1992 2004 1979 2000 1978 2005 2001 2006 1981 1993
## [16] 1996 1992 1993 1990 1989 1997 1999 2007 1993 1990 1994 1997 1995 1994 2009
## [31] 1992 1987 1993 2001 2000 1988 1999 1979 2003 2004 2004 2007 1989 1997 1998
## [46] 1995 1994 1993 2001 1996 1993
df <- df %>%
mutate_at(vars(HariLogin, Gender, MerkHp, DigitalInterest, LocationType, Status), as.factor) %>%
mutate(NoTelp=as.character(NoTelp))
str(df)
## tibble [51 × 12] (S3: tbl_df/tbl/data.frame)
## $ NamaLokasi : chr [1:51] "Universitas Muhammadiyah Semarang" "SMA 15" "SMA 15" "SMA 15" ...
## $ HariLogin : Factor w/ 7 levels "Jumat","Kamis",..: 7 7 7 6 6 6 6 4 4 4 ...
## $ JamLogin : POSIXct[1:51], format: "1899-12-31 09:25:00" "1899-12-31 16:30:00" ...
## $ Nama : chr [1:51] "Adel" "Lisa" "Sesotya" "Harum" ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 2 1 2 1 1 1 2 ...
## $ Email : chr [1:51] "adelia.fatma@gmail.com" "lisa.ningsih@gmail.com" "sesotyabudi@gmail.com" "harumprabu@gmail.com" ...
## $ NoTelp : chr [1:51] "85138901234" "81245819261" "81902831901" "81379634790" ...
## $ TahunLahir : num [1:51] 2003 1995 2007 2005 2004 ...
## $ MerkHp : Factor w/ 6 levels "Iphone","Oppo",..: 4 6 2 3 1 1 2 5 4 1 ...
## $ DigitalInterest: Factor w/ 13 levels "Big Data","Content Digital",..: 11 9 7 7 1 5 1 6 7 5 ...
## $ LocationType : Factor w/ 11 levels "Cafe","Cowork Space",..: 5 4 8 8 5 7 6 7 5 2 ...
## $ Status : Factor w/ 6 levels "Business Man",..: 5 5 5 5 5 2 5 4 5 1 ...
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.2.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
df$JamLogin <- hour(df$JamLogin)
df
## # A tibble: 51 × 12
## NamaLokasi HariL…¹ JamLo…² Nama Gender Email NoTelp Tahun…³ MerkHp Digit…⁴
## <chr> <fct> <int> <chr> <fct> <chr> <chr> <dbl> <fct> <fct>
## 1 Universitas… Senin 9 Adel Female adel… 85138… 2003 Samsu… Health…
## 2 SMA 15 Senin 16 Lisa Female lisa… 81245… 1995 Xiaomi Game
## 3 SMA 15 Senin 12 Seso… Male seso… 81902… 2007 Oppo Educat…
## 4 SMA 15 Selasa 9 Harum Male haru… 81379… 2005 Realme Educat…
## 5 Universitas… Selasa 10 Icha Female icha… 85297… 2004 Iphone Big Da…
## 6 DP Mall Selasa 12 Aldo Male aldo… 81427… 1992 Iphone Digita…
## 7 Candisari Selasa 16 Putri Female putr… 85109… 2004 Oppo Big Da…
## 8 Pasa Johar Rabu 8 Indah Female inda… 85183… 1979 Vivo E-Comm…
## 9 Universitas… Rabu 13 Laras Female lara… 85083… 2000 Samsu… Educat…
## 10 Gajah Mada Rabu 13 Arif Male arif… 81457… 1978 Iphone Digita…
## # … with 41 more rows, 2 more variables: LocationType <fct>, Status <fct>, and
## # abbreviated variable names ¹HariLogin, ²JamLogin, ³TahunLahir,
## # ⁴DigitalInterest
df <- df %>%
mutate(Usia = 2023-TahunLahir)
freq_gender <- df %>%
group_by(Gender) %>%
summarise(Freq = length(Gender)) %>%
mutate(Persentase = round(Freq/sum(Freq)*100,2))
freq_gender
## # A tibble: 2 × 3
## Gender Freq Persentase
## <fct> <int> <dbl>
## 1 Female 27 52.9
## 2 Male 24 47.1
library(ggplot2)
freq_gender %>%
ggplot(mapping = aes( x = Gender, y = Persentase)) +
geom_col(aes(fill = Gender)) +
geom_text(aes(label =paste(format(Persentase, nsmall = 2), "%"), group = Gender), position = position_dodge(width = 0.9), vjust = -0.5) +
theme_minimal() +
theme(plot.title = element_text(size = 12, face = "bold", hjust = 0.5),
legend.position = "right",
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_blank())+
# panel.grid.major.y = element_blank(),
# panel.grid.minor.y = element_blank()) +
labs(title = "Frequensi Pengguna WiFi berdasarkan Gender",
y = "Persentase",
x = "Gender",
fill = "Gender")
Insight:
findoutlier <- function(x) {
return(x < quantile(x, 0.25) - 1.5*IQR(x) | x > quantile(x, 0.75) + 1.5*IQR(x))
}
df_box <- df %>%
mutate(outlier = ifelse(findoutlier(Usia), Usia, NA))
df_box
## # A tibble: 51 × 14
## NamaLokasi HariL…¹ JamLo…² Nama Gender Email NoTelp Tahun…³ MerkHp Digit…⁴
## <chr> <fct> <int> <chr> <fct> <chr> <chr> <dbl> <fct> <fct>
## 1 Universitas… Senin 9 Adel Female adel… 85138… 2003 Samsu… Health…
## 2 SMA 15 Senin 16 Lisa Female lisa… 81245… 1995 Xiaomi Game
## 3 SMA 15 Senin 12 Seso… Male seso… 81902… 2007 Oppo Educat…
## 4 SMA 15 Selasa 9 Harum Male haru… 81379… 2005 Realme Educat…
## 5 Universitas… Selasa 10 Icha Female icha… 85297… 2004 Iphone Big Da…
## 6 DP Mall Selasa 12 Aldo Male aldo… 81427… 1992 Iphone Digita…
## 7 Candisari Selasa 16 Putri Female putr… 85109… 2004 Oppo Big Da…
## 8 Pasa Johar Rabu 8 Indah Female inda… 85183… 1979 Vivo E-Comm…
## 9 Universitas… Rabu 13 Laras Female lara… 85083… 2000 Samsu… Educat…
## 10 Gajah Mada Rabu 13 Arif Male arif… 81457… 1978 Iphone Digita…
## # … with 41 more rows, 4 more variables: LocationType <fct>, Status <fct>,
## # Usia <dbl>, outlier <dbl>, and abbreviated variable names ¹HariLogin,
## # ²JamLogin, ³TahunLahir, ⁴DigitalInterest
get_box_stats <- function(y, upper_limit = max(df$Usia) * 1.15) {
return(data.frame(
y = 0.95 * upper_limit,
label = paste(
"Mean =", round(mean(y), 2), "\n",
"Median =", round(median(y), 2), "\n"
)
))
}
df_box %>%
ggplot(aes(x = 0, y = Usia))+
geom_boxplot(fill = "#0099f8" ) +
# geom_hline(data = aa3_mean_5, aes(yintercept = MN), color = "darkred", lwd = 1, linetype = "dashed")+
# stat_summary(fun = "mean", geom = "point", shape = 7, size = 2, color = "red") +
# stat_summary(fun.data = get_box_stats, geom = "text", hjust = 1) +
# facet_wrap(. ~ Variable , scales = "free") +
labs(title = "Boxplot Usia Pengguna Wifi"
) +
theme_classic()+
theme(
plot.title = element_text( size = 12, face = "bold", hjust = 0.5),
axis.title.x = element_blank()
) +
scale_x_discrete(labels = NULL) +
stat_summary(fun = "mean", geom = "point", shape = 13, size = 3, color = "red") +
geom_text(aes(label=outlier), na.rm=TRUE, hjust = -0.1) +
stat_summary(fun.data = get_box_stats, geom = "text", hjust = 0.5, vjust = 5.3)
range(df$Usia)
## [1] 14 45
Insight:
freq_hari <- df %>%
group_by(HariLogin) %>%
summarise(Freq = length(HariLogin))
freq_hari
## # A tibble: 7 × 2
## HariLogin Freq
## <fct> <int>
## 1 Jumat 9
## 2 Kamis 3
## 3 Minggu 16
## 4 Rabu 3
## 5 Sabtu 13
## 6 Selasa 4
## 7 Senin 3
freq_hari %>%
ggplot(mapping = aes(x = Freq, y = reorder(HariLogin, Freq))) +
geom_col(aes(fill = HariLogin)) +
labs(title = "Frekuensi Pengguna WiFI berdasarkan Hari",
fill = "Hari Login") +
theme_classic()+
theme(
plot.title = element_text( size = 12, face = "bold", hjust = 0.5),
axis.title.x = element_blank()
)
Insight:
df %>%
ggplot(aes(x = HariLogin, y = JamLogin)) +
geom_boxplot(aes(fill = HariLogin)) +
labs(title = "Persebaran Jam Login Berdasarkan Hari",
fill = "Hari Login",
y = "Jam Login") +
theme_classic()+
theme(
plot.title = element_text( size = 12, face = "bold", hjust = 0.5),
axis.title.x = element_blank()
)
Insight:
freq_usia <- df %>%
group_by(Usia) %>%
summarise(Freq = length(Usia)) %>%
arrange(desc(Freq))
freq_usia %>%
ggplot(mapping= aes( x = Usia, y = Freq)) +
geom_col(aes(fill = Usia))
Insight :
Mari kita lihat keterkaitannya antara usia pengguna dengan frekuensi akses WiFi. Apakah semakin tinggi usia akan semakin tinggi juga frekuensi dalam akses WiFi
library(GGally)
## Warning: package 'GGally' was built under R version 4.2.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
cor(freq_usia)
## Usia Freq
## Usia 1.000000 -0.182002
## Freq -0.182002 1.000000
ggcorr(freq_usia, label = T)
plot(freq_usia$Usia, freq_usia$Freq)
abline(lm(freq_usia$Freq ~ freq_usia$Usia), # garis linear `lm` akan dibahas di Machine Learning
col = 'red') # warna garis
Insght :
freq_lokasi <- df %>%
group_by(LocationType) %>%
summarise(Freq = length(LocationType)) %>%
arrange(desc(Freq))
freq_lokasi
## # A tibble: 11 × 2
## LocationType Freq
## <fct> <int>
## 1 Cafe 14
## 2 Tempat Umum 7
## 3 Pusat Perbelanjaan 6
## 4 Cowork Space 5
## 5 Gymp 4
## 6 Tempat Wisata 4
## 7 Halte 3
## 8 Kampus 3
## 9 Perpusnas 2
## 10 Sekolah 2
## 11 Tempat Ibadah 1
freq_lokasi %>%
ggplot(mapping = aes(y = reorder(LocationType,Freq) , x = Freq)) +
geom_col(aes(fill = LocationType)) +
labs(title = "Frekuensi Penggunaan Wifi berdasarkan Tipe Lokasi",
y = "Location Type",
x = "Frekuensi")
Insight:
multivv <- df %>%
group_by(HariLogin, LocationType) %>%
summarise(Freq = length(Nama))
## `summarise()` has grouped output by 'HariLogin'. You can override using the
## `.groups` argument.
multivv
## # A tibble: 29 × 3
## # Groups: HariLogin [7]
## HariLogin LocationType Freq
## <fct> <fct> <int>
## 1 Jumat Cafe 3
## 2 Jumat Cowork Space 2
## 3 Jumat Gymp 1
## 4 Jumat Tempat Umum 3
## 5 Kamis Cowork Space 1
## 6 Kamis Halte 2
## 7 Minggu Cafe 6
## 8 Minggu Cowork Space 1
## 9 Minggu Gymp 1
## 10 Minggu Pusat Perbelanjaan 3
## # … with 19 more rows
multiv <- aggregate(x = Nama ~ HariLogin + LocationType,
data = df,
FUN = length)
multiv
## HariLogin LocationType Nama
## 1 Jumat Cafe 3
## 2 Minggu Cafe 6
## 3 Sabtu Cafe 5
## 4 Jumat Cowork Space 2
## 5 Kamis Cowork Space 1
## 6 Minggu Cowork Space 1
## 7 Rabu Cowork Space 1
## 8 Jumat Gymp 1
## 9 Minggu Gymp 1
## 10 Sabtu Gymp 2
## 11 Kamis Halte 2
## 12 Senin Halte 1
## 13 Rabu Kampus 1
## 14 Selasa Kampus 1
## 15 Senin Kampus 1
## 16 Sabtu Perpusnas 1
## 17 Selasa Perpusnas 1
## 18 Minggu Pusat Perbelanjaan 3
## 19 Rabu Pusat Perbelanjaan 1
## 20 Sabtu Pusat Perbelanjaan 1
## 21 Selasa Pusat Perbelanjaan 1
## 22 Selasa Sekolah 1
## 23 Senin Sekolah 1
## 24 Minggu Tempat Ibadah 1
## 25 Jumat Tempat Umum 3
## 26 Minggu Tempat Umum 2
## 27 Sabtu Tempat Umum 2
## 28 Minggu Tempat Wisata 2
## 29 Sabtu Tempat Wisata 2
ggplot(data = multivv, mapping = aes(x = Freq, y = reorder(HariLogin, Freq))) +
geom_col(aes(fill = LocationType), position = "dodge") +
labs(title = "Frekuensi Pengunjung berdasarkan Lokasi dan Hari",
x = "Frekuensi",
y = "Hari")
Insight :