Setup

knitr::opts_chunk$set(cache=TRUE)
options(scipen = 9999)
rm(list=ls())

Library

Bagian untuk memasukkan library yang dibutuhkan.

library(ggplot2)
library(GGally)
library(ggthemes)
library(ggpubr)
## Loading required package: magrittr
library(leaflet) #visualisasi geo-spasial
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date

Dataset dan Melihat Gambaran Data

Mengambil dataset dari https://www.kaggle.com/lava18/google-play-store-apps dan menginputkannya.

playstore <- read.csv("googleplaystore_edit_dollar.csv")
names(playstore)
##  [1] "App"            "Category"       "Rating"         "Reviews"       
##  [5] "Size"           "Installs"       "Type"           "Price"         
##  [9] "Content.Rating" "Genres"         "Last.Updated"   "Current.Ver"   
## [13] "Android.Ver"
str(playstore)
## 'data.frame':    10840 obs. of  13 variables:
##  $ App           : Factor w/ 9659 levels "\"i DT\" Fútbol. Todos Somos Técnicos.",..: 5493 6575 3628 5505 5511 5512 5517 5518 5514 5509 ...
##  $ Category      : Factor w/ 33 levels "ART_AND_DESIGN",..: 19 12 19 19 12 13 13 13 12 13 ...
##  $ Rating        : num  3.6 4.3 3.8 3.8 4 4.1 3.8 3.5 4.4 4.1 ...
##  $ Reviews       : int  275 6 718 3547 856 1867 93 472 201 129 ...
##  $ Size          : Factor w/ 461 levels "1.0M","1.1M",..: 312 6 105 9 367 172 81 446 62 62 ...
##  $ Installs      : Factor w/ 21 levels "0","0+","1,000,000,000+",..: 8 12 8 11 8 17 5 14 14 5 ...
##  $ Type          : Factor w/ 3 levels "Free","NaN","Paid": 3 3 3 3 3 3 3 3 3 3 ...
##  $ Price         : num  400 400 400 400 400 ...
##  $ Content.Rating: Factor w/ 6 levels "Adults only 18+",..: 2 2 2 2 2 2 2 2 2 5 ...
##  $ Genres        : Factor w/ 119 levels "Action","Action;Action & Adventure",..: 68 53 68 68 53 61 61 61 53 61 ...
##  $ Last.Updated  : Factor w/ 1377 levels "April 1, 2016",..: 1043 605 841 468 998 1084 239 953 1046 327 ...
##  $ Current.Ver   : Factor w/ 2785 levels "","0.0.0.2","0.0.1",..: 138 110 117 997 1500 680 110 1701 672 997 ...
##  $ Android.Ver   : Factor w/ 34 levels "1.0 and up","1.5 and up",..: 18 30 21 16 21 14 18 21 3 16 ...
#table(playstore$Installs)
#table(playstore$Price)

#Gak kepake

#user_review_playstore = read.csv("googleplaystore_user_reviews.csv")
#names(user_review_playstore)
#str(user_review_playstore)

Benerin Tipe Data & Price

Memperbaiki tipe data yang ada dalam file .csv

playstore <-  playstore[match(unique(playstore$App), playstore$App),]
playstore[,c("App", "Category", "Genres")] <- lapply(playstore[,c("App", "Category", "Genres")], as.character)
playstore[,c("Rating", "Reviews")] <- lapply(playstore[,c("Rating", "Reviews")], as.numeric)

str(playstore)
## 'data.frame':    9659 obs. of  13 variables:
##  $ App           : chr  "I'm Rich - Trump Edition" "most expensive app (H)" "💎 I'm rich" "I am rich" ...
##  $ Category      : chr  "LIFESTYLE" "FAMILY" "LIFESTYLE" "LIFESTYLE" ...
##  $ Rating        : num  3.6 4.3 3.8 3.8 4 4.1 3.8 3.5 4.4 4.1 ...
##  $ Reviews       : num  275 6 718 3547 856 ...
##  $ Size          : Factor w/ 461 levels "1.0M","1.1M",..: 312 6 105 9 367 172 81 446 62 62 ...
##  $ Installs      : Factor w/ 21 levels "0","0+","1,000,000,000+",..: 8 12 8 11 8 17 5 14 14 5 ...
##  $ Type          : Factor w/ 3 levels "Free","NaN","Paid": 3 3 3 3 3 3 3 3 3 3 ...
##  $ Price         : num  400 400 400 400 400 ...
##  $ Content.Rating: Factor w/ 6 levels "Adults only 18+",..: 2 2 2 2 2 2 2 2 2 5 ...
##  $ Genres        : chr  "Lifestyle" "Entertainment" "Lifestyle" "Lifestyle" ...
##  $ Last.Updated  : Factor w/ 1377 levels "April 1, 2016",..: 1043 605 841 468 998 1084 239 953 1046 327 ...
##  $ Current.Ver   : Factor w/ 2785 levels "","0.0.0.2","0.0.1",..: 138 110 117 997 1500 680 110 1701 672 997 ...
##  $ Android.Ver   : Factor w/ 34 levels "1.0 and up","1.5 and up",..: 18 30 21 16 21 14 18 21 3 16 ...
head(playstore)
##                        App  Category Rating Reviews Size Installs Type
## 1 I'm Rich - Trump Edition LIFESTYLE    3.6     275 7.3M  10,000+ Paid
## 2   most expensive app (H)    FAMILY    4.3       6 1.5M     100+ Paid
## 3            💎 I'm rich LIFESTYLE    3.8     718  26M  10,000+ Paid
## 4                I am rich LIFESTYLE    3.8    3547 1.8M 100,000+ Paid
## 5           I am Rich Plus    FAMILY    4.0     856 8.7M  10,000+ Paid
## 6        I Am Rich Premium   FINANCE    4.1    1867 4.7M  50,000+ Paid
##    Price Content.Rating        Genres      Last.Updated Current.Ver
## 1 400.00       Everyone     Lifestyle       May 3, 2018       1.0.1
## 2 399.99       Everyone Entertainment     July 16, 2018           1
## 3 399.99       Everyone     Lifestyle    March 11, 2018       1.0.0
## 4 399.99       Everyone     Lifestyle  January 12, 2018           2
## 5 399.99       Everyone Entertainment      May 19, 2018           3
## 6 399.99       Everyone       Finance November 12, 2017         1.6
##    Android.Ver
## 1   4.1 and up
## 2   7.0 and up
## 3   4.4 and up
## 4 4.0.3 and up
## 5   4.4 and up
## 6   4.0 and up
#table(playstore$Installs)
#table(playstore$Price)

Kategori, Jenis (Berbayar atau Tidak), Review, Harga dan Rating.

Memilih 5 kolom yang akan dijadikan acuan dalam pengerjaan tugas. Tujuan awalnya adalah melihat kategori apa yang paling digemari orang-orang yang membeli aplikasi berbayar, dengan pertimbangan jumlah review >1000.

#Memilih 5 kolom
ps_CatTyInsRat <- playstore[,c("Category","Type","Installs","Rating","Reviews","Price")]

#Memilih Review yang memiliki jumlah yang lebih dari 1000 (agar rating lebih kredibel) & Berbayar & Rating Tidak "NaN"
ps_filter1 <- ps_CatTyInsRat[ps_CatTyInsRat$Reviews > 1000 & ps_CatTyInsRat$Type == "Paid" & ps_CatTyInsRat$Rating != "NaN", ]
ps_filter1 <- ps_filter1[,-c(2,5)] #type #reviews
head(ps_filter1)
##     Category Installs Rating  Price
## 4  LIFESTYLE 100,000+    3.8 399.99
## 6    FINANCE  50,000+    4.1 399.99
## 36    SPORTS  50,000+    4.1  29.99
## 41   MEDICAL 100,000+    4.5  24.99
## 53    FAMILY  50,000+    4.5  19.99
## 57      GAME  10,000+    4.6  17.99
#Melihat gambaran data
str(ps_filter1)
## 'data.frame':    200 obs. of  4 variables:
##  $ Category: chr  "LIFESTYLE" "FINANCE" "SPORTS" "MEDICAL" ...
##  $ Installs: Factor w/ 21 levels "0","0+","1,000,000,000+",..: 11 17 17 11 17 8 8 8 11 11 ...
##  $ Rating  : num  3.8 4.1 4.1 4.5 4.5 4.6 4.7 4.7 4.4 4.3 ...
##  $ Price   : num  400 400 30 25 20 ...
table(ps_filter1$Price)
## 
##   0.99    1.2   1.49   1.99   2.49    2.5   2.59    2.9   2.95   2.99 
##     21      1      5     21      5      1      1      1      1     49 
##   3.28   3.49    3.9   3.95   3.99   4.49   4.77   4.99   5.99   6.99 
##      1      3      1      1     16      5      1     32      6      3 
##   7.99   8.99      9   9.99  11.99  12.99  13.99  14.99  17.99  19.99 
##      3      2      1      6      2      1      1      3      1      1 
##  24.99  29.99 399.99 
##      1      1      2
#ps_category <- table(ps_filter1$Category)
#ps_category
#ps_installs <- table(ps_filter1$Installs)
#ps_installs

#ps_CatRat <- playstore[,c("Category","Rating")]

Mengkategori Harga Barang

Melakukan segmentasi pada harga agar memudahkan dalam visualisasi (per 1 dollar) dan diurutkan.

segmen_harga <- function(x){
  if(0 <x&x< 1){x <- "<1"}
  else if(x> 1  & x< 5){x <- "1-5"}
  else if(5 <x&x< 10){x <- "5-10"}
  else if(10 <x&x< 15){x <- "10-15"}
  else if(15<x&x< 30){x <- "15-30"}
  else if(30<x&x< 400){x <- "30-400"}
}

ps_filter1$Price <- as.character(sapply(ps_filter1$Price, segmen_harga))
ps_filter1$Price <- ordered(ps_filter1$Price, levels=c("<1", "1-5", "5-10", "10-15", "15-30", "30-400"))

summary(ps_filter1$Rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.700   4.275   4.500   4.428   4.600   4.900

Plot dari data yang sudah di Filter

Akan dilakukan visualisasi menggunakan ggplot.

ggplot(ps_filter1, aes(x = Price,y= Rating)) +
  geom_jitter(aes(col = Category)) +
  theme(legend.position = "right") +
  labs(title="Price to Rating", subtitle="Google Play Apps, 2018", x="Price (dollar)", y="Rating", caption="24 Jan 2019")

Dari hasil visualisasi, didaptkan:

  1. Tidak ada aplikasi berbayar yang memiliki rating dibawah 3.7
  2. Mayoritas Aplikasi berbayar ada di range harga 1-5 dollar
  3. Hijau (Entertainment, Family, Finance) dan Ungu (Tools, Video Players) mendominasi aplikasi berbayar