knitr::opts_chunk$set(cache=TRUE)
options(scipen = 9999)
rm(list=ls())
Bagian untuk memasukkan library yang dibutuhkan.
library(ggplot2)
library(GGally)
library(ggthemes)
library(ggpubr)
## Loading required package: magrittr
library(leaflet) #visualisasi geo-spasial
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
Mengambil dataset dari https://www.kaggle.com/lava18/google-play-store-apps dan menginputkannya.
playstore <- read.csv("googleplaystore_edit_dollar.csv")
names(playstore)
## [1] "App" "Category" "Rating" "Reviews"
## [5] "Size" "Installs" "Type" "Price"
## [9] "Content.Rating" "Genres" "Last.Updated" "Current.Ver"
## [13] "Android.Ver"
str(playstore)
## 'data.frame': 10840 obs. of 13 variables:
## $ App : Factor w/ 9659 levels "\"i DT\" Fútbol. Todos Somos Técnicos.",..: 5493 6575 3628 5505 5511 5512 5517 5518 5514 5509 ...
## $ Category : Factor w/ 33 levels "ART_AND_DESIGN",..: 19 12 19 19 12 13 13 13 12 13 ...
## $ Rating : num 3.6 4.3 3.8 3.8 4 4.1 3.8 3.5 4.4 4.1 ...
## $ Reviews : int 275 6 718 3547 856 1867 93 472 201 129 ...
## $ Size : Factor w/ 461 levels "1.0M","1.1M",..: 312 6 105 9 367 172 81 446 62 62 ...
## $ Installs : Factor w/ 21 levels "0","0+","1,000,000,000+",..: 8 12 8 11 8 17 5 14 14 5 ...
## $ Type : Factor w/ 3 levels "Free","NaN","Paid": 3 3 3 3 3 3 3 3 3 3 ...
## $ Price : num 400 400 400 400 400 ...
## $ Content.Rating: Factor w/ 6 levels "Adults only 18+",..: 2 2 2 2 2 2 2 2 2 5 ...
## $ Genres : Factor w/ 119 levels "Action","Action;Action & Adventure",..: 68 53 68 68 53 61 61 61 53 61 ...
## $ Last.Updated : Factor w/ 1377 levels "April 1, 2016",..: 1043 605 841 468 998 1084 239 953 1046 327 ...
## $ Current.Ver : Factor w/ 2785 levels "","0.0.0.2","0.0.1",..: 138 110 117 997 1500 680 110 1701 672 997 ...
## $ Android.Ver : Factor w/ 34 levels "1.0 and up","1.5 and up",..: 18 30 21 16 21 14 18 21 3 16 ...
#table(playstore$Installs)
#table(playstore$Price)
#Gak kepake
#user_review_playstore = read.csv("googleplaystore_user_reviews.csv")
#names(user_review_playstore)
#str(user_review_playstore)
Memperbaiki tipe data yang ada dalam file .csv
playstore <- playstore[match(unique(playstore$App), playstore$App),]
playstore[,c("App", "Category", "Genres")] <- lapply(playstore[,c("App", "Category", "Genres")], as.character)
playstore[,c("Rating", "Reviews")] <- lapply(playstore[,c("Rating", "Reviews")], as.numeric)
str(playstore)
## 'data.frame': 9659 obs. of 13 variables:
## $ App : chr "I'm Rich - Trump Edition" "most expensive app (H)" "ð I'm rich" "I am rich" ...
## $ Category : chr "LIFESTYLE" "FAMILY" "LIFESTYLE" "LIFESTYLE" ...
## $ Rating : num 3.6 4.3 3.8 3.8 4 4.1 3.8 3.5 4.4 4.1 ...
## $ Reviews : num 275 6 718 3547 856 ...
## $ Size : Factor w/ 461 levels "1.0M","1.1M",..: 312 6 105 9 367 172 81 446 62 62 ...
## $ Installs : Factor w/ 21 levels "0","0+","1,000,000,000+",..: 8 12 8 11 8 17 5 14 14 5 ...
## $ Type : Factor w/ 3 levels "Free","NaN","Paid": 3 3 3 3 3 3 3 3 3 3 ...
## $ Price : num 400 400 400 400 400 ...
## $ Content.Rating: Factor w/ 6 levels "Adults only 18+",..: 2 2 2 2 2 2 2 2 2 5 ...
## $ Genres : chr "Lifestyle" "Entertainment" "Lifestyle" "Lifestyle" ...
## $ Last.Updated : Factor w/ 1377 levels "April 1, 2016",..: 1043 605 841 468 998 1084 239 953 1046 327 ...
## $ Current.Ver : Factor w/ 2785 levels "","0.0.0.2","0.0.1",..: 138 110 117 997 1500 680 110 1701 672 997 ...
## $ Android.Ver : Factor w/ 34 levels "1.0 and up","1.5 and up",..: 18 30 21 16 21 14 18 21 3 16 ...
head(playstore)
## App Category Rating Reviews Size Installs Type
## 1 I'm Rich - Trump Edition LIFESTYLE 3.6 275 7.3M 10,000+ Paid
## 2 most expensive app (H) FAMILY 4.3 6 1.5M 100+ Paid
## 3 ð I'm rich LIFESTYLE 3.8 718 26M 10,000+ Paid
## 4 I am rich LIFESTYLE 3.8 3547 1.8M 100,000+ Paid
## 5 I am Rich Plus FAMILY 4.0 856 8.7M 10,000+ Paid
## 6 I Am Rich Premium FINANCE 4.1 1867 4.7M 50,000+ Paid
## Price Content.Rating Genres Last.Updated Current.Ver
## 1 400.00 Everyone Lifestyle May 3, 2018 1.0.1
## 2 399.99 Everyone Entertainment July 16, 2018 1
## 3 399.99 Everyone Lifestyle March 11, 2018 1.0.0
## 4 399.99 Everyone Lifestyle January 12, 2018 2
## 5 399.99 Everyone Entertainment May 19, 2018 3
## 6 399.99 Everyone Finance November 12, 2017 1.6
## Android.Ver
## 1 4.1 and up
## 2 7.0 and up
## 3 4.4 and up
## 4 4.0.3 and up
## 5 4.4 and up
## 6 4.0 and up
#table(playstore$Installs)
#table(playstore$Price)
Memilih 5 kolom yang akan dijadikan acuan dalam pengerjaan tugas. Tujuan awalnya adalah melihat kategori apa yang paling digemari orang-orang yang membeli aplikasi berbayar, dengan pertimbangan jumlah review >1000.
#Memilih 5 kolom
ps_CatTyInsRat <- playstore[,c("Category","Type","Installs","Rating","Reviews","Price")]
#Memilih Review yang memiliki jumlah yang lebih dari 1000 (agar rating lebih kredibel) & Berbayar & Rating Tidak "NaN"
ps_filter1 <- ps_CatTyInsRat[ps_CatTyInsRat$Reviews > 1000 & ps_CatTyInsRat$Type == "Paid" & ps_CatTyInsRat$Rating != "NaN", ]
ps_filter1 <- ps_filter1[,-c(2,5)] #type #reviews
head(ps_filter1)
## Category Installs Rating Price
## 4 LIFESTYLE 100,000+ 3.8 399.99
## 6 FINANCE 50,000+ 4.1 399.99
## 36 SPORTS 50,000+ 4.1 29.99
## 41 MEDICAL 100,000+ 4.5 24.99
## 53 FAMILY 50,000+ 4.5 19.99
## 57 GAME 10,000+ 4.6 17.99
#Melihat gambaran data
str(ps_filter1)
## 'data.frame': 200 obs. of 4 variables:
## $ Category: chr "LIFESTYLE" "FINANCE" "SPORTS" "MEDICAL" ...
## $ Installs: Factor w/ 21 levels "0","0+","1,000,000,000+",..: 11 17 17 11 17 8 8 8 11 11 ...
## $ Rating : num 3.8 4.1 4.1 4.5 4.5 4.6 4.7 4.7 4.4 4.3 ...
## $ Price : num 400 400 30 25 20 ...
table(ps_filter1$Price)
##
## 0.99 1.2 1.49 1.99 2.49 2.5 2.59 2.9 2.95 2.99
## 21 1 5 21 5 1 1 1 1 49
## 3.28 3.49 3.9 3.95 3.99 4.49 4.77 4.99 5.99 6.99
## 1 3 1 1 16 5 1 32 6 3
## 7.99 8.99 9 9.99 11.99 12.99 13.99 14.99 17.99 19.99
## 3 2 1 6 2 1 1 3 1 1
## 24.99 29.99 399.99
## 1 1 2
#ps_category <- table(ps_filter1$Category)
#ps_category
#ps_installs <- table(ps_filter1$Installs)
#ps_installs
#ps_CatRat <- playstore[,c("Category","Rating")]
Melakukan segmentasi pada harga agar memudahkan dalam visualisasi (per 1 dollar) dan diurutkan.
segmen_harga <- function(x){
if(0 <x&x< 1){x <- "<1"}
else if(x> 1 & x< 5){x <- "1-5"}
else if(5 <x&x< 10){x <- "5-10"}
else if(10 <x&x< 15){x <- "10-15"}
else if(15<x&x< 30){x <- "15-30"}
else if(30<x&x< 400){x <- "30-400"}
}
ps_filter1$Price <- as.character(sapply(ps_filter1$Price, segmen_harga))
ps_filter1$Price <- ordered(ps_filter1$Price, levels=c("<1", "1-5", "5-10", "10-15", "15-30", "30-400"))
summary(ps_filter1$Rating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.700 4.275 4.500 4.428 4.600 4.900
Akan dilakukan visualisasi menggunakan ggplot.
ggplot(ps_filter1, aes(x = Price,y= Rating)) +
geom_jitter(aes(col = Category)) +
theme(legend.position = "right") +
labs(title="Price to Rating", subtitle="Google Play Apps, 2018", x="Price (dollar)", y="Rating", caption="24 Jan 2019")
Dari hasil visualisasi, didaptkan: