Setup

패키지 불러오기

library(tidyverse) # 전처리하기
library(highcharter)  # EDA시 그래프 그리기
library(lubridate) # 날짜변수 다루기
library(stringr) # 문자형 변수 다루기
library(xts) # 시계열 데이터 다루기

데이터 불러오기

아래 데이터를 살펴보도록한다.

## 'data.frame':    10841 obs. of  13 variables:
##  $ App           : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7206 2551 8970 8089 7272 7103 8149 5568 4926 5806 ...
##  $ Category      : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Rating        : num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews       : Factor w/ 6002 levels "0","1","10","100",..: 1183 5924 5681 1947 5924 1310 1464 3385 816 485 ...
##  $ Size          : Factor w/ 462 levels "1,000+","1.0M",..: 55 30 368 102 64 222 55 118 146 120 ...
##  $ Installs      : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
##  $ Type          : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Price         : Factor w/ 93 levels "$0.99","$1.00",..: 92 92 92 92 92 92 92 92 92 92 ...
##  $ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
##  $ Genres        : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
##  $ Last.Updated  : Factor w/ 1378 levels "1.0.19","April 1, 2016",..: 562 482 117 825 757 901 76 726 1317 670 ...
##  $ Current.Ver   : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 122 1020 468 2827 280 116 280 2393 1457 1431 ...
##  $ Android.Ver   : Factor w/ 35 levels "","1.0 and up",..: 17 17 17 20 22 10 17 20 12 17 ...

Factor형 변수가 많이 있는데 실제로는 Numeric형 변수인 것들도 있다. 약간의 데이터 전처리를 수행하도록 한다.

data.clean <- data %>%
  mutate(
    # +기호나 콤마 등 값에 포함된 문자들을 제거 후 Numeric으로 변환
    Installs = gsub("\\+", "", as.character(Installs)),
    Installs = as.numeric(gsub(",", "", Installs)),
    
    # M문자 제거
    Size = gsub("M", "", Size),
    # k문자 포함된 값은 0로 변환 ( < 1MB 이기 때문)
    Size = ifelse(grepl("k", Size), 0, as.numeric(Size)),

    # Reviews는 Numeric으로 변환
    Reviews = as.numeric(Reviews),
    
    # $기호 제거 후 Numeric으로 변환
    Price = as.numeric(gsub("\\$", "", as.character(Price))),
    
    # Last.Updated는 날짜형변수로 변환 (mmddyyyy)
    Last.Updated = mdy(Last.Updated),
    
    # Android.Ver은 "Varies with device"인 값은 결국 버전이 무엇인지 알수 없으므로 NA로 대체함.
    Min.Android.Ver = gsub("Varies with device", NA, Android.Ver),
    # Min.Android.Ver은 앱을 사용하기위한 최소 안드로이드 버전을 말함.
    # Android.Ver에서 소수첫번째 자리까지만 가져옴.
    Min.Android.Ver = as.numeric(substr(Min.Android.Ver, start = 1, stop = 3)),
    
    # Drop old Android version column
    Android.Ver = NULL
  ) %>%
  filter(
    # 2가지 앱이 0과 NA를 가짐. 제거
    Type %in% c("Free", "Paid")
  )

str(data.clean)

## 'data.frame':    10839 obs. of  13 variables:
##  $ App            : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7206 2551 8970 8089 7272 7103 8149 5568 4926 5806 ...
##  $ Category       : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Rating         : num  4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews        : num  1183 5924 5681 1947 5924 ...
##  $ Size           : num  19 14 8.7 25 2.8 5.6 19 29 33 3.1 ...
##  $ Installs       : num  1e+04 5e+05 5e+06 5e+07 1e+05 5e+04 5e+04 1e+06 1e+06 1e+04 ...
##  $ Type           : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Price          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Content.Rating : Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
##  $ Genres         : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
##  $ Last.Updated   : Date, format: "2018-01-07" "2018-01-15" ...
##  $ Current.Ver    : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 122 1020 468 2827 280 116 280 2393 1457 1431 ...
##  $ Min.Android.Ver: num  4 4 4 4.2 4.4 2.3 4 4.2 3 4 ...

탐색적 자료분석

중복된 행

첫번째로 우리는 중복된 행이 있는지 확인한다.

nrow(data.clean %>%
  distinct())

## [1] 10356

기존 데이터셋의 행의 갯수는 10841 이고 483 개의 행이 중복된다.

따라서 중복제거를 수행했다.

결측치 분석

데이터 전처리를 수행했으니 각 변수별로 NA값이 얼마나 있는지 확인해보자.

data.clean %>%
    summarise_all(
        funs(sum(is.na(.)))
    ) %>%
  gather() %>%
  # Only show columns with NA
  filter(value > 1) %>%
  arrange(-value) %>%
    hchart('column', hcaes(x = 'key', y = 'value', color = 'key')) %>%
  hc_add_theme(hc_theme_elementary()) %>%
  hc_title(text = "Columns with NA values")

위와 같이 우리는 NA 값을 지닌 변수가 3개가 있다는 걸 확인했으며, 이러한 상황은 Rating 변수를 제외한 나머지 2개는 Factor형을 Numeric형으로 바꾸는 과정에서 NA가 발생해서 그렇다.

각 변수별로 NA값에 대한 분석을 수행해보자.

Rating에서 NA 값

Rating이 NA인 경우 유저들이 앱의 점수를 매기지 않은 상황이다.

그렇다면 이러한 경우 얼마나 다운로드 및 설치했는지 빈도를 확인해보자.

data.clean %>%
  filter(is.na(Rating)) %>%
  count(Installs) %>%
  arrange(-n) %>%
  hchart('column', hcaes(x = "Installs", y = "n")) %>%
  hc_add_theme(hc_theme_google()) %>%
  hc_title(text = "Installations with no rating")

대부분 Rating이 없는 앱들은 다운로드 및 설치 횟수가 거의 없다. 이는 우리가 이해 할 수 있는 상황이다.

만약 다운로드 및 설치를 하지 않은 경우는 그 앱은 사람들의 리뷰 및 점수가 그만큼 없을 것이다.

하지만 약 백만번의 다운로드 및 설치가 일어났는데도 불구하고, Rating이 없는 경우도 존재한다.

Web Scrapping이 잘못된것일까? (아직 확인X)

Size에서 NA 값

데이터 전처리 과정에서, “Varies with device”라는 값을 Size에서 발견했다.

이 값은 총 1695 개의 앱에서 발견되었고 변환과정에서 NA가 발생했다.

data %>%
  filter(
    Size %in% "Varies with device"
    ) %>% 
  count()

## # A tibble: 1 x 1
##       n
##   <int>
## 1  1695

Minimum Android Version에서 NA 값

데이터 전처리 과정에서, “Varies with device”라는 값을 Android.Ver에서 발견했다.

이 값은 총 1362 개의 앱에서 발견되었고 변환과정에서 NA가 발생했다.

data %>%
  filter(
    Android.Ver %in% "Varies with device"
    ) %>% 
  count()

## # A tibble: 1 x 1
##       n
##   <int>
## 1  1362

Application size

이제 MB단위로 앱의 크기별 빈도수를 살펴보자.

data.clean %>%
  count(Size) %>%
  hchart('area', hcaes(x = "Size", y = "n")) %>%
  hc_colors("#fb4901") %>%
  hc_add_theme(hc_theme_ffx()) %>%
  hc_title(text = "Distribution of application size (in MB)")

hcboxplot(x = data.clean$Size, var = data.clean$Type, outliers = TRUE, color = "#fb4901", fillColor = "lightblue") %>%
  hc_chart(type = "column") %>%
  hc_add_theme(hc_theme_ffx()) %>%
  hc_title(text = "Application size range (in MB) by Application Type")

10MB 이하의 앱이 많다. 일반적으로 앱은 5MB에서 30MB정도의 크기를 갖는다.

유료앱이 무료앱보다 크기가 약간 작다.

Installs

이 데이터셋에서 앱은 0부터 10^{9} 까지 다운로드 및 설치횟수의 범위를 갖는다.

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 1.000e+03 1.000e+05 1.416e+07 1.000e+06 1.000e+09

만일 10k씩 그룹을 나눈다면 다음과 같은 분포를 볼 수 있다.

tmp <- data.clean %>%
  group_by(
    Installs.Group = cut(Installs, breaks= seq(0, 1000000000, by = 10000))
    ) %>% 
  summarise(
    n= n()
    )

highchart() %>%
  hc_chart(
    type = "pie"
    ) %>%
  hc_add_series_labels_values(
    labels = tmp$Installs.Group, values = tmp$n
    ) %>%
  hc_title(
    text="Number of installs (groups per 10k)"
    ) %>%
  hc_add_theme(hc_theme_economist())

가장 큰 그룹은 10k까지 다운로드된 앱들이다.

100k까지 다운로드된 앱들이 전체의 반이상을 차지한다.

Type

유료앱과 무료앱별로 살펴보도록한다.

먼저 유료앱과 무료앱간 비율은 아래와 같다.

tmp <- data.clean %>%
  count(Type) %>%
  mutate(perc = round((n /sum(n))*100)) %>%
  arrange(desc(perc))

hciconarray(tmp$Type, tmp$perc, icons = "android", size = 5) %>%
  hc_title(text="Percentage of paid vs free apps")

100개의 앱에 대해서, 7개는 유료이다.

이번에는 category별로 무료앱과 유료앱의 비율을 살펴보도록한다.

data.clean %>% 
  group_by(Category, Type) %>%
  summarize(
    n = n()
  ) %>%
  mutate(perc = round((n /sum(n))*100)) %>%
  hchart('bar', hcaes(x = 'Category', y = 'perc', group = 'Type')) %>%
  hc_plotOptions(series=list(stacking='normal')) %>%
  hc_title(text="Percentage of Free vs Paid by Category") %>%
  hc_add_theme(hc_theme_flat())

“medical” 과 “personalization”가 유료앱의 구성비가 가장 높다. (약 20%)

Price

총 3.674718510^{8}이 모든 앱에 대해 소비된 금액이다.

category별로 가격을 살펴보도록한다.

outlier의 영향으로 인한 치우침을 방지하기 위해 평균값이 아닌 중앙값을 사용한다.

중앙값을 기준으로 어떤 category가 가장 비싼지 확인해보도록한다.

data.clean %>%
  filter(Type == "Paid") %>%
  group_by(Category) %>%
  summarize(
    Price = median(Price)
  ) %>%
  arrange(-Price) %>%
  hchart('treemap', hcaes(x = 'Category', value = 'Price', color = 'Price')) %>%
  hc_add_theme(hc_theme_elementary()) %>%
  hc_title(text="Median price per category")

“Events”가 가장 크긴하지만 오로지 1개 앱만 유료이다. 때문에 “Finance”가 가장 비싼 앱이 되겠으며, 약 29달러 정도 된다.

각 category별로 돈을 얼마나 벌어들이는지 확인해보도록한다.

data.clean %>%
  filter(Type == "Paid") %>%
  mutate(
    Total.Paid = Price * Installs
  ) %>%
  group_by(Category) %>%
  summarize(USD.Paid = sum(Total.Paid)) %>%
  arrange(-USD.Paid) %>%
  hchart('treemap', hcaes(x = 'Category', value = 'USD.Paid', color = 'USD.Paid')) %>%
  hc_add_theme(hc_theme_elementary()) %>%
  hc_title(text="Total amount spent by category (installs * price)")

“Family”가 가장 높은 이익을 만들어내는데 아마도 마인크래프트의 큰 인기 때문일 것이다.

Content Rating

“content rating”별로 얼마나 설치했는지 확인해보도록한다.

tmp <- data.clean %>%
  group_by(Content.Rating) %>%
  summarize(Total.Installs = sum(Installs)) %>%
  arrange(-Total.Installs)

highchart() %>%
  hc_chart(type = "pyramid") %>%
  hc_add_series_labels_values(
    labels = tmp$Content.Rating, values = tmp$Total.Installs
    ) %>%
  hc_title(
    text="Number of Installs by Content Rating"
    ) %>%
  hc_add_theme(hc_theme_flat())

“Everyone”이 가장 설치횟수가 많았고, 그 다음이 “Teen” 등급이다.

Genres

유니크한 “Genres” 갯수는 총 119 개이다.

설치횟수 기준으로 가장 인기있는 것이 무엇인지 확인하도록한다.

highchart() %>% 
  hc_chart(type ="column",
           options3d = list(enabled = TRUE, alpha = 15, beta = 15)) %>%
  hc_xAxis(categories = tmp$Genres) %>% 
  hc_add_series(data = tmp$Total.Installs, name = "Total.Installs") %>%
  hc_add_theme(hc_theme_smpl()) %>%
  hc_title(
    text="Number of Installs by Genre (all apps)"
    ) %>%
    hc_chart(
      borderColor = '#EBBA95',
      borderRadius = 10,
      borderWidth = 1,
      backgroundColor = list(
        linearGradient = c(0, 0, 500, 500), stops = list(
               list(0, 'rgb(255, 255, 255)'),
               list(1, 'rgb(200, 200, 255)')
             )))

“Communication”이 가장 인기있는 장르이다.

가장 인기있는 communication apps은 어떤게 있을까?

data.clean %>%
    select(App, Installs) %>%
    arrange(-Installs) %>%
    head(10)

##                                         App Installs
## 1                         Google Play Books    1e+09
## 2  Messenger – Text and Video Chat for Free    1e+09
## 3                        WhatsApp Messenger    1e+09
## 4              Google Chrome: Fast & Secure    1e+09
## 5                                     Gmail    1e+09
## 6                                  Hangouts    1e+09
## 7  Messenger – Text and Video Chat for Free    1e+09
## 8                                  Hangouts    1e+09
## 9             Skype - free IM & video calls    1e+09
## 10             Google Chrome: Fast & Secure    1e+09

설치횟수가 같은 앱은 10개이상이지만, 처음 상위 10개만 가져왔다.

이 다음에는 장르별로 top10 유료앱과 무료앱이 무엇이 있는지 살펴본다.

tmp.paid <- tmp %>% filter(Type == "Paid") %>% head(10)

highchart() %>% 
  hc_chart(type ="column",
           options3d = list(enabled = TRUE, alpha = 15, beta = 15)) %>%
  hc_xAxis(categories = tmp.paid$Genres) %>% 
  hc_add_series(data = tmp.paid$Total.Installs, name = "Total.Installs") %>%
  hc_add_theme(hc_theme_smpl()) %>%
  hc_title(
    text = "Top 10 number of installs by genre"
    ) %>%
  hc_subtitle(
    text = "(paid apps only)"
  )

rm(tmp.paid)

tmp.free <- tmp %>%
  filter(Type == "Free") %>%
  head(10)

highchart() %>% 
  hc_chart(type ="column",
           options3d = list(enabled = TRUE, alpha = 15, beta = 15)) %>%
  hc_xAxis(categories = tmp.free$Genres) %>% 
  hc_add_series(data = tmp.free$Total.Installs, name = "Total.Installs") %>%
  hc_add_theme(hc_theme_smpl()) %>%
  hc_title(
    text="Top 10 number of installs by genre"
    ) %>%
  hc_subtitle(
    text = "(free apps only)"
  )

rm(tmp.free)

top 10 앱 장르는 유료인지 무료인지에 따라 약간의 변동이 있다.

Top 무료 앱은 “social”이었고 유료인 경우 “games”이었다.

Last updated

시계열 차트를 만듦으로써 앱의 최근 업데이트 날짜가 언제인지 확인해보자.

# Get number of apps by last updated date
tmp <- data.clean %>% count(Last.Updated)

# Transform data into time series
time_series <- xts(
  tmp$n, order.by = tmp$Last.Updated 
)

highchart(type = "stock") %>% 
  hc_title(text = "Last updated date") %>% 
  hc_subtitle(text = "Number of applications by date of last update") %>% 
  hc_add_series(time_series) %>%
  hc_add_theme(hc_theme_gridlight())

rm(time_series)

대부분 앱은 최근 6개월이내로 업데이트가 되어왔다.

그러나 5년간 업데이트가 안된 어플도 존재한다.

Android Version

The column “Android version” actually relates to the minimum Android version the application supports. Let’s analyze minimum android version by number of installations.

“Android version”는 실제로 해당 앱을 실행할 때 필요한 최소 안드로이드 버전과 관련이 있다.

설치횟수에 따른 최소 안드로이드 버전을 확인해보도록한다.

# Heatmap is working on my computer but not on Kaggle. I'll put here the code and comment it out so I don't get an error.
#data.clean %>%
#  filter(Min.Android.Ver > 0, Type %in% c("Free", "Paid")
#    ) %>%
#  group_by(as.factor(Min.Android.Ver), Type) %>%
#  rename(Minimum.Android.Version = "as.factor(Min.Android.Ver)") %>%
#  summarize(Total.Installs = sum(Installs)) %>%
#  hchart(
#    type = "heatmap", hcaes(x = Minimum.Android.Version, y = Type, value = Total.Installs)
#    ) %>%
#  hc_title(text = "Minimum Android Version (by number of installs)")

4,1이상의 안드로이드 버전에 앱이 많이 존재한다.

오래된 장치에 대해서 좋은 소식이다.

References and more information

Google Play EDA with Highchart : 한글번역

원저자 - Danilo da Silva, 역자 - 배병선