3 3.感興趣的問題

3.1 3.1哪些因素會影響APP的訂價

這裡我們配適Tobit model中的corner soution 模型，以Price作為outcome，以Review,Rating,Installs,size_mb,small_app作為feature ，這裡使用Tobit model是因為Price有很多價格都等於0

note: 這裡我是參考別人配適婚外情的data，我不太確定left cersored在0是不是就等價 Tobit model中的corner soution

fm.tobit <- tobit(Price ~Reviews+Rating+Installs+size_mb+small_app,
data = datause2)

summary(fm.tobit)

## 
## Call:
## tobit(formula = Price ~ Reviews + Rating + Installs + size_mb + 
##     small_app, data = datause2)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##           7729           7150            579              0 
## 
## Coefficients:
##                  Estimate    Std. Error z value   Pr(>|z|)    
## (Intercept) -148.91289953   13.84169619 -10.758    < 2e-16 ***
## Reviews        0.00012537    0.00001232  10.176    < 2e-16 ***
## Rating        11.95001211    3.03144860   3.942 0.00008080 ***
## Installs      -0.00005735    0.00000531 -10.801    < 2e-16 ***
## size_mb        0.27169760    0.08312430   3.269    0.00108 ** 
## small_app     32.62328616    7.25582737   4.496 0.00000692 ***
## Log(scale)     4.30739311    0.03168942 135.925    < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Scale: 74.25 
## 
## Gaussian distribution
## Number of Newton-Raphson Iterations: 11 
## Log-likelihood: -4333 on 7 Df
## Wald-statistic: 142.4 on 5 Df, p-value: < 2.22e-16

3.2 3.2哪些因素會影響APP的Rating

set.seed(1)
rf2<- ranger(Rating~Reviews+Installs+size_mb+small_app+Price+Type+Category+Genres+Content_Rating ,datause2, quantreg = TRUE,importance='impurity')

rf2$variable.importance %>% 
  as.matrix() %>% 
  as.data.frame() %>% 
  add_rownames() %>% 
  `colnames<-`(c("varname","imp")) %>%
  arrange(desc(imp)) %>% 
  top_n(25,wt = imp) %>% 
  ggplot(mapping = aes(x = reorder(varname, imp), y = imp)) +
  geom_col() +
  coord_flip() +
  ggtitle(label = "Top 9 important variables") +
  theme(
    axis.title = element_blank()
  )

## Warning: Deprecated, use tibble::rownames_to_column() instead.

df_numeric<- datause2
df_numeric[,11] <- ifelse(df_numeric$Price>0,1,0  )

df_numeric

## # A tibble: 7,729 x 11
##    Category Rating Reviews Installs Type  Price Content_Rating Genres size_mb
##    <fct>     <dbl>   <dbl>    <dbl> <fct> <dbl> <fct>          <fct>    <dbl>
##  1 ART_AND~    4.1     159    10000 Free      0 Everyone       Art &~    19  
##  2 ART_AND~    3.9     967   500000 Free      0 Everyone       Art &~    14  
##  3 ART_AND~    4.7   87510  5000000 Free      0 Everyone       Art &~     8.7
##  4 ART_AND~    4.5  215644 50000000 Free      0 Teen           Art &~    25  
##  5 ART_AND~    4.3     967   100000 Free      0 Everyone       Art &~     2.8
##  6 ART_AND~    4.4     167    50000 Free      0 Everyone       Art &~     5.6
##  7 ART_AND~    3.8     178    50000 Free      0 Everyone       Art &~    19  
##  8 ART_AND~    4.1   36815  1000000 Free      0 Everyone       Art &~    29  
##  9 ART_AND~    4.4   13791  1000000 Free      0 Everyone       Art &~    33  
## 10 ART_AND~    4.7     121    10000 Free      0 Everyone       Art &~     3.1
## # ... with 7,719 more rows, and 2 more variables: small_app <dbl>, V11 <dbl>

df_numeric<- 
df_numeric %>% rename(paidornot=V11)
df_numeric <- df_numeric[,-c(1,5,7,8)]

#require(e1071)
#library(rminer)

#M <- fit(Price ~., data=df_numeric, model="svm", kpar=list(sigma=0.10))
#summary(M)
#svm.imp <- Importance(M, data=df_numeric)

#sum(is.na.data.frame(df_numeric))

#model<-svm(Price ~ . , df_numeric)
#summary(model)

#model$SV
#svr.pred = predict(model, df_numeric)

#sqrt(  mean((df_numeric$Price - svr.pred)^2 ))

#data(churn)
#model <- fit(Rating~Reviews, data = df_numeric, model = 'svm')

#df<- as.matrix(df_numeric)

3.3 3.5針對app name去做文字雲

mixseg<-worker()
seg <- mixseg[datause$App]
segA<-data.frame(table(seg))

segC<-data.frame(table(seg[nchar(seg)>1]))#data.frame
segC_top50<-head(segC[order(segC$Freq,decreasing = TRUE),],50)

library(wordcloud)

## Loading required package: RColorBrewer

par(family=("Heiti TC Light"))
wordcloud(
  words = segC_top50[,1], # 或segC_top50$Var1
  freq =  segC_top50$Freq, 
  scale = c(4,.1), # 給定文字尺寸的區間（向量）
  random.order = FALSE,# 關閉文字隨機顯示 按順序
  ordered.colors = FALSE,#關閉配色順序
  rot.per = FALSE,#關閉文字轉角度
  min.freq = 7,# 定義最小freq數字 
  colors = brewer.pal(8,"Dark2")
)

3.4 3.6針對app 的評論去做正負面分析判斷

1.先刪掉沒有評論和中立性的發言
2.先做BOW matrix
3.將BOW的模型處理的 matrix當成input 丟入 logsitic regression中

3.4.1 3.6.1先刪掉沒有評論和中立性的發言

library(text2vec)
review2<-review[-which(is.na(review$Sentiment_Polarity)),]
review2<- review2[which(complete.cases(review2)),]
review2 <- review2[-which(review2$Sentiment=="Neutral"),]
review2$id <- seq(1:dim(review2)[1])
review2$sentiment <- ifelse(review2$Sentiment=="Positive",1,0)
review2 <- review2[,-c(3,4,5)]

review2 <- review2 [,c( 3,1,2,4    )]

3.4.2 3.6.2製作BOW矩陣

這裡簡單介紹何謂BOW matrix

這個矩陣會列出每個詞彙在每則評論的出現頻率

好處是即使測試及沒有相對應的詞彙依然有用

setDT(review2)
setkey(review2, id)
set.seed(2016L)
all_ids = review2$id
train_ids = sample(all_ids, 20000)
test_ids = setdiff(all_ids, train_ids)
train = review2[train_ids,]
test = review2[test_ids,]

prep_fun = tolower
tok_fun = word_tokenizer
it_train = itoken(train$Translated_Review, 
             preprocessor = prep_fun, 
             tokenizer = tok_fun, 
             ids = train$id, 
             progressbar = FALSE)
vocab = create_vocabulary(it_train)

vectorizer = vocab_vectorizer(vocab)
dtm_train = create_dtm(it_train, vectorizer)
dim(dtm_train)

## [1] 20000 17448

m <- as.matrix(dtm_train)
#identical(rownames(dtm_train), train$id)
#check there is no wrong with id
sum(as.numeric(rownames(dtm_train))!=train$id)

## [1] 0

#which(colnames(m)=="good")

show the bow matrix

m[50:60 ,17444:17448   ]

##       good the it game i
## 31770    1   0  1    2 1
## 10343    0   0  0    0 0
## 27168    0   0  0    0 1
## 16964    0   0  0    0 0
## 24512    1   0  0    0 0
## 26942    0   0  0    0 1
## 24315    0   0  1    0 1
## 3751     0   0  0    0 1
## 9237     0   1  0    0 0
## 24872    0   0  0    0 4
## 10504    0   0  0    4 1

dim(m)

## [1] 20000 17448

3.4.3 3.6.3建立logistic regression

將bow matrix當成feature給logistic regression

這裡使用l1 panalty 和 5 fold cross validation去挑選出lambda

NFOLDS = 5
glmnet_classifier = cv.glmnet(x = dtm_train, y = train[['sentiment']], 
                              family = 'binomial', 
                              # L1 penalty
                              alpha = 1,
                              # interested in the area under ROC curve
                              type.measure = "auc",
                              # 5-fold cross-validation
                              nfolds = NFOLDS,
                              # high value is less accurate, but has faster training
                              thresh = 1e-3,
                              # again lower number of iterations for faster training
                              maxit = 1e3)

3.4.4 3.6.4訓練集的混淆矩陣

train_real<- ifelse(train$sentiment>0,"positive"," negative"   )
train_real <- as.factor(train_real)
preds = predict(glmnet_classifier, dtm_train, type = 'response')[,1]
predictions<- ifelse(preds>0.5,"positive"," negative"   )
predictions <- as.factor(predictions)
confusionMatrix( predictions,train_real)

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction   negative positive
##    negative      4641       96
##   positive        462    14801
##                                           
##                Accuracy : 0.9721          
##                  95% CI : (0.9697, 0.9743)
##     No Information Rate : 0.7448          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9248          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9095          
##             Specificity : 0.9936          
##          Pos Pred Value : 0.9797          
##          Neg Pred Value : 0.9697          
##              Prevalence : 0.2551          
##          Detection Rate : 0.2321          
##    Detection Prevalence : 0.2369          
##       Balanced Accuracy : 0.9515          
##                                           
##        'Positive' Class :  negative       
##

3.4.5 3.6.5ROC curve of training data

plot(roc(train$sentiment,preds , direction="<"), 
     col="red", lwd=1, main="ROC curve")

## Setting levels: control = 0, case = 1

3.4.6 3.6.5Auc of training data

glmnet:::auc(train$sentiment, preds)

## [1] 0.9940932

it_test = test$Translated_Review %>% 
  prep_fun %>% 
  tok_fun %>% 
  itoken(ids = test$id, 
         # turn off progressbar because it won't look nice in rmd
         progressbar = FALSE)

dtm_test = create_dtm(it_test, vectorizer)

3.4.7 3.6.6測試集的混淆矩陣

test_real<- ifelse(test$sentiment>0,"positive"," negative"   )
test_real<- as.factor(test_real)
preds = predict(glmnet_classifier, dtm_test, type = 'response')[,1]
predictions<- ifelse(preds>0.5,"positive"," negative"   )
predictions <- as.factor(predictions)
confusionMatrix( predictions      ,test_real)

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction   negative positive
##    negative      2628      186
##   positive        540     8915
##                                           
##                Accuracy : 0.9408          
##                  95% CI : (0.9365, 0.9449)
##     No Information Rate : 0.7418          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8397          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8295          
##             Specificity : 0.9796          
##          Pos Pred Value : 0.9339          
##          Neg Pred Value : 0.9429          
##              Prevalence : 0.2582          
##          Detection Rate : 0.2142          
##    Detection Prevalence : 0.2294          
##       Balanced Accuracy : 0.9046          
##                                           
##        'Positive' Class :  negative       
##

3.4.8 3.6.7Roc curve of testing data

plot(roc(test$sentiment,preds , direction="<"), 
     col="red", lwd=1, main="ROC curve")

## Setting levels: control = 0, case = 1

### 3.6.7Auc of testing data

glmnet:::auc(test$sentiment, preds)

## [1] 0.9789596

3.5 deal the install as categorical or ordinal

table(datause2$Installs)

## 
##          1          5         10         50        100        500       1000 
##          3          9         67         56        303        197        690 
##       5000      10000      50000     100000     500000    1000000    5000000 
##        420        969        437       1037        491       1302        535 
##   10000000   50000000  100000000  500000000 1000000000 
##        825        147        201         30         10

knitr::kable(table(datause2$Installs)   )

Var1	Freq
1	3
5	9
10	67
50	56
100	303
500	197
1000	690
5000	420
10000	969
50000	437
100000	1037
500000	491
1000000	1302
5000000	535
10000000	825
50000000	147
100000000	201
500000000	30
1000000000	10

可看出此變數以每5倍，每2倍之順序在進行統計
此變數雖然是數值型，但是由於不是非常連續，所以我們把它分成三種type，分別是low,medium,high

low(1-10000)

medium(10000-1000000)

high(10^6 up)

sum( table(datause2$Installs)  ) # just for check

## [1] 7729

#ifelse(datause2$Installs<10000 ,"low" ,"medium"     ) 
a<- datause2$Installs

ordinal<- cut(a, c(-Inf,10000,1000000,Inf), c("low","medium","high"),right=T)

datause2$ordinal <- ordinal

df <- datause2[,-c( 1,4,8  )]
#a[which(1000000<=a),1] <- "high"
#a[which(a<10000),1]<-"low"
#a[which(10000<=a&a<1000000),1] <- "medium"

m <- polr(ordinal~ Price+Rating+size_mb+small_app, data = df, Hess=TRUE)
summary(m)

## Call:
## polr(formula = ordinal ~ Price + Rating + size_mb + small_app, 
##     data = df, Hess = TRUE)
## 
## Coefficients:
##              Value Std. Error t value
## Price     -0.06542   0.012292  -5.322
## Rating     0.36387   0.041137   8.845
## size_mb    0.02600   0.000997  26.084
## small_app -0.58709   0.129758  -4.524
## 
## Intercepts:
##             Value   Std. Error t value
## low|medium   1.3699  0.1726     7.9375
## medium|high  3.4158  0.1763    19.3716
## 
## Residual Deviance: 15498.17 
## AIC: 15510.17

(ctable <- coef(summary(m)))

##                   Value   Std. Error   t value
## Price       -0.06542291 0.0122922065 -5.322308
## Rating       0.36386696 0.0411366445  8.845324
## size_mb      0.02600456 0.0009969589 26.083883
## small_app   -0.58708625 0.1297579377 -4.524473
## low|medium   1.36991449 0.1725880531  7.937482
## medium|high  3.41579831 0.1763302744 19.371593

p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
(ctable <- cbind(ctable, "p value" = p))

##                   Value   Std. Error   t value       p value
## Price       -0.06542291 0.0122922065 -5.322308  1.024589e-07
## Rating       0.36386696 0.0411366445  8.845324  9.126290e-19
## size_mb      0.02600456 0.0009969589 26.083883 5.555030e-150
## small_app   -0.58708625 0.1297579377 -4.524473  6.054626e-06
## low|medium   1.36991449 0.1725880531  7.937482  2.063277e-15
## medium|high  3.41579831 0.1763302744 19.371593  1.340454e-83

ctable

##                   Value   Std. Error   t value       p value
## Price       -0.06542291 0.0122922065 -5.322308  1.024589e-07
## Rating       0.36386696 0.0411366445  8.845324  9.126290e-19
## size_mb      0.02600456 0.0009969589 26.083883 5.555030e-150
## small_app   -0.58708625 0.1297579377 -4.524473  6.054626e-06
## low|medium   1.36991449 0.1725880531  7.937482  2.063277e-15
## medium|high  3.41579831 0.1763302744 19.371593  1.340454e-83

googleplaydataset

Chen Ning Kuan

2020年1月27日

1 1.變數介紹:

2 2.資料處理方式

2.1 2.1刪掉的變數

2.2 2.2變數處理方式