1 1.變數介紹:

我們有13個變數,包含10841個樣本

2 2.資料處理方式

2.1 2.1刪掉的變數

我們刪掉 App,Last UpdatedDate,Current VerCurrent,Android Ver這4個變數

2.2 2.2變數處理方式

  • 將Size變數全部轉成MB大小,並且刪除Size變數中 大小隨裝置改變的APP樣本去除
  • 將Installs變數的“+”號去除
  • 將Rating變數NA的樣本去除
  • 新增一個變數叫做small app ,容量小於1MB的叫做small app

最後剩下7729個樣本和10個變數

3 3.感興趣的問題

3.1 3.1哪些因素會影響APP的訂價

這裡我們配適Tobit model中的corner soution 模型,以Price作為outcome,以Review,Rating,Installs,size_mb,small_app作為feature ,這裡使用Tobit model是因為Price有很多價格都等於0

note: 這裡我是參考別人配適婚外情的data,我不太確定left cersored在0是不是就等價 Tobit model中的corner soution

fm.tobit <- tobit(Price ~Reviews+Rating+Installs+size_mb+small_app,
data = datause2)

summary(fm.tobit)
## 
## Call:
## tobit(formula = Price ~ Reviews + Rating + Installs + size_mb + 
##     small_app, data = datause2)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##           7729           7150            579              0 
## 
## Coefficients:
##                  Estimate    Std. Error z value   Pr(>|z|)    
## (Intercept) -148.91289953   13.84169619 -10.758    < 2e-16 ***
## Reviews        0.00012537    0.00001232  10.176    < 2e-16 ***
## Rating        11.95001211    3.03144860   3.942 0.00008080 ***
## Installs      -0.00005735    0.00000531 -10.801    < 2e-16 ***
## size_mb        0.27169760    0.08312430   3.269    0.00108 ** 
## small_app     32.62328616    7.25582737   4.496 0.00000692 ***
## Log(scale)     4.30739311    0.03168942 135.925    < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Scale: 74.25 
## 
## Gaussian distribution
## Number of Newton-Raphson Iterations: 11 
## Log-likelihood: -4333 on 7 Df
## Wald-statistic: 142.4 on 5 Df, p-value: < 2.22e-16

3.2 3.2哪些因素會影響APP的Rating

set.seed(1)
rf2<- ranger(Rating~Reviews+Installs+size_mb+small_app+Price+Type+Category+Genres+Content_Rating ,datause2, quantreg = TRUE,importance='impurity')
rf2$variable.importance %>% 
  as.matrix() %>% 
  as.data.frame() %>% 
  add_rownames() %>% 
  `colnames<-`(c("varname","imp")) %>%
  arrange(desc(imp)) %>% 
  top_n(25,wt = imp) %>% 
  ggplot(mapping = aes(x = reorder(varname, imp), y = imp)) +
  geom_col() +
  coord_flip() +
  ggtitle(label = "Top 9 important variables") +
  theme(
    axis.title = element_blank()
  )
## Warning: Deprecated, use tibble::rownames_to_column() instead.

df_numeric<- datause2
df_numeric[,11] <- ifelse(df_numeric$Price>0,1,0  )

df_numeric
## # A tibble: 7,729 x 11
##    Category Rating Reviews Installs Type  Price Content_Rating Genres size_mb
##    <fct>     <dbl>   <dbl>    <dbl> <fct> <dbl> <fct>          <fct>    <dbl>
##  1 ART_AND~    4.1     159    10000 Free      0 Everyone       Art &~    19  
##  2 ART_AND~    3.9     967   500000 Free      0 Everyone       Art &~    14  
##  3 ART_AND~    4.7   87510  5000000 Free      0 Everyone       Art &~     8.7
##  4 ART_AND~    4.5  215644 50000000 Free      0 Teen           Art &~    25  
##  5 ART_AND~    4.3     967   100000 Free      0 Everyone       Art &~     2.8
##  6 ART_AND~    4.4     167    50000 Free      0 Everyone       Art &~     5.6
##  7 ART_AND~    3.8     178    50000 Free      0 Everyone       Art &~    19  
##  8 ART_AND~    4.1   36815  1000000 Free      0 Everyone       Art &~    29  
##  9 ART_AND~    4.4   13791  1000000 Free      0 Everyone       Art &~    33  
## 10 ART_AND~    4.7     121    10000 Free      0 Everyone       Art &~     3.1
## # ... with 7,719 more rows, and 2 more variables: small_app <dbl>, V11 <dbl>
df_numeric<- 
df_numeric %>% rename(paidornot=V11)
df_numeric <- df_numeric[,-c(1,5,7,8)]
#require(e1071)
#library(rminer)

#M <- fit(Price ~., data=df_numeric, model="svm", kpar=list(sigma=0.10))
#summary(M)
#svm.imp <- Importance(M, data=df_numeric)

#sum(is.na.data.frame(df_numeric))

#model<-svm(Price ~ . , df_numeric)
#summary(model)

#model$SV
#svr.pred = predict(model, df_numeric)

#sqrt(  mean((df_numeric$Price - svr.pred)^2 ))

#data(churn)
#model <- fit(Rating~Reviews, data = df_numeric, model = 'svm')

#df<- as.matrix(df_numeric)

3.3 3.5針對app name去做文字雲

mixseg<-worker()
seg <- mixseg[datause$App]
segA<-data.frame(table(seg))

segC<-data.frame(table(seg[nchar(seg)>1]))#data.frame
segC_top50<-head(segC[order(segC$Freq,decreasing = TRUE),],50)

library(wordcloud)
## Loading required package: RColorBrewer
par(family=("Heiti TC Light"))
wordcloud(
  words = segC_top50[,1], # 或segC_top50$Var1
  freq =  segC_top50$Freq, 
  scale = c(4,.1), # 給定文字尺寸的區間(向量)
  random.order = FALSE,# 關閉文字隨機顯示 按順序
  ordered.colors = FALSE,#關閉配色順序
  rot.per = FALSE,#關閉文字轉角度
  min.freq = 7,# 定義最小freq數字 
  colors = brewer.pal(8,"Dark2")
)

3.4 3.6針對app 的評論去做正負面分析判斷

  • 1.先刪掉沒有評論和中立性的發言
  • 2.先做BOW matrix
  • 3.將BOW的模型處理的 matrix當成input 丟入 logsitic regression中

3.4.1 3.6.1先刪掉沒有評論和中立性的發言

library(text2vec)
review2<-review[-which(is.na(review$Sentiment_Polarity)),]
review2<- review2[which(complete.cases(review2)),]
review2 <- review2[-which(review2$Sentiment=="Neutral"),]
review2$id <- seq(1:dim(review2)[1])
review2$sentiment <- ifelse(review2$Sentiment=="Positive",1,0)
review2 <- review2[,-c(3,4,5)]

review2 <- review2 [,c( 3,1,2,4    )]

3.4.2 3.6.2製作BOW矩陣

這裡簡單介紹何謂BOW matrix

這個矩陣會列出每個詞彙在每則評論的出現頻率

好處是即使測試及沒有相對應的詞彙依然有用

setDT(review2)
setkey(review2, id)
set.seed(2016L)
all_ids = review2$id
train_ids = sample(all_ids, 20000)
test_ids = setdiff(all_ids, train_ids)
train = review2[train_ids,]
test = review2[test_ids,]
prep_fun = tolower
tok_fun = word_tokenizer
it_train = itoken(train$Translated_Review, 
             preprocessor = prep_fun, 
             tokenizer = tok_fun, 
             ids = train$id, 
             progressbar = FALSE)
vocab = create_vocabulary(it_train)

vectorizer = vocab_vectorizer(vocab)
dtm_train = create_dtm(it_train, vectorizer)
dim(dtm_train)
## [1] 20000 17448
m <- as.matrix(dtm_train)
#identical(rownames(dtm_train), train$id)
#check there is no wrong with id
sum(as.numeric(rownames(dtm_train))!=train$id)
## [1] 0
#which(colnames(m)=="good")

show the bow matrix

m[50:60 ,17444:17448   ]
##       good the it game i
## 31770    1   0  1    2 1
## 10343    0   0  0    0 0
## 27168    0   0  0    0 1
## 16964    0   0  0    0 0
## 24512    1   0  0    0 0
## 26942    0   0  0    0 1
## 24315    0   0  1    0 1
## 3751     0   0  0    0 1
## 9237     0   1  0    0 0
## 24872    0   0  0    0 4
## 10504    0   0  0    4 1
dim(m)
## [1] 20000 17448

3.4.3 3.6.3建立logistic regression

將bow matrix當成feature給logistic regression

這裡使用l1 panalty 和 5 fold cross validation去挑選出lambda

NFOLDS = 5
glmnet_classifier = cv.glmnet(x = dtm_train, y = train[['sentiment']], 
                              family = 'binomial', 
                              # L1 penalty
                              alpha = 1,
                              # interested in the area under ROC curve
                              type.measure = "auc",
                              # 5-fold cross-validation
                              nfolds = NFOLDS,
                              # high value is less accurate, but has faster training
                              thresh = 1e-3,
                              # again lower number of iterations for faster training
                              maxit = 1e3)

3.4.4 3.6.4訓練集的混淆矩陣

train_real<- ifelse(train$sentiment>0,"positive"," negative"   )
train_real <- as.factor(train_real)
preds = predict(glmnet_classifier, dtm_train, type = 'response')[,1]
predictions<- ifelse(preds>0.5,"positive"," negative"   )
predictions <- as.factor(predictions)
confusionMatrix( predictions,train_real)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction   negative positive
##    negative      4641       96
##   positive        462    14801
##                                           
##                Accuracy : 0.9721          
##                  95% CI : (0.9697, 0.9743)
##     No Information Rate : 0.7448          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9248          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9095          
##             Specificity : 0.9936          
##          Pos Pred Value : 0.9797          
##          Neg Pred Value : 0.9697          
##              Prevalence : 0.2551          
##          Detection Rate : 0.2321          
##    Detection Prevalence : 0.2369          
##       Balanced Accuracy : 0.9515          
##                                           
##        'Positive' Class :  negative       
## 

3.4.5 3.6.5ROC curve of training data

plot(roc(train$sentiment,preds , direction="<"), 
     col="red", lwd=1, main="ROC curve") 
## Setting levels: control = 0, case = 1

3.4.6 3.6.5Auc of training data

glmnet:::auc(train$sentiment, preds)
## [1] 0.9940932
it_test = test$Translated_Review %>% 
  prep_fun %>% 
  tok_fun %>% 
  itoken(ids = test$id, 
         # turn off progressbar because it won't look nice in rmd
         progressbar = FALSE)

dtm_test = create_dtm(it_test, vectorizer)

3.4.7 3.6.6測試集的混淆矩陣

test_real<- ifelse(test$sentiment>0,"positive"," negative"   )
test_real<- as.factor(test_real)
preds = predict(glmnet_classifier, dtm_test, type = 'response')[,1]
predictions<- ifelse(preds>0.5,"positive"," negative"   )
predictions <- as.factor(predictions)
confusionMatrix( predictions      ,test_real)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction   negative positive
##    negative      2628      186
##   positive        540     8915
##                                           
##                Accuracy : 0.9408          
##                  95% CI : (0.9365, 0.9449)
##     No Information Rate : 0.7418          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8397          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8295          
##             Specificity : 0.9796          
##          Pos Pred Value : 0.9339          
##          Neg Pred Value : 0.9429          
##              Prevalence : 0.2582          
##          Detection Rate : 0.2142          
##    Detection Prevalence : 0.2294          
##       Balanced Accuracy : 0.9046          
##                                           
##        'Positive' Class :  negative       
## 

3.4.8 3.6.7Roc curve of testing data

plot(roc(test$sentiment,preds , direction="<"), 
     col="red", lwd=1, main="ROC curve") 
## Setting levels: control = 0, case = 1

### 3.6.7Auc of testing data

glmnet:::auc(test$sentiment, preds)
## [1] 0.9789596

3.5 deal the install as categorical or ordinal

table(datause2$Installs)
## 
##          1          5         10         50        100        500       1000 
##          3          9         67         56        303        197        690 
##       5000      10000      50000     100000     500000    1000000    5000000 
##        420        969        437       1037        491       1302        535 
##   10000000   50000000  100000000  500000000 1000000000 
##        825        147        201         30         10
knitr::kable(table(datause2$Installs)   )
Var1 Freq
1 3
5 9
10 67
50 56
100 303
500 197
1000 690
5000 420
10000 969
50000 437
100000 1037
500000 491
1000000 1302
5000000 535
10000000 825
50000000 147
100000000 201
500000000 30
1000000000 10
  • 可看出此變數以每5倍,每2倍之順序在進行統計

  • 此變數雖然是數值型,但是由於不是非常連續,所以我們把它分成三種type,分別是low,medium,high

low(1-10000)

medium(10000-1000000)

high(10^6 up)

sum( table(datause2$Installs)  ) # just for check
## [1] 7729
#ifelse(datause2$Installs<10000 ,"low" ,"medium"     ) 
a<- datause2$Installs

ordinal<- cut(a, c(-Inf,10000,1000000,Inf), c("low","medium","high"),right=T)

datause2$ordinal <- ordinal

df <- datause2[,-c( 1,4,8  )]
#a[which(1000000<=a),1] <- "high"
#a[which(a<10000),1]<-"low"
#a[which(10000<=a&a<1000000),1] <- "medium"

m <- polr(ordinal~ Price+Rating+size_mb+small_app, data = df, Hess=TRUE)
summary(m)
## Call:
## polr(formula = ordinal ~ Price + Rating + size_mb + small_app, 
##     data = df, Hess = TRUE)
## 
## Coefficients:
##              Value Std. Error t value
## Price     -0.06542   0.012292  -5.322
## Rating     0.36387   0.041137   8.845
## size_mb    0.02600   0.000997  26.084
## small_app -0.58709   0.129758  -4.524
## 
## Intercepts:
##             Value   Std. Error t value
## low|medium   1.3699  0.1726     7.9375
## medium|high  3.4158  0.1763    19.3716
## 
## Residual Deviance: 15498.17 
## AIC: 15510.17
(ctable <- coef(summary(m)))
##                   Value   Std. Error   t value
## Price       -0.06542291 0.0122922065 -5.322308
## Rating       0.36386696 0.0411366445  8.845324
## size_mb      0.02600456 0.0009969589 26.083883
## small_app   -0.58708625 0.1297579377 -4.524473
## low|medium   1.36991449 0.1725880531  7.937482
## medium|high  3.41579831 0.1763302744 19.371593
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
(ctable <- cbind(ctable, "p value" = p))
##                   Value   Std. Error   t value       p value
## Price       -0.06542291 0.0122922065 -5.322308  1.024589e-07
## Rating       0.36386696 0.0411366445  8.845324  9.126290e-19
## size_mb      0.02600456 0.0009969589 26.083883 5.555030e-150
## small_app   -0.58708625 0.1297579377 -4.524473  6.054626e-06
## low|medium   1.36991449 0.1725880531  7.937482  2.063277e-15
## medium|high  3.41579831 0.1763302744 19.371593  1.340454e-83
ctable
##                   Value   Std. Error   t value       p value
## Price       -0.06542291 0.0122922065 -5.322308  1.024589e-07
## Rating       0.36386696 0.0411366445  8.845324  9.126290e-19
## size_mb      0.02600456 0.0009969589 26.083883 5.555030e-150
## small_app   -0.58708625 0.1297579377 -4.524473  6.054626e-06
## low|medium   1.36991449 0.1725880531  7.937482  2.063277e-15
## medium|high  3.41579831 0.1763302744 19.371593  1.340454e-83