我們有13個變數,包含10841個樣本
App:Application name
Category:Category the app belongs to
Rating:Overall user rating of the app (as when scraped)
Reviews:Number of user reviews for the app (as when scraped)
Size:Size of the app (as when scraped)
Installs:Number of user downloads/installs for the app (as when scraped)
Type:Paid or Free
Price:Price of the app (as when scraped)
Content Rating:Age group the app is targeted at - Children / Mature 21+ / Adult
Genres:An app can belong to multiple genres (apart from its main category). For eg, a musical family game will belong to Music, Game, Family genres.
Last UpdatedDate: when the app was last updated on Play Store (as when scraped)
Current VerCurrent: version of the app available on Play Store (as when scraped)
Android Ver:Min required Android version (as when scraped)
我們刪掉 App,Last UpdatedDate,Current VerCurrent,Android Ver這4個變數
最後剩下7729個樣本和10個變數
這裡我們配適Tobit model中的corner soution 模型,以Price作為outcome,以Review,Rating,Installs,size_mb,small_app作為feature ,這裡使用Tobit model是因為Price有很多價格都等於0
note: 這裡我是參考別人配適婚外情的data,我不太確定left cersored在0是不是就等價 Tobit model中的corner soution
fm.tobit <- tobit(Price ~Reviews+Rating+Installs+size_mb+small_app,
data = datause2)
summary(fm.tobit)
##
## Call:
## tobit(formula = Price ~ Reviews + Rating + Installs + size_mb +
## small_app, data = datause2)
##
## Observations:
## Total Left-censored Uncensored Right-censored
## 7729 7150 579 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -148.91289953 13.84169619 -10.758 < 2e-16 ***
## Reviews 0.00012537 0.00001232 10.176 < 2e-16 ***
## Rating 11.95001211 3.03144860 3.942 0.00008080 ***
## Installs -0.00005735 0.00000531 -10.801 < 2e-16 ***
## size_mb 0.27169760 0.08312430 3.269 0.00108 **
## small_app 32.62328616 7.25582737 4.496 0.00000692 ***
## Log(scale) 4.30739311 0.03168942 135.925 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Scale: 74.25
##
## Gaussian distribution
## Number of Newton-Raphson Iterations: 11
## Log-likelihood: -4333 on 7 Df
## Wald-statistic: 142.4 on 5 Df, p-value: < 2.22e-16
set.seed(1)
rf2<- ranger(Rating~Reviews+Installs+size_mb+small_app+Price+Type+Category+Genres+Content_Rating ,datause2, quantreg = TRUE,importance='impurity')
rf2$variable.importance %>%
as.matrix() %>%
as.data.frame() %>%
add_rownames() %>%
`colnames<-`(c("varname","imp")) %>%
arrange(desc(imp)) %>%
top_n(25,wt = imp) %>%
ggplot(mapping = aes(x = reorder(varname, imp), y = imp)) +
geom_col() +
coord_flip() +
ggtitle(label = "Top 9 important variables") +
theme(
axis.title = element_blank()
)
## Warning: Deprecated, use tibble::rownames_to_column() instead.
df_numeric<- datause2
df_numeric[,11] <- ifelse(df_numeric$Price>0,1,0 )
df_numeric
## # A tibble: 7,729 x 11
## Category Rating Reviews Installs Type Price Content_Rating Genres size_mb
## <fct> <dbl> <dbl> <dbl> <fct> <dbl> <fct> <fct> <dbl>
## 1 ART_AND~ 4.1 159 10000 Free 0 Everyone Art &~ 19
## 2 ART_AND~ 3.9 967 500000 Free 0 Everyone Art &~ 14
## 3 ART_AND~ 4.7 87510 5000000 Free 0 Everyone Art &~ 8.7
## 4 ART_AND~ 4.5 215644 50000000 Free 0 Teen Art &~ 25
## 5 ART_AND~ 4.3 967 100000 Free 0 Everyone Art &~ 2.8
## 6 ART_AND~ 4.4 167 50000 Free 0 Everyone Art &~ 5.6
## 7 ART_AND~ 3.8 178 50000 Free 0 Everyone Art &~ 19
## 8 ART_AND~ 4.1 36815 1000000 Free 0 Everyone Art &~ 29
## 9 ART_AND~ 4.4 13791 1000000 Free 0 Everyone Art &~ 33
## 10 ART_AND~ 4.7 121 10000 Free 0 Everyone Art &~ 3.1
## # ... with 7,719 more rows, and 2 more variables: small_app <dbl>, V11 <dbl>
df_numeric<-
df_numeric %>% rename(paidornot=V11)
df_numeric <- df_numeric[,-c(1,5,7,8)]
#require(e1071)
#library(rminer)
#M <- fit(Price ~., data=df_numeric, model="svm", kpar=list(sigma=0.10))
#summary(M)
#svm.imp <- Importance(M, data=df_numeric)
#sum(is.na.data.frame(df_numeric))
#model<-svm(Price ~ . , df_numeric)
#summary(model)
#model$SV
#svr.pred = predict(model, df_numeric)
#sqrt( mean((df_numeric$Price - svr.pred)^2 ))
#data(churn)
#model <- fit(Rating~Reviews, data = df_numeric, model = 'svm')
#df<- as.matrix(df_numeric)
mixseg<-worker()
seg <- mixseg[datause$App]
segA<-data.frame(table(seg))
segC<-data.frame(table(seg[nchar(seg)>1]))#data.frame
segC_top50<-head(segC[order(segC$Freq,decreasing = TRUE),],50)
library(wordcloud)
## Loading required package: RColorBrewer
par(family=("Heiti TC Light"))
wordcloud(
words = segC_top50[,1], # 或segC_top50$Var1
freq = segC_top50$Freq,
scale = c(4,.1), # 給定文字尺寸的區間(向量)
random.order = FALSE,# 關閉文字隨機顯示 按順序
ordered.colors = FALSE,#關閉配色順序
rot.per = FALSE,#關閉文字轉角度
min.freq = 7,# 定義最小freq數字
colors = brewer.pal(8,"Dark2")
)
library(text2vec)
review2<-review[-which(is.na(review$Sentiment_Polarity)),]
review2<- review2[which(complete.cases(review2)),]
review2 <- review2[-which(review2$Sentiment=="Neutral"),]
review2$id <- seq(1:dim(review2)[1])
review2$sentiment <- ifelse(review2$Sentiment=="Positive",1,0)
review2 <- review2[,-c(3,4,5)]
review2 <- review2 [,c( 3,1,2,4 )]
這裡簡單介紹何謂BOW matrix
這個矩陣會列出每個詞彙在每則評論的出現頻率
好處是即使測試及沒有相對應的詞彙依然有用
setDT(review2)
setkey(review2, id)
set.seed(2016L)
all_ids = review2$id
train_ids = sample(all_ids, 20000)
test_ids = setdiff(all_ids, train_ids)
train = review2[train_ids,]
test = review2[test_ids,]
prep_fun = tolower
tok_fun = word_tokenizer
it_train = itoken(train$Translated_Review,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = train$id,
progressbar = FALSE)
vocab = create_vocabulary(it_train)
vectorizer = vocab_vectorizer(vocab)
dtm_train = create_dtm(it_train, vectorizer)
dim(dtm_train)
## [1] 20000 17448
m <- as.matrix(dtm_train)
#identical(rownames(dtm_train), train$id)
#check there is no wrong with id
sum(as.numeric(rownames(dtm_train))!=train$id)
## [1] 0
#which(colnames(m)=="good")
show the bow matrix
m[50:60 ,17444:17448 ]
## good the it game i
## 31770 1 0 1 2 1
## 10343 0 0 0 0 0
## 27168 0 0 0 0 1
## 16964 0 0 0 0 0
## 24512 1 0 0 0 0
## 26942 0 0 0 0 1
## 24315 0 0 1 0 1
## 3751 0 0 0 0 1
## 9237 0 1 0 0 0
## 24872 0 0 0 0 4
## 10504 0 0 0 4 1
dim(m)
## [1] 20000 17448
將bow matrix當成feature給logistic regression
這裡使用l1 panalty 和 5 fold cross validation去挑選出lambda
NFOLDS = 5
glmnet_classifier = cv.glmnet(x = dtm_train, y = train[['sentiment']],
family = 'binomial',
# L1 penalty
alpha = 1,
# interested in the area under ROC curve
type.measure = "auc",
# 5-fold cross-validation
nfolds = NFOLDS,
# high value is less accurate, but has faster training
thresh = 1e-3,
# again lower number of iterations for faster training
maxit = 1e3)
train_real<- ifelse(train$sentiment>0,"positive"," negative" )
train_real <- as.factor(train_real)
preds = predict(glmnet_classifier, dtm_train, type = 'response')[,1]
predictions<- ifelse(preds>0.5,"positive"," negative" )
predictions <- as.factor(predictions)
confusionMatrix( predictions,train_real)
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 4641 96
## positive 462 14801
##
## Accuracy : 0.9721
## 95% CI : (0.9697, 0.9743)
## No Information Rate : 0.7448
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9248
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9095
## Specificity : 0.9936
## Pos Pred Value : 0.9797
## Neg Pred Value : 0.9697
## Prevalence : 0.2551
## Detection Rate : 0.2321
## Detection Prevalence : 0.2369
## Balanced Accuracy : 0.9515
##
## 'Positive' Class : negative
##
plot(roc(train$sentiment,preds , direction="<"),
col="red", lwd=1, main="ROC curve")
## Setting levels: control = 0, case = 1
glmnet:::auc(train$sentiment, preds)
## [1] 0.9940932
it_test = test$Translated_Review %>%
prep_fun %>%
tok_fun %>%
itoken(ids = test$id,
# turn off progressbar because it won't look nice in rmd
progressbar = FALSE)
dtm_test = create_dtm(it_test, vectorizer)
test_real<- ifelse(test$sentiment>0,"positive"," negative" )
test_real<- as.factor(test_real)
preds = predict(glmnet_classifier, dtm_test, type = 'response')[,1]
predictions<- ifelse(preds>0.5,"positive"," negative" )
predictions <- as.factor(predictions)
confusionMatrix( predictions ,test_real)
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 2628 186
## positive 540 8915
##
## Accuracy : 0.9408
## 95% CI : (0.9365, 0.9449)
## No Information Rate : 0.7418
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8397
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8295
## Specificity : 0.9796
## Pos Pred Value : 0.9339
## Neg Pred Value : 0.9429
## Prevalence : 0.2582
## Detection Rate : 0.2142
## Detection Prevalence : 0.2294
## Balanced Accuracy : 0.9046
##
## 'Positive' Class : negative
##
plot(roc(test$sentiment,preds , direction="<"),
col="red", lwd=1, main="ROC curve")
## Setting levels: control = 0, case = 1
### 3.6.7Auc of testing data
glmnet:::auc(test$sentiment, preds)
## [1] 0.9789596
table(datause2$Installs)
##
## 1 5 10 50 100 500 1000
## 3 9 67 56 303 197 690
## 5000 10000 50000 100000 500000 1000000 5000000
## 420 969 437 1037 491 1302 535
## 10000000 50000000 100000000 500000000 1000000000
## 825 147 201 30 10
knitr::kable(table(datause2$Installs) )
| Var1 | Freq |
|---|---|
| 1 | 3 |
| 5 | 9 |
| 10 | 67 |
| 50 | 56 |
| 100 | 303 |
| 500 | 197 |
| 1000 | 690 |
| 5000 | 420 |
| 10000 | 969 |
| 50000 | 437 |
| 100000 | 1037 |
| 500000 | 491 |
| 1000000 | 1302 |
| 5000000 | 535 |
| 10000000 | 825 |
| 50000000 | 147 |
| 100000000 | 201 |
| 500000000 | 30 |
| 1000000000 | 10 |
可看出此變數以每5倍,每2倍之順序在進行統計
此變數雖然是數值型,但是由於不是非常連續,所以我們把它分成三種type,分別是low,medium,high
low(1-10000)
medium(10000-1000000)
high(10^6 up)
sum( table(datause2$Installs) ) # just for check
## [1] 7729
#ifelse(datause2$Installs<10000 ,"low" ,"medium" )
a<- datause2$Installs
ordinal<- cut(a, c(-Inf,10000,1000000,Inf), c("low","medium","high"),right=T)
datause2$ordinal <- ordinal
df <- datause2[,-c( 1,4,8 )]
#a[which(1000000<=a),1] <- "high"
#a[which(a<10000),1]<-"low"
#a[which(10000<=a&a<1000000),1] <- "medium"
m <- polr(ordinal~ Price+Rating+size_mb+small_app, data = df, Hess=TRUE)
summary(m)
## Call:
## polr(formula = ordinal ~ Price + Rating + size_mb + small_app,
## data = df, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## Price -0.06542 0.012292 -5.322
## Rating 0.36387 0.041137 8.845
## size_mb 0.02600 0.000997 26.084
## small_app -0.58709 0.129758 -4.524
##
## Intercepts:
## Value Std. Error t value
## low|medium 1.3699 0.1726 7.9375
## medium|high 3.4158 0.1763 19.3716
##
## Residual Deviance: 15498.17
## AIC: 15510.17
(ctable <- coef(summary(m)))
## Value Std. Error t value
## Price -0.06542291 0.0122922065 -5.322308
## Rating 0.36386696 0.0411366445 8.845324
## size_mb 0.02600456 0.0009969589 26.083883
## small_app -0.58708625 0.1297579377 -4.524473
## low|medium 1.36991449 0.1725880531 7.937482
## medium|high 3.41579831 0.1763302744 19.371593
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
(ctable <- cbind(ctable, "p value" = p))
## Value Std. Error t value p value
## Price -0.06542291 0.0122922065 -5.322308 1.024589e-07
## Rating 0.36386696 0.0411366445 8.845324 9.126290e-19
## size_mb 0.02600456 0.0009969589 26.083883 5.555030e-150
## small_app -0.58708625 0.1297579377 -4.524473 6.054626e-06
## low|medium 1.36991449 0.1725880531 7.937482 2.063277e-15
## medium|high 3.41579831 0.1763302744 19.371593 1.340454e-83
ctable
## Value Std. Error t value p value
## Price -0.06542291 0.0122922065 -5.322308 1.024589e-07
## Rating 0.36386696 0.0411366445 8.845324 9.126290e-19
## size_mb 0.02600456 0.0009969589 26.083883 5.555030e-150
## small_app -0.58708625 0.1297579377 -4.524473 6.054626e-06
## low|medium 1.36991449 0.1725880531 7.937482 2.063277e-15
## medium|high 3.41579831 0.1763302744 19.371593 1.340454e-83