1.變數介紹:

我們有13個變數,包含10841個樣本

2.資料處理方式

2.1刪掉的變數

我們刪掉 App,Last UpdatedDate,Current VerCurrent,Android Ver這4個變數

2.2變數處理方式

  • 將Size變數全部轉成MB大小,並且刪除Size變數中 大小隨裝置改變的APP樣本去除
  • 將Installs變數的“+”號去除
  • 將Rating變數NA的樣本去除
  • 新增一個變數叫做small app ,容量小於1MB的叫做small app

最後剩下7729個樣本和10個變數

3.感興趣的問題

3.1哪些因素會影響APP的訂價

這裡我們配適Tobit model中的corner soution 模型,以Price作為outcome,以Review,Rating,Installs,size_mb,small_app作為feature ,這裡使用Tobit model是因為Price有很多價格都等於0

note: 這裡我是參考別人配適婚外情的data,我不太確定left cersored在0是不是就等價 Tobit model中的corner soution

fm.tobit <- tobit(Price ~Reviews+Rating+Installs+size_mb+small_app,
data = datause2)

summary(fm.tobit)
## 
## Call:
## tobit(formula = Price ~ Reviews + Rating + Installs + size_mb + 
##     small_app, data = datause2)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##           7729           7150            579              0 
## 
## Coefficients:
##                  Estimate    Std. Error z value             Pr(>|z|)    
## (Intercept) -148.91289953   13.84169619 -10.758 < 0.0000000000000002 ***
## Reviews        0.00012537    0.00001232  10.176 < 0.0000000000000002 ***
## Rating        11.95001211    3.03144860   3.942           0.00008080 ***
## Installs      -0.00005735    0.00000531 -10.801 < 0.0000000000000002 ***
## size_mb        0.27169760    0.08312430   3.269              0.00108 ** 
## small_app     32.62328616    7.25582737   4.496           0.00000692 ***
## Log(scale)     4.30739311    0.03168942 135.925 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Scale: 74.25 
## 
## Gaussian distribution
## Number of Newton-Raphson Iterations: 11 
## Log-likelihood: -4333 on 7 Df
## Wald-statistic: 142.4 on 5 Df, p-value: < 0.000000000000000222

3.2哪些因素會影響APP的Rating

set.seed(1)
rf2<- ranger(Rating~Reviews+Installs+size_mb+small_app+Price+Type+Category+Genres+Content_Rating ,datause2, quantreg = TRUE,importance='impurity')
rf2$variable.importance %>% 
  as.matrix() %>% 
  as.data.frame() %>% 
  add_rownames() %>% 
  `colnames<-`(c("varname","imp")) %>%
  arrange(desc(imp)) %>% 
  top_n(25,wt = imp) %>% 
  ggplot(mapping = aes(x = reorder(varname, imp), y = imp)) +
  geom_col() +
  coord_flip() +
  ggtitle(label = "Top 9 important variables") +
  theme(
    axis.title = element_blank()
  )
## Warning: Deprecated, use tibble::rownames_to_column() instead.

df_numeric<- datause2
df_numeric[,11] <- ifelse(df_numeric$Price>0,1,0  )

df_numeric
## # A tibble: 7,729 x 11
##    Category Rating Reviews Installs Type  Price Content_Rating Genres size_mb
##    <fct>     <dbl>   <dbl>    <dbl> <fct> <dbl> <fct>          <fct>    <dbl>
##  1 ART_AND~    4.1     159    10000 Free      0 Everyone       Art &~    19  
##  2 ART_AND~    3.9     967   500000 Free      0 Everyone       Art &~    14  
##  3 ART_AND~    4.7   87510  5000000 Free      0 Everyone       Art &~     8.7
##  4 ART_AND~    4.5  215644 50000000 Free      0 Teen           Art &~    25  
##  5 ART_AND~    4.3     967   100000 Free      0 Everyone       Art &~     2.8
##  6 ART_AND~    4.4     167    50000 Free      0 Everyone       Art &~     5.6
##  7 ART_AND~    3.8     178    50000 Free      0 Everyone       Art &~    19  
##  8 ART_AND~    4.1   36815  1000000 Free      0 Everyone       Art &~    29  
##  9 ART_AND~    4.4   13791  1000000 Free      0 Everyone       Art &~    33  
## 10 ART_AND~    4.7     121    10000 Free      0 Everyone       Art &~     3.1
## # ... with 7,719 more rows, and 2 more variables: small_app <dbl>, V11 <dbl>
df_numeric<- 
df_numeric %>% rename(paidornot=V11)
df_numeric <- df_numeric[,-c(1,5,7,8)]
#require(e1071)
#library(rminer)

#M <- fit(Price ~., data=df_numeric, model="svm", kpar=list(sigma=0.10))
#summary(M)
#svm.imp <- Importance(M, data=df_numeric)

sum(is.na.data.frame(df_numeric))
## [1] 0
#model<-svm(Price ~ . , df_numeric)
#summary(model)

#model$SV
#svr.pred = predict(model, df_numeric)

#sqrt(  mean((df_numeric$Price - svr.pred)^2 ))

3.3針對price做預測

require(e1071)
## Loading required package: e1071
## Warning: package 'e1071' was built under R version 3.6.2
model <- lm(Price~.,df_numeric)
lm.pred = predict(model, df_numeric)
sqrt(  mean((df_numeric$Price - lm.pred)^2 ))
## [1] 16.92995
plot(df_numeric$Price,lm.pred      )

model2<-svm(Price ~ . , df_numeric)
summary(model)
## 
## Call:
## lm(formula = Price ~ ., data = df_numeric)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -17.12  -0.43   0.00   0.37 385.08 
## 
## Coefficients:
##                     Estimate       Std. Error t value             Pr(>|t|)    
## (Intercept)  4.4220948800543  1.5009652230536   2.946              0.00323 ** 
## Rating      -0.9694859078844  0.3565536818036  -2.719              0.00656 ** 
## Reviews      0.0000000634848  0.0000001350393   0.470              0.63828    
## Installs     0.0000000001531  0.0000000049312   0.031              0.97523    
## size_mb     -0.0157995417327  0.0086172007582  -1.833              0.06677 .  
## small_app   -1.3010710291208  1.0954177462113  -1.188              0.23497    
## paidornot   15.1931425351991  0.7364019535454  20.632 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.94 on 7722 degrees of freedom
## Multiple R-squared:  0.05332,    Adjusted R-squared:  0.05258 
## F-statistic: 72.48 on 6 and 7722 DF,  p-value: < 0.00000000000000022
svr.pred = predict(model2, df_numeric)
sqrt(  mean((df_numeric$Price - svr.pred)^2 ))
## [1] 17.29564

3.4針對Rating做預測

model <- lm(Rating~.,df_numeric)
lm.pred = predict(model, df_numeric)
sqrt(  mean((df_numeric$Rating - lm.pred)^2 ))
## [1] 0.5400799
plot( df_numeric$Rating,lm.pred      )

model2<-svm(Rating ~ . , df_numeric)
summary(model)
## 
## Call:
## lm(formula = Rating ~ ., data = df_numeric)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2715 -0.1806  0.0950  0.3374  1.0192 
## 
## Coefficients:
##                      Estimate        Std. Error t value             Pr(>|t|)
## (Intercept)  4.13355798059408  0.00908803462267 454.835 < 0.0000000000000002
## Reviews      0.00000001863717  0.00000000430271   4.331          0.000014997
## Installs     0.00000000004319  0.00000000015731   0.275              0.78366
## Price       -0.00098661167093  0.00036285212701  -2.719              0.00656
## size_mb      0.00138321739231  0.00027450499862   5.039          0.000000479
## small_app   -0.15283634103311  0.03490465336934  -4.379          0.000012097
## paidornot    0.11877477324580  0.02409276009006   4.930          0.000000840
##                
## (Intercept) ***
## Reviews     ***
## Installs       
## Price       ** 
## size_mb     ***
## small_app   ***
## paidornot   ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5403 on 7722 degrees of freedom
## Multiple R-squared:  0.01627,    Adjusted R-squared:  0.0155 
## F-statistic: 21.28 on 6 and 7722 DF,  p-value: < 0.00000000000000022
svr.pred = predict(model2, df_numeric)
sqrt(  mean((df_numeric$Rating - svr.pred)^2 ))
## [1] 0.5442792
plot( df_numeric$Rating,svr.pred      )