我們有13個變數,包含10841個樣本
App:Application name
Category:Category the app belongs to
Rating:Overall user rating of the app (as when scraped)
Reviews:Number of user reviews for the app (as when scraped)
Size:Size of the app (as when scraped)
Installs:Number of user downloads/installs for the app (as when scraped)
Type:Paid or Free
Price:Price of the app (as when scraped)
Content Rating:Age group the app is targeted at - Children / Mature 21+ / Adult
Genres:An app can belong to multiple genres (apart from its main category). For eg, a musical family game will belong to Music, Game, Family genres.
Last UpdatedDate: when the app was last updated on Play Store (as when scraped)
Current VerCurrent: version of the app available on Play Store (as when scraped)
Android Ver:Min required Android version (as when scraped)
我們刪掉 App,Last UpdatedDate,Current VerCurrent,Android Ver這4個變數
最後剩下7729個樣本和10個變數
這裡我們配適Tobit model中的corner soution 模型,以Price作為outcome,以Review,Rating,Installs,size_mb,small_app作為feature ,這裡使用Tobit model是因為Price有很多價格都等於0
note: 這裡我是參考別人配適婚外情的data,我不太確定left cersored在0是不是就等價 Tobit model中的corner soution
fm.tobit <- tobit(Price ~Reviews+Rating+Installs+size_mb+small_app,
data = datause2)
summary(fm.tobit)
##
## Call:
## tobit(formula = Price ~ Reviews + Rating + Installs + size_mb +
## small_app, data = datause2)
##
## Observations:
## Total Left-censored Uncensored Right-censored
## 7729 7150 579 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -148.91289953 13.84169619 -10.758 < 0.0000000000000002 ***
## Reviews 0.00012537 0.00001232 10.176 < 0.0000000000000002 ***
## Rating 11.95001211 3.03144860 3.942 0.00008080 ***
## Installs -0.00005735 0.00000531 -10.801 < 0.0000000000000002 ***
## size_mb 0.27169760 0.08312430 3.269 0.00108 **
## small_app 32.62328616 7.25582737 4.496 0.00000692 ***
## Log(scale) 4.30739311 0.03168942 135.925 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Scale: 74.25
##
## Gaussian distribution
## Number of Newton-Raphson Iterations: 11
## Log-likelihood: -4333 on 7 Df
## Wald-statistic: 142.4 on 5 Df, p-value: < 0.000000000000000222
set.seed(1)
rf2<- ranger(Rating~Reviews+Installs+size_mb+small_app+Price+Type+Category+Genres+Content_Rating ,datause2, quantreg = TRUE,importance='impurity')
rf2$variable.importance %>%
as.matrix() %>%
as.data.frame() %>%
add_rownames() %>%
`colnames<-`(c("varname","imp")) %>%
arrange(desc(imp)) %>%
top_n(25,wt = imp) %>%
ggplot(mapping = aes(x = reorder(varname, imp), y = imp)) +
geom_col() +
coord_flip() +
ggtitle(label = "Top 9 important variables") +
theme(
axis.title = element_blank()
)
## Warning: Deprecated, use tibble::rownames_to_column() instead.
df_numeric<- datause2
df_numeric[,11] <- ifelse(df_numeric$Price>0,1,0 )
df_numeric
## # A tibble: 7,729 x 11
## Category Rating Reviews Installs Type Price Content_Rating Genres size_mb
## <fct> <dbl> <dbl> <dbl> <fct> <dbl> <fct> <fct> <dbl>
## 1 ART_AND~ 4.1 159 10000 Free 0 Everyone Art &~ 19
## 2 ART_AND~ 3.9 967 500000 Free 0 Everyone Art &~ 14
## 3 ART_AND~ 4.7 87510 5000000 Free 0 Everyone Art &~ 8.7
## 4 ART_AND~ 4.5 215644 50000000 Free 0 Teen Art &~ 25
## 5 ART_AND~ 4.3 967 100000 Free 0 Everyone Art &~ 2.8
## 6 ART_AND~ 4.4 167 50000 Free 0 Everyone Art &~ 5.6
## 7 ART_AND~ 3.8 178 50000 Free 0 Everyone Art &~ 19
## 8 ART_AND~ 4.1 36815 1000000 Free 0 Everyone Art &~ 29
## 9 ART_AND~ 4.4 13791 1000000 Free 0 Everyone Art &~ 33
## 10 ART_AND~ 4.7 121 10000 Free 0 Everyone Art &~ 3.1
## # ... with 7,719 more rows, and 2 more variables: small_app <dbl>, V11 <dbl>
df_numeric<-
df_numeric %>% rename(paidornot=V11)
df_numeric <- df_numeric[,-c(1,5,7,8)]
#require(e1071)
#library(rminer)
#M <- fit(Price ~., data=df_numeric, model="svm", kpar=list(sigma=0.10))
#summary(M)
#svm.imp <- Importance(M, data=df_numeric)
sum(is.na.data.frame(df_numeric))
## [1] 0
#model<-svm(Price ~ . , df_numeric)
#summary(model)
#model$SV
#svr.pred = predict(model, df_numeric)
#sqrt( mean((df_numeric$Price - svr.pred)^2 ))
require(e1071)
## Loading required package: e1071
## Warning: package 'e1071' was built under R version 3.6.2
model <- lm(Price~.,df_numeric)
lm.pred = predict(model, df_numeric)
sqrt( mean((df_numeric$Price - lm.pred)^2 ))
## [1] 16.92995
plot(df_numeric$Price,lm.pred )
model2<-svm(Price ~ . , df_numeric)
summary(model)
##
## Call:
## lm(formula = Price ~ ., data = df_numeric)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.12 -0.43 0.00 0.37 385.08
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.4220948800543 1.5009652230536 2.946 0.00323 **
## Rating -0.9694859078844 0.3565536818036 -2.719 0.00656 **
## Reviews 0.0000000634848 0.0000001350393 0.470 0.63828
## Installs 0.0000000001531 0.0000000049312 0.031 0.97523
## size_mb -0.0157995417327 0.0086172007582 -1.833 0.06677 .
## small_app -1.3010710291208 1.0954177462113 -1.188 0.23497
## paidornot 15.1931425351991 0.7364019535454 20.632 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.94 on 7722 degrees of freedom
## Multiple R-squared: 0.05332, Adjusted R-squared: 0.05258
## F-statistic: 72.48 on 6 and 7722 DF, p-value: < 0.00000000000000022
svr.pred = predict(model2, df_numeric)
sqrt( mean((df_numeric$Price - svr.pred)^2 ))
## [1] 17.29564
model <- lm(Rating~.,df_numeric)
lm.pred = predict(model, df_numeric)
sqrt( mean((df_numeric$Rating - lm.pred)^2 ))
## [1] 0.5400799
plot( df_numeric$Rating,lm.pred )
model2<-svm(Rating ~ . , df_numeric)
summary(model)
##
## Call:
## lm(formula = Rating ~ ., data = df_numeric)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2715 -0.1806 0.0950 0.3374 1.0192
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.13355798059408 0.00908803462267 454.835 < 0.0000000000000002
## Reviews 0.00000001863717 0.00000000430271 4.331 0.000014997
## Installs 0.00000000004319 0.00000000015731 0.275 0.78366
## Price -0.00098661167093 0.00036285212701 -2.719 0.00656
## size_mb 0.00138321739231 0.00027450499862 5.039 0.000000479
## small_app -0.15283634103311 0.03490465336934 -4.379 0.000012097
## paidornot 0.11877477324580 0.02409276009006 4.930 0.000000840
##
## (Intercept) ***
## Reviews ***
## Installs
## Price **
## size_mb ***
## small_app ***
## paidornot ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5403 on 7722 degrees of freedom
## Multiple R-squared: 0.01627, Adjusted R-squared: 0.0155
## F-statistic: 21.28 on 6 and 7722 DF, p-value: < 0.00000000000000022
svr.pred = predict(model2, df_numeric)
sqrt( mean((df_numeric$Rating - svr.pred)^2 ))
## [1] 0.5442792
plot( df_numeric$Rating,svr.pred )