#一 變數介紹:

分別是 4+,9+,12+,17+

備註: Apple批量購買計劃(VPP)是一項服務,允許已註冊Apple VPP的組織批量購買iOS應用,但不能以折扣價購買。主要應該是用於企業的大量購買,此變數為二元變數

#二 變數處理方式

其中id ,app name 都沒包含有用的資訊,而所有app都使用美金計價,ver(版本)部分由於各家版本號過於凌亂所以也予以刪除。

是故我只留下12個變數,其中又只有prime_genre,vpp_lic,cont_rating和是屬於類別型變數,其他變數都是連續型

然後新增一個虛擬變數為付費與否

另外由於bytes並非常用的單位,是故把它轉換成MB

非常可惜的是並沒有公布APP的下載量

charge<- as.factor(ifelse( ios$price>0,"paid","free"  ))
ios[,13] <- charge

ios<- ios %>% rename(charge=V13)

ios$cont_rating <- as.factor(ios$cont_rating )
ios$prime_genre <- as.factor(ios$prime_genre  )
ios$size_MB <- ios$size_bytes/1000000
ios <- ios[,-1]

這裡可以看出所有的資料,至此我們總共有13個變數外加7197個觀察值,我們的資料沒有任何遺漏值

秀出我們有的所有資料

DT::datatable(ios, options = list(
pageLength=50, scrollX='400px'), filter = 'top')
## This version of Shiny is designed to work with 'htmlwidgets' >= 1.5.
##     Please upgrade via install.packages('htmlwidgets').
#any NA in data.frame
sum(is.na.data.frame(ios))
## [1] 0

秀出我們的變數的敘述統計

#m<- summary(ios[,4:7])
#class(ios)
#knitr::kable(m)
#colnames(ios)
#stargazer(ios[,1:11],summary = T)
#stargazer(m)

#n <- summary(dff)
#stargazer(dff)

#dim(dff)

dff<- as.data.frame(dff)
class(dff)
## [1] "data.frame"
dff2 <- as.data.frame(ios)
dim(ios)
## [1] 7197   13
stargazer(dff2,omit.summary.stat = c("p25", "p75"))
## 
## % Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
## % Date and time: 週三, 七月 29, 2020 - 下午 09:03:16
## \begin{table}[!htbp] \centering 
##   \caption{} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}}lccccc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
## Statistic & \multicolumn{1}{c}{N} & \multicolumn{1}{c}{Mean} & \multicolumn{1}{c}{St. Dev.} & \multicolumn{1}{c}{Min} & \multicolumn{1}{c}{Max} \\ 
## \hline \\[-1.8ex] 
## price & 7,197 & 1.726 & 5.833 & 0.000 & 299.990 \\ 
## rating\_count\_tot & 7,197 & 12,892.910 & 75,739.410 & 0 & 2,974,676 \\ 
## rating\_count\_ver & 7,197 & 460.374 & 3,920.455 & 0 & 177,050 \\ 
## user\_rating & 7,197 & 3.527 & 1.518 & 0 & 5 \\ 
## user\_rating\_ver & 7,197 & 3.254 & 1.809 & 0.000 & 5.000 \\ 
## sup\_devices.num & 7,197 & 37.362 & 3.738 & 9 & 47 \\ 
## ipadSc\_urls.num & 7,197 & 3.707 & 1.986 & 0 & 5 \\ 
## lang.num & 7,197 & 5.435 & 7.920 & 0 & 75 \\ 
## vpp\_lic & 7,197 & 0.993 & 0.083 & 0 & 1 \\ 
## size\_MB & 7,197 & 199.134 & 359.207 & 0.590 & 4,025.970 \\ 
## \hline \\[-1.8ex] 
## \end{tabular} 
## \end{table}

#三 資料視覺化

##3.1 畫出correlation matrix

library(corrplot)
## corrplot 0.84 loaded
df<- ios[,-c(6,7,12)]
df <- as.matrix(df)
M<- cor(df)

corrplot(M, method="circle")

corrplot(M, method="number")

correlation.matrix <- cor(df) 
stargazer(correlation.matrix, title = "Ios APP的相關係數矩陣")
## 
## % Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
## % Date and time: 週三, 七月 29, 2020 - 下午 09:03:17
## \begin{table}[!htbp] \centering 
##   \caption{Ios APP的相關係數矩陣} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}} ccccccccccc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
##  & price & rating\_count\_tot & rating\_count\_ver & user\_rating & user\_rating\_ver & sup\_devices.num & ipadSc\_urls.num & lang.num & vpp\_lic & size\_MB \\ 
## \hline \\[-1.8ex] 
## price & $1$ & $$-$0.039$ & $$-$0.018$ & $0.047$ & $0.025$ & $$-$0.115$ & $0.066$ & $$-$0.007$ & $$-$0.030$ & $0.182$ \\ 
## rating\_count\_tot & $$-$0.039$ & $1$ & $0.164$ & $0.083$ & $0.089$ & $0.009$ & $0.016$ & $0.138$ & $$-$0.001$ & $0.004$ \\ 
## rating\_count\_ver & $$-$0.018$ & $0.164$ & $1$ & $0.069$ & $0.078$ & $0.038$ & $0.024$ & $0.013$ & $0.006$ & $0.006$ \\ 
## user\_rating & $0.047$ & $0.083$ & $0.069$ & $1$ & $0.774$ & $$-$0.042$ & $0.266$ & $0.171$ & $0.070$ & $0.066$ \\ 
## user\_rating\_ver & $0.025$ & $0.089$ & $0.078$ & $0.774$ & $1$ & $$-$0.019$ & $0.276$ & $0.176$ & $0.050$ & $0.086$ \\ 
## sup\_devices.num & $$-$0.115$ & $0.009$ & $0.038$ & $$-$0.042$ & $$-$0.019$ & $1$ & $$-$0.038$ & $$-$0.042$ & $$-$0.037$ & $$-$0.118$ \\ 
## ipadSc\_urls.num & $0.066$ & $0.016$ & $0.024$ & $0.266$ & $0.276$ & $$-$0.038$ & $1$ & $0.088$ & $0.072$ & $0.153$ \\ 
## lang.num & $$-$0.007$ & $0.138$ & $0.013$ & $0.171$ & $0.176$ & $$-$0.042$ & $0.088$ & $1$ & $0.032$ & $0.005$ \\ 
## vpp\_lic & $$-$0.030$ & $$-$0.001$ & $0.006$ & $0.070$ & $0.050$ & $$-$0.037$ & $0.072$ & $0.032$ & $1$ & $$-$0.150$ \\ 
## size\_MB & $0.182$ & $0.004$ & $0.006$ & $0.066$ & $0.086$ & $$-$0.118$ & $0.153$ & $0.005$ & $$-$0.150$ & $1$ \\ 
## \hline \\[-1.8ex] 
## \end{tabular} 
## \end{table}

可以看出變數之間除了目前的user rating和所有版本的user rating外,其餘變數之間並無線性關係

##3.2 大部分APP的使用年齡

m1<- table(ios$cont_rating)

table(ios$cont_rating)
## 
##  12+  17+   4+   9+ 
## 1155  622 4433  987
ios$cont_rating <- factor(ios$cont_rating,levels = c("4+", "9+", "12+", "17+"))
m1<- table(ios$cont_rating)
barplot(m1)

knitr::kable(m1)
Var1 Freq
4+ 4433
9+ 987
12+ 1155
17+ 622

可以看出大部分的APP都是設計出來給4歲以上使用即可

#四 感興趣的問題

1.哪些變數會影響APP的評分?

2.付費軟體的評分有比較好嗎?

3.大部分的APP的定價趨勢為何?

##4.1 哪些變數會影響APP的評分?

###4.1.1先使用簡單的線性回歸來看

m1 <- lm(user_rating   ~. ,ios)
m2<- summary(m1)

data.frame(summary(m1)$coef[summary(m1)$coef[,4] <= .05, 4])
##                              summary.m1..coef.summary.m1..coef...4.....0.05..4.
## (Intercept)                                             0.000885712962934446244
## price                                                   0.017149274322532933462
## user_rating_ver                                         0.000000000000000000000
## cont_rating12+                                          0.030125854628497372723
## prime_genreBusiness                                     0.008332204043162476370
## prime_genreEducation                                    0.013074676199248123562
## prime_genreEntertainment                                0.008985110250370754656
## prime_genreFinance                                      0.008459562324483195853
## prime_genreFood & Drink                                 0.000205487435224537907
## prime_genreGames                                        0.011396798805536760210
## prime_genreHealth & Fitness                             0.000255245793004355188
## prime_genreLifestyle                                    0.008410693519466458032
## prime_genreMusic                                        0.006334849962495827866
## prime_genreNews                                         0.001242261154232396098
## prime_genrePhoto & Video                                0.000063520404373961997
## prime_genreProductivity                                 0.000544045272050046173
## prime_genreShopping                                     0.000000000149900693008
## prime_genreSocial Networking                            0.007537393754674120809
## prime_genreSports                                       0.046849439600335426870
## prime_genreTravel                                       0.000762921294891544624
## prime_genreUtilities                                    0.003950537294257992597
## prime_genreWeather                                      0.008810318304743505746
## sup_devices.num                                         0.042884258425487840893
## ipadSc_urls.num                                         0.000000000000001649702
## lang.num                                                0.001047715801772405907
## vpp_lic                                                 0.000409720710488918757
## chargepaid                                              0.031491392856383318422

###4.1.2 使用隨機森林來看哪些變數影響rating

set.seed(1)

rf<- ranger(user_rating~. ,ios, quantreg = TRUE,importance='impurity')
rf$variable.importance %>% 
  as.matrix() %>% 
  as.data.frame() %>% 
  add_rownames() %>% 
  `colnames<-`(c("varname","imp")) %>%
  arrange(desc(imp)) %>% 
  top_n(25,wt = imp) %>% 
  ggplot(mapping = aes(x = reorder(varname, imp), y = imp)) +
  geom_col() +
  coord_flip() +
  ggtitle(label = "Top 12 important variables") +
  theme(
    axis.title = element_blank()
  )

從隨機森林的結果可以發現影響使用者目前rating的前三名變數分別是

##4.2 付費軟體的評分有比較好嗎?

qplot(user_rating, data = ios, geom = "density",
  fill = charge, alpha = I(.5),
  main="Distribution of App rating",
  xlab="Rating",
  ylab="Density")

mean(ios$user_rating)
## [1] 3.526956
mean(ios$user_rating[which(ios$V13=="paid"       )])
## [1] NaN
mean(ios$user_rating[which(ios$V13=="free"       )])
## [1] NaN

所有APP的平均評分為3.526956,付費APP的評分為3.720949,免費APP為3.376726

# Compute the analysis of variance
res.aov <- aov(user_rating ~ charge, data = ios)
# Summary of the analysis
summary(res.aov)
##               Df Sum Sq Mean Sq F value              Pr(>F)    
## charge         1    210  209.75   92.18 <0.0000000000000002 ***
## Residuals   7195  16371    2.28                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

one_way anova table也告訴我們,如果評分代表著APP的品質的話,那麼付費APP確實在統計上品質顯著大於免費APP

##4.3大部分的APP的定價趨勢為何?

sum(is.na(ios$price))
## [1] 0
#there is no NA in price
#we draw the ecdf of this data

plot(ecdf(ios$price  ))

object<- table(ios$price  )
barplot(log(object))

#plot(sort(unique(applestore$price)) ,log(object)     )

#log(table(applestore$price  ))
#qplot(price,data=applestore,geom="histogram"     )

#qplot(price,data=applestore,geom="histogram",log = "y")

#plot(applestore$price, log="y", type='histogram')

APP的訂價顯然是免費居多,而且訂價有指數分布的趨勢存在

4.4哪些因素影響APP定價

fm.tobit <- tobit(price~.-cont_rating-charge-prime_genre,
data = ios)

summary(fm.tobit)
## 
## Call:
## tobit(formula = price ~ . - cont_rating - charge - prime_genre, 
##     data = ios)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##           7197           4056           3141              0 
## 
## Coefficients:
##                      Estimate   Std. Error z value             Pr(>|z|)    
## (Intercept)       1.048136359  2.049136950   0.512              0.60900    
## rating_count_tot -0.000050599  0.000005149  -9.828 < 0.0000000000000002 ***
## rating_count_ver  0.000025065  0.000038847   0.645              0.51879    
## user_rating       0.655692302  0.142783884   4.592          0.000004386 ***
## user_rating_ver   0.141256047  0.117381845   1.203              0.22883    
## sup_devices.num  -0.164158253  0.033108869  -4.958          0.000000712 ***
## ipadSc_urls.num   0.219992391  0.071121613   3.093              0.00198 ** 
## lang.num         -0.046153546  0.017895840  -2.579              0.00991 ** 
## vpp_lic          -2.357705841  1.535457318  -1.536              0.12466    
## size_MB           0.004816440  0.000340466  14.147 < 0.0000000000000002 ***
## Log(scale)        2.257355544  0.013342801 169.182 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Scale: 9.558 
## 
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4 
## Log-likelihood: -1.365e+04 on 11 Df
## Wald-statistic: 471.4 on 9 Df, p-value: < 0.000000000000000222
#colnames(ios)

#這個可以向老師報告 要看wooldrige 的教科書

fm.tobit$scale
## [1] 9.557781
exp(2.257355544)
## [1] 9.557781
stargazer(fm.tobit  )
## 
## % Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
## % Date and time: 週三, 七月 29, 2020 - 下午 09:03:21
## \begin{table}[!htbp] \centering 
##   \caption{} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}}lc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
##  & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
## \cline{2-2} 
## \\[-1.8ex] & price \\ 
## \hline \\[-1.8ex] 
##  rating\_count\_tot & $-$0.0001$^{***}$ \\ 
##   & (0.00001) \\ 
##   & \\ 
##  rating\_count\_ver & 0.00003 \\ 
##   & (0.00004) \\ 
##   & \\ 
##  user\_rating & 0.656$^{***}$ \\ 
##   & (0.143) \\ 
##   & \\ 
##  user\_rating\_ver & 0.141 \\ 
##   & (0.117) \\ 
##   & \\ 
##  sup\_devices.num & $-$0.164$^{***}$ \\ 
##   & (0.033) \\ 
##   & \\ 
##  ipadSc\_urls.num & 0.220$^{***}$ \\ 
##   & (0.071) \\ 
##   & \\ 
##  lang.num & $-$0.046$^{***}$ \\ 
##   & (0.018) \\ 
##   & \\ 
##  vpp\_lic & $-$2.358 \\ 
##   & (1.535) \\ 
##   & \\ 
##  size\_MB & 0.005$^{***}$ \\ 
##   & (0.0003) \\ 
##   & \\ 
##  Constant & 1.048 \\ 
##   & (2.049) \\ 
##   & \\ 
## \hline \\[-1.8ex] 
## Observations & 7,197 \\ 
## Log Likelihood & $-$13,646.240 \\ 
## Wald Test & 471.405$^{***}$ (df = 9) \\ 
## \hline 
## \hline \\[-1.8ex] 
## \textit{Note:}  & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
## \end{tabular} 
## \end{table}
require(AER)

m1

fm.tobit <- tobit(price~rating_count_tot,
data = ios)
summary(fm.tobit)
## 
## Call:
## tobit(formula = price ~ rating_count_tot, data = ios)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##           7197           4056           3141              0 
## 
## Coefficients:
##                      Estimate   Std. Error z value            Pr(>|z|)    
## (Intercept)      -3.156429442  0.157552739 -20.034 <0.0000000000000002 ***
## rating_count_tot -0.000047303  0.000004914  -9.626 <0.0000000000000002 ***
## Log(scale)        2.291481642  0.013407173 170.915 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Scale: 9.89 
## 
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4 
## Log-likelihood: -1.384e+04 on 3 Df
## Wald-statistic: 92.66 on 1 Df, p-value: < 0.000000000000000222

m2

fm.tobit <- tobit(price~rating_count_tot+user_rating,
data = ios)
summary(fm.tobit)
## 
## Call:
## tobit(formula = price ~ rating_count_tot + user_rating, data = ios)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##           7197           4056           3141              0 
## 
## Coefficients:
##                      Estimate   Std. Error z value            Pr(>|z|)    
## (Intercept)      -6.333061164  0.382270247 -16.567 <0.0000000000000002 ***
## rating_count_tot -0.000052524  0.000005023 -10.456 <0.0000000000000002 ***
## user_rating       0.900530320  0.095359591   9.444 <0.0000000000000002 ***
## Log(scale)        2.287429836  0.013390672 170.823 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Scale: 9.85 
## 
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4 
## Log-likelihood: -1.38e+04 on 4 Df
## Wald-statistic: 177.1 on 2 Df, p-value: < 0.000000000000000222

m3

fm.tobit <- tobit(price~rating_count_tot+user_rating+sup_devices.num,
data = ios)
summary(fm.tobit)
## 
## Call:
## tobit(formula = price ~ rating_count_tot + user_rating + sup_devices.num, 
##     data = ios)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##           7197           4056           3141              0 
## 
## Coefficients:
##                      Estimate   Std. Error z value             Pr(>|z|)    
## (Intercept)       2.145189433  1.308010573   1.640                0.101    
## rating_count_tot -0.000051630  0.000004971 -10.387 < 0.0000000000000002 ***
## user_rating       0.874390988  0.094839245   9.220 < 0.0000000000000002 ***
## sup_devices.num  -0.223726274  0.033299257  -6.719      0.0000000000183 ***
## Log(scale)        2.280253796  0.013390585 170.288 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Scale: 9.779 
## 
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4 
## Log-likelihood: -1.377e+04 on 5 Df
## Wald-statistic: 222.8 on 3 Df, p-value: < 0.000000000000000222

m4

fm.tobit <- tobit(price~rating_count_tot+user_rating+sup_devices.num+ipadSc_urls.num,
data = ios)
summary(fm.tobit)
## 
## Call:
## tobit(formula = price ~ rating_count_tot + user_rating + sup_devices.num + 
##     ipadSc_urls.num, data = ios)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##           7197           4056           3141              0 
## 
## Coefficients:
##                      Estimate   Std. Error z value             Pr(>|z|)    
## (Intercept)       0.970060845  1.328247333   0.730                0.465    
## rating_count_tot -0.000051226  0.000004943 -10.363 < 0.0000000000000002 ***
## user_rating       0.760460517  0.097264237   7.819  0.00000000000000535 ***
## sup_devices.num  -0.216497004  0.033254242  -6.510  0.00000000007497213 ***
## ipadSc_urls.num   0.352107338  0.071095646   4.953  0.00000073233488690 ***
## Log(scale)        2.277807192  0.013385017 170.176 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Scale: 9.755 
## 
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4 
## Log-likelihood: -1.376e+04 on 6 Df
## Wald-statistic: 247.2 on 4 Df, p-value: < 0.000000000000000222

m5

fm.tobit <- tobit(price~rating_count_tot+user_rating+sup_devices.num+ipadSc_urls.num+lang.num,
data = ios)
summary(fm.tobit)
## 
## Call:
## tobit(formula = price ~ rating_count_tot + user_rating + sup_devices.num + 
##     ipadSc_urls.num + lang.num, data = ios)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##           7197           4056           3141              0 
## 
## Coefficients:
##                      Estimate   Std. Error z value             Pr(>|z|)    
## (Intercept)       1.188983094  1.329700197   0.894               0.3712    
## rating_count_tot -0.000050060  0.000004954 -10.105 < 0.0000000000000002 ***
## user_rating       0.799063897  0.098213341   8.136 0.000000000000000409 ***
## sup_devices.num  -0.219741828  0.033261052  -6.607 0.000000000039329758 ***
## ipadSc_urls.num   0.360603439  0.071149968   5.068 0.000000401560647350 ***
## lang.num         -0.052460415  0.018177759  -2.886               0.0039 ** 
## Log(scale)        2.277583745  0.013384040 170.172 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Scale: 9.753 
## 
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4 
## Log-likelihood: -1.376e+04 on 7 Df
## Wald-statistic: 255.1 on 5 Df, p-value: < 0.000000000000000222
fm.tobit <- tobit(price~rating_count_tot+user_rating+sup_devices.num+ipadSc_urls.num+lang.num+vpp_lic,
data = ios)
summary(fm.tobit)
## 
## Call:
## tobit(formula = price ~ rating_count_tot + user_rating + sup_devices.num + 
##     ipadSc_urls.num + lang.num + vpp_lic, data = ios)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##           7197           4056           3141              0 
## 
## Coefficients:
##                      Estimate   Std. Error z value             Pr(>|z|)    
## (Intercept)       7.509599939  2.003366848   3.748             0.000178 ***
## rating_count_tot -0.000050304  0.000004948 -10.166 < 0.0000000000000002 ***
## user_rating       0.824256015  0.098435952   8.374 < 0.0000000000000002 ***
## sup_devices.num  -0.224713025  0.033228048  -6.763      0.0000000000135 ***
## ipadSc_urls.num   0.379041906  0.071250254   5.320      0.0000001038429 ***
## lang.num         -0.050940974  0.018151990  -2.806             0.005011 ** 
## vpp_lic          -6.342612587  1.512540256  -4.193      0.0000274863327 ***
## Log(scale)        2.276010452  0.013379946 170.106 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Scale: 9.738 
## 
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4 
## Log-likelihood: -1.375e+04 on 8 Df
## Wald-statistic: 271.6 on 6 Df, p-value: < 0.000000000000000222
fm.tobit <- tobit(price~rating_count_tot+user_rating+sup_devices.num+ipadSc_urls.num+lang.num+vpp_lic+size_MB,
data = ios)
summary(fm.tobit)
## 
## Call:
## tobit(formula = price ~ rating_count_tot + user_rating + sup_devices.num + 
##     ipadSc_urls.num + lang.num + vpp_lic + size_MB, data = ios)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##           7197           4056           3141              0 
## 
## Coefficients:
##                      Estimate   Std. Error z value             Pr(>|z|)    
## (Intercept)       0.936739461  2.046010147   0.458              0.64707    
## rating_count_tot -0.000049306  0.000004851 -10.164 < 0.0000000000000002 ***
## user_rating       0.783468352  0.096980716   8.079 0.000000000000000655 ***
## sup_devices.num  -0.161857072  0.033043714  -4.898 0.000000966835686918 ***
## ipadSc_urls.num   0.228431134  0.070773719   3.228              0.00125 ** 
## lang.num         -0.045151117  0.017844492  -2.530              0.01140 *  
## vpp_lic          -2.358754109  1.534419510  -1.537              0.12424    
## size_MB           0.004836715  0.000340020  14.225 < 0.0000000000000002 ***
## Log(scale)        2.256985701  0.013336824 169.230 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Scale: 9.554 
## 
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4 
## Log-likelihood: -1.365e+04 on 9 Df
## Wald-statistic: 470.8 on 7 Df, p-value: < 0.000000000000000222
require(AER)
require(wooldridge)
## Loading required package: wooldridge
require(npsf)
## Loading required package: npsf
## Loading required package: Formula
## Loading required package: randtoolbox
## Loading required package: rngWELL
## This is randtoolbox. For an overview, type 'help("randtoolbox")'.
## Loading required package: sfsmisc
## 
## Attaching package: 'sfsmisc'
## The following object is masked from 'package:rminer':
## 
##     factorize
## The following object is masked from 'package:dplyr':
## 
##     last
data(mroz)
names(mroz)
##  [1] "inlf"     "hours"    "kidslt6"  "kidsge6"  "age"      "educ"    
##  [7] "wage"     "repwage"  "hushrs"   "husage"   "huseduc"  "huswage" 
## [13] "faminc"   "mtr"      "motheduc" "fatheduc" "unem"     "city"    
## [19] "exper"    "nwifeinc"
fm.tobit <- tobit(hours~nwifeinc+educ+exper+I(exper^2)+age+kidslt6+kidsge6,
data = mroz)
summary(fm.tobit)
## 
## Call:
## tobit(formula = hours ~ nwifeinc + educ + exper + I(exper^2) + 
##     age + kidslt6 + kidsge6, data = mroz)
## 
## Observations:
##          Total  Left-censored     Uncensored Right-censored 
##            753            325            428              0 
## 
## Coefficients:
##               Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)  965.30528  446.43614   2.162             0.030599 *  
## nwifeinc      -8.81424    4.45910  -1.977             0.048077 *  
## educ          80.64561   21.58324   3.736             0.000187 ***
## exper        131.56430   17.27939   7.614  0.00000000000002659 ***
## I(exper^2)    -1.86416    0.53766  -3.467             0.000526 ***
## age          -54.40501    7.41850  -7.334  0.00000000000022390 ***
## kidslt6     -894.02174  111.87804  -7.991  0.00000000000000134 ***
## kidsge6      -16.21800   38.64139  -0.420             0.674701    
## Log(scale)     7.02289    0.03706 189.514 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Scale: 1122 
## 
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4 
## Log-likelihood: -3819 on 9 Df
## Wald-statistic: 253.9 on 7 Df, p-value: < 0.000000000000000222

文字雲的結果

mixseg<-worker()

seg <- mixseg[dff$track_name]
segA<-data.frame(table(seg))

segC<-data.frame(table(seg[nchar(seg)>1]))#data.frame
segC_top50<-head(segC[order(segC$Freq,decreasing = TRUE),],50)

library(wordcloud)
## Loading required package: RColorBrewer
par(family=("Heiti TC Light"))
wordcloud(
  words = segC_top50[,1], # 或segC_top50$Var1
  freq =  segC_top50$Freq, 
  scale = c(4,.1), # 給定文字尺寸的區間(向量)
  random.order = FALSE,# 關閉文字隨機顯示 按順序
  ordered.colors = FALSE,#關閉配色順序
  rot.per = FALSE,#關閉文字轉角度
  min.freq = 7,# 定義最小freq數字 
  colors = brewer.pal(8,"Dark2")
)