#一 變數介紹:
我們有16個變數,包含
“id” : App ID
“track_name”: App Name
“size_bytes”: Size (in Bytes)
“currency”: Currency Type
“price”: Price amount
“ratingcounttot”: User Rating counts (for all version)
“ratingcountver”: User Rating counts (for current version)
“user_rating” : Average User Rating value (for all version)
“userratingver”: Average User Rating value (for current version)
“ver” : Latest version code
“cont_rating”: Content Rating 適合哪個年齡層使用 有4個level
分別是 4+,9+,12+,17+
“prime_genre”: Primary Genre
“sup_devices.num”: Number of supporting devices
“ipadSc_urls.num”: Number of screenshots showed for display “可以視為功能的展現”
“lang.num”: Number of supported languages
“vpp_lic”: Vpp Device Based Licensing Enabled
備註: Apple批量購買計劃(VPP)是一項服務,允許已註冊Apple VPP的組織批量購買iOS應用,但不能以折扣價購買。主要應該是用於企業的大量購買,此變數為二元變數
#二 變數處理方式
其中id ,app name 都沒包含有用的資訊,而所有app都使用美金計價,ver(版本)部分由於各家版本號過於凌亂所以也予以刪除。
是故我只留下12個變數,其中又只有prime_genre,vpp_lic,cont_rating和是屬於類別型變數,其他變數都是連續型
然後新增一個虛擬變數為付費與否
另外由於bytes並非常用的單位,是故把它轉換成MB
非常可惜的是並沒有公布APP的下載量
charge<- as.factor(ifelse( ios$price>0,"paid","free" ))
ios[,13] <- charge
ios<- ios %>% rename(charge=V13)
ios$cont_rating <- as.factor(ios$cont_rating )
ios$prime_genre <- as.factor(ios$prime_genre )
ios$size_MB <- ios$size_bytes/1000000
ios <- ios[,-1]
這裡可以看出所有的資料,至此我們總共有13個變數外加7197個觀察值,我們的資料沒有任何遺漏值
秀出我們有的所有資料
DT::datatable(ios, options = list(
pageLength=50, scrollX='400px'), filter = 'top')
## This version of Shiny is designed to work with 'htmlwidgets' >= 1.5.
## Please upgrade via install.packages('htmlwidgets').
#any NA in data.frame
sum(is.na.data.frame(ios))
## [1] 0
秀出我們的變數的敘述統計
#m<- summary(ios[,4:7])
#class(ios)
#knitr::kable(m)
#colnames(ios)
#stargazer(ios[,1:11],summary = T)
#stargazer(m)
#n <- summary(dff)
#stargazer(dff)
#dim(dff)
dff<- as.data.frame(dff)
class(dff)
## [1] "data.frame"
dff2 <- as.data.frame(ios)
dim(ios)
## [1] 7197 13
stargazer(dff2,omit.summary.stat = c("p25", "p75"))
##
## % Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
## % Date and time: 週三, 七月 29, 2020 - 下午 09:03:16
## \begin{table}[!htbp] \centering
## \caption{}
## \label{}
## \begin{tabular}{@{\extracolsep{5pt}}lccccc}
## \\[-1.8ex]\hline
## \hline \\[-1.8ex]
## Statistic & \multicolumn{1}{c}{N} & \multicolumn{1}{c}{Mean} & \multicolumn{1}{c}{St. Dev.} & \multicolumn{1}{c}{Min} & \multicolumn{1}{c}{Max} \\
## \hline \\[-1.8ex]
## price & 7,197 & 1.726 & 5.833 & 0.000 & 299.990 \\
## rating\_count\_tot & 7,197 & 12,892.910 & 75,739.410 & 0 & 2,974,676 \\
## rating\_count\_ver & 7,197 & 460.374 & 3,920.455 & 0 & 177,050 \\
## user\_rating & 7,197 & 3.527 & 1.518 & 0 & 5 \\
## user\_rating\_ver & 7,197 & 3.254 & 1.809 & 0.000 & 5.000 \\
## sup\_devices.num & 7,197 & 37.362 & 3.738 & 9 & 47 \\
## ipadSc\_urls.num & 7,197 & 3.707 & 1.986 & 0 & 5 \\
## lang.num & 7,197 & 5.435 & 7.920 & 0 & 75 \\
## vpp\_lic & 7,197 & 0.993 & 0.083 & 0 & 1 \\
## size\_MB & 7,197 & 199.134 & 359.207 & 0.590 & 4,025.970 \\
## \hline \\[-1.8ex]
## \end{tabular}
## \end{table}
#三 資料視覺化
##3.1 畫出correlation matrix
library(corrplot)
## corrplot 0.84 loaded
df<- ios[,-c(6,7,12)]
df <- as.matrix(df)
M<- cor(df)
corrplot(M, method="circle")
corrplot(M, method="number")
correlation.matrix <- cor(df)
stargazer(correlation.matrix, title = "Ios APP的相關係數矩陣")
##
## % Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
## % Date and time: 週三, 七月 29, 2020 - 下午 09:03:17
## \begin{table}[!htbp] \centering
## \caption{Ios APP的相關係數矩陣}
## \label{}
## \begin{tabular}{@{\extracolsep{5pt}} ccccccccccc}
## \\[-1.8ex]\hline
## \hline \\[-1.8ex]
## & price & rating\_count\_tot & rating\_count\_ver & user\_rating & user\_rating\_ver & sup\_devices.num & ipadSc\_urls.num & lang.num & vpp\_lic & size\_MB \\
## \hline \\[-1.8ex]
## price & $1$ & $$-$0.039$ & $$-$0.018$ & $0.047$ & $0.025$ & $$-$0.115$ & $0.066$ & $$-$0.007$ & $$-$0.030$ & $0.182$ \\
## rating\_count\_tot & $$-$0.039$ & $1$ & $0.164$ & $0.083$ & $0.089$ & $0.009$ & $0.016$ & $0.138$ & $$-$0.001$ & $0.004$ \\
## rating\_count\_ver & $$-$0.018$ & $0.164$ & $1$ & $0.069$ & $0.078$ & $0.038$ & $0.024$ & $0.013$ & $0.006$ & $0.006$ \\
## user\_rating & $0.047$ & $0.083$ & $0.069$ & $1$ & $0.774$ & $$-$0.042$ & $0.266$ & $0.171$ & $0.070$ & $0.066$ \\
## user\_rating\_ver & $0.025$ & $0.089$ & $0.078$ & $0.774$ & $1$ & $$-$0.019$ & $0.276$ & $0.176$ & $0.050$ & $0.086$ \\
## sup\_devices.num & $$-$0.115$ & $0.009$ & $0.038$ & $$-$0.042$ & $$-$0.019$ & $1$ & $$-$0.038$ & $$-$0.042$ & $$-$0.037$ & $$-$0.118$ \\
## ipadSc\_urls.num & $0.066$ & $0.016$ & $0.024$ & $0.266$ & $0.276$ & $$-$0.038$ & $1$ & $0.088$ & $0.072$ & $0.153$ \\
## lang.num & $$-$0.007$ & $0.138$ & $0.013$ & $0.171$ & $0.176$ & $$-$0.042$ & $0.088$ & $1$ & $0.032$ & $0.005$ \\
## vpp\_lic & $$-$0.030$ & $$-$0.001$ & $0.006$ & $0.070$ & $0.050$ & $$-$0.037$ & $0.072$ & $0.032$ & $1$ & $$-$0.150$ \\
## size\_MB & $0.182$ & $0.004$ & $0.006$ & $0.066$ & $0.086$ & $$-$0.118$ & $0.153$ & $0.005$ & $$-$0.150$ & $1$ \\
## \hline \\[-1.8ex]
## \end{tabular}
## \end{table}
可以看出變數之間除了目前的user rating和所有版本的user rating外,其餘變數之間並無線性關係
##3.2 大部分APP的使用年齡
m1<- table(ios$cont_rating)
table(ios$cont_rating)
##
## 12+ 17+ 4+ 9+
## 1155 622 4433 987
ios$cont_rating <- factor(ios$cont_rating,levels = c("4+", "9+", "12+", "17+"))
m1<- table(ios$cont_rating)
barplot(m1)
knitr::kable(m1)
| Var1 | Freq |
|---|---|
| 4+ | 4433 |
| 9+ | 987 |
| 12+ | 1155 |
| 17+ | 622 |
可以看出大部分的APP都是設計出來給4歲以上使用即可
#四 感興趣的問題
1.哪些變數會影響APP的評分?
2.付費軟體的評分有比較好嗎?
3.大部分的APP的定價趨勢為何?
##4.1 哪些變數會影響APP的評分?
###4.1.1先使用簡單的線性回歸來看
m1 <- lm(user_rating ~. ,ios)
m2<- summary(m1)
data.frame(summary(m1)$coef[summary(m1)$coef[,4] <= .05, 4])
## summary.m1..coef.summary.m1..coef...4.....0.05..4.
## (Intercept) 0.000885712962934446244
## price 0.017149274322532933462
## user_rating_ver 0.000000000000000000000
## cont_rating12+ 0.030125854628497372723
## prime_genreBusiness 0.008332204043162476370
## prime_genreEducation 0.013074676199248123562
## prime_genreEntertainment 0.008985110250370754656
## prime_genreFinance 0.008459562324483195853
## prime_genreFood & Drink 0.000205487435224537907
## prime_genreGames 0.011396798805536760210
## prime_genreHealth & Fitness 0.000255245793004355188
## prime_genreLifestyle 0.008410693519466458032
## prime_genreMusic 0.006334849962495827866
## prime_genreNews 0.001242261154232396098
## prime_genrePhoto & Video 0.000063520404373961997
## prime_genreProductivity 0.000544045272050046173
## prime_genreShopping 0.000000000149900693008
## prime_genreSocial Networking 0.007537393754674120809
## prime_genreSports 0.046849439600335426870
## prime_genreTravel 0.000762921294891544624
## prime_genreUtilities 0.003950537294257992597
## prime_genreWeather 0.008810318304743505746
## sup_devices.num 0.042884258425487840893
## ipadSc_urls.num 0.000000000000001649702
## lang.num 0.001047715801772405907
## vpp_lic 0.000409720710488918757
## chargepaid 0.031491392856383318422
###4.1.2 使用隨機森林來看哪些變數影響rating
set.seed(1)
rf<- ranger(user_rating~. ,ios, quantreg = TRUE,importance='impurity')
rf$variable.importance %>%
as.matrix() %>%
as.data.frame() %>%
add_rownames() %>%
`colnames<-`(c("varname","imp")) %>%
arrange(desc(imp)) %>%
top_n(25,wt = imp) %>%
ggplot(mapping = aes(x = reorder(varname, imp), y = imp)) +
geom_col() +
coord_flip() +
ggtitle(label = "Top 12 important variables") +
theme(
axis.title = element_blank()
)
從隨機森林的結果可以發現影響使用者目前rating的前三名變數分別是
##4.2 付費軟體的評分有比較好嗎?
qplot(user_rating, data = ios, geom = "density",
fill = charge, alpha = I(.5),
main="Distribution of App rating",
xlab="Rating",
ylab="Density")
mean(ios$user_rating)
## [1] 3.526956
mean(ios$user_rating[which(ios$V13=="paid" )])
## [1] NaN
mean(ios$user_rating[which(ios$V13=="free" )])
## [1] NaN
所有APP的平均評分為3.526956,付費APP的評分為3.720949,免費APP為3.376726
# Compute the analysis of variance
res.aov <- aov(user_rating ~ charge, data = ios)
# Summary of the analysis
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## charge 1 210 209.75 92.18 <0.0000000000000002 ***
## Residuals 7195 16371 2.28
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
one_way anova table也告訴我們,如果評分代表著APP的品質的話,那麼付費APP確實在統計上品質顯著大於免費APP
##4.3大部分的APP的定價趨勢為何?
sum(is.na(ios$price))
## [1] 0
#there is no NA in price
#we draw the ecdf of this data
plot(ecdf(ios$price ))
object<- table(ios$price )
barplot(log(object))
#plot(sort(unique(applestore$price)) ,log(object) )
#log(table(applestore$price ))
#qplot(price,data=applestore,geom="histogram" )
#qplot(price,data=applestore,geom="histogram",log = "y")
#plot(applestore$price, log="y", type='histogram')
APP的訂價顯然是免費居多,而且訂價有指數分布的趨勢存在
fm.tobit <- tobit(price~.-cont_rating-charge-prime_genre,
data = ios)
summary(fm.tobit)
##
## Call:
## tobit(formula = price ~ . - cont_rating - charge - prime_genre,
## data = ios)
##
## Observations:
## Total Left-censored Uncensored Right-censored
## 7197 4056 3141 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.048136359 2.049136950 0.512 0.60900
## rating_count_tot -0.000050599 0.000005149 -9.828 < 0.0000000000000002 ***
## rating_count_ver 0.000025065 0.000038847 0.645 0.51879
## user_rating 0.655692302 0.142783884 4.592 0.000004386 ***
## user_rating_ver 0.141256047 0.117381845 1.203 0.22883
## sup_devices.num -0.164158253 0.033108869 -4.958 0.000000712 ***
## ipadSc_urls.num 0.219992391 0.071121613 3.093 0.00198 **
## lang.num -0.046153546 0.017895840 -2.579 0.00991 **
## vpp_lic -2.357705841 1.535457318 -1.536 0.12466
## size_MB 0.004816440 0.000340466 14.147 < 0.0000000000000002 ***
## Log(scale) 2.257355544 0.013342801 169.182 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Scale: 9.558
##
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4
## Log-likelihood: -1.365e+04 on 11 Df
## Wald-statistic: 471.4 on 9 Df, p-value: < 0.000000000000000222
#colnames(ios)
#這個可以向老師報告 要看wooldrige 的教科書
fm.tobit$scale
## [1] 9.557781
exp(2.257355544)
## [1] 9.557781
stargazer(fm.tobit )
##
## % Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
## % Date and time: 週三, 七月 29, 2020 - 下午 09:03:21
## \begin{table}[!htbp] \centering
## \caption{}
## \label{}
## \begin{tabular}{@{\extracolsep{5pt}}lc}
## \\[-1.8ex]\hline
## \hline \\[-1.8ex]
## & \multicolumn{1}{c}{\textit{Dependent variable:}} \\
## \cline{2-2}
## \\[-1.8ex] & price \\
## \hline \\[-1.8ex]
## rating\_count\_tot & $-$0.0001$^{***}$ \\
## & (0.00001) \\
## & \\
## rating\_count\_ver & 0.00003 \\
## & (0.00004) \\
## & \\
## user\_rating & 0.656$^{***}$ \\
## & (0.143) \\
## & \\
## user\_rating\_ver & 0.141 \\
## & (0.117) \\
## & \\
## sup\_devices.num & $-$0.164$^{***}$ \\
## & (0.033) \\
## & \\
## ipadSc\_urls.num & 0.220$^{***}$ \\
## & (0.071) \\
## & \\
## lang.num & $-$0.046$^{***}$ \\
## & (0.018) \\
## & \\
## vpp\_lic & $-$2.358 \\
## & (1.535) \\
## & \\
## size\_MB & 0.005$^{***}$ \\
## & (0.0003) \\
## & \\
## Constant & 1.048 \\
## & (2.049) \\
## & \\
## \hline \\[-1.8ex]
## Observations & 7,197 \\
## Log Likelihood & $-$13,646.240 \\
## Wald Test & 471.405$^{***}$ (df = 9) \\
## \hline
## \hline \\[-1.8ex]
## \textit{Note:} & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\
## \end{tabular}
## \end{table}
require(AER)
fm.tobit <- tobit(price~rating_count_tot,
data = ios)
summary(fm.tobit)
##
## Call:
## tobit(formula = price ~ rating_count_tot, data = ios)
##
## Observations:
## Total Left-censored Uncensored Right-censored
## 7197 4056 3141 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.156429442 0.157552739 -20.034 <0.0000000000000002 ***
## rating_count_tot -0.000047303 0.000004914 -9.626 <0.0000000000000002 ***
## Log(scale) 2.291481642 0.013407173 170.915 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Scale: 9.89
##
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4
## Log-likelihood: -1.384e+04 on 3 Df
## Wald-statistic: 92.66 on 1 Df, p-value: < 0.000000000000000222
fm.tobit <- tobit(price~rating_count_tot+user_rating,
data = ios)
summary(fm.tobit)
##
## Call:
## tobit(formula = price ~ rating_count_tot + user_rating, data = ios)
##
## Observations:
## Total Left-censored Uncensored Right-censored
## 7197 4056 3141 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.333061164 0.382270247 -16.567 <0.0000000000000002 ***
## rating_count_tot -0.000052524 0.000005023 -10.456 <0.0000000000000002 ***
## user_rating 0.900530320 0.095359591 9.444 <0.0000000000000002 ***
## Log(scale) 2.287429836 0.013390672 170.823 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Scale: 9.85
##
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4
## Log-likelihood: -1.38e+04 on 4 Df
## Wald-statistic: 177.1 on 2 Df, p-value: < 0.000000000000000222
fm.tobit <- tobit(price~rating_count_tot+user_rating+sup_devices.num,
data = ios)
summary(fm.tobit)
##
## Call:
## tobit(formula = price ~ rating_count_tot + user_rating + sup_devices.num,
## data = ios)
##
## Observations:
## Total Left-censored Uncensored Right-censored
## 7197 4056 3141 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.145189433 1.308010573 1.640 0.101
## rating_count_tot -0.000051630 0.000004971 -10.387 < 0.0000000000000002 ***
## user_rating 0.874390988 0.094839245 9.220 < 0.0000000000000002 ***
## sup_devices.num -0.223726274 0.033299257 -6.719 0.0000000000183 ***
## Log(scale) 2.280253796 0.013390585 170.288 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Scale: 9.779
##
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4
## Log-likelihood: -1.377e+04 on 5 Df
## Wald-statistic: 222.8 on 3 Df, p-value: < 0.000000000000000222
fm.tobit <- tobit(price~rating_count_tot+user_rating+sup_devices.num+ipadSc_urls.num,
data = ios)
summary(fm.tobit)
##
## Call:
## tobit(formula = price ~ rating_count_tot + user_rating + sup_devices.num +
## ipadSc_urls.num, data = ios)
##
## Observations:
## Total Left-censored Uncensored Right-censored
## 7197 4056 3141 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.970060845 1.328247333 0.730 0.465
## rating_count_tot -0.000051226 0.000004943 -10.363 < 0.0000000000000002 ***
## user_rating 0.760460517 0.097264237 7.819 0.00000000000000535 ***
## sup_devices.num -0.216497004 0.033254242 -6.510 0.00000000007497213 ***
## ipadSc_urls.num 0.352107338 0.071095646 4.953 0.00000073233488690 ***
## Log(scale) 2.277807192 0.013385017 170.176 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Scale: 9.755
##
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4
## Log-likelihood: -1.376e+04 on 6 Df
## Wald-statistic: 247.2 on 4 Df, p-value: < 0.000000000000000222
fm.tobit <- tobit(price~rating_count_tot+user_rating+sup_devices.num+ipadSc_urls.num+lang.num,
data = ios)
summary(fm.tobit)
##
## Call:
## tobit(formula = price ~ rating_count_tot + user_rating + sup_devices.num +
## ipadSc_urls.num + lang.num, data = ios)
##
## Observations:
## Total Left-censored Uncensored Right-censored
## 7197 4056 3141 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.188983094 1.329700197 0.894 0.3712
## rating_count_tot -0.000050060 0.000004954 -10.105 < 0.0000000000000002 ***
## user_rating 0.799063897 0.098213341 8.136 0.000000000000000409 ***
## sup_devices.num -0.219741828 0.033261052 -6.607 0.000000000039329758 ***
## ipadSc_urls.num 0.360603439 0.071149968 5.068 0.000000401560647350 ***
## lang.num -0.052460415 0.018177759 -2.886 0.0039 **
## Log(scale) 2.277583745 0.013384040 170.172 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Scale: 9.753
##
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4
## Log-likelihood: -1.376e+04 on 7 Df
## Wald-statistic: 255.1 on 5 Df, p-value: < 0.000000000000000222
fm.tobit <- tobit(price~rating_count_tot+user_rating+sup_devices.num+ipadSc_urls.num+lang.num+vpp_lic,
data = ios)
summary(fm.tobit)
##
## Call:
## tobit(formula = price ~ rating_count_tot + user_rating + sup_devices.num +
## ipadSc_urls.num + lang.num + vpp_lic, data = ios)
##
## Observations:
## Total Left-censored Uncensored Right-censored
## 7197 4056 3141 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.509599939 2.003366848 3.748 0.000178 ***
## rating_count_tot -0.000050304 0.000004948 -10.166 < 0.0000000000000002 ***
## user_rating 0.824256015 0.098435952 8.374 < 0.0000000000000002 ***
## sup_devices.num -0.224713025 0.033228048 -6.763 0.0000000000135 ***
## ipadSc_urls.num 0.379041906 0.071250254 5.320 0.0000001038429 ***
## lang.num -0.050940974 0.018151990 -2.806 0.005011 **
## vpp_lic -6.342612587 1.512540256 -4.193 0.0000274863327 ***
## Log(scale) 2.276010452 0.013379946 170.106 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Scale: 9.738
##
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4
## Log-likelihood: -1.375e+04 on 8 Df
## Wald-statistic: 271.6 on 6 Df, p-value: < 0.000000000000000222
fm.tobit <- tobit(price~rating_count_tot+user_rating+sup_devices.num+ipadSc_urls.num+lang.num+vpp_lic+size_MB,
data = ios)
summary(fm.tobit)
##
## Call:
## tobit(formula = price ~ rating_count_tot + user_rating + sup_devices.num +
## ipadSc_urls.num + lang.num + vpp_lic + size_MB, data = ios)
##
## Observations:
## Total Left-censored Uncensored Right-censored
## 7197 4056 3141 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.936739461 2.046010147 0.458 0.64707
## rating_count_tot -0.000049306 0.000004851 -10.164 < 0.0000000000000002 ***
## user_rating 0.783468352 0.096980716 8.079 0.000000000000000655 ***
## sup_devices.num -0.161857072 0.033043714 -4.898 0.000000966835686918 ***
## ipadSc_urls.num 0.228431134 0.070773719 3.228 0.00125 **
## lang.num -0.045151117 0.017844492 -2.530 0.01140 *
## vpp_lic -2.358754109 1.534419510 -1.537 0.12424
## size_MB 0.004836715 0.000340020 14.225 < 0.0000000000000002 ***
## Log(scale) 2.256985701 0.013336824 169.230 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Scale: 9.554
##
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4
## Log-likelihood: -1.365e+04 on 9 Df
## Wald-statistic: 470.8 on 7 Df, p-value: < 0.000000000000000222
require(AER)
require(wooldridge)
## Loading required package: wooldridge
require(npsf)
## Loading required package: npsf
## Loading required package: Formula
## Loading required package: randtoolbox
## Loading required package: rngWELL
## This is randtoolbox. For an overview, type 'help("randtoolbox")'.
## Loading required package: sfsmisc
##
## Attaching package: 'sfsmisc'
## The following object is masked from 'package:rminer':
##
## factorize
## The following object is masked from 'package:dplyr':
##
## last
data(mroz)
names(mroz)
## [1] "inlf" "hours" "kidslt6" "kidsge6" "age" "educ"
## [7] "wage" "repwage" "hushrs" "husage" "huseduc" "huswage"
## [13] "faminc" "mtr" "motheduc" "fatheduc" "unem" "city"
## [19] "exper" "nwifeinc"
fm.tobit <- tobit(hours~nwifeinc+educ+exper+I(exper^2)+age+kidslt6+kidsge6,
data = mroz)
summary(fm.tobit)
##
## Call:
## tobit(formula = hours ~ nwifeinc + educ + exper + I(exper^2) +
## age + kidslt6 + kidsge6, data = mroz)
##
## Observations:
## Total Left-censored Uncensored Right-censored
## 753 325 428 0
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 965.30528 446.43614 2.162 0.030599 *
## nwifeinc -8.81424 4.45910 -1.977 0.048077 *
## educ 80.64561 21.58324 3.736 0.000187 ***
## exper 131.56430 17.27939 7.614 0.00000000000002659 ***
## I(exper^2) -1.86416 0.53766 -3.467 0.000526 ***
## age -54.40501 7.41850 -7.334 0.00000000000022390 ***
## kidslt6 -894.02174 111.87804 -7.991 0.00000000000000134 ***
## kidsge6 -16.21800 38.64139 -0.420 0.674701
## Log(scale) 7.02289 0.03706 189.514 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Scale: 1122
##
## Gaussian distribution
## Number of Newton-Raphson Iterations: 4
## Log-likelihood: -3819 on 9 Df
## Wald-statistic: 253.9 on 7 Df, p-value: < 0.000000000000000222
文字雲的結果
mixseg<-worker()
seg <- mixseg[dff$track_name]
segA<-data.frame(table(seg))
segC<-data.frame(table(seg[nchar(seg)>1]))#data.frame
segC_top50<-head(segC[order(segC$Freq,decreasing = TRUE),],50)
library(wordcloud)
## Loading required package: RColorBrewer
par(family=("Heiti TC Light"))
wordcloud(
words = segC_top50[,1], # 或segC_top50$Var1
freq = segC_top50$Freq,
scale = c(4,.1), # 給定文字尺寸的區間(向量)
random.order = FALSE,# 關閉文字隨機顯示 按順序
ordered.colors = FALSE,#關閉配色順序
rot.per = FALSE,#關閉文字轉角度
min.freq = 7,# 定義最小freq數字
colors = brewer.pal(8,"Dark2")
)