#1.Loading Data
gg = read.csv("googleplaystore.csv")
review = read.csv("googleplaystore_user_reviews.csv")
library(e1071)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ───────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.3 ✓ purrr 0.3.4
✓ tibble 3.0.4 ✓ dplyr 1.0.2
✓ tidyr 1.1.2 ✓ stringr 1.4.0
✓ readr 1.4.0 ✓ forcats 0.5.0
── Conflicts ──────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
review1 = review %>% select(App, Translated_Review)
head(review1)
knitr::kable(head(review1))
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
| App | Translated_Review |
|---|---|
| 10 Best Foods for You | I like eat delicious food. That’s I’m cooking food myself, case “10 Best Foods” helps lot, also “Best Before (Shelf Life)” |
| 10 Best Foods for You | This help eating healthy exercise regular basis |
| 10 Best Foods for You | nan |
| 10 Best Foods for You | Works great especially going grocery store |
| 10 Best Foods for You | Best idea us |
| 10 Best Foods for You | Best way |
head(review)
head(gg)
#2.Data Preprocessing
str(gg)
'data.frame': 10841 obs. of 13 variables:
$ App : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7206 2551 8970 8089 7272 7103 8149 5568 4926 5806 ...
$ Category : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Rating : num 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
$ Reviews : Factor w/ 6002 levels "0","1","10","100",..: 1183 5924 5681 1947 5924 1310 1464 3385 816 485 ...
$ Size : Factor w/ 462 levels "1,000+","1.0M",..: 55 30 368 102 64 222 55 118 146 120 ...
$ Installs : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
$ Type : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Price : Factor w/ 93 levels "$0.99","$1.00",..: 92 92 92 92 92 92 92 92 92 92 ...
$ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
$ Genres : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
$ Last.Updated : Factor w/ 1378 levels "1.0.19","April 1, 2016",..: 562 482 117 825 757 901 76 726 1317 670 ...
$ Current.Ver : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 122 1020 468 2827 280 116 280 2393 1457 1431 ...
$ Android.Ver : Factor w/ 35 levels "","1.0 and up",..: 17 17 17 20 22 10 17 20 12 17 ...
There are a lot of factor variables which should actually be converted to numeric variables.
##2.1 Converting variable types(imputation)
library(lubridate)
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
library(tidyverse)
library(dplyr)
gg.new <- gg %>%
mutate(
# Eliminate "+" to transform Installs to numeric variable
# Installs = gsub("\\+", "", as.character(Installs)),
# Installs = as.numeric(gsub(",", "", Installs)),
# Eliminate "M" to transform Size to numeric variable
Size = gsub("M", "", Size),
# For cells with k, divide it by 1024, since 1024kB = 1MB, the unit for size is MB
Size = ifelse(grepl("k", Size),as.numeric(gsub("k", "", Size))/1024, as.numeric(Size)),
# Transform reviews to numeric
Reviews = as.numeric(Reviews),
# Remove "$" from Price to transform it to numeric
Price = as.numeric(gsub("\\$", "", as.character(Price))),
# Convert Last Updated to date format
Last.Updated = mdy(Last.Updated),
# Replace "Varies with device" to NA since it is unknown
Min.Android.Ver = gsub("Varies with device", NA, Android.Ver),
# Keep only version number to 1 decimal as it's most representative
Min.Android.Ver = as.numeric(substr(Min.Android.Ver, start = 1, stop = 3)),
# Drop old Android version column
Android.Ver = NULL
) %>%
filter(
# Two apps had type as 0 or NA, they will be removed
Type %in% c("Free", "Paid")
)
Problem with `mutate()` input `Size`.
ℹ NAs introduced by coercion
ℹ Input `Size` is `ifelse(...)`.NAs introduced by coercionProblem with `mutate()` input `Size`.
ℹ NAs introduced by coercion
ℹ Input `Size` is `ifelse(...)`.NAs introduced by coercionProblem with `mutate()` input `Price`.
ℹ NAs introduced by coercion
ℹ Input `Price` is `as.numeric(gsub("\\$", "", as.character(Price)))`.NAs introduced by coercionProblem with `mutate()` input `Last.Updated`.
ℹ 1 failed to parse.
ℹ Input `Last.Updated` is `mdy(Last.Updated)`. 1 failed to parse.
str(gg.new)
'data.frame': 10839 obs. of 13 variables:
$ App : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7206 2551 8970 8089 7272 7103 8149 5568 4926 5806 ...
$ Category : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Rating : num 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
$ Reviews : num 1183 5924 5681 1947 5924 ...
$ Size : num 19 14 8.7 25 2.8 5.6 19 29 33 3.1 ...
$ Installs : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
$ Type : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Price : num 0 0 0 0 0 0 0 0 0 0 ...
$ Content.Rating : Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
$ Genres : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
$ Last.Updated : Date, format: "2018-01-07" "2018-01-15" ...
$ Current.Ver : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 122 1020 468 2827 280 116 280 2393 1457 1431 ...
$ Min.Android.Ver: num 4 4 4 4.2 4.4 2.3 4 4.2 3 4 ...
options(scipen=999)
table(gg.new$Installs)
0 0+ 1,000,000,000+ 1,000,000+ 1,000+
0 14 58 1579 907
1+ 10,000,000+ 10,000+ 10+ 100,000,000+
67 1252 1054 386 409
100,000+ 100+ 5,000,000+ 5,000+ 5+
1169 719 752 477 82
50,000,000+ 50,000+ 50+ 500,000,000+ 500,000+
289 479 205 72 539
500+ Free
330 0
gg.new$Installs%>%str()%>% print
Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
NULL
gg.new %>% filter(Installs == "500,000") %>% print
library(highcharter)
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'quantmod':
method from
as.zoo.data.frame zoo
Registered S3 method overwritten by 'data.table':
method from
print.data.table
gg.new %>% select(-Min.Android.Ver) %>%
summarise_all(
funs(sum(is.na(.)))
) %>%
gather() %>%
# Only show columns with NA
filter(value> 1) %>%
arrange(-value) %>%
hchart('column', hcaes(x = 'key', y = 'value', color = 'key')) %>%
hc_add_theme(hc_theme_elementary()) %>%
hc_title(text = "Columns with Missing Value")
`funs()` is deprecated as of dplyr 0.8.0.
Please use a list of either functions or lambdas:
# Simple named list:
list(mean = mean, median = median)
# Auto named with `tibble::lst()`:
tibble::lst(mean, median)
# Using lambdas
list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
gg.new1 <- gg %>%
mutate(
# Eliminate "+" to transform Installs to numeric variable
Installs = gsub("\\+", "", as.character(Installs)),
Installs = as.numeric(gsub(",", "", Installs)),
# Eliminate "M" to transform Size to numeric variable
Size = gsub("M", "", Size),
# For cells with k, divide it by 1024, since 1024kB = 1MB, the unit for size is MB
Size = ifelse(grepl("k", Size),as.numeric(gsub("k", "", Size))/1024, as.numeric(Size)),
# Transform reviews to numeric
Reviews = as.numeric(Reviews),
# Remove "$" from Price to transform it to numeric
Price = as.numeric(gsub("\\$", "", as.character(Price))),
# Convert Last Updated to date format
Last.Updated = mdy(Last.Updated),
# Replace "Varies with device" to NA since it is unknown
Min.Android.Ver = gsub("Varies with device", NA, Android.Ver),
# Keep only version number to 1 decimal as it's most representatice
Min.Android.Ver = as.numeric(substr(Min.Android.Ver, start = 1, stop = 3)),
# Drop old Android version column
Android.Ver = NULL
)
Problem with `mutate()` input `Installs`.
ℹ NAs introduced by coercion
ℹ Input `Installs` is `as.numeric(gsub(",", "", Installs))`.NAs introduced by coercionProblem with `mutate()` input `Size`.
ℹ NAs introduced by coercion
ℹ Input `Size` is `ifelse(...)`.NAs introduced by coercionProblem with `mutate()` input `Size`.
ℹ NAs introduced by coercion
ℹ Input `Size` is `ifelse(...)`.NAs introduced by coercionProblem with `mutate()` input `Price`.
ℹ NAs introduced by coercion
ℹ Input `Price` is `as.numeric(gsub("\\$", "", as.character(Price)))`.NAs introduced by coercionProblem with `mutate()` input `Last.Updated`.
ℹ 1 failed to parse.
ℹ Input `Last.Updated` is `mdy(Last.Updated)`. 1 failed to parse.
gg.new2 = gg.new1 %>% mutate(Interval = difftime(time1 = today(), time2 = Last.Updated)) %>% print
ggplot(gg.new2) + geom_line(aes(x = Interval, y = Installs)) + labs(x = "Days Since Last Update", y = "Installments")
gg.new1 %>%
group_by(Category) %>% filter(Category != 1.9) %>%
summarize(
TotalInstalls = sum(as.numeric(Installs))
) %>%
arrange(-TotalInstalls) %>%
hchart('scatter', hcaes(x = "Category", y = "TotalInstalls", size = "TotalInstalls", color = "Category")) %>%
hc_add_theme(hc_theme_538()) %>%
hc_title(text = "Most popular categories")
`summarise()` ungrouping output (override with `.groups` argument)
###Correlation map
head(iris)
library(reshape2)
Attaching package: ‘reshape2’
The following object is masked from ‘package:tidyr’:
smiths
df_cor = iris[,2:3]
cormat <- round(cor(df_cor),2)
melted_cormat <- melt(cormat)
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "yellow", high = "purple", mid = "red",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
##2.2Divide Installs into 3 categories
library(tidyverse)
options(scipen=999)
# write function to convert installment
convert_install = function(data, installment) {
#install.levels = factor(c("low", "medium", "high"))
if (installment %in% c("0", "1", "50", "100", "500", "1,000", "5,000", "10,000", "50,000")) {
Installs.cat = "low"
}
else if (installment %in% c ("100,000", "500,000", "1,000,000", "5,000,000")){
Installs.cat = "medium"
}
else {
Installs.cat = "high"
}
}
#gg.new = gg.new %>% filter(!is.na(Installs)) %>% mutate(Installs.cat = factor(convert_install(gg.new, Installs), # levels = c("low", "medium", "high")))
sum((gg.new$Installs) %in% "10,000")
[1] 0
# gg.new = gg.new %>% mutate(Installs.cat = "1")
str(gg.new)
'data.frame': 10839 obs. of 13 variables:
$ App : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7206 2551 8970 8089 7272 7103 8149 5568 4926 5806 ...
$ Category : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Rating : num 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
$ Reviews : num 1183 5924 5681 1947 5924 ...
$ Size : num 19 14 8.7 25 2.8 5.6 19 29 33 3.1 ...
$ Installs : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
$ Type : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Price : num 0 0 0 0 0 0 0 0 0 0 ...
$ Content.Rating : Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
$ Genres : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
$ Last.Updated : Date, format: "2018-01-07" "2018-01-15" ...
$ Current.Ver : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 122 1020 468 2827 280 116 280 2393 1457 1431 ...
$ Min.Android.Ver: num 4 4 4 4.2 4.4 2.3 4 4.2 3 4 ...
table(gg.new$Installs)
0 0+ 1,000,000,000+ 1,000,000+ 1,000+
0 14 58 1579 907
1+ 10,000,000+ 10,000+ 10+ 100,000,000+
67 1252 1054 386 409
100,000+ 100+ 5,000,000+ 5,000+ 5+
1169 719 752 477 82
50,000,000+ 50,000+ 50+ 500,000,000+ 500,000+
289 479 205 72 539
500+ Free
330 0
table(gg.new$Installs.cat)
< table of extent 0 >
gg.new = gg.new %>% filter(Installs != "Free") %>% mutate(
Installs.cat = fct_collapse(Installs,
low = c("Free","0", "0+","1+", "5+", "10+","100+", "50+", "100+", "500+", "1,000+", "5,000+"),
medium = c("10,000+", "50,000+", "100,000+", "500,000+"),
high = c("1,000,000+", "5,000,000+", "1,000,000,000+", "10,000,000+", "100,000,000+", "50,000,000+", "500,000,000+")))
table(gg.new$Installs.cat)
low high medium
3187 4411 3241
ggplot(data = gg.new) +
geom_boxplot(aes(x = reorder(Installs.cat, -Rating), y = Rating)) +
labs(x = "Installment Categories",y = "Rating")
##2.3 Delete duplicated rows
# number of observations before deleting duplicated rows
(original_num_rows = nrow(gg.new))
[1] 10839
gg.new.uniq = gg.new %>% distinct
# number of rows after delete duplicated rows
(uniq_num_rows = nrow(gg.new.uniq))
[1] 10356
# number of duplicated rows
(dup_rows = original_num_rows - uniq_num_rows)
[1] 483
##2.4 Merge Category into 6
# gg.new.uniq %>% filter (!is.na(Category)) %>% print
levels(gg.new.uniq$Category)
[1] "1.9" "ART_AND_DESIGN" "AUTO_AND_VEHICLES"
[4] "BEAUTY" "BOOKS_AND_REFERENCE" "BUSINESS"
[7] "COMICS" "COMMUNICATION" "DATING"
[10] "EDUCATION" "ENTERTAINMENT" "EVENTS"
[13] "FAMILY" "FINANCE" "FOOD_AND_DRINK"
[16] "GAME" "HEALTH_AND_FITNESS" "HOUSE_AND_HOME"
[19] "LIBRARIES_AND_DEMO" "LIFESTYLE" "MAPS_AND_NAVIGATION"
[22] "MEDICAL" "NEWS_AND_MAGAZINES" "PARENTING"
[25] "PERSONALIZATION" "PHOTOGRAPHY" "PRODUCTIVITY"
[28] "SHOPPING" "SOCIAL" "SPORTS"
[31] "TOOLS" "TRAVEL_AND_LOCAL" "VIDEO_PLAYERS"
[34] "WEATHER"
mydata1 = gg.new.uniq %>% filter(Category != 1.9) %>% mutate(Cat.cat = fct_collapse(Category,
Education = c("EDUCATION", "BOOKS_AND_REFERENCE", "LIBRARIES_AND_DEMO", "ART_AND_DESIGN"),
Personalization = c("PERSONALIZATION", "BEAUTY", "SHOPPING", "DATING", "PHOTOGRAPHY"),
Lifestyle = c("HEALTH_AND_FITNESS", "MEDICAL", "LIFESTYLE", "SPORTS", "FOOD_AND_DRINK"),
Family = c("FAMILY", "PARENTING", "HOUSE_AND_HOME", "1.9"),
Entertainment = c("ENTERTAINMENT", "GAME", "COMICS", "VIDEO_PLAYERS"),
Business = c("BUSINESS", "FINANCE", "PRODUCTIVITY", "TOOLS", "NEWS_AND_MAGAZINES", "EVENTS", "SOCIAL", "COMMUNICATION"),
Travel = c("MAPS_AND_NAVIGATION", "AUTO_AND_VEHICLES", "TRAVEL_AND_LOCAL", "WEATHER")))
mydata2 = mydata1 %>% mutate(Interval = difftime(time1 = today(), time2 = Last.Updated))
str(mydata2)
'data.frame': 10356 obs. of 16 variables:
$ App : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7206 2551 8970 8089 7272 7103 8149 5568 4926 5806 ...
$ Category : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Rating : num 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
$ Reviews : num 1183 5924 5681 1947 5924 ...
$ Size : num 19 14 8.7 25 2.8 5.6 19 29 33 3.1 ...
$ Installs : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
$ Type : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Price : num 0 0 0 0 0 0 0 0 0 0 ...
$ Content.Rating : Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
$ Genres : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
$ Last.Updated : Date, format: "2018-01-07" "2018-01-15" ...
$ Current.Ver : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 122 1020 468 2827 280 116 280 2393 1457 1431 ...
$ Min.Android.Ver: num 4 4 4 4.2 4.4 2.3 4 4.2 3 4 ...
$ Installs.cat : Factor w/ 3 levels "low","high","medium": 3 3 2 2 3 3 3 2 2 3 ...
$ Cat.cat : Factor w/ 7 levels "Family","Education",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Interval : 'difftime' num 1094 1086 888 942 ...
..- attr(*, "units")= chr "days"
mydata2 %>% filter(Installs.cat == "low") %>% print
#missForest
library(missForest)
Loading required package: randomForest
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
Attaching package: ‘randomForest’
The following object is masked from ‘package:dplyr’:
combine
The following object is masked from ‘package:ggplot2’:
margin
Loading required package: foreach
Attaching package: ‘foreach’
The following objects are masked from ‘package:purrr’:
accumulate, when
Loading required package: itertools
Loading required package: iterators
#impute missing values, using all parameters as default values
gg.new.imp <- missForest(data.matrix(mydata2), maxiter = 5, ntree = 10)
missForest iteration 1 in progress...done!
missForest iteration 2 in progress...done!
missForest iteration 3 in progress...done!
missForest iteration 4 in progress...done!
#check imputed values
# gg.new.imp$ximp
#check imputation error
gg.new.imp$OOBerror
NRMSE
0.0010914
# install.packages("stringr")
# install.packages("tidytext")
library(stringr)
library(tidytext)
# read in user reviews
user_review = read.csv("googleplaystore_user_reviews.csv")
str(user_review)
'data.frame': 64295 obs. of 5 variables:
$ App : Factor w/ 1074 levels "10 Best Foods for You",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Translated_Review : Factor w/ 27996 levels "","___ ___ ___ ___ ___ 0",..: 9279 23853 17229 27355 2076 2168 1032 17229 15968 13280 ...
$ Sentiment : Factor w/ 4 levels "nan","Negative",..: 4 4 1 4 4 4 4 1 3 3 ...
$ Sentiment_Polarity : num 1 0.25 NaN 0.4 1 1 0.6 NaN 0 0 ...
$ Sentiment_Subjectivity: num 0.533 0.288 NaN 0.875 0.3 ...
user_review %>% print
head(user_review)
# get sentiment data frame
sents = get_sentiments("afinn") %>% print
range(sents$score)
Unknown or uninitialised column: `score`.no non-missing arguments to min; returning Infno non-missing arguments to max; returning -Inf
[1] Inf -Inf
# left join the sentiment chart and the user reviews to get score
t1 = user_review %>% mutate(review = as.character(Translated_Review)) %>% unnest_tokens(word, review)
# t2 = user_review[1:500, ]
user_score = left_join(t1, sents) %>% group_by(App) %>% summarise(n = n(), score=sum(t1$score, na.rm=T)) %>% mutate(avg.score = score / n) %>% print
Joining, by = "word"
`summarise()` ungrouping output (override with `.groups` argument)
# range(user_score $ avg.score)
user_review %>% group_by(App) %>% count
t11 = user_score %>% inner_join(gg.new) %>% filter(Installs != 5000) %>% filter(Installs != 1000000000)
Joining, by = "App"
ggplot(t11) + geom_line(aes(x = Installs, y = avg.score))
ggplot(t11) + geom_boxplot(aes(x = reorder(as.factor(Installs), -avg.score), y = avg.score)) + labs(x = "Installments", y = "Average Score") + coord_flip()
# recover app name after data imputation
# add num_row to gg.new
mydata2 = mydata2 %>% mutate(r = row_number())
# split data into training and test data
# change the list to data frame
gg.df = gg.new.imp[[1]] %>% unlist()
gg.data = data.frame(gg.df) %>% mutate(r = row_number())
t1 = left_join(gg.data, mydata2, by = "r") %>%
select(Rating.x, Reviews.y, Size.x, Installs.cat.y, Price.y, Content.Rating.y, Cat.cat.y, Interval.y) %>% print
# split data
(total_row = nrow(t1))
[1] 10356
ins.l= which(t1$Installs.cat.y == "low")
ins.m= which(t1$Installs.cat.y == "medium")
ins.h= which(t1$Installs.cat.y == "high")
train.id = c(sample(ins.l, size = trunc(0.8 *length(ins.l))),
sample(ins.m, size = trunc(0.8 *length(ins.m))),
sample(ins.h, size = trunc(0.8 *length(ins.h))))
train.gg = t1[train.id, ]
test.gg = t1[-train.id, ]
levels(train.gg$`Installs`)
[1] "low" "high" "medium"
table(train.gg$`Installs`)
low high medium
2519 3243 2522
# random forest
set.seed(415)
library(randomForest)
table(factor(train.gg$Installs.cat.y))
low high medium
2519 3243 2522
bag.gg=randomForest(Installs.cat.y~., data=train.gg, mtry = ncol(train.gg) - 1,importance=TRUE)
bag.gg
Call:
randomForest(formula = Installs.cat.y ~ ., data = train.gg, mtry = ncol(train.gg) - 1, importance = TRUE)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 7
OOB estimate of error rate: 33.9%
Confusion matrix:
low high medium class.error
low 1675 259 585 0.3350536
high 130 2501 612 0.2288005
medium 409 813 1300 0.4845361
# plot
yhat.bag = predict(bag.gg, newdata=test.gg)
# test error
(forest.test.err = mean(yhat.bag != test.gg$Installs.cat.y))
[1] 0.359556
# get the importance
importance(bag.gg)
low high medium MeanDecreaseAccuracy MeanDecreaseGini
Rating.x 82.58887 143.36552 36.52999 152.14778 892.6160
Reviews.y 165.44337 132.80628 56.34013 198.64125 1634.3904
Size.x 40.85667 145.63664 23.41362 137.53552 1078.9575
Price.y 59.52072 126.01800 29.53226 115.29368 158.7036
Content.Rating.y 14.39674 12.01644 15.28103 22.98796 131.2132
Cat.cat.y 20.08711 90.17230 28.56530 88.02326 365.6191
Interval.y 45.08460 150.29478 18.44381 136.68670 1219.0267
varImpPlot(bag.gg)
# tree
set.seed(415)
library(tree)
Registered S3 method overwritten by 'tree':
method from
print.tree cli
#train.gg
#colnames(train.gg)[1] = "Rating"
#colnames(train.gg)[2] = "Reviews"
#colnames(train.gg)[3] = "Size"
#colnames(train.gg)[5] = "Price"
#colnames(train.gg)[6] = "Content Rating"
#colnames(train.gg)[7] = "Category"
#colnames(train.gg)[1] = "Time Since Last Update"
#train.gg
train.gg
tree.gg = tree(Installs.cat.y~., data = train.gg)
NAs introduced by coercion
summary(tree.gg)
Classification tree:
tree(formula = Installs.cat.y ~ ., data = train.gg)
Variables actually used in tree construction:
[1] "Reviews.y" "Size.x" "Rating.x" "Price.y"
Number of terminal nodes: 8
Residual mean deviance: 1.684 = 13940 / 8276
Misclassification error rate: 0.4067 = 3369 / 8284
plot(tree.gg)
text(tree.gg, pretty = 1, cex = 1)
yhat.tree = predict(tree.gg, newdata=test.gg)
NAs introduced by coercion
# test error
(tree.test.err = mean(yhat.tree != test.gg$Installs.cat.y))
[1] 1
# prune the tree
cv.gg.tree=cv.tree(tree.gg,FUN=prune.misclass)
NAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercionNAs introduced by coercion
cv.gg.tree
$size
[1] 8 7 6 5 4 3 2 1
$dev
[1] 3385 3454 3550 3608 3670 3797 4289 5041
$k
[1] -Inf 58 70 82 94 127 489 752
$method
[1] "misclass"
attr(,"class")
[1] "prune" "tree.sequence"
# par(mfrow=c(1,2))
# plot(cv.gg.tree$size,cv.gg.tree$dev / length(train.gg),ylab="cv error", xlab="size",type="b")
# plot(cv.gg.tree$k, cv.gg.tree$dev / length(train.gg),ylab="cv error", xlab="k",type="b")
# predict using pruning tree
prune.tree=prune.misclass(tree.gg,best=8)
tree.pred=predict(prune.tree, test.gg,type="class")
NAs introduced by coercion
table(tree.pred, test.gg$Installs.cat.y)
tree.pred low high medium
low 270 6 57
high 60 493 138
medium 300 312 436
(test.tree.err = mean(tree.pred != test.gg$Installs.cat.y))
[1] 0.421332
# plot the tree
plot(prune.tree)
text(prune.tree, pretty = 0, cex = 1)
As we can see in both single tree and random forest, reviews is the most important predictor. When we dig into the reviews, we figure out that approxiamtely 1000 apps have more than 100 relevant text reviews / comments.
set.seed(415)
# get data frame ready to use
train.gg
table(factor(train.gg$Installs.cat.y))
low high medium
2519 3243 2522
costVals = c(1, 5, 10, 50)
# linear kernel
# running too slow, be careful to change predictors
svm1 <- tune(svm, as.factor(Installs.cat.y) ~ ., data = train.gg,
kernel = "linear",
ranges = list("cost" = costVals))
summary(svm1)
Parameter tuning of ‘svm’:
- sampling method: 10-fold cross validation
- best parameters:
- best performance: 0.4482075
- Detailed performance results:
# find the best cost under linear kernel
best_mod_linear = svm1$best.model
summary(best_mod_linear)
Call:
best.tune(method = svm, train.x = as.factor(Installs.cat.y) ~ ., data = train.gg,
ranges = list(cost = costVals), kernel = "linear")
Parameters:
SVM-Type: C-classification
SVM-Kernel: linear
cost: 5
Number of Support Vectors: 6862
( 2194 2442 2226 )
Number of Classes: 3
Levels:
low high medium
# thus the cost of the best model si 50.
# get the test error of the best model of the linear kernel
test.gg %>% str()
'data.frame': 2072 obs. of 8 variables:
$ Rating.x : num 4.5 4.4 4.4 4.7 4.8 4.2 4.1 4.2 4.7 4.1 ...
$ Reviews.y : num 1947 834 1057 3353 1655 ...
$ Size.x : num 25 28 37 5.5 6 9.2 5.2 11 24 36.7 ...
$ Installs.cat.y : Factor w/ 3 levels "low","high","medium": 2 2 3 3 3 3 3 3 3 2 ...
$ Price.y : num 0 0 0 0 0 0 0 0 0 0 ...
$ Content.Rating.y: Factor w/ 7 levels "","Adults only 18+",..: 6 3 3 3 3 3 3 3 3 3 ...
$ Cat.cat.y : Factor w/ 7 levels "Family","Education",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Interval.y : 'difftime' num 942 1166 886 889 ...
..- attr(*, "units")= chr "days"
pred_test_linear = predict(best_mod_linear, newdata = test.gg)
table(predict = pred_test_linear, truth = test.gg$Installs.cat.y)
truth
predict low high medium
low 349 132 217
high 164 607 235
medium 117 72 179
(test_err_linear = mean(pred_test_linear != test.gg$Installs.cat.y))
[1] 0.4522201
set.seed(415)
# kernel radial
gammaVals = c(1, 2, 3, 4)
svm_radial <-tune(svm, as.factor(Installs.cat.y) ~ ., data = train.gg,
kernel = "radial",
cost = 100,
gamma =gammaVals)
summary(svm_radial)
Error estimation of ‘svm’ using 10-fold cross validation: 0.4356596
best_mod_radial = svm_radial$best.model
summary(best_mod_radial)
Call:
best.tune(method = svm, train.x = as.factor(Installs.cat.y) ~ ., data = train.gg,
kernel = "radial", cost = 100, gamma = gammaVals)
Parameters:
SVM-Type: C-classification
SVM-Kernel: radial
cost: 100
Number of Support Vectors: 5903
( 1732 2164 2007 )
Number of Classes: 3
Levels:
low high medium
# get test error of kernel of the radial
pred_test_radial = predict(best_mod_radial, newdata = test.gg)
(test_err_radial = mean(pred_test_radial != test.gg$Installs.cat.y))
[1] 0.4382239
Is it true that people tends to give text review when they highly positively review the app?
# left join the user_score table and t3
mydata2 = mydata2 %>% mutate(r = row_number()) %>% print
gg.df = gg.new.imp[[1]] %>% unlist()
gg.data = data.frame(gg.df) %>% mutate(r = row_number()) %>% print
t3 = left_join(gg.data, mydata2, by = "r") %>%
select(Rating.x, Reviews.y, App.y, Installs.cat.y) %>% print
colnames(t3)[3] = "App"
t2 = inner_join(user_score, t3, by = "App") %>% print
# raing and avg score
# add main title manually, which is "rating vs aaverage sentimental score"
ggplot(data = t2, aes(x = Rating.x, y = avg.score)) + geom_bar(stat = "identity") + labs(x = "Rating", y = "Average Sentimental Score", title = "Rating vs Average sentimental Score")
ggplot(data = t2, aes(x = as.factor(Installs.cat.y), y = avg.score)) + geom_boxplot() + labs(x = "Installment Category", y = "Average Sentimental Score")
#boxplot(t2$Installs.cat.y ~ t2$avg.score)
# rating vs reviews
ggplot(data = t2, aes(x = Reviews.y, y = avg.score)) + geom_bar(stat = "identity") + labs(x = "Number of #Reviews", y = "Average Sentimental Score", title = "Number of Reviews vs Average sentimental Score")
High avg score tends to concentrated at rating above and including 4.0
final1 = left_join(gg.data, mydata2, by = "r") %>% select(App.y, Reviews.y, Rating.x, Interval.y, Size.x, Price.y, Cat.cat.y, Content.Rating.y) %>% print
colnames(final1)[1] = "App"
colnames(final1)[2] = "Reviews"
colnames(final1)[3] = "Rating"
colnames(final1)[4] = "Interval"
colnames(final1)[5] = "Size"
colnames(final1)[6] = "Price"
colnames(final1)[7] = "Category"
colnames(final1)[8] = "Content"
show((final1))
plot(final1)