This document is a step by step in making a good visualization and also a plot that is suitable for publication. This Rmarkdown will describe :
Importing font:
library(extrafont)
# font_import()
# loagpsonts(device = "win")
# `windowsFonts()` for checking all font type
New theme:
my_theme <- theme(
panel.background = element_rect(fill = "white", colour = NA),
panel.border = element_rect(color = "gray"),
panel.grid.major.y = element_line(colour = "gray", size = 0.2, linetype = 2),
panel.margin.x = NULL,
panel.margin.y = NULL,
legend.background = element_rect(colour = NA),
legend.key = element_rect(fill = "grey95", colour = "white"),
legend.key.size = unit(1, "lines"),
legend.key.height = NULL,
legend.key.width = NULL,
legend.text = element_text(size = rel(0.8)),
legend.text.align = NULL,
legend.title = element_text(hjust = 0),
legend.title.align = NULL,
legend.position = "right",
legend.direction = NULL,
legend.justification = "bottom",
legend.box = NULL,
plot.title = element_text(face = "bold", colour = "chartreuse4",
family = "Comic Sans MS", size = 17),
plot.subtitle = element_text(family = "Georgia", size = 10, face = "italic"),
plot.caption = element_text(family = "Verdana", size = 7, face = "italic"),
axis.text.x = element_text(size = 9, margin = margin(t = 1.6), vjust = 1),
axis.text.y = element_text(size = 8),
axis.title = element_text(family = "Arial", size = 12, face = "bold"),
axis.ticks = element_blank()
)
This is Google Play Store dataset obtained from kaggle. The Play Store apps data has enormous potential to drive app-making businesses to success. Actionable insights can be drawn for developers to work on and capture the Android market
#> 'data.frame': 10841 obs. of 13 variables:
#> $ App : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7229 2563 8998 8113 7294 7125 8171 5589 4948 5826 ...
#> $ Category : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
#> $ Rating : num 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
#> $ Reviews : Factor w/ 6002 levels "0","1","10","100",..: 1183 5924 5681 1947 5924 1310 1464 3385 816 485 ...
#> $ Size : Factor w/ 462 levels "1,000+","1.0M",..: 55 30 368 102 64 222 55 118 146 120 ...
#> $ Installs : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
#> $ Type : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
#> $ Price : Factor w/ 93 levels "$0.99","$1.00",..: 92 92 92 92 92 92 92 92 92 92 ...
#> $ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
#> $ Genres : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
#> $ Last.Updated : Factor w/ 1378 levels "1.0.19","April 1, 2016",..: 562 482 117 825 757 901 76 726 1317 670 ...
#> $ Current.Ver : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 121 1020 466 2827 279 115 279 2393 1457 1431 ...
#> $ Android.Ver : Factor w/ 35 levels "","1.0 and up",..: 17 17 17 20 22 10 17 20 12 17 ...
#> App Category
#> ROBLOX : 9 FAMILY :1972
#> CBS Sports App - Scores, News, Stats & Watch Live: 8 GAME :1144
#> 8 Ball Pool : 7 TOOLS : 843
#> Candy Crush Saga : 7 MEDICAL : 463
#> Duolingo: Learn Languages Free : 7 BUSINESS : 460
#> ESPN : 7 PRODUCTIVITY: 424
#> (Other) :10796 (Other) :5535
#> Rating Reviews Size Installs
#> Min. : 1.000 0 : 596 Varies with device:1695 1,000,000+ :1579
#> 1st Qu.: 4.000 1 : 272 11M : 198 10,000,000+:1252
#> Median : 4.300 2 : 214 12M : 196 100,000+ :1169
#> Mean : 4.193 3 : 175 14M : 194 10,000+ :1054
#> 3rd Qu.: 4.500 4 : 137 13M : 191 1,000+ : 907
#> Max. :19.000 5 : 108 15M : 184 5,000,000+ : 752
#> NA's :1474 (Other):9339 (Other) :8183 (Other) :4128
#> Type Price Content.Rating Genres
#> 0 : 1 0 :10040 : 1 Tools : 842
#> Free:10039 $0.99 : 148 Adults only 18+: 3 Entertainment: 623
#> NaN : 1 $2.99 : 129 Everyone :8714 Education : 549
#> Paid: 800 $1.99 : 73 Everyone 10+ : 414 Medical : 463
#> $4.99 : 72 Mature 17+ : 499 Business : 460
#> $3.99 : 63 Teen :1208 Productivity : 424
#> (Other): 316 Unrated : 2 (Other) :7480
#> Last.Updated Current.Ver Android.Ver
#> August 3, 2018: 326 Varies with device:1459 4.1 and up :2451
#> August 2, 2018: 304 1.0 : 809 4.0.3 and up :1501
#> July 31, 2018 : 294 1.1 : 264 4.0 and up :1375
#> August 1, 2018: 285 1.2 : 178 Varies with device:1362
#> July 30, 2018 : 211 2.0 : 151 4.4 and up : 980
#> July 25, 2018 : 164 1.3 : 145 2.3 and up : 652
#> (Other) :9257 (Other) :7835 (Other) :2520
gps$App <- as.character(gps$App)
gps$Reviews <- as.numeric(gps$Reviews)
gps$Size <- gsub("M", "", gps$Size)
gps$Size <- ifelse(grepl("k", gps$Size), 0, as.numeric(gps$Size))
gps$Installs <- gsub("\\+", "", as.character(gps$Installs))
gps$Installs <- as.numeric(gsub(",", "", gps$Installs))
gps$Price <- as.numeric((gsub("\\$", "", as.character(gps$Price))))
gps$Last.Updated <- mdy(gps$Last.Updated)
gps$Year.Updated <- year(gps$Last.Updated)
gps$Month.Updated <- month(gps$Last.Updated)
gps[,c(12:13)] <- NULL #removing Current.Ver and Android.Ver
gps <- gps[gps$Type %in% c("Free", "Paid"),] #removing "0" and "NaN" Type
gps$Type <- droplevels(gps$Type)
str(gps)
#> 'data.frame': 10839 obs. of 13 variables:
#> $ App : chr "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite â\200“ FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
#> $ Category : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
#> $ Rating : num 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
#> $ Reviews : num 1183 5924 5681 1947 5924 ...
#> $ Size : num 19 14 8.7 25 2.8 5.6 19 29 33 3.1 ...
#> $ Installs : num 1e+04 5e+05 5e+06 5e+07 1e+05 5e+04 5e+04 1e+06 1e+06 1e+04 ...
#> $ Type : Factor w/ 2 levels "Free","Paid": 1 1 1 1 1 1 1 1 1 1 ...
#> $ Price : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
#> $ Genres : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
#> $ Last.Updated : Date, format: "2018-01-07" "2018-01-15" ...
#> $ Year.Updated : num 2018 2018 2018 2018 2018 ...
#> $ Month.Updated : num 1 1 8 6 6 3 4 6 9 7 ...
#> App Category Rating Reviews
#> Length:10839 FAMILY :1971 Min. :1.000 Min. : 1
#> Class :character GAME :1144 1st Qu.:4.000 1st Qu.:1158
#> Mode :character TOOLS : 843 Median :4.300 Median :2747
#> MEDICAL : 463 Mean :4.192 Mean :2744
#> BUSINESS : 460 3rd Qu.:4.500 3rd Qu.:4320
#> PRODUCTIVITY: 424 Max. :5.000 Max. :6002
#> (Other) :5534 NA's :1473
#> Size Installs Type Price
#> Min. : 0.0 Min. :0.000e+00 Free:10039 Min. : 0.000
#> 1st Qu.: 4.9 1st Qu.:3.000e+03 Paid: 800 1st Qu.: 0.000
#> Median : 13.0 Median :1.000e+05 Median : 0.000
#> Mean : 21.5 Mean :1.547e+07 Mean : 1.028
#> 3rd Qu.: 30.0 3rd Qu.:5.000e+06 3rd Qu.: 0.000
#> Max. :100.0 Max. :1.000e+09 Max. :400.000
#> NA's :1694
#> Content.Rating Genres Last.Updated
#> : 0 Tools : 842 Min. :2010-05-21
#> Adults only 18+: 3 Entertainment: 623 1st Qu.:2017-09-20
#> Everyone :8714 Education : 549 Median :2018-05-24
#> Everyone 10+ : 413 Medical : 463 Mean :2017-11-21
#> Mature 17+ : 499 Business : 460 3rd Qu.:2018-07-20
#> Teen :1208 Productivity : 424 Max. :2018-08-08
#> Unrated : 2 (Other) :7478
#> Year.Updated Month.Updated
#> Min. :2010 Min. : 1.000
#> 1st Qu.:2017 1st Qu.: 5.000
#> Median :2018 Median : 7.000
#> Mean :2017 Mean : 6.422
#> 3rd Qu.:2018 3rd Qu.: 8.000
#> Max. :2018 Max. :12.000
#>
convert number of installs to install category:
ic <- function(x) {
if (x < 10001){
x <- "Grade C"
}else if ( x >= 10001 & x < 1000001) {
x <- "Grade B"
}else if ( x >= 1000001 & x < 100000001) {
x <- "Grade A"
}else {
x <- "Grade A+"
}
}
gps$Install.cat <- sapply(gps$Installs, ic)
gps$Install.cat <- factor(gps$Install.cat, levels=c("Grade C", "Grade B", "Grade A", "Grade A+"))
head(gps) #check install category
#> App Category Rating
#> 1 Photo Editor & Candy Camera & Grid & ScrapBook ART_AND_DESIGN 4.1
#> 2 Coloring book moana ART_AND_DESIGN 3.9
#> 3 U Launcher Lite â\200“ FREE Live Cool Themes, Hide Apps ART_AND_DESIGN 4.7
#> 4 Sketch - Draw & Paint ART_AND_DESIGN 4.5
#> 5 Pixel Draw - Number Art Coloring Book ART_AND_DESIGN 4.3
#> 6 Paper flowers instructions ART_AND_DESIGN 4.4
#> Reviews Size Installs Type Price Content.Rating Genres
#> 1 1183 19.0 1e+04 Free 0 Everyone Art & Design
#> 2 5924 14.0 5e+05 Free 0 Everyone Art & Design;Pretend Play
#> 3 5681 8.7 5e+06 Free 0 Everyone Art & Design
#> 4 1947 25.0 5e+07 Free 0 Teen Art & Design
#> 5 5924 2.8 1e+05 Free 0 Everyone Art & Design;Creativity
#> 6 1310 5.6 5e+04 Free 0 Everyone Art & Design
#> Last.Updated Year.Updated Month.Updated Install.cat
#> 1 2018-01-07 2018 1 Grade C
#> 2 2018-01-15 2018 1 Grade B
#> 3 2018-08-01 2018 8 Grade A
#> 4 2018-06-08 2018 6 Grade A
#> 5 2018-06-20 2018 6 Grade B
#> 6 2017-03-26 2017 3 Grade B
NA value
#> App Category Rating Reviews Size
#> 0 0 1464 0 1525
#> Installs Type Price Content.Rating Genres
#> 0 0 0 0 0
#> Last.Updated Year.Updated Month.Updated Install.cat
#> 0 0 0 0
more than 10% of our data, we have to convert NA value
Remove/convert NA rating :
# rating.na <- aggregate.data.frame(is.na(gps$Rating), by=list(gps$Install.cat), sum)
rating.na <- gps[is.na(gps$Rating),]
rating.na <- aggregate(App ~ Install.cat, rating.na, length)
plot(rating.na)
na : Most of the missing values are from the Grade C Category of Installs. Meaning Apps with less than 10k number of Installs have no rating.
gps[is.na(gps$Rating) & gps$Installs>11000,"Rating"] <- mean(gps$Rating, na.rm = TRUE)
gps[is.na(gps$Rating)& gps$Installs<10001,"Rating"] <- 0
colSums(is.na(gps)) #there is no more NA Rating
#> App Category Rating Reviews Size
#> 0 0 0 0 1525
#> Installs Type Price Content.Rating Genres
#> 0 0 0 0 0
#> Last.Updated Year.Updated Month.Updated Install.cat
#> 0 0 0 0
We can use na.omit
: (removing na value in our dataframe)
#> Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#> 0.00 4.70 13.00 21.27 29.00 100.00 1525
1525 NA size(Varies with device)
gps_Year = group_by(gps, Year.Updated) %>%
summarise(n=n())
ggplot (data=gps_Year, aes(x=Year.Updated, y=n))+
geom_line(colour = "chartreuse4")+
geom_point(colour = "red")+
scale_y_continuous(breaks=seq(0,7000, by=500))+
labs(title = "GPS Apps Year Updated",
y = "Number of Apps")+
theme_minimal()
gps_Month = group_by(gps, Month.Updated) %>%
summarise(n=n())
ggplot (data=gps_Month, aes(x=Month.Updated, y=n))+
geom_line(colour = "chartreuse4")+
geom_point(colour = "red")+
scale_y_continuous(breaks=seq(0,3000, by=200))+
scale_x_continuous(breaks=c(1:12),
labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Des"))+
labs(title = "GPS Apps Month Updated",
y = "Number of Apps")+
theme_minimal()
I dont know why most of the apps (around 30 percent) were updated in Month of July.
Top Rated Category
gps$Category <- droplevels(gps$Category) #removing "1.9" Category
ggplot(data=gps, aes(x = fct_rev(fct_infreq(Category))))+
geom_bar(fill = "chartreuse4")+
coord_flip()+
labs(title= "GPS Apps Category",
x = "Category",
y = "Number of Apps")
As per the graph , Family Apps Category has the most number of Apps in this dataset,followed by Games and Tools
overall more than 4.0 Rating
gps.rate <- aggregate(Rating ~ Category, gps, mean)
gps.rate <- gps.rate[order(gps.rate$Rating, decreasing = T),]
gps.rate <- head(gps.rate, 10)
gps.rate$Category <- droplevels(gps.rate$Category)
using ggplot for better plots:
ggplot(gps.rate, aes(x = reorder(Category, Rating), y = Rating)) +
geom_col( fill = "chartreuse4" ) +
coord_flip(ylim= c(3.5, 4.5)) +
labs(title = "Top Rated App Categories",
x = "Category",
y = "Ratings")
Popular : Number of Installs
gps.pop <- aggregate(Installs ~ Category, gps, sum)
gps.pop <- gps.pop[order(gps.pop$Installs, decreasing = T),]
gps.pop <- head(gps.pop, 10)
using ggplot for better plots
ggplot(gps.pop, aes(x = reorder(Category, Installs), y = Installs)) +
geom_col( fill = "chartreuse4" ) +
coord_flip() +
labs(title = "Most Popular Categories",
x = "Category",
y = "Total Number of Installs")
Not so shocking Results!! We could have guessed this. Games are the most popular category of Apps in Play Store. Followed by Communication and Social.
Top Genre
#so many Genres types. find the most frequent
top.genres <- aggregate(App~ Genres, gps, length)
top.genres <- top.genres[order(top.genres$App, decreasing = T),]
top.genres <- head(top.genres, 15)
top.genres$Genres <- droplevels(top.genres$Genres)
match <- gps$Genres %in% top.genres$Genres
gps.gen <- gps[match,]
ggplot (gps.gen, aes(x = fct_rev(fct_infreq(Genres))))+
geom_bar(fill = "chartreuse4")+
coord_flip()+
theme(legend.position = "none")+
labs(title= "Category vs Top Genres",
x = "Category",
y = "Number of Apps")
Plotting Genre against Category:
ggplot(gps, aes(x=Category))+
geom_bar(aes(fill = Genres))+
coord_flip()+
theme(legend.position = "none")+
ggtitle("Category vs Genres")
gps_fam <- subset(gps, Category == "FAMILY") %>%
group_by(Genres) %>%
summarise(n=n()) %>%
arrange(desc(n))
gps_fam <- head(gps_fam, 15)
ggplot (data=gps_fam, aes(x=reorder(Genres,n), y = n, fill= Genres))+
geom_col()+
coord_flip()+
geom_text(aes(label=n), hjust= 1)+
labs(title = "Top Genres in Family Category",
x = "Genres",
y = "Number of Apps")+
theme_minimal()+
theme(legend.position = "none")
##Games Category
gps_game <- subset(gps, Category == "GAME") %>%
group_by(Genres) %>%
summarise(n=n()) %>%
arrange(desc(n))
gps_game <- head(gps_game, 15)
ggplot (data=gps_game, aes(x=reorder(Genres,n), y = n, fill= Genres))+
geom_col()+
coord_flip()+
geom_text(aes(label=n), hjust= 1)+
labs(title = "Top Genres in Family Category",
x = "Genres",
y = "Number of Apps")+
theme_minimal()+
theme(legend.position = "none")
#average rating of GAME category for subsetting top rated
mean(gps.size[gps.size$Category == "GAME",]$Rating)
#> [1] 4.094988
gps.game <- gps.size[gps.size$Category == "GAME" &
gps.size$Rating > 4.2 , ## top rated
c("App", "Rating", "Size", "Type", "Reviews", "Install.cat")]
gps.game <- gps.game[order(gps.game$Reviews, decreasing = T),]
#merge Large and Small storage
gps.game <- rbind(gps.game[gps.game$Size > 80,],
gps.game[gps.game$Size < 10,])
gps.game$Size.cat <- ifelse(gps.game$Size >80, "Large Storage", "Small Storage")
#merge top 6 free and paid games
gps.game.free <- rbind(head(gps.game[gps.game$Size.cat == "Large Storage" &
gps.game$Type == "Free",]
,3),
head(gps.game[gps.game$Size.cat == "Small Storage" &
gps.game$Type == "Free",]
,3))
gps.game.paid <- rbind(head(gps.game[gps.game$Size.cat == "Large Storage" &
gps.game$Type == "Paid",]
,3),
head(gps.game[gps.game$Size.cat == "Small Storage" &
gps.game$Type == "Paid",]
,3))
gps.game <- rbind(gps.game.free, gps.game.paid)
#Plotting
ggplot(gps.game, aes(x = Rating, y = App))+
geom_point(aes(col = Install.cat, size = Reviews)) +
facet_grid(Type~Size.cat) +
labs(title = "Which Game to Play?",
subtitle = "based on Storage and Type",
caption = "Source : Google Play Store",
x = "Rating",
y = "Top Games")+
theme_bw(base_size = 9,
base_family = "",
base_line_size = 0.5,
base_rect_size = 0.5)+
my_theme
gps.all <- gps.size[gps.size$Rating > 4 , ## top rated
c("App", "Rating", "Size", "Type", "Reviews", "Category")]
gps.all <- gps.all[order(gps.all$Reviews, decreasing = T),]
summary(gps.all$Size)
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.00 5.60 15.00 24.19 35.00 100.00
gps.all$Size.cat <- ifelse(gps.all$Size <10, "Small Storage",
ifelse(gps.all$Size <80, "Medium Storage", "Large Storage"))
#merge top 9 free and paid games
gps.all.free <- rbind(head(gps.all[gps.all$Size.cat == "Large Storage" &
gps.all$Type == "Free",]
,3),
head(gps.all[gps.all$Size.cat == "Medium Storage" &
gps.all$Type == "Free",]
,3),
head(gps.all[gps.all$Size.cat == "Small Storage" &
gps.all$Type == "Free",]
,3))
gps.all.paid <- rbind(head(gps.all[gps.all$Size.cat == "Large Storage" &
gps.all$Type == "Paid",]
,3),
head(gps.all[gps.all$Size.cat == "Medium Storage" &
gps.all$Type == "Paid",]
,3),
head(gps.all[gps.all$Size.cat == "Small Storage" &
gps.all$Type == "Paid",]
,3))
gps.all <- rbind(gps.all.free, gps.all.paid)
#Plotting
ggplot(gps.all, aes(x = Rating, y = App))+
geom_point(aes(col = Category, size = Reviews)) +
facet_grid(Type~Size.cat) +
labs(title = "Best Google Play Store Apps",
subtitle = "based on Storage and Type",
caption = "Source : Google Play Store",
x = "Rating",
y = "Top Games")+
theme_bw(base_size = 9,
base_family = "",
base_line_size = 0.5,
base_rect_size = 0.5)+
my_theme