This document is a step by step in making a good visualization and also a plot that is suitable for publication. This Rmarkdown will describe :
Importing font:
library(extrafont)
# font_import()
# loagpsonts(device = "win")
# `windowsFonts()` for checking all font typeNew theme:
my_theme <- theme(
panel.background = element_rect(fill = "white", colour = NA),
panel.border = element_rect(color = "gray"),
panel.grid.major.y = element_line(colour = "gray", size = 0.2, linetype = 2),
panel.margin.x = NULL,
panel.margin.y = NULL,
legend.background = element_rect(colour = NA),
legend.key = element_rect(fill = "grey95", colour = "white"),
legend.key.size = unit(1, "lines"),
legend.key.height = NULL,
legend.key.width = NULL,
legend.text = element_text(size = rel(0.8)),
legend.text.align = NULL,
legend.title = element_text(hjust = 0),
legend.title.align = NULL,
legend.position = "right",
legend.direction = NULL,
legend.justification = "bottom",
legend.box = NULL,
plot.title = element_text(face = "bold", colour = "chartreuse4",
family = "Comic Sans MS", size = 17),
plot.subtitle = element_text(family = "Georgia", size = 10, face = "italic"),
plot.caption = element_text(family = "Verdana", size = 7, face = "italic"),
axis.text.x = element_text(size = 9, margin = margin(t = 1.6), vjust = 1),
axis.text.y = element_text(size = 8),
axis.title = element_text(family = "Arial", size = 12, face = "bold"),
axis.ticks = element_blank()
)This is Google Play Store dataset obtained from kaggle. The Play Store apps data has enormous potential to drive app-making businesses to success. Actionable insights can be drawn for developers to work on and capture the Android market
#> 'data.frame': 10841 obs. of 13 variables:
#> $ App : Factor w/ 9660 levels "- Free Comics - Comic Apps",..: 7229 2563 8998 8113 7294 7125 8171 5589 4948 5826 ...
#> $ Category : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
#> $ Rating : num 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
#> $ Reviews : Factor w/ 6002 levels "0","1","10","100",..: 1183 5924 5681 1947 5924 1310 1464 3385 816 485 ...
#> $ Size : Factor w/ 462 levels "1,000+","1.0M",..: 55 30 368 102 64 222 55 118 146 120 ...
#> $ Installs : Factor w/ 22 levels "0","0+","1,000,000,000+",..: 8 20 13 16 11 17 17 4 4 8 ...
#> $ Type : Factor w/ 4 levels "0","Free","NaN",..: 2 2 2 2 2 2 2 2 2 2 ...
#> $ Price : Factor w/ 93 levels "$0.99","$1.00",..: 92 92 92 92 92 92 92 92 92 92 ...
#> $ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
#> $ Genres : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
#> $ Last.Updated : Factor w/ 1378 levels "1.0.19","April 1, 2016",..: 562 482 117 825 757 901 76 726 1317 670 ...
#> $ Current.Ver : Factor w/ 2834 levels "","0.0.0.2","0.0.1",..: 121 1020 466 2827 279 115 279 2393 1457 1431 ...
#> $ Android.Ver : Factor w/ 35 levels "","1.0 and up",..: 17 17 17 20 22 10 17 20 12 17 ...
#> App Category
#> ROBLOX : 9 FAMILY :1972
#> CBS Sports App - Scores, News, Stats & Watch Live: 8 GAME :1144
#> 8 Ball Pool : 7 TOOLS : 843
#> Candy Crush Saga : 7 MEDICAL : 463
#> Duolingo: Learn Languages Free : 7 BUSINESS : 460
#> ESPN : 7 PRODUCTIVITY: 424
#> (Other) :10796 (Other) :5535
#> Rating Reviews Size Installs
#> Min. : 1.000 0 : 596 Varies with device:1695 1,000,000+ :1579
#> 1st Qu.: 4.000 1 : 272 11M : 198 10,000,000+:1252
#> Median : 4.300 2 : 214 12M : 196 100,000+ :1169
#> Mean : 4.193 3 : 175 14M : 194 10,000+ :1054
#> 3rd Qu.: 4.500 4 : 137 13M : 191 1,000+ : 907
#> Max. :19.000 5 : 108 15M : 184 5,000,000+ : 752
#> NA's :1474 (Other):9339 (Other) :8183 (Other) :4128
#> Type Price Content.Rating Genres
#> 0 : 1 0 :10040 : 1 Tools : 842
#> Free:10039 $0.99 : 148 Adults only 18+: 3 Entertainment: 623
#> NaN : 1 $2.99 : 129 Everyone :8714 Education : 549
#> Paid: 800 $1.99 : 73 Everyone 10+ : 414 Medical : 463
#> $4.99 : 72 Mature 17+ : 499 Business : 460
#> $3.99 : 63 Teen :1208 Productivity : 424
#> (Other): 316 Unrated : 2 (Other) :7480
#> Last.Updated Current.Ver Android.Ver
#> August 3, 2018: 326 Varies with device:1459 4.1 and up :2451
#> August 2, 2018: 304 1.0 : 809 4.0.3 and up :1501
#> July 31, 2018 : 294 1.1 : 264 4.0 and up :1375
#> August 1, 2018: 285 1.2 : 178 Varies with device:1362
#> July 30, 2018 : 211 2.0 : 151 4.4 and up : 980
#> July 25, 2018 : 164 1.3 : 145 2.3 and up : 652
#> (Other) :9257 (Other) :7835 (Other) :2520
gps$App <- as.character(gps$App)
gps$Reviews <- as.numeric(gps$Reviews)
gps$Size <- gsub("M", "", gps$Size)
gps$Size <- ifelse(grepl("k", gps$Size), 0, as.numeric(gps$Size))
gps$Installs <- gsub("\\+", "", as.character(gps$Installs))
gps$Installs <- as.numeric(gsub(",", "", gps$Installs))
gps$Price <- as.numeric((gsub("\\$", "", as.character(gps$Price))))
gps$Last.Updated <- mdy(gps$Last.Updated)
gps$Year.Updated <- year(gps$Last.Updated)
gps$Month.Updated <- month(gps$Last.Updated)
gps[,c(12:13)] <- NULL #removing Current.Ver and Android.Ver
gps <- gps[gps$Type %in% c("Free", "Paid"),] #removing "0" and "NaN" Type
gps$Type <- droplevels(gps$Type)
str(gps)#> 'data.frame': 10839 obs. of 13 variables:
#> $ App : chr "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite â\200“ FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
#> $ Category : Factor w/ 34 levels "1.9","ART_AND_DESIGN",..: 2 2 2 2 2 2 2 2 2 2 ...
#> $ Rating : num 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
#> $ Reviews : num 1183 5924 5681 1947 5924 ...
#> $ Size : num 19 14 8.7 25 2.8 5.6 19 29 33 3.1 ...
#> $ Installs : num 1e+04 5e+05 5e+06 5e+07 1e+05 5e+04 5e+04 1e+06 1e+06 1e+04 ...
#> $ Type : Factor w/ 2 levels "Free","Paid": 1 1 1 1 1 1 1 1 1 1 ...
#> $ Price : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ Content.Rating: Factor w/ 7 levels "","Adults only 18+",..: 3 3 3 6 3 3 3 3 3 3 ...
#> $ Genres : Factor w/ 120 levels "Action","Action;Action & Adventure",..: 10 13 10 10 12 10 10 10 10 12 ...
#> $ Last.Updated : Date, format: "2018-01-07" "2018-01-15" ...
#> $ Year.Updated : num 2018 2018 2018 2018 2018 ...
#> $ Month.Updated : num 1 1 8 6 6 3 4 6 9 7 ...
#> App Category Rating Reviews
#> Length:10839 FAMILY :1971 Min. :1.000 Min. : 1
#> Class :character GAME :1144 1st Qu.:4.000 1st Qu.:1158
#> Mode :character TOOLS : 843 Median :4.300 Median :2747
#> MEDICAL : 463 Mean :4.192 Mean :2744
#> BUSINESS : 460 3rd Qu.:4.500 3rd Qu.:4320
#> PRODUCTIVITY: 424 Max. :5.000 Max. :6002
#> (Other) :5534 NA's :1473
#> Size Installs Type Price
#> Min. : 0.0 Min. :0.000e+00 Free:10039 Min. : 0.000
#> 1st Qu.: 4.9 1st Qu.:3.000e+03 Paid: 800 1st Qu.: 0.000
#> Median : 13.0 Median :1.000e+05 Median : 0.000
#> Mean : 21.5 Mean :1.547e+07 Mean : 1.028
#> 3rd Qu.: 30.0 3rd Qu.:5.000e+06 3rd Qu.: 0.000
#> Max. :100.0 Max. :1.000e+09 Max. :400.000
#> NA's :1694
#> Content.Rating Genres Last.Updated
#> : 0 Tools : 842 Min. :2010-05-21
#> Adults only 18+: 3 Entertainment: 623 1st Qu.:2017-09-20
#> Everyone :8714 Education : 549 Median :2018-05-24
#> Everyone 10+ : 413 Medical : 463 Mean :2017-11-21
#> Mature 17+ : 499 Business : 460 3rd Qu.:2018-07-20
#> Teen :1208 Productivity : 424 Max. :2018-08-08
#> Unrated : 2 (Other) :7478
#> Year.Updated Month.Updated
#> Min. :2010 Min. : 1.000
#> 1st Qu.:2017 1st Qu.: 5.000
#> Median :2018 Median : 7.000
#> Mean :2017 Mean : 6.422
#> 3rd Qu.:2018 3rd Qu.: 8.000
#> Max. :2018 Max. :12.000
#>
convert number of installs to install category:
ic <- function(x) {
if (x < 10001){
x <- "Grade C"
}else if ( x >= 10001 & x < 1000001) {
x <- "Grade B"
}else if ( x >= 1000001 & x < 100000001) {
x <- "Grade A"
}else {
x <- "Grade A+"
}
}
gps$Install.cat <- sapply(gps$Installs, ic)
gps$Install.cat <- factor(gps$Install.cat, levels=c("Grade C", "Grade B", "Grade A", "Grade A+"))
head(gps) #check install category#> App Category Rating
#> 1 Photo Editor & Candy Camera & Grid & ScrapBook ART_AND_DESIGN 4.1
#> 2 Coloring book moana ART_AND_DESIGN 3.9
#> 3 U Launcher Lite â\200“ FREE Live Cool Themes, Hide Apps ART_AND_DESIGN 4.7
#> 4 Sketch - Draw & Paint ART_AND_DESIGN 4.5
#> 5 Pixel Draw - Number Art Coloring Book ART_AND_DESIGN 4.3
#> 6 Paper flowers instructions ART_AND_DESIGN 4.4
#> Reviews Size Installs Type Price Content.Rating Genres
#> 1 1183 19.0 1e+04 Free 0 Everyone Art & Design
#> 2 5924 14.0 5e+05 Free 0 Everyone Art & Design;Pretend Play
#> 3 5681 8.7 5e+06 Free 0 Everyone Art & Design
#> 4 1947 25.0 5e+07 Free 0 Teen Art & Design
#> 5 5924 2.8 1e+05 Free 0 Everyone Art & Design;Creativity
#> 6 1310 5.6 5e+04 Free 0 Everyone Art & Design
#> Last.Updated Year.Updated Month.Updated Install.cat
#> 1 2018-01-07 2018 1 Grade C
#> 2 2018-01-15 2018 1 Grade B
#> 3 2018-08-01 2018 8 Grade A
#> 4 2018-06-08 2018 6 Grade A
#> 5 2018-06-20 2018 6 Grade B
#> 6 2017-03-26 2017 3 Grade B
NA value
#> App Category Rating Reviews Size
#> 0 0 1464 0 1525
#> Installs Type Price Content.Rating Genres
#> 0 0 0 0 0
#> Last.Updated Year.Updated Month.Updated Install.cat
#> 0 0 0 0
more than 10% of our data, we have to convert NA value
Remove/convert NA rating :
# rating.na <- aggregate.data.frame(is.na(gps$Rating), by=list(gps$Install.cat), sum)
rating.na <- gps[is.na(gps$Rating),]
rating.na <- aggregate(App ~ Install.cat, rating.na, length)
plot(rating.na)na : Most of the missing values are from the Grade C Category of Installs. Meaning Apps with less than 10k number of Installs have no rating.
gps[is.na(gps$Rating) & gps$Installs>11000,"Rating"] <- mean(gps$Rating, na.rm = TRUE)
gps[is.na(gps$Rating)& gps$Installs<10001,"Rating"] <- 0
colSums(is.na(gps)) #there is no more NA Rating#> App Category Rating Reviews Size
#> 0 0 0 0 1525
#> Installs Type Price Content.Rating Genres
#> 0 0 0 0 0
#> Last.Updated Year.Updated Month.Updated Install.cat
#> 0 0 0 0
We can use na.omit : (removing na value in our dataframe)
#> Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#> 0.00 4.70 13.00 21.27 29.00 100.00 1525
1525 NA size(Varies with device)
gps_Year = group_by(gps, Year.Updated) %>%
summarise(n=n())
ggplot (data=gps_Year, aes(x=Year.Updated, y=n))+
geom_line(colour = "chartreuse4")+
geom_point(colour = "red")+
scale_y_continuous(breaks=seq(0,7000, by=500))+
labs(title = "GPS Apps Year Updated",
y = "Number of Apps")+
theme_minimal()gps_Month = group_by(gps, Month.Updated) %>%
summarise(n=n())
ggplot (data=gps_Month, aes(x=Month.Updated, y=n))+
geom_line(colour = "chartreuse4")+
geom_point(colour = "red")+
scale_y_continuous(breaks=seq(0,3000, by=200))+
scale_x_continuous(breaks=c(1:12),
labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Des"))+
labs(title = "GPS Apps Month Updated",
y = "Number of Apps")+
theme_minimal()I dont know why most of the apps (around 30 percent) were updated in Month of July.
Top Rated Category
gps$Category <- droplevels(gps$Category) #removing "1.9" Category
ggplot(data=gps, aes(x = fct_rev(fct_infreq(Category))))+
geom_bar(fill = "chartreuse4")+
coord_flip()+
labs(title= "GPS Apps Category",
x = "Category",
y = "Number of Apps")As per the graph , Family Apps Category has the most number of Apps in this dataset,followed by Games and Tools
overall more than 4.0 Rating
gps.rate <- aggregate(Rating ~ Category, gps, mean)
gps.rate <- gps.rate[order(gps.rate$Rating, decreasing = T),]
gps.rate <- head(gps.rate, 10)
gps.rate$Category <- droplevels(gps.rate$Category)using ggplot for better plots:
ggplot(gps.rate, aes(x = reorder(Category, Rating), y = Rating)) +
geom_col( fill = "chartreuse4" ) +
coord_flip(ylim= c(3.5, 4.5)) +
labs(title = "Top Rated App Categories",
x = "Category",
y = "Ratings")Popular : Number of Installs
gps.pop <- aggregate(Installs ~ Category, gps, sum)
gps.pop <- gps.pop[order(gps.pop$Installs, decreasing = T),]
gps.pop <- head(gps.pop, 10)using ggplot for better plots
ggplot(gps.pop, aes(x = reorder(Category, Installs), y = Installs)) +
geom_col( fill = "chartreuse4" ) +
coord_flip() +
labs(title = "Most Popular Categories",
x = "Category",
y = "Total Number of Installs")Not so shocking Results!! We could have guessed this. Games are the most popular category of Apps in Play Store. Followed by Communication and Social.
Top Genre
#so many Genres types. find the most frequent
top.genres <- aggregate(App~ Genres, gps, length)
top.genres <- top.genres[order(top.genres$App, decreasing = T),]
top.genres <- head(top.genres, 15)
top.genres$Genres <- droplevels(top.genres$Genres)
match <- gps$Genres %in% top.genres$Genres
gps.gen <- gps[match,]
ggplot (gps.gen, aes(x = fct_rev(fct_infreq(Genres))))+
geom_bar(fill = "chartreuse4")+
coord_flip()+
theme(legend.position = "none")+
labs(title= "Category vs Top Genres",
x = "Category",
y = "Number of Apps")Plotting Genre against Category:
ggplot(gps, aes(x=Category))+
geom_bar(aes(fill = Genres))+
coord_flip()+
theme(legend.position = "none")+
ggtitle("Category vs Genres")gps_fam <- subset(gps, Category == "FAMILY") %>%
group_by(Genres) %>%
summarise(n=n()) %>%
arrange(desc(n))
gps_fam <- head(gps_fam, 15)
ggplot (data=gps_fam, aes(x=reorder(Genres,n), y = n, fill= Genres))+
geom_col()+
coord_flip()+
geom_text(aes(label=n), hjust= 1)+
labs(title = "Top Genres in Family Category",
x = "Genres",
y = "Number of Apps")+
theme_minimal()+
theme(legend.position = "none")##Games Category
gps_game <- subset(gps, Category == "GAME") %>%
group_by(Genres) %>%
summarise(n=n()) %>%
arrange(desc(n))
gps_game <- head(gps_game, 15)
ggplot (data=gps_game, aes(x=reorder(Genres,n), y = n, fill= Genres))+
geom_col()+
coord_flip()+
geom_text(aes(label=n), hjust= 1)+
labs(title = "Top Genres in Family Category",
x = "Genres",
y = "Number of Apps")+
theme_minimal()+
theme(legend.position = "none")#average rating of GAME category for subsetting top rated
mean(gps.size[gps.size$Category == "GAME",]$Rating)#> [1] 4.094988
gps.game <- gps.size[gps.size$Category == "GAME" &
gps.size$Rating > 4.2 , ## top rated
c("App", "Rating", "Size", "Type", "Reviews", "Install.cat")]
gps.game <- gps.game[order(gps.game$Reviews, decreasing = T),]
#merge Large and Small storage
gps.game <- rbind(gps.game[gps.game$Size > 80,],
gps.game[gps.game$Size < 10,])
gps.game$Size.cat <- ifelse(gps.game$Size >80, "Large Storage", "Small Storage")
#merge top 6 free and paid games
gps.game.free <- rbind(head(gps.game[gps.game$Size.cat == "Large Storage" &
gps.game$Type == "Free",]
,3),
head(gps.game[gps.game$Size.cat == "Small Storage" &
gps.game$Type == "Free",]
,3))
gps.game.paid <- rbind(head(gps.game[gps.game$Size.cat == "Large Storage" &
gps.game$Type == "Paid",]
,3),
head(gps.game[gps.game$Size.cat == "Small Storage" &
gps.game$Type == "Paid",]
,3))
gps.game <- rbind(gps.game.free, gps.game.paid)#Plotting
ggplot(gps.game, aes(x = Rating, y = App))+
geom_point(aes(col = Install.cat, size = Reviews)) +
facet_grid(Type~Size.cat) +
labs(title = "Which Game to Play?",
subtitle = "based on Storage and Type",
caption = "Source : Google Play Store",
x = "Rating",
y = "Top Games")+
theme_bw(base_size = 9,
base_family = "",
base_line_size = 0.5,
base_rect_size = 0.5)+
my_themegps.all <- gps.size[gps.size$Rating > 4 , ## top rated
c("App", "Rating", "Size", "Type", "Reviews", "Category")]
gps.all <- gps.all[order(gps.all$Reviews, decreasing = T),]
summary(gps.all$Size)#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.00 5.60 15.00 24.19 35.00 100.00
gps.all$Size.cat <- ifelse(gps.all$Size <10, "Small Storage",
ifelse(gps.all$Size <80, "Medium Storage", "Large Storage"))
#merge top 9 free and paid games
gps.all.free <- rbind(head(gps.all[gps.all$Size.cat == "Large Storage" &
gps.all$Type == "Free",]
,3),
head(gps.all[gps.all$Size.cat == "Medium Storage" &
gps.all$Type == "Free",]
,3),
head(gps.all[gps.all$Size.cat == "Small Storage" &
gps.all$Type == "Free",]
,3))
gps.all.paid <- rbind(head(gps.all[gps.all$Size.cat == "Large Storage" &
gps.all$Type == "Paid",]
,3),
head(gps.all[gps.all$Size.cat == "Medium Storage" &
gps.all$Type == "Paid",]
,3),
head(gps.all[gps.all$Size.cat == "Small Storage" &
gps.all$Type == "Paid",]
,3))
gps.all <- rbind(gps.all.free, gps.all.paid)#Plotting
ggplot(gps.all, aes(x = Rating, y = App))+
geom_point(aes(col = Category, size = Reviews)) +
facet_grid(Type~Size.cat) +
labs(title = "Best Google Play Store Apps",
subtitle = "based on Storage and Type",
caption = "Source : Google Play Store",
x = "Rating",
y = "Top Games")+
theme_bw(base_size = 9,
base_family = "",
base_line_size = 0.5,
base_rect_size = 0.5)+
my_theme