options(repos = list(CRAN="http://cran.rstudio.com/"))

##Insert dataframe and the right packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(lubridate)
## Loading required package: timechange
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(modelr)
install.packages("naniar")
## Installing package into 'C:/Users/Costa/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'naniar' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Costa\AppData\Local\Temp\RtmpEBZn2u\downloaded_packages
library(naniar)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:purrr':
## 
##     compact
library(dplyr)

googleplaystore <- read_csv("D:/ANALYTICS/playstore/googleplaystore.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 10841 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): App, Category, Size, Installs, Type, Price, Content Rating, Genres...
## dbl  (2): Rating, Reviews
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(googleplaystore)

##nest step Computes the number of duplicate rows in the dataset using the duplicated() and sum() functions. The duplicated() function returns a logical vector indicating which rows are duplicates, and the sum() function counts the number of TRUE values in the vector.

Shows the first six rows of the dataset using the head() function. The head() function returns the first n rows of a dataset, where n is a specified number. By default, n is set to 6.

Shows the structure of the dataset using the str() function. The str() function shows the structure of an R object, including the class, length, and first few elements of each component.

Visualizes the missing values in the dataset using the vis_miss() function from the naniar package. The vis_miss() function creates a visualization that shows the distribution of missing values in the dataset, including the number and percentage of missing values for each column. Note that the naniar package must be installed and loaded in the current R session in order to use the vis_miss() function.

sum(duplicated(googleplaystore))
## [1] 483
head(googleplaystore)
## # A tibble: 6 × 13
##   App    Categ…¹ Rating Reviews Size  Insta…² Type  Price Conte…³ Genres Last …⁴
##   <chr>  <chr>    <dbl>   <dbl> <chr> <chr>   <chr> <chr> <chr>   <chr>  <chr>  
## 1 Photo… ART_AN…    4.1     159 19M   10,000+ Free  0     Everyo… Art &… Januar…
## 2 Color… ART_AN…    3.9     967 14M   500,00… Free  0     Everyo… Art &… Januar…
## 3 U Lau… ART_AN…    4.7   87510 8.7M  5,000,… Free  0     Everyo… Art &… August…
## 4 Sketc… ART_AN…    4.5  215644 25M   50,000… Free  0     Teen    Art &… June 8…
## 5 Pixel… ART_AN…    4.3     967 2.8M  100,00… Free  0     Everyo… Art &… June 2…
## 6 Paper… ART_AN…    4.4     167 5.6M  50,000+ Free  0     Everyo… Art &… March …
## # … with 2 more variables: `Current Ver` <chr>, `Android Ver` <chr>, and
## #   abbreviated variable names ¹​Category, ²​Installs, ³​`Content Rating`,
## #   ⁴​`Last Updated`
str(googleplaystore)
## spc_tbl_ [10,841 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ App           : chr [1:10841] "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite – FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
##  $ Category      : chr [1:10841] "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" ...
##  $ Rating        : num [1:10841] 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews       : num [1:10841] 159 967 87510 215644 967 ...
##  $ Size          : chr [1:10841] "19M" "14M" "8.7M" "25M" ...
##  $ Installs      : chr [1:10841] "10,000+" "500,000+" "5,000,000+" "50,000,000+" ...
##  $ Type          : chr [1:10841] "Free" "Free" "Free" "Free" ...
##  $ Price         : chr [1:10841] "0" "0" "0" "0" ...
##  $ Content Rating: chr [1:10841] "Everyone" "Everyone" "Everyone" "Teen" ...
##  $ Genres        : chr [1:10841] "Art & Design" "Art & Design;Pretend Play" "Art & Design" "Art & Design" ...
##  $ Last Updated  : chr [1:10841] "January 7, 2018" "January 15, 2018" "August 1, 2018" "June 8, 2018" ...
##  $ Current Ver   : chr [1:10841] "1.0.0" "2.0.0" "1.2.4" "Varies with device" ...
##  $ Android Ver   : chr [1:10841] "4.0.3 and up" "4.0.3 and up" "4.0.3 and up" "4.2 and up" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   App = col_character(),
##   ..   Category = col_character(),
##   ..   Rating = col_double(),
##   ..   Reviews = col_double(),
##   ..   Size = col_character(),
##   ..   Installs = col_character(),
##   ..   Type = col_character(),
##   ..   Price = col_character(),
##   ..   `Content Rating` = col_character(),
##   ..   Genres = col_character(),
##   ..   `Last Updated` = col_character(),
##   ..   `Current Ver` = col_character(),
##   ..   `Android Ver` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
vis_miss(googleplaystore)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `gather()` instead.
## ℹ The deprecated feature was likely used in the visdat package.
##   Please report the issue at <]8;;https://github.com/ropensci/visdat/issueshttps://github.com/ropensci/visdat/issues]8;;>.

#dropping duplicate records

app_data <- unique(googleplaystore) 
glimpse(app_data)
## Rows: 10,358
## Columns: 13
## $ App              <chr> "Photo Editor & Candy Camera & Grid & ScrapBook", "Co…
## $ Category         <chr> "ART_AND_DESIGN", "ART_AND_DESIGN", "ART_AND_DESIGN",…
## $ Rating           <dbl> 4.1, 3.9, 4.7, 4.5, 4.3, 4.4, 3.8, 4.1, 4.4, 4.7, 4.4…
## $ Reviews          <dbl> 159, 967, 87510, 215644, 967, 167, 178, 36815, 13791,…
## $ Size             <chr> "19M", "14M", "8.7M", "25M", "2.8M", "5.6M", "19M", "…
## $ Installs         <chr> "10,000+", "500,000+", "5,000,000+", "50,000,000+", "…
## $ Type             <chr> "Free", "Free", "Free", "Free", "Free", "Free", "Free…
## $ Price            <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0"…
## $ `Content Rating` <chr> "Everyone", "Everyone", "Everyone", "Teen", "Everyone…
## $ Genres           <chr> "Art & Design", "Art & Design;Pretend Play", "Art & D…
## $ `Last Updated`   <chr> "January 7, 2018", "January 15, 2018", "August 1, 201…
## $ `Current Ver`    <chr> "1.0.0", "2.0.0", "1.2.4", "Varies with device", "1.1…
## $ `Android Ver`    <chr> "4.0.3 and up", "4.0.3 and up", "4.0.3 and up", "4.2 …
app_data %>%
  filter(is.na(Rating) | is.nan(Rating)) %>%
  select(Rating)
## # A tibble: 1,465 × 1
##    Rating
##     <dbl>
##  1    NaN
##  2    NaN
##  3    NaN
##  4    NaN
##  5    NaN
##  6    NaN
##  7    NaN
##  8    NaN
##  9    NaN
## 10    NaN
## # … with 1,455 more rows

Replace non numeric values in “Rating” with 0 value

app_data["Rating"][is.na(app_data["Rating"])] <- 0
print(app_data)
## # A tibble: 10,358 × 13
##    App   Categ…¹ Rating Reviews Size  Insta…² Type  Price Conte…³ Genres Last …⁴
##    <chr> <chr>    <dbl>   <dbl> <chr> <chr>   <chr> <chr> <chr>   <chr>  <chr>  
##  1 Phot… ART_AN…    4.1     159 19M   10,000+ Free  0     Everyo… Art &… Januar…
##  2 Colo… ART_AN…    3.9     967 14M   500,00… Free  0     Everyo… Art &… Januar…
##  3 U La… ART_AN…    4.7   87510 8.7M  5,000,… Free  0     Everyo… Art &… August…
##  4 Sket… ART_AN…    4.5  215644 25M   50,000… Free  0     Teen    Art &… June 8…
##  5 Pixe… ART_AN…    4.3     967 2.8M  100,00… Free  0     Everyo… Art &… June 2…
##  6 Pape… ART_AN…    4.4     167 5.6M  50,000+ Free  0     Everyo… Art &… March …
##  7 Smok… ART_AN…    3.8     178 19M   50,000+ Free  0     Everyo… Art &… April …
##  8 Infi… ART_AN…    4.1   36815 29M   1,000,… Free  0     Everyo… Art &… June 1…
##  9 Gard… ART_AN…    4.4   13791 33M   1,000,… Free  0     Everyo… Art &… Septem…
## 10 Kids… ART_AN…    4.7     121 3.1M  10,000+ Free  0     Everyo… Art &… July 3…
## # … with 10,348 more rows, 2 more variables: `Current Ver` <chr>,
## #   `Android Ver` <chr>, and abbreviated variable names ¹​Category, ²​Installs,
## #   ³​`Content Rating`, ⁴​`Last Updated`

Now there are no NaN values in “Rating”

app_data %>%
  filter(is.na(Rating) | is.nan(Rating)) %>%
  select(Rating)
## # A tibble: 0 × 1
## # … with 1 variable: Rating <dbl>
str(app_data)
## tibble [10,358 × 13] (S3: tbl_df/tbl/data.frame)
##  $ App           : chr [1:10358] "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite – FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
##  $ Category      : chr [1:10358] "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" ...
##  $ Rating        : num [1:10358] 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews       : num [1:10358] 159 967 87510 215644 967 ...
##  $ Size          : chr [1:10358] "19M" "14M" "8.7M" "25M" ...
##  $ Installs      : chr [1:10358] "10,000+" "500,000+" "5,000,000+" "50,000,000+" ...
##  $ Type          : chr [1:10358] "Free" "Free" "Free" "Free" ...
##  $ Price         : chr [1:10358] "0" "0" "0" "0" ...
##  $ Content Rating: chr [1:10358] "Everyone" "Everyone" "Everyone" "Teen" ...
##  $ Genres        : chr [1:10358] "Art & Design" "Art & Design;Pretend Play" "Art & Design" "Art & Design" ...
##  $ Last Updated  : chr [1:10358] "January 7, 2018" "January 15, 2018" "August 1, 2018" "June 8, 2018" ...
##  $ Current Ver   : chr [1:10358] "1.0.0" "2.0.0" "1.2.4" "Varies with device" ...
##  $ Android Ver   : chr [1:10358] "4.0.3 and up" "4.0.3 and up" "4.0.3 and up" "4.2 and up" ...

##Replacing symbols ## Price - Remove “$” sign

app_data$Price <- str_replace_all(app_data$Price, "\\$","")

##Installs - Remove “+” , “,” symbols

app_data$Installs <- str_replace_all(app_data$Installs, "\\+","")
app_data$Installs <- str_replace_all(app_data$Installs, "\\,", "")

##Standardize types Some columns are in character type whereas they should be numeric or integer. Rating, Reviews , Price, Installs

app_data$Rating <- as.numeric(app_data$Rating)
app_data$Reviews <- as.numeric(app_data$Reviews)
app_data$Price <- as.numeric(app_data$Price)
## Warning: NAs introduced by coercion
app_data$Installs <- as.integer(app_data$Installs)
## Warning: NAs introduced by coercion

Rating is only up to 5

filter(app_data, Rating >= 0 & Rating <= 5)
## # A tibble: 10,357 × 13
##    App   Categ…¹ Rating Reviews Size  Insta…² Type  Price Conte…³ Genres Last …⁴
##    <chr> <chr>    <dbl>   <dbl> <chr>   <int> <chr> <dbl> <chr>   <chr>  <chr>  
##  1 Phot… ART_AN…    4.1     159 19M       1e4 Free      0 Everyo… Art &… Januar…
##  2 Colo… ART_AN…    3.9     967 14M       5e5 Free      0 Everyo… Art &… Januar…
##  3 U La… ART_AN…    4.7   87510 8.7M      5e6 Free      0 Everyo… Art &… August…
##  4 Sket… ART_AN…    4.5  215644 25M       5e7 Free      0 Teen    Art &… June 8…
##  5 Pixe… ART_AN…    4.3     967 2.8M      1e5 Free      0 Everyo… Art &… June 2…
##  6 Pape… ART_AN…    4.4     167 5.6M      5e4 Free      0 Everyo… Art &… March …
##  7 Smok… ART_AN…    3.8     178 19M       5e4 Free      0 Everyo… Art &… April …
##  8 Infi… ART_AN…    4.1   36815 29M       1e6 Free      0 Everyo… Art &… June 1…
##  9 Gard… ART_AN…    4.4   13791 33M       1e6 Free      0 Everyo… Art &… Septem…
## 10 Kids… ART_AN…    4.7     121 3.1M      1e4 Free      0 Everyo… Art &… July 3…
## # … with 10,347 more rows, 2 more variables: `Current Ver` <chr>,
## #   `Android Ver` <chr>, and abbreviated variable names ¹​Category, ²​Installs,
## #   ³​`Content Rating`, ⁴​`Last Updated`
unique(app_data$Category)
##  [1] "ART_AND_DESIGN"      "AUTO_AND_VEHICLES"   "BEAUTY"             
##  [4] "BOOKS_AND_REFERENCE" "BUSINESS"            "COMICS"             
##  [7] "COMMUNICATION"       "DATING"              "EDUCATION"          
## [10] "ENTERTAINMENT"       "EVENTS"              "FINANCE"            
## [13] "FOOD_AND_DRINK"      "HEALTH_AND_FITNESS"  "HOUSE_AND_HOME"     
## [16] "LIBRARIES_AND_DEMO"  "LIFESTYLE"           "GAME"               
## [19] "FAMILY"              "MEDICAL"             "SOCIAL"             
## [22] "SHOPPING"            "PHOTOGRAPHY"         "SPORTS"             
## [25] "TRAVEL_AND_LOCAL"    "TOOLS"               "PERSONALIZATION"    
## [28] "PRODUCTIVITY"        "PARENTING"           "WEATHER"            
## [31] "VIDEO_PLAYERS"       "NEWS_AND_MAGAZINES"  "MAPS_AND_NAVIGATION"
## [34] "1.9"

We have 33 Categories and 9660 entries

##Remove duplicates by App

newapp_data<-distinct(app_data, App, .keep_all = TRUE)
##then remove one category that was wrong (1.9)
  newapp_data<-subset(newapp_data, Category != "1.9")
  unique(newapp_data$Category)
##  [1] "ART_AND_DESIGN"      "AUTO_AND_VEHICLES"   "BEAUTY"             
##  [4] "BOOKS_AND_REFERENCE" "BUSINESS"            "COMICS"             
##  [7] "COMMUNICATION"       "DATING"              "EDUCATION"          
## [10] "ENTERTAINMENT"       "EVENTS"              "FINANCE"            
## [13] "FOOD_AND_DRINK"      "HEALTH_AND_FITNESS"  "HOUSE_AND_HOME"     
## [16] "LIBRARIES_AND_DEMO"  "LIFESTYLE"           "GAME"               
## [19] "FAMILY"              "MEDICAL"             "SOCIAL"             
## [22] "SHOPPING"            "PHOTOGRAPHY"         "SPORTS"             
## [25] "TRAVEL_AND_LOCAL"    "TOOLS"               "PERSONALIZATION"    
## [28] "PRODUCTIVITY"        "PARENTING"           "WEATHER"            
## [31] "VIDEO_PLAYERS"       "NEWS_AND_MAGAZINES"  "MAPS_AND_NAVIGATION"
str(newapp_data)
## tibble [9,659 × 13] (S3: tbl_df/tbl/data.frame)
##  $ App           : chr [1:9659] "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite – FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
##  $ Category      : chr [1:9659] "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" ...
##  $ Rating        : num [1:9659] 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
##  $ Reviews       : num [1:9659] 159 967 87510 215644 967 ...
##  $ Size          : chr [1:9659] "19M" "14M" "8.7M" "25M" ...
##  $ Installs      : int [1:9659] 10000 500000 5000000 50000000 100000 50000 50000 1000000 1000000 10000 ...
##  $ Type          : chr [1:9659] "Free" "Free" "Free" "Free" ...
##  $ Price         : num [1:9659] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Content Rating: chr [1:9659] "Everyone" "Everyone" "Everyone" "Teen" ...
##  $ Genres        : chr [1:9659] "Art & Design" "Art & Design;Pretend Play" "Art & Design" "Art & Design" ...
##  $ Last Updated  : chr [1:9659] "January 7, 2018" "January 15, 2018" "August 1, 2018" "June 8, 2018" ...
##  $ Current Ver   : chr [1:9659] "1.0.0" "2.0.0" "1.2.4" "Varies with device" ...
##  $ Android Ver   : chr [1:9659] "4.0.3 and up" "4.0.3 and up" "4.0.3 and up" "4.2 and up" ...
options(scipen = 999)
ggplot(newapp_data, aes(x = Category, y = Installs)) +
  geom_bar(stat = "identity", width = 0.7, fill = "red") +
  coord_flip() +
  labs(title = "Total App Installation for Each Category") +
  theme(axis.text.x = element_text(angle = 90))

##The most populars: 1) by Reviews

ggplot(newapp_data, aes(x = Category, y = Reviews)) +
  geom_bar(stat = "identity", width = 0.7, fill = "indianred") +
  coord_flip() +
  labs(title = "Total App Reviews for Each Category") +
  theme(axis.text.x = element_text(angle = 90))

Games are reviewed more than any other category, with a very big difference from the second one, Communication apps.

2. By the number of installations

ggplot(newapp_data, aes(x = Category, y = Installs)) +
  geom_bar(stat = "identity", width = 0.9, fill = "indianred") +
  coord_flip() +
  labs(title = "Total App Installations for Each Category") +
  theme(axis.text.x = element_text(angle = 90))

Again Games and Communications with the most installations.

What is the average number of the Ratings?

ggplot(newapp_data1, aes(y=Rating)) +
  geom_boxplot() +
  ggtitle("App Rating") +
  ylab("Rating")

Most of the ratings are between 3,5 and 4, with max 5.

ggplot(data = newapp_data1, aes(x = Reviews, y = Rating, color = Installs)) +
  geom_jitter() +
  labs(title = "Relationship between reviews, rating, and installations",
       x = "Number of Reviews",
       y = "Rating") 

The rating remains stable regarding the number of installations. But as reviews are getting more, the number of installations is increased.

##Correlation And bellow we can see the correlation between those 2 factors:

install.packages("GGally")
## Installing package into 'C:/Users/Costa/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'GGally' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Costa\AppData\Local\Temp\RtmpEBZn2u\downloaded_packages
library (GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
newapp_data1cor <- newapp_data1[,c(1,2,3)]
# Visualization
ggcorr(newapp_data1cor[,1:3],geom = "circle", nbreaks = 5)

##Rating distribution

ggplot(newapp_data1, aes(x=Rating)) + geom_histogram(binwidth=.5)

mean(newapp_data1$Rating)
## [1] 3.541143
median(newapp_data1$Rating)
## [1] 4.2

Most rating is between 3.5-5

Most downloadable categories in Games

newapp_data %>%
  select(Genres, Rating, Size, Reviews, Installs) %>%
  distinct() %>%
  group_by(Genres) %>%
  summarize(mean_rating = mean(Rating),
            mean_size = mean(Size),
            mean_reviews = mean(Reviews),
            mean_installs = mean(Installs)) %>%
  arrange(-mean_installs) %>%
  head()
## Warning in mean.default(Size): argument is not numeric or logical: returning NA
##   mean_rating mean_size mean_reviews mean_installs
## 1    3.563748        NA     218056.2       7830080

In “games” we have “Adventures” as the most downloadable game category, followed by “Arcade” in second place “Casual” third place..

##The most installed games

 newapp_data %>%
select(App,Category, Rating,Installs)%>%
  filter(Category=="GAME")%>%
  arrange(-Installs)%>%
  head()
## # A tibble: 6 × 4
##   App              Category Rating   Installs
##   <chr>            <chr>     <dbl>      <int>
## 1 Subway Surfers   GAME        4.5 1000000000
## 2 Candy Crush Saga GAME        4.4  500000000
## 3 Temple Run 2     GAME        4.3  500000000
## 4 Pou              GAME        4.3  500000000
## 5 My Talking Tom   GAME        4.5  500000000
## 6 ROBLOX           GAME        4.5  100000000