setwd("/Users/mariaseo/Desktop/MSQM/R/mydata")
library(readr)
apps <- read_csv("/Users/mariaseo/Desktop/MSQM/R/mydata/apps.csv")
## New names:
## Rows: 9659 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (10): App, Category, Installs, Type, Price, Content Rating, Genres, Last... dbl
## (4): ...1, Rating, Reviews, Size
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
#create subset with app categories of interest
apps_subset <- subset(apps, Category == "BUSINESS" | 
                        Category == "FINANCE" | 
                        Category == "DATING" | 
                        Category == "EDUCATION" | 
                        Category == "EVENTS" | 
                        Category == "SOCIAL" | 
                        Category == "FOOD_AND_DRINK" | 
                        Category == "GAME")

Summary statistics

#summary
summary(apps_subset)
##       ...1           App              Category             Rating     
##  Min.   :  187   Length:2429        Length:2429        Min.   :1.000  
##  1st Qu.: 1667   Class :character   Class :character   1st Qu.:4.000  
##  Median : 5640   Mode  :character   Mode  :character   Median :4.300  
##  Mean   : 5180                                         Mean   :4.199  
##  3rd Qu.: 8129                                         3rd Qu.:4.500  
##  Max.   :10835                                         Max.   :5.000  
##                                                        NA's   :358    
##     Reviews              Size          Installs             Type          
##  Min.   :       0   Min.   :  0.00   Length:2429        Length:2429       
##  1st Qu.:      37   1st Qu.:  6.70   Class :character   Class :character  
##  Median :    2555   Median : 19.00   Mode  :character   Mode  :character  
##  Mean   :  368980   Mean   : 26.65                                        
##  3rd Qu.:   55256   3rd Qu.: 39.00                                        
##  Max.   :78158306   Max.   :100.00                                        
##                     NA's   :299                                           
##     Price           Content Rating        Genres          Last Updated      
##  Length:2429        Length:2429        Length:2429        Length:2429       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Current Ver        Android Ver       
##  Length:2429        Length:2429       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 

Which app type received the highest reviews?

#consolidate categories
apps_subset$Category[apps_subset$Category == "BUSINESS"] <- "Business & Finance"
apps_subset$Category[apps_subset$Category == "FINANCE"] <- "Business & Finance"
apps_subset$Category[apps_subset$Category == "DATING"] <- "Dating"
apps_subset$Category[apps_subset$Category == "EDUCATION"] <- "Education"
apps_subset$Category[apps_subset$Category == "EVENTS"] <- "Events & Social"
apps_subset$Category[apps_subset$Category == "SOCIAL"] <- "Events & Social"
apps_subset$Category[apps_subset$Category == "FOOD_AND_DRINK"] <- "Food"
apps_subset$Category[apps_subset$Category == "GAME"] <- "Game"

#generate bar chart
library(ggplot2)
app_rating <- ggplot(apps_subset, aes(x=Category, y=Rating)) +
  geom_bar(fun="mean", stat="summary", fill="#d6d6d6") + 
  theme(panel.background = element_blank(), axis.line = element_line(colour = "black"))

app_rating
## Warning: Removed 358 rows containing non-finite values (`stat_summary()`).

Although there isn’t a significant difference, education apps received higher reviews than others. Games and event & social apps also received high ratings. On the other hand, dating apps had the lowest ratings, although the difference wasn’t substantial.

Which app type received the most reviews?

# Load the ggplot2 and scales packages
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ stringr   1.5.0
## ✔ forcats   1.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ scales::col_factor() masks readr::col_factor()
## ✖ purrr::discard()     masks scales::discard()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::lag()         masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
apps_subset <- apps_subset %>%
  mutate(Reviews = as.numeric(Reviews))

##apps_subset <- apps_subset %>%
##  mutate(Reviews = Reviews / 1000)

# Set the plot size options
options(repr.plot.width =9, repr.plot.height =9)

# Create the ggplot graph
app_reviews <- ggplot(apps_subset, aes(x=Category, y=Reviews)) +
  geom_bar(fun="mean", stat="summary", fill="#d6d6d6") + 
  theme(panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  labs(
    title = "Average Number of Reviews by Category",
    x = "App Category",
    y = "Average Reviews"
  ) +
  scale_y_continuous(labels = label_number_si(accuracy = 1))
## Warning: `label_number_si()` was deprecated in scales 1.2.0.
## ℹ Please use the `scale_cut` argument of `label_number()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display the graph
app_reviews

Event & social apps received the highest number of reviews, followed by game apps, suggesting there are the most popular. Dating apps received the lowest number of reviews.

What type of apps do people download the most?

# Step 1: Remove the "+" sign
apps_subset$Installs <- gsub("\\+", "", apps_subset$Installs)
# Step 2: Remove Commas
apps_subset$Installs <- gsub(",", "", apps_subset$Installs)
# Changing to numeric 
apps_subset <- apps_subset %>%
mutate(Installs = as.numeric(Installs))

#generate bar chart
  app_download <- ggplot(apps_subset, aes(x = reorder(Category, -Installs, FUN=mean), y = Installs)) +
  geom_bar(fun="mean", stat="summary", fill="#d6d6d6") + 
  theme(panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  labs(
    title = "Average Number of download by Category",
    x = "App Category",
    y = "Average download"
  ) +
  scale_y_continuous(labels = label_number_si(accuracy = 1))

app_download

Event & social apps and game apps were downloaded the most, further suggesting that they are the most popular. Dating apps are the least downloaded.

Findings

Many people downloaded event & social and game apps, giving them high ratings and writing the most reviews. It’s interesting that business & finance apps had a relatively high number of downloads but received fewer reviews compared to other apps.