library('readr')
## Warning: package 'readr' was built under R version 4.3.3
googleplaystore <- read_csv("C://Users//hp//Desktop/Google Play Store Apps//googleplaystore.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 10841 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): App, Category, Size, Installs, Type, Price, Content Rating, Genres...
## dbl (2): Rating, Reviews
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
googleplaystore
## # A tibble: 10,841 × 13
## App Category Rating Reviews Size Installs Type Price `Content Rating`
## <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 Photo Ed… ART_AND… 4.1 159 19M 10,000+ Free 0 Everyone
## 2 Coloring… ART_AND… 3.9 967 14M 500,000+ Free 0 Everyone
## 3 U Launch… ART_AND… 4.7 87510 8.7M 5,000,0… Free 0 Everyone
## 4 Sketch -… ART_AND… 4.5 215644 25M 50,000,… Free 0 Teen
## 5 Pixel Dr… ART_AND… 4.3 967 2.8M 100,000+ Free 0 Everyone
## 6 Paper fl… ART_AND… 4.4 167 5.6M 50,000+ Free 0 Everyone
## 7 Smoke Ef… ART_AND… 3.8 178 19M 50,000+ Free 0 Everyone
## 8 Infinite… ART_AND… 4.1 36815 29M 1,000,0… Free 0 Everyone
## 9 Garden C… ART_AND… 4.4 13791 33M 1,000,0… Free 0 Everyone
## 10 Kids Pai… ART_AND… 4.7 121 3.1M 10,000+ Free 0 Everyone
## # ℹ 10,831 more rows
## # ℹ 4 more variables: Genres <chr>, `Last Updated` <chr>, `Current Ver` <chr>,
## # `Android Ver` <chr>
# Question 1: What is the maximum number of reviews among all the apps?
googleplaystore <- na.omit(googleplaystore)
max_review <- max(googleplaystore$Reviews)
cat("Maximum number of reviews among all the apps are : ",max_review)
## Maximum number of reviews among all the apps are : 78158306
# Question 2 : How many unique app categories are there in the dataset?
library('dplyr')
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
unique_categories <- googleplaystore %>% distinct(Category)
unique_categories
## # A tibble: 33 × 1
## Category
## <chr>
## 1 ART_AND_DESIGN
## 2 AUTO_AND_VEHICLES
## 3 BEAUTY
## 4 BOOKS_AND_REFERENCE
## 5 BUSINESS
## 6 COMICS
## 7 COMMUNICATION
## 8 DATING
## 9 EDUCATION
## 10 ENTERTAINMENT
## # ℹ 23 more rows
# Question 3 : Which genres have the highest number of apps? List out top 10
library('dplyr')
genre_counts <- googleplaystore %>%
group_by(Genres) %>%
summarize(App_Count = n()) %>%
arrange(desc(App_Count))
top_10_genres <- head(genre_counts, 10)
top_10_genres
## # A tibble: 10 × 2
## Genres App_Count
## <chr> <int>
## 1 Tools 733
## 2 Entertainment 533
## 3 Education 468
## 4 Action 358
## 5 Productivity 351
## 6 Medical 350
## 7 Sports 333
## 8 Communication 328
## 9 Finance 323
## 10 Photography 317
# Question 4 : How many apps are free versus paid?
app_count <-table(googleplaystore$Type)
app_count
##
## Free Paid
## 8718 647
# Question 5 : What is the average price of paid apps in different categories?
library('dplyr')
googleplaystore$Price <- gsub("\\$", "", googleplaystore$Price)
googleplaystore$Price <- as.numeric(googleplaystore$Price)
average_price <- googleplaystore %>%
filter(Type == 'Paid') %>%
group_by(Category) %>%
summarize(average_price = mean(Price))
average_price
## # A tibble: 28 × 2
## Category average_price
## <chr> <dbl>
## 1 ART_AND_DESIGN 1.99
## 2 AUTO_AND_VEHICLES 1.99
## 3 BOOKS_AND_REFERENCE 2.98
## 4 BUSINESS 6.76
## 5 COMMUNICATION 2.58
## 6 DATING 5.74
## 7 EDUCATION 4.49
## 8 ENTERTAINMENT 3.99
## 9 FAMILY 14.2
## 10 FINANCE 188.
## # ℹ 18 more rows
# Question 6 : What is the distribution of app sizes (in bytes) across different app categories?
library('dplyr')
library('tidyr')
columns <- googleplaystore %>% select(Category, Size)
reshaped_data <- gather(columns, key = "Size_Category", value = "App_Size", -Category)
reshaped_data
## # A tibble: 9,365 × 3
## Category Size_Category App_Size
## <chr> <chr> <chr>
## 1 ART_AND_DESIGN Size 19M
## 2 ART_AND_DESIGN Size 14M
## 3 ART_AND_DESIGN Size 8.7M
## 4 ART_AND_DESIGN Size 25M
## 5 ART_AND_DESIGN Size 2.8M
## 6 ART_AND_DESIGN Size 5.6M
## 7 ART_AND_DESIGN Size 19M
## 8 ART_AND_DESIGN Size 29M
## 9 ART_AND_DESIGN Size 33M
## 10 ART_AND_DESIGN Size 3.1M
## # ℹ 9,355 more rows
# Question 7 : How many apps have a content rating of "Everyone"?
everyone_apps <- subset(googleplaystore, `Content Rating` == "Everyone")
number_of_everyone_apps <- nrow(everyone_apps)
cat("Number of apps with a content rating of 'Everyone':", number_of_everyone_apps)
## Number of apps with a content rating of 'Everyone': 7419
# Question 8 : Do app size categories influence user engagement, as measured by the average rating?
library('dplyr')
library('tidyr')
googleplaystore <- googleplaystore %>%
mutate(Size_Category = case_when(
Size < 10 ~ "Small",
Size >= 10 & Size < 50 ~ "Medium",
Size >= 50 ~ "Large"
))
rating_summary <- googleplaystore %>%
group_by(Size_Category) %>%
summarize(Average_Rating = mean(Rating, na.rm = TRUE))
library(ggplot2)
ggplot(rating_summary, aes(x = Size_Category, y = Average_Rating, fill = Size_Category)) +
geom_bar(stat = "identity") +
labs(title = "Average Rating by App Size Category",
x = "Size Category",
y = "Average Rating")

# Question 9 : What is the distribution of apps based on their content (e.g., Everyone, Teen, Mature 17+, etc.)?
googleplaystore <- googleplaystore %>% filter(`Content Rating` != "Unrated")
content_distribution <- table(googleplaystore$`Content Rating`)
pie(content_distribution,
main = "Distribution of Apps by Content Rating",
labels = names(content_distribution),
col = rainbow(length(content_distribution)))

# Question 10 : What is the distribution of app ratings in the dataset?
library('ggplot2')
ggplot(googleplaystore, aes(x = Rating)) +
geom_histogram(binwidth = 0.5, fill = "lightgreen", color = "black") +
labs(title = "Distribution of App Ratings",
x = "Rating",
y = "Frequency")

# Question 11 : Is there any correlation between user reviews and user ratings?
library('corrplot')
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
correlation <- cor(googleplaystore[, c("Reviews", "Rating")], method = "pearson")
corrplot(correlation, method = "number")

# Question 12 : How does the number of app rating relate to the app reviews?
library('ggplot2')
library('dplyr')
model <- lm(Rating ~ Reviews, data = googleplaystore)
ggplot(googleplaystore, aes(x = Reviews, y = Rating)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Relationship between App Rating and App Reviews",
x = "Reviews",
y = "Rating")
## `geom_smooth()` using formula = 'y ~ x'

# Question 13: How does the distribution of app ratings vary across different size categories, segmented by content ratings?
library('ggplot2')
ggplot(googleplaystore, aes(x = Size_Category, y = Rating, fill = `Content Rating`)) +
geom_boxplot() +
labs(title = "Distribution of App Ratings Across Different Size Categories",
x = "App Size Category",
y = "App Rating")

# Question 14 : What is the variability in app ratings within each app category?
library('dplyr')
category_sd <- googleplaystore %>%
group_by(Category) %>%
summarize(Std_Dev_Rating = sd(Rating))
category_sd
## # A tibble: 33 × 2
## Category Std_Dev_Rating
## <chr> <dbl>
## 1 ART_AND_DESIGN 0.358
## 2 AUTO_AND_VEHICLES 0.544
## 3 BEAUTY 0.363
## 4 BOOKS_AND_REFERENCE 0.429
## 5 BUSINESS 0.624
## 6 COMICS 0.538
## 7 COMMUNICATION 0.426
## 8 DATING 0.631
## 9 EDUCATION 0.252
## 10 ENTERTAINMENT 0.303
## # ℹ 23 more rows
# Question 15 : What is the relationship between app ratings, reviews, and price, and how do these variables vary across different app categories?
library('GGally')
## Warning: package 'GGally' was built under R version 4.3.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library('dplyr')
specific_columns <- googleplaystore %>%
select(Rating, Reviews, Price, Category) %>% head(200)
ggpairs(specific_columns, columns = c("Rating", "Reviews"), mapping = aes(color = Category))
