library('readr')
## Warning: package 'readr' was built under R version 4.3.3
googleplaystore <- read_csv("C://Users//hp//Desktop/Google Play Store Apps//googleplaystore.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 10841 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): App, Category, Size, Installs, Type, Price, Content Rating, Genres...
## dbl  (2): Rating, Reviews
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
googleplaystore
## # A tibble: 10,841 × 13
##    App       Category Rating Reviews Size  Installs Type  Price `Content Rating`
##    <chr>     <chr>     <dbl>   <dbl> <chr> <chr>    <chr> <chr> <chr>           
##  1 Photo Ed… ART_AND…    4.1     159 19M   10,000+  Free  0     Everyone        
##  2 Coloring… ART_AND…    3.9     967 14M   500,000+ Free  0     Everyone        
##  3 U Launch… ART_AND…    4.7   87510 8.7M  5,000,0… Free  0     Everyone        
##  4 Sketch -… ART_AND…    4.5  215644 25M   50,000,… Free  0     Teen            
##  5 Pixel Dr… ART_AND…    4.3     967 2.8M  100,000+ Free  0     Everyone        
##  6 Paper fl… ART_AND…    4.4     167 5.6M  50,000+  Free  0     Everyone        
##  7 Smoke Ef… ART_AND…    3.8     178 19M   50,000+  Free  0     Everyone        
##  8 Infinite… ART_AND…    4.1   36815 29M   1,000,0… Free  0     Everyone        
##  9 Garden C… ART_AND…    4.4   13791 33M   1,000,0… Free  0     Everyone        
## 10 Kids Pai… ART_AND…    4.7     121 3.1M  10,000+  Free  0     Everyone        
## # ℹ 10,831 more rows
## # ℹ 4 more variables: Genres <chr>, `Last Updated` <chr>, `Current Ver` <chr>,
## #   `Android Ver` <chr>
# Question 1: What is the maximum number of reviews among all the apps?
googleplaystore <- na.omit(googleplaystore)
max_review <- max(googleplaystore$Reviews)
cat("Maximum number of reviews among all the apps are : ",max_review)
## Maximum number of reviews among all the apps are :  78158306
# Question 2 : How many unique app categories are there in the dataset?
library('dplyr')
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
unique_categories <- googleplaystore %>% distinct(Category)
unique_categories
## # A tibble: 33 × 1
##    Category           
##    <chr>              
##  1 ART_AND_DESIGN     
##  2 AUTO_AND_VEHICLES  
##  3 BEAUTY             
##  4 BOOKS_AND_REFERENCE
##  5 BUSINESS           
##  6 COMICS             
##  7 COMMUNICATION      
##  8 DATING             
##  9 EDUCATION          
## 10 ENTERTAINMENT      
## # ℹ 23 more rows
# Question 3 : Which genres have the highest number of apps? List out top 10
library('dplyr')
genre_counts <- googleplaystore %>%
  group_by(Genres) %>%
  summarize(App_Count = n()) %>%
  arrange(desc(App_Count))

top_10_genres <- head(genre_counts, 10)  
top_10_genres
## # A tibble: 10 × 2
##    Genres        App_Count
##    <chr>             <int>
##  1 Tools               733
##  2 Entertainment       533
##  3 Education           468
##  4 Action              358
##  5 Productivity        351
##  6 Medical             350
##  7 Sports              333
##  8 Communication       328
##  9 Finance             323
## 10 Photography         317
# Question 4 : How many apps are free versus paid?

app_count <-table(googleplaystore$Type)
app_count
## 
## Free Paid 
## 8718  647
# Question 5 : What is the average price of paid apps in different categories?

library('dplyr')

googleplaystore$Price <- gsub("\\$", "", googleplaystore$Price)
googleplaystore$Price <- as.numeric(googleplaystore$Price)

average_price <- googleplaystore %>%
  filter(Type == 'Paid') %>%
  group_by(Category) %>%
  summarize(average_price = mean(Price))

average_price
## # A tibble: 28 × 2
##    Category            average_price
##    <chr>                       <dbl>
##  1 ART_AND_DESIGN               1.99
##  2 AUTO_AND_VEHICLES            1.99
##  3 BOOKS_AND_REFERENCE          2.98
##  4 BUSINESS                     6.76
##  5 COMMUNICATION                2.58
##  6 DATING                       5.74
##  7 EDUCATION                    4.49
##  8 ENTERTAINMENT                3.99
##  9 FAMILY                      14.2 
## 10 FINANCE                    188.  
## # ℹ 18 more rows
# Question 6 : What is the distribution of app sizes (in bytes) across different app categories?

library('dplyr')
library('tidyr')

columns <- googleplaystore %>% select(Category, Size)

reshaped_data <- gather(columns, key = "Size_Category", value = "App_Size", -Category)
reshaped_data
## # A tibble: 9,365 × 3
##    Category       Size_Category App_Size
##    <chr>          <chr>         <chr>   
##  1 ART_AND_DESIGN Size          19M     
##  2 ART_AND_DESIGN Size          14M     
##  3 ART_AND_DESIGN Size          8.7M    
##  4 ART_AND_DESIGN Size          25M     
##  5 ART_AND_DESIGN Size          2.8M    
##  6 ART_AND_DESIGN Size          5.6M    
##  7 ART_AND_DESIGN Size          19M     
##  8 ART_AND_DESIGN Size          29M     
##  9 ART_AND_DESIGN Size          33M     
## 10 ART_AND_DESIGN Size          3.1M    
## # ℹ 9,355 more rows
# Question 7 : How many apps have a content rating of "Everyone"?

everyone_apps <- subset(googleplaystore, `Content Rating` == "Everyone")

number_of_everyone_apps <- nrow(everyone_apps)

cat("Number of apps with a content rating of 'Everyone':", number_of_everyone_apps)
## Number of apps with a content rating of 'Everyone': 7419
# Question 8 : Do app size categories influence user engagement, as measured by the average rating?
library('dplyr')
library('tidyr')


googleplaystore <- googleplaystore %>%
  mutate(Size_Category = case_when(
    Size < 10 ~ "Small",
    Size >= 10 & Size < 50 ~ "Medium",
    Size >= 50 ~ "Large"
  ))


rating_summary <- googleplaystore %>%
  group_by(Size_Category) %>%
  summarize(Average_Rating = mean(Rating, na.rm = TRUE))


library(ggplot2)
ggplot(rating_summary, aes(x = Size_Category, y = Average_Rating, fill = Size_Category)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Rating by App Size Category",
       x = "Size Category",
       y = "Average Rating") 

# Question 9 : What is the distribution of apps based on their content (e.g., Everyone, Teen, Mature 17+, etc.)?


googleplaystore <- googleplaystore %>% filter(`Content Rating` != "Unrated")

content_distribution <- table(googleplaystore$`Content Rating`) 

pie(content_distribution,
    main = "Distribution of Apps by Content Rating",
    labels = names(content_distribution),
    col = rainbow(length(content_distribution)))

# Question 10 : What is the distribution of app ratings in the dataset?

library('ggplot2')
ggplot(googleplaystore, aes(x = Rating)) +
  geom_histogram(binwidth = 0.5, fill = "lightgreen", color = "black") +
  labs(title = "Distribution of App Ratings",
       x = "Rating",
       y = "Frequency")

# Question 11 : Is there any correlation between user reviews and user ratings?

library('corrplot')
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
correlation <- cor(googleplaystore[, c("Reviews", "Rating")], method = "pearson")

corrplot(correlation, method = "number")

# Question 12 : How does the number of app rating relate to the app reviews?


library('ggplot2')
library('dplyr')

model <- lm(Rating ~ Reviews, data = googleplaystore)


ggplot(googleplaystore, aes(x = Reviews, y = Rating)) +
  geom_point() +
  geom_smooth(method = "lm") +  
  labs(title = "Relationship between App Rating and App Reviews",
       x = "Reviews",
       y = "Rating")
## `geom_smooth()` using formula = 'y ~ x'

# Question 13:  How does the distribution of app ratings vary across different size categories, segmented by content ratings?

library('ggplot2')

ggplot(googleplaystore, aes(x = Size_Category, y = Rating, fill = `Content Rating`)) +
  geom_boxplot() +
  labs(title = "Distribution of App Ratings Across Different Size Categories",
       x = "App Size Category",
       y = "App Rating")

# Question 14 : What is the variability in app ratings within each app category?

library('dplyr')
category_sd <- googleplaystore %>%
  group_by(Category) %>%
  summarize(Std_Dev_Rating = sd(Rating))

category_sd
## # A tibble: 33 × 2
##    Category            Std_Dev_Rating
##    <chr>                        <dbl>
##  1 ART_AND_DESIGN               0.358
##  2 AUTO_AND_VEHICLES            0.544
##  3 BEAUTY                       0.363
##  4 BOOKS_AND_REFERENCE          0.429
##  5 BUSINESS                     0.624
##  6 COMICS                       0.538
##  7 COMMUNICATION                0.426
##  8 DATING                       0.631
##  9 EDUCATION                    0.252
## 10 ENTERTAINMENT                0.303
## # ℹ 23 more rows
# Question 15 : What is the relationship between app ratings, reviews, and price, and how do these variables vary across different app categories?

library('GGally')
## Warning: package 'GGally' was built under R version 4.3.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library('dplyr')

specific_columns <- googleplaystore %>%
  select(Rating, Reviews, Price, Category) %>% head(200)

ggpairs(specific_columns, columns = c("Rating", "Reviews"), mapping = aes(color = Category))