library(tidyverse) 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor) 
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(highcharter) 
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
 # set working directory and Load the dataset
setwd("/Users/ayomidealagbada/AYOMIDE'S DATAVISUALITIOM")
processed_data <- read_csv("processed_data2.csv")
## Rows: 1708 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (23): phone_brand, phone_model, store, dimensions, display_type, displa...
## dbl  (13): price_usd, storage, ram, weight, display_size, nfc, battery, fold...
## date  (2): launch_date, year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Subset of dataset overview
head(processed_data)
## # A tibble: 6 × 38
##   phone_brand phone_model   store price_usd storage   ram launch_date dimensions
##   <chr>       <chr>         <chr>     <dbl>   <dbl> <dbl> <date>      <chr>     
## 1 apple       Apple iPhone… Amaz…     1358.     256     8 2024-09-20  149.6 x 7…
## 2 apple       Apple iPhone… Amaz…     1493.     512     8 2024-09-20  149.6 x 7…
## 3 apple       Apple iPhone… Amaz…     1705.    1000     8 2024-09-20  149.6 x 7…
## 4 apple       Apple iPhone… Amaz…     1565.     512     8 2024-09-20  163 x 77.…
## 5 apple       Apple iPhone… Amaz…      247.     128     4 2020-11-13  131.5 x 6…
## 6 apple       Apple iPhone… Amaz…      320.     256     4 2020-11-13  131.5 x 6…
## # ℹ 30 more variables: weight <dbl>, display_type <chr>, display_size <dbl>,
## #   display_resolution <chr>, os <chr>, nfc <dbl>, usb <chr>, battery <dbl>,
## #   features_sensors <chr>, colors <chr>, video <chr>, chipset <chr>,
## #   cpu <chr>, gpu <chr>, year <date>, foldable <dbl>, ppi_density <dbl>,
## #   quantile_10 <dbl>, quantile_50 <dbl>, quantile_90 <dbl>, price_range <chr>,
## #   os_type <chr>, os_version <chr>, battery_size <chr>,
## #   colors_available <dbl>, chip_company <chr>, cpu_core <chr>, …
# Assume missing data exists; use dplyr methods for handling it instead of na.omit
processed_data <- processed_data %>% 
  filter(!is.na(ram) & !is.na(battery))
# Mutate to add new columns if needed
processed_data <- processed_data %>% 
  mutate(price_category = ifelse(price_usd > 500, "High", "Low"))
# dplyr Commands:
# 1. Grouping and summarizing average RAM by brand
brand_ram_summary <- processed_data %>% 
  group_by(phone_brand) %>% 
  summarize(avg_ram = mean(ram, na.rm = TRUE)) %>% 
  arrange(desc(avg_ram))
# 2. Filtering top 5 brands by RAM
top_brands <- brand_ram_summary %>% 
  slice_max(order_by = avg_ram, n = 5)
# 3. Creating a summarized dataset for visualization
battery_ram_summary <- processed_data %>% 
  group_by(phone_brand) %>% 
  summarize(avg_battery = mean(battery,na.rm = TRUE), 
            avg_ram = mean(ram, na.rm = TRUE))
plot1 <- ggplot(processed_data, aes(x = ram, y = battery, color = phone_brand)) +
  geom_point(alpha = 0.7, size = 3) +  # Add transparency
  geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "dashed") +  # Add trend line
  labs(
    title = "RAM vs Battery Capacity Across Phone Brands",
    subtitle = "Relationship between device memory and battery performance",
    x = "RAM (GB)",
    y = "Battery Capacity (mAh)",
    color = "Phone Brand",
    caption = "Data sourced from GSM Arena (2024)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "right",
    axis.title = element_text(face = "italic")
  ) +
  scale_color_brewer(palette = "Paired")

plot1
## `geom_smooth()` using formula = 'y ~ x'
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Paired is 12
## Returning the palette you asked for with that many colors
## Warning: Removed 1114 rows containing missing values or values outside the scale range
## (`geom_point()`).

plot2 <- ggplot(top_brands, aes(x = reorder(phone_brand, avg_ram), y = avg_ram, fill = phone_brand)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(
    title = "Top 5 Brands with Highest Average RAM",
    x = "Brand",
    y = "Average RAM (GB)",
    caption = "Data Source: Kaggle & GSM Arena"
  ) +
  theme_classic() +
  scale_fill_manual(values = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e"))

print(plot2)

# Interactivity Example (Highcharter)
hchart(processed_data, "scatter", hcaes(x = ram, y = battery, group = phone_brand)) %>%
  hc_title(text = "Interactive RAM vs Battery Capacity") %>%
  hc_xAxis(title = list(text = "RAM (GB)")) %>%
  hc_yAxis(title = list(text = "Battery Capacity (mAh)"))
hchart(top_brands, "bar", hcaes(x = reorder(phone_brand, avg_ram), y = avg_ram, color = phone_brand)) %>%
  hc_title(text = "Top 5 Brands with Highest Average RAM") %>%
  hc_xAxis(title = list(text = "Brand")) %>%
  hc_yAxis(title = list(text = "Average RAM (GB)")) %>%
  hc_plotOptions(
    bar = list(
      stacking = "normal"
    )
  ) %>%
  hc_colors(c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e")) %>%
  hc_tooltip(pointFormat = "{point.y} GB") %>%
  hc_chart(type = "bar")
# Statistical Component: Linear Regression
lm_model <- lm(battery ~ ram, data = processed_data)
summary(lm_model)
## 
## Call:
## lm(formula = battery ~ ram, data = processed_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2632.0  -286.2   177.8   423.9  5267.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4329.965     41.760  103.69   <2e-16 ***
## ram           41.016      4.808    8.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 680.5 on 1706 degrees of freedom
## Multiple R-squared:  0.04091,    Adjusted R-squared:  0.04035 
## F-statistic: 72.77 on 1 and 1706 DF,  p-value: < 2.2e-16
# Try adjusting the plot margins before creating the plots:
par(mfrow = c(2, 2), mar = c(4, 4, 2, 1))

# Then plot your diagnostic plots:
plot(lm_model)