
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
# set working directory and Load the dataset
setwd("/Users/ayomidealagbada/AYOMIDE'S DATAVISUALITIOM")
processed_data <- read_csv("processed_data2.csv")
## Rows: 1708 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (23): phone_brand, phone_model, store, dimensions, display_type, displa...
## dbl (13): price_usd, storage, ram, weight, display_size, nfc, battery, fold...
## date (2): launch_date, year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Subset of dataset overview
head(processed_data)
## # A tibble: 6 × 38
## phone_brand phone_model store price_usd storage ram launch_date dimensions
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <date> <chr>
## 1 apple Apple iPhone… Amaz… 1358. 256 8 2024-09-20 149.6 x 7…
## 2 apple Apple iPhone… Amaz… 1493. 512 8 2024-09-20 149.6 x 7…
## 3 apple Apple iPhone… Amaz… 1705. 1000 8 2024-09-20 149.6 x 7…
## 4 apple Apple iPhone… Amaz… 1565. 512 8 2024-09-20 163 x 77.…
## 5 apple Apple iPhone… Amaz… 247. 128 4 2020-11-13 131.5 x 6…
## 6 apple Apple iPhone… Amaz… 320. 256 4 2020-11-13 131.5 x 6…
## # ℹ 30 more variables: weight <dbl>, display_type <chr>, display_size <dbl>,
## # display_resolution <chr>, os <chr>, nfc <dbl>, usb <chr>, battery <dbl>,
## # features_sensors <chr>, colors <chr>, video <chr>, chipset <chr>,
## # cpu <chr>, gpu <chr>, year <date>, foldable <dbl>, ppi_density <dbl>,
## # quantile_10 <dbl>, quantile_50 <dbl>, quantile_90 <dbl>, price_range <chr>,
## # os_type <chr>, os_version <chr>, battery_size <chr>,
## # colors_available <dbl>, chip_company <chr>, cpu_core <chr>, …
# Assume missing data exists; use dplyr methods for handling it instead of na.omit
processed_data <- processed_data %>%
filter(!is.na(ram) & !is.na(battery))
# Mutate to add new columns if needed
processed_data <- processed_data %>%
mutate(price_category = ifelse(price_usd > 500, "High", "Low"))
# dplyr Commands:
# 1. Grouping and summarizing average RAM by brand
brand_ram_summary <- processed_data %>%
group_by(phone_brand) %>%
summarize(avg_ram = mean(ram, na.rm = TRUE)) %>%
arrange(desc(avg_ram))
# 2. Filtering top 5 brands by RAM
top_brands <- brand_ram_summary %>%
slice_max(order_by = avg_ram, n = 5)
# 3. Creating a summarized dataset for visualization
battery_ram_summary <- processed_data %>%
group_by(phone_brand) %>%
summarize(avg_battery = mean(battery,na.rm = TRUE),
avg_ram = mean(ram, na.rm = TRUE))
plot1 <- ggplot(processed_data, aes(x = ram, y = battery, color = phone_brand)) +
geom_point(alpha = 0.7, size = 3) + # Add transparency
geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "dashed") + # Add trend line
labs(
title = "RAM vs Battery Capacity Across Phone Brands",
subtitle = "Relationship between device memory and battery performance",
x = "RAM (GB)",
y = "Battery Capacity (mAh)",
color = "Phone Brand",
caption = "Data sourced from GSM Arena (2024)"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
legend.position = "right",
axis.title = element_text(face = "italic")
) +
scale_color_brewer(palette = "Paired")
plot1
## `geom_smooth()` using formula = 'y ~ x'
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Paired is 12
## Returning the palette you asked for with that many colors
## Warning: Removed 1114 rows containing missing values or values outside the scale range
## (`geom_point()`).

plot2 <- ggplot(top_brands, aes(x = reorder(phone_brand, avg_ram), y = avg_ram, fill = phone_brand)) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() +
labs(
title = "Top 5 Brands with Highest Average RAM",
x = "Brand",
y = "Average RAM (GB)",
caption = "Data Source: Kaggle & GSM Arena"
) +
theme_classic() +
scale_fill_manual(values = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e"))
print(plot2)

# Interactivity Example (Highcharter)
hchart(processed_data, "scatter", hcaes(x = ram, y = battery, group = phone_brand)) %>%
hc_title(text = "Interactive RAM vs Battery Capacity") %>%
hc_xAxis(title = list(text = "RAM (GB)")) %>%
hc_yAxis(title = list(text = "Battery Capacity (mAh)"))
hchart(top_brands, "bar", hcaes(x = reorder(phone_brand, avg_ram), y = avg_ram, color = phone_brand)) %>%
hc_title(text = "Top 5 Brands with Highest Average RAM") %>%
hc_xAxis(title = list(text = "Brand")) %>%
hc_yAxis(title = list(text = "Average RAM (GB)")) %>%
hc_plotOptions(
bar = list(
stacking = "normal"
)
) %>%
hc_colors(c("#1b9e77", "#d95f02", "#7570b3", "#e7298a", "#66a61e")) %>%
hc_tooltip(pointFormat = "{point.y} GB") %>%
hc_chart(type = "bar")
# Statistical Component: Linear Regression
lm_model <- lm(battery ~ ram, data = processed_data)
summary(lm_model)
##
## Call:
## lm(formula = battery ~ ram, data = processed_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2632.0 -286.2 177.8 423.9 5267.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4329.965 41.760 103.69 <2e-16 ***
## ram 41.016 4.808 8.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 680.5 on 1706 degrees of freedom
## Multiple R-squared: 0.04091, Adjusted R-squared: 0.04035
## F-statistic: 72.77 on 1 and 1706 DF, p-value: < 2.2e-16
# Try adjusting the plot margins before creating the plots:
par(mfrow = c(2, 2), mar = c(4, 4, 2, 1))
# Then plot your diagnostic plots:
plot(lm_model)
