Project#2-Data3

library(tidyr)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)
url <- "https://raw.githubusercontent.com/AslamF/DATA607-Project-2/refs/heads/main/Laptop%20Price%20Dataset.csv"

raw_data <- read.csv(url)
glimpse(raw_data)
Rows: 1,303
Columns: 12
$ Unnamed..0       <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
$ Company          <chr> "Apple", "Apple", "HP", "Apple", "Apple", "Acer", "Ap…
$ TypeName         <chr> "Ultrabook", "Ultrabook", "Notebook", "Ultrabook", "U…
$ Inches           <chr> "13.3", "13.3", "15.6", "15.4", "13.3", "15.6", "15.4…
$ ScreenResolution <chr> "IPS Panel Retina Display 2560x1600", "1440x900", "Fu…
$ Cpu              <chr> "Intel Core i5 2.3GHz", "Intel Core i5 1.8GHz", "Inte…
$ Ram              <chr> "8GB", "8GB", "8GB", "16GB", "8GB", "4GB", "16GB", "8…
$ Memory           <chr> "128GB SSD", "128GB Flash Storage", "256GB SSD", "512…
$ Gpu              <chr> "Intel Iris Plus Graphics 640", "Intel HD Graphics 60…
$ OpSys            <chr> "macOS", "macOS", "No OS", "macOS", "macOS", "Windows…
$ Weight           <chr> "1.37kg", "1.34kg", "1.86kg", "1.83kg", "1.37kg", "2.…
$ Price            <dbl> 71378.68, 47895.52, 30636.00, 135195.34, 96095.81, 21…
tidy_laptops <- raw_data |>
  pivot_longer(
    cols      = c(Inches, ScreenResolution, Cpu, Ram, Memory, Gpu, OpSys, Weight),
    names_to  = "spec",           # column names become values in "spec"
    values_to = "spec_value"      # the values go into "spec_value"
  )


tidy_laptops <- tidy_laptops |>
  rename_with(tolower) |>
  rename(
    laptop_id  = unnamed..0,
    brand      = company,
    type       = typename
  ) |>
  mutate(
    price = as.numeric(gsub("[^0-9.]", "", as.character(price)))
  )


tidy_laptops <- tidy_laptops |>
  filter(!is.na(spec_value), spec_value != "", !is.na(price))

#  Convert price from INR to USD
tidy_laptops <- tidy_laptops |>
  mutate(price_usd = round(price * 0.012, 2))

tidy_laptops |>
  print(n = 20)
# A tibble: 10,184 × 7
   laptop_id brand type       price spec             spec_value        price_usd
       <int> <chr> <chr>      <dbl> <chr>            <chr>                 <dbl>
 1         0 Apple Ultrabook 71379. Inches           13.3                   857.
 2         0 Apple Ultrabook 71379. ScreenResolution IPS Panel Retina…      857.
 3         0 Apple Ultrabook 71379. Cpu              Intel Core i5 2.…      857.
 4         0 Apple Ultrabook 71379. Ram              8GB                    857.
 5         0 Apple Ultrabook 71379. Memory           128GB SSD              857.
 6         0 Apple Ultrabook 71379. Gpu              Intel Iris Plus …      857.
 7         0 Apple Ultrabook 71379. OpSys            macOS                  857.
 8         0 Apple Ultrabook 71379. Weight           1.37kg                 857.
 9         1 Apple Ultrabook 47896. Inches           13.3                   575.
10         1 Apple Ultrabook 47896. ScreenResolution 1440x900               575.
11         1 Apple Ultrabook 47896. Cpu              Intel Core i5 1.…      575.
12         1 Apple Ultrabook 47896. Ram              8GB                    575.
13         1 Apple Ultrabook 47896. Memory           128GB Flash Stor…      575.
14         1 Apple Ultrabook 47896. Gpu              Intel HD Graphic…      575.
15         1 Apple Ultrabook 47896. OpSys            macOS                  575.
16         1 Apple Ultrabook 47896. Weight           1.34kg                 575.
17         2 HP    Notebook  30636  Inches           15.6                   368.
18         2 HP    Notebook  30636  ScreenResolution Full HD 1920x1080      368.
19         2 HP    Notebook  30636  Cpu              Intel Core i5 72…      368.
20         2 HP    Notebook  30636  Ram              8GB                    368.
# ℹ 10,164 more rows

Analysis for average price by ram

tidy_laptops |>
  filter(spec == "Ram") |>
  group_by(spec_value) |>
  summarise(avg_price = round(mean(price, na.rm = TRUE), 2)) |>
  arrange(desc(avg_price)) |>
  print()
# A tibble: 10 × 2
   spec_value avg_price
   <chr>          <dbl>
 1 32GB         181849.
 2 24GB         117553.
 3 64GB         117512.
 4 16GB         103158.
 5 12GB          66037.
 6 8GB           63161.
 7 1GB           53227.
 8 6GB           32826.
 9 4GB           30553.
10 2GB           14757.
tidy_laptops |>
  filter(spec == "Ram") |>
  group_by(spec_value) |>
  summarise(avg_price = mean(price_usd, na.rm = TRUE)) |>
  ggplot(aes(x = reorder(spec_value, avg_price), y = avg_price)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Average Laptop Price by RAM",
    x     = "RAM",
    y     = "Average Price (USD)"
  ) +
  theme_minimal()

Conclusion

The Dataset required significant cleaning before analysis was done. Names were inconsistent and price column had non-numeric values. After completing the assignment I was shocked to see the price for a laptop being 150,000… This caused me to re-look at the data and realize the price was being shown in Indian Rupees! I had to go back and mutate the data with an appropriate conversion and then show the USD amount which is easier to interperet and understand. The data is very clear, More RAM means a higher price. 32GB will cost a significant premium and the average user at 8gb of RAM should look to spend around 750 USD.