library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

#load the dataset 
df <- read.csv("/Users/revathiyajjavarapu/Documents/statistics(1)/laptop_prices.csv")

Group by Company, Summarize by Price_euros and Weight

#The first grouping is by Company, summarizing Price_euros and Weight.

# Load required libraries
library(dplyr)

# Assuming df is the dataframe containing the dataset
# Group by 'Company' and summarize 'Price_euros' and 'Weight'
grouped_by_company <- df %>%
  group_by(Company) %>%
  summarize(
    Mean_Price = mean(Price_euros, na.rm = TRUE),
    Min_Price = min(Price_euros, na.rm = TRUE),
    Max_Price = max(Price_euros, na.rm = TRUE),
    Mean_Weight = mean(Weight, na.rm = TRUE),
    Min_Weight = min(Weight, na.rm = TRUE),
    Max_Weight = max(Weight, na.rm = TRUE),
    Group_Size = n()
  )

# Print the result
print(grouped_by_company)

## # A tibble: 19 × 8
##    Company   Mean_Price Min_Price Max_Price Mean_Weight Min_Weight Max_Weight
##    <chr>          <dbl>     <dbl>     <dbl>       <dbl>      <dbl>      <dbl>
##  1 Acer            633.      174      2599         2.16       1.12       4.2 
##  2 Apple          1564.      899.     2858         1.32       0.92       2.04
##  3 Asus           1124.      192.     3975         2.23       0.91       4.7 
##  4 Chuwi           314.      245.      449         1.73       1.4        1.89
##  5 Dell           1199.      275.     3659.        2.14       1.18       4.42
##  6 Fujitsu         729       649       799         2.23       2.2        2.3 
##  7 Google         1678.     1275      2199         1.1        1.1        1.1 
##  8 HP             1080.      209      4389         1.91       0.97       3.78
##  9 Huawei         1424      1349      1499         1.05       1.05       1.05
## 10 LG             2099      1899      2299         1.05       0.98       1.09
## 11 Lenovo         1094.      229      4899         2.01       0.69       4.6 
## 12 MSI            1729.      839      2799         2.66       1.6        4.5 
## 13 Mediacom        295       239       389         1.34       1.16       1.45
## 14 Microsoft      1612.      989      2589         1.25       1.25       1.25
## 15 Razer          3346.     1029      6099         2.20       1.25       3.49
## 16 Samsung        1413.      269      1849         1.32       0.81       2.5 
## 17 Toshiba        1268.      447      2799         1.64       1.05       2.4 
## 18 Vero            217.      196       260         1.33       1.22       1.45
## 19 Xiaomi         1133.      935      1400.        1.62       1.28       1.95
## # ℹ 1 more variable: Group_Size <int>

Group by OS, Summarize by Inches and Ram

#The second grouping is by OS, summarizing Inches (screen size) and Ram.

# Load required libraries
library(dplyr)

# Assuming df is the dataframe containing the dataset
# Group by 'OS' and summarize 'Inches' and 'Ram'
grouped_by_os <- df %>%
  group_by(OS) %>%
  summarize(
    Mean_Inches = mean(Inches, na.rm = TRUE),
    Min_Inches = min(Inches, na.rm = TRUE),
    Max_Inches = max(Inches, na.rm = TRUE),
    Mean_Ram = mean(Ram, na.rm = TRUE),
    Min_Ram = min(Ram, na.rm = TRUE),
    Max_Ram = max(Ram, na.rm = TRUE),
    Group_Size = n()
  )

# Print the result
print(grouped_by_os)

## # A tibble: 9 × 8
##   OS       Mean_Inches Min_Inches Max_Inches Mean_Ram Min_Ram Max_Ram Group_Size
##   <chr>          <dbl>      <dbl>      <dbl>    <dbl>   <int>   <int>      <int>
## 1 Android         10.1       10.1       10.1     4          4       4          2
## 2 Chrome …        12.7       11.6       15.6     4.81       2      16         27
## 3 Linux           15.6       13.3       17.3     6.28       4      16         58
## 4 Mac OS X        12.7       11.6       15.4     8.5        4      16          8
## 5 No OS           15.7       13.3       17.3     6.42       4      16         66
## 6 Windows…        15.1       10.1       18.4     8.81       2      64       1048
## 7 Windows…        13.3       11.6       14       7          4      16          8
## 8 Windows…        14.5       12.5       17.3     7.91       4      16         45
## 9 macOS           13.6       12         15.4     9.85       8      16         13

Group by TypeName and Touchscreen, Summarize by Price_euros

#The third grouping combines TypeName and Touchscreen, summarizing Price_euros.

# Load required libraries
library(dplyr)

# Assuming df is the dataframe containing the dataset
# Group by 'TypeName' and 'Touchscreen' and summarize 'Price_euros'
grouped_by_type_touchscreen <- df %>%
  group_by(TypeName, Touchscreen) %>%
  summarize(
    Mean_Price = mean(Price_euros, na.rm = TRUE),
    Min_Price = min(Price_euros, na.rm = TRUE),
    Max_Price = max(Price_euros, na.rm = TRUE),
    Group_Size = n()
  )

## `summarise()` has grouped output by 'TypeName'. You can override using the
## `.groups` argument.

# Print the result
print(grouped_by_type_touchscreen)

## # A tibble: 12 × 6
## # Groups:   TypeName [6]
##    TypeName           Touchscreen Mean_Price Min_Price Max_Price Group_Size
##    <chr>              <chr>            <dbl>     <dbl>     <dbl>      <int>
##  1 2 in 1 Convertible No               1029        899      1159          2
##  2 2 in 1 Convertible Yes              1294.       275      2824        115
##  3 Gaming             No               1700.       699      3975        200
##  4 Gaming             Yes              2999       1029      6099          5
##  5 Netbook            No                636.       174      1908         21
##  6 Netbook            Yes              1062.       475      1650          2
##  7 Notebook           No                776.       196      4899        688
##  8 Notebook           Yes              1266.       439      2639         19
##  9 Ultrabook          No               1499.       499      2858        149
## 10 Ultrabook          Yes              1749.       899      3100         45
## 11 Workstation        No               2284.      1369      4389         27
## 12 Workstation        Yes              2238.      1763      2712          2

Identify and Tag Lowest Probability Groups

For Company Grouping:

lowest_prob_company <- grouped_by_company %>%
  filter(Group_Size == min(Group_Size))
print(lowest_prob_company)

## # A tibble: 1 × 8
##   Company Mean_Price Min_Price Max_Price Mean_Weight Min_Weight Max_Weight
##   <chr>        <dbl>     <dbl>     <dbl>       <dbl>      <dbl>      <dbl>
## 1 Huawei        1424      1349      1499        1.05       1.05       1.05
## # ℹ 1 more variable: Group_Size <int>

For OS Grouping:

lowest_prob_os <- grouped_by_os %>%
  filter(Group_Size == min(Group_Size))
print(lowest_prob_os)

## # A tibble: 1 × 8
##   OS      Mean_Inches Min_Inches Max_Inches Mean_Ram Min_Ram Max_Ram Group_Size
##   <chr>         <dbl>      <dbl>      <dbl>    <dbl>   <int>   <int>      <int>
## 1 Android        10.1       10.1       10.1        4       4       4          2

#For TypeName and Touchscreen Grouping:

lowest_prob_type_touchscreen <- grouped_by_type_touchscreen %>%
  filter(Group_Size == min(Group_Size))
print(lowest_prob_type_touchscreen)

## # A tibble: 6 × 6
## # Groups:   TypeName [6]
##   TypeName           Touchscreen Mean_Price Min_Price Max_Price Group_Size
##   <chr>              <chr>            <dbl>     <dbl>     <dbl>      <int>
## 1 2 in 1 Convertible No               1029        899      1159          2
## 2 Gaming             Yes              2999       1029      6099          5
## 3 Netbook            Yes              1062.       475      1650          2
## 4 Notebook           Yes              1266.       439      2639         19
## 5 Ultrabook          Yes              1749.       899      3100         45
## 6 Workstation        Yes              2238.      1763      2712          2

#Hypothesis: Smaller groups represent niche products (e.g., specific brands, specialized operating systems, or high-end touchscreen laptops), and these groups may have higher average prices. #Testable Hypothesis: Niche products are rarer and tend to be more expensive due to limited production and specialized features.

Hypothesis

#Correlation between Group_Size and Mean_Price for the ‘Company’ group

# Correlation test
cor.test(grouped_by_company$Group_Size, grouped_by_company$Mean_Price)

## 
##  Pearson's product-moment correlation
## 
## data:  grouped_by_company$Group_Size and grouped_by_company$Mean_Price
## t = -0.49325, df = 17, p-value = 0.6281
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5436603  0.3545556
## sample estimates:
##        cor 
## -0.1187827

# Regression test
lm(Mean_Price ~ Group_Size, data = grouped_by_company)

## 
## Call:
## lm(formula = Mean_Price ~ Group_Size, data = grouped_by_company)
## 
## Coefficients:
## (Intercept)   Group_Size  
##   1316.2356      -0.8275

#group by TypeName + Touchscreen

# Correlation test
cor.test(grouped_by_type_touchscreen$Group_Size, grouped_by_type_touchscreen$Mean_Price)

## 
##  Pearson's product-moment correlation
## 
## data:  grouped_by_type_touchscreen$Group_Size and grouped_by_type_touchscreen$Mean_Price
## t = -1.19, df = 10, p-value = 0.2615
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7703893  0.2778566
## sample estimates:
##        cor 
## -0.3522091

# Regression test
lm(Mean_Price ~ Group_Size, data = grouped_by_type_touchscreen)

## 
## Call:
## lm(formula = Mean_Price ~ Group_Size, data = grouped_by_type_touchscreen)
## 
## Coefficients:
## (Intercept)   Group_Size  
##    1677.197       -1.252

Scatter plot with regression line

ggplot(grouped_by_company, aes(x = Group_Size, y = Mean_Price)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Relationship between Group Size and Mean Price",
       x = "Group Size (Rarity)",
       y = "Mean Price (Euros)")

## `geom_smooth()` using formula = 'y ~ x'

#Visualization:Helps to visualize the relationship between rarity (group size) and price.

Visualization for Company Grouping

ggplot(grouped_by_company, aes(x = Company, y = Mean_Price)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Mean Price by Company", x = "Company", y = "Mean Price (Euros)")

#Brands like Apple may have higher average prices and lighter devices.

Visualization for OS Grouping

ggplot(grouped_by_os, aes(x = OS, y = Mean_Ram)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Mean RAM by Operating System", x = "Operating System", y = "Mean RAM (GB)")

#Operating systems like macOS or Linux may correspond to higher screen size and RAM specifications.

Visualization for TypeName and Touchscreen Grouping

grouped_by_type_touchscreen$Type_Touch <- paste(grouped_by_type_touchscreen$TypeName, "(Touchscreen:", grouped_by_type_touchscreen$Touchscreen, ")")

ggplot(grouped_by_type_touchscreen, aes(x = Type_Touch, y = Mean_Price)) +
  geom_bar(stat = "identity", fill = "coral") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Mean Price by Type and Touchscreen Capability", x = "Type and Touchscreen", y = "Mean Price (Euros)")

#Here Touchscreen laptops, especially Ultrabooks, tend to be more expensive due to additional features.

# Load required libraries
library(dplyr)
library(ggplot2)
#1 Pick two categorical variables
cat_var1 <- "Company"
cat_var2 <- "TypeName"

#2 Build a dataframe of all combinations of these two variables
all_combinations <- expand.grid(
  Company = unique(df[[cat_var1]]),
  TypeName = unique(df[[cat_var2]])
)

#3 Find combinations that do not exist in the data
#Create a dataframe with the actual combinations in the dataset
actual_combinations <- df %>%
  select(Company, TypeName) %>%
  distinct()

# Identify missing combinations by using anti_join
missing_combinations <- anti_join(all_combinations, actual_combinations, by = c("Company", "TypeName"))

# Display missing combinations (if any)
print("Missing combinations:")

## [1] "Missing combinations:"

print(missing_combinations)

##      Company           TypeName
## 1      Chuwi          Ultrabook
## 2        MSI          Ultrabook
## 3       Vero          Ultrabook
## 4   Mediacom          Ultrabook
## 5    Fujitsu          Ultrabook
## 6      Apple           Notebook
## 7        MSI           Notebook
## 8  Microsoft           Notebook
## 9     Huawei           Notebook
## 10     Razer           Notebook
## 11    Google           Notebook
## 12        LG           Notebook
## 13     Apple            Netbook
## 14     Chuwi            Netbook
## 15       MSI            Netbook
## 16 Microsoft            Netbook
## 17   Toshiba            Netbook
## 18    Huawei            Netbook
## 19    Xiaomi            Netbook
## 20      Vero            Netbook
## 21     Razer            Netbook
## 22  Mediacom            Netbook
## 23    Google            Netbook
## 24   Fujitsu            Netbook
## 25        LG            Netbook
## 26     Apple             Gaming
## 27     Chuwi             Gaming
## 28 Microsoft             Gaming
## 29   Toshiba             Gaming
## 30    Huawei             Gaming
## 31    Xiaomi             Gaming
## 32      Vero             Gaming
## 33  Mediacom             Gaming
## 34   Samsung             Gaming
## 35    Google             Gaming
## 36   Fujitsu             Gaming
## 37        LG             Gaming
## 38     Apple 2 in 1 Convertible
## 39     Chuwi 2 in 1 Convertible
## 40       MSI 2 in 1 Convertible
## 41 Microsoft 2 in 1 Convertible
## 42   Toshiba 2 in 1 Convertible
## 43    Huawei 2 in 1 Convertible
## 44    Xiaomi 2 in 1 Convertible
## 45      Vero 2 in 1 Convertible
## 46     Razer 2 in 1 Convertible
## 47    Google 2 in 1 Convertible
## 48   Fujitsu 2 in 1 Convertible
## 49        LG 2 in 1 Convertible
## 50     Apple        Workstation
## 51      Acer        Workstation
## 52      Asus        Workstation
## 53     Chuwi        Workstation
## 54       MSI        Workstation
## 55 Microsoft        Workstation
## 56   Toshiba        Workstation
## 57    Huawei        Workstation
## 58    Xiaomi        Workstation
## 59      Vero        Workstation
## 60     Razer        Workstation
## 61  Mediacom        Workstation
## 62   Samsung        Workstation
## 63    Google        Workstation
## 64   Fujitsu        Workstation
## 65        LG        Workstation

#4 Count the most and least common combinations
combination_counts <- df %>%
  group_by(Company, TypeName) %>%
  count() %>%
  arrange(desc(n))

# Display the most common and least common combinations
print("Most common combinations:")

## [1] "Most common combinations:"

print(head(combination_counts))

## # A tibble: 6 × 3
## # Groups:   Company, TypeName [6]
##   Company TypeName     n
##   <chr>   <chr>    <int>
## 1 HP      Notebook   180
## 2 Lenovo  Notebook   174
## 3 Dell    Notebook   159
## 4 Acer    Notebook    76
## 5 Asus    Notebook    63
## 6 Asus    Gaming      54

print("Least common combinations:")

## [1] "Least common combinations:"

print(tail(combination_counts))

## # A tibble: 6 × 3
## # Groups:   Company, TypeName [6]
##   Company  TypeName               n
##   <chr>    <chr>              <int>
## 1 Samsung  2 in 1 Convertible     2
## 2 Xiaomi   Notebook               2
## 3 Xiaomi   Ultrabook              2
## 4 Mediacom 2 in 1 Convertible     1
## 5 Samsung  Netbook                1
## 6 Samsung  Notebook               1

#5 Visualize one of the combinations
#Example: Plot the counts of combinations using a bar plot
ggplot(combination_counts, aes(x = interaction(Company, TypeName), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(x = "Company and TypeName Combinations", y = "Count", title = "Frequency of Company-Type Combinations")

#Build Combinations: #expand.grid() creates a dataframe of all possible combinations between Company and TypeName.

#anti_join() is used to find combinations that are in all_combinations but not in actual_combinations.

#group_by() and count() calculate the frequency of each combination in the dataset. arrange(desc(n)) sorts them by count, so you can see the most and least common combinations.

#ggplot() is used to create a bar plot showing the frequency of each combination. interaction() combines the two categorical variables into a single axis label.

Hypothesis for Missing Combinations:

#Market Strategy: Some combinations may be missing due to strategic decisions by the company. For instance, a company might not offer a specific type of product. #Demand and Production: Certain combinations may not exist due to low market demand or production constraints.