library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
#load the dataset
df <- read.csv("/Users/revathiyajjavarapu/Documents/statistics(1)/laptop_prices.csv")
#The first grouping is by Company, summarizing Price_euros and Weight.
# Load required libraries
library(dplyr)
# Assuming df is the dataframe containing the dataset
# Group by 'Company' and summarize 'Price_euros' and 'Weight'
grouped_by_company <- df %>%
group_by(Company) %>%
summarize(
Mean_Price = mean(Price_euros, na.rm = TRUE),
Min_Price = min(Price_euros, na.rm = TRUE),
Max_Price = max(Price_euros, na.rm = TRUE),
Mean_Weight = mean(Weight, na.rm = TRUE),
Min_Weight = min(Weight, na.rm = TRUE),
Max_Weight = max(Weight, na.rm = TRUE),
Group_Size = n()
)
# Print the result
print(grouped_by_company)
## # A tibble: 19 × 8
## Company Mean_Price Min_Price Max_Price Mean_Weight Min_Weight Max_Weight
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Acer 633. 174 2599 2.16 1.12 4.2
## 2 Apple 1564. 899. 2858 1.32 0.92 2.04
## 3 Asus 1124. 192. 3975 2.23 0.91 4.7
## 4 Chuwi 314. 245. 449 1.73 1.4 1.89
## 5 Dell 1199. 275. 3659. 2.14 1.18 4.42
## 6 Fujitsu 729 649 799 2.23 2.2 2.3
## 7 Google 1678. 1275 2199 1.1 1.1 1.1
## 8 HP 1080. 209 4389 1.91 0.97 3.78
## 9 Huawei 1424 1349 1499 1.05 1.05 1.05
## 10 LG 2099 1899 2299 1.05 0.98 1.09
## 11 Lenovo 1094. 229 4899 2.01 0.69 4.6
## 12 MSI 1729. 839 2799 2.66 1.6 4.5
## 13 Mediacom 295 239 389 1.34 1.16 1.45
## 14 Microsoft 1612. 989 2589 1.25 1.25 1.25
## 15 Razer 3346. 1029 6099 2.20 1.25 3.49
## 16 Samsung 1413. 269 1849 1.32 0.81 2.5
## 17 Toshiba 1268. 447 2799 1.64 1.05 2.4
## 18 Vero 217. 196 260 1.33 1.22 1.45
## 19 Xiaomi 1133. 935 1400. 1.62 1.28 1.95
## # ℹ 1 more variable: Group_Size <int>
#The second grouping is by OS, summarizing Inches (screen size) and Ram.
# Load required libraries
library(dplyr)
# Assuming df is the dataframe containing the dataset
# Group by 'OS' and summarize 'Inches' and 'Ram'
grouped_by_os <- df %>%
group_by(OS) %>%
summarize(
Mean_Inches = mean(Inches, na.rm = TRUE),
Min_Inches = min(Inches, na.rm = TRUE),
Max_Inches = max(Inches, na.rm = TRUE),
Mean_Ram = mean(Ram, na.rm = TRUE),
Min_Ram = min(Ram, na.rm = TRUE),
Max_Ram = max(Ram, na.rm = TRUE),
Group_Size = n()
)
# Print the result
print(grouped_by_os)
## # A tibble: 9 × 8
## OS Mean_Inches Min_Inches Max_Inches Mean_Ram Min_Ram Max_Ram Group_Size
## <chr> <dbl> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 Android 10.1 10.1 10.1 4 4 4 2
## 2 Chrome … 12.7 11.6 15.6 4.81 2 16 27
## 3 Linux 15.6 13.3 17.3 6.28 4 16 58
## 4 Mac OS X 12.7 11.6 15.4 8.5 4 16 8
## 5 No OS 15.7 13.3 17.3 6.42 4 16 66
## 6 Windows… 15.1 10.1 18.4 8.81 2 64 1048
## 7 Windows… 13.3 11.6 14 7 4 16 8
## 8 Windows… 14.5 12.5 17.3 7.91 4 16 45
## 9 macOS 13.6 12 15.4 9.85 8 16 13
#The third grouping combines TypeName and Touchscreen, summarizing Price_euros.
# Load required libraries
library(dplyr)
# Assuming df is the dataframe containing the dataset
# Group by 'TypeName' and 'Touchscreen' and summarize 'Price_euros'
grouped_by_type_touchscreen <- df %>%
group_by(TypeName, Touchscreen) %>%
summarize(
Mean_Price = mean(Price_euros, na.rm = TRUE),
Min_Price = min(Price_euros, na.rm = TRUE),
Max_Price = max(Price_euros, na.rm = TRUE),
Group_Size = n()
)
## `summarise()` has grouped output by 'TypeName'. You can override using the
## `.groups` argument.
# Print the result
print(grouped_by_type_touchscreen)
## # A tibble: 12 × 6
## # Groups: TypeName [6]
## TypeName Touchscreen Mean_Price Min_Price Max_Price Group_Size
## <chr> <chr> <dbl> <dbl> <dbl> <int>
## 1 2 in 1 Convertible No 1029 899 1159 2
## 2 2 in 1 Convertible Yes 1294. 275 2824 115
## 3 Gaming No 1700. 699 3975 200
## 4 Gaming Yes 2999 1029 6099 5
## 5 Netbook No 636. 174 1908 21
## 6 Netbook Yes 1062. 475 1650 2
## 7 Notebook No 776. 196 4899 688
## 8 Notebook Yes 1266. 439 2639 19
## 9 Ultrabook No 1499. 499 2858 149
## 10 Ultrabook Yes 1749. 899 3100 45
## 11 Workstation No 2284. 1369 4389 27
## 12 Workstation Yes 2238. 1763 2712 2
lowest_prob_company <- grouped_by_company %>%
filter(Group_Size == min(Group_Size))
print(lowest_prob_company)
## # A tibble: 1 × 8
## Company Mean_Price Min_Price Max_Price Mean_Weight Min_Weight Max_Weight
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Huawei 1424 1349 1499 1.05 1.05 1.05
## # ℹ 1 more variable: Group_Size <int>
lowest_prob_os <- grouped_by_os %>%
filter(Group_Size == min(Group_Size))
print(lowest_prob_os)
## # A tibble: 1 × 8
## OS Mean_Inches Min_Inches Max_Inches Mean_Ram Min_Ram Max_Ram Group_Size
## <chr> <dbl> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 Android 10.1 10.1 10.1 4 4 4 2
#For TypeName and Touchscreen Grouping:
lowest_prob_type_touchscreen <- grouped_by_type_touchscreen %>%
filter(Group_Size == min(Group_Size))
print(lowest_prob_type_touchscreen)
## # A tibble: 6 × 6
## # Groups: TypeName [6]
## TypeName Touchscreen Mean_Price Min_Price Max_Price Group_Size
## <chr> <chr> <dbl> <dbl> <dbl> <int>
## 1 2 in 1 Convertible No 1029 899 1159 2
## 2 Gaming Yes 2999 1029 6099 5
## 3 Netbook Yes 1062. 475 1650 2
## 4 Notebook Yes 1266. 439 2639 19
## 5 Ultrabook Yes 1749. 899 3100 45
## 6 Workstation Yes 2238. 1763 2712 2
#Hypothesis: Smaller groups represent niche products (e.g., specific brands, specialized operating systems, or high-end touchscreen laptops), and these groups may have higher average prices. #Testable Hypothesis: Niche products are rarer and tend to be more expensive due to limited production and specialized features.
#Correlation between Group_Size and Mean_Price for the ‘Company’ group
# Correlation test
cor.test(grouped_by_company$Group_Size, grouped_by_company$Mean_Price)
##
## Pearson's product-moment correlation
##
## data: grouped_by_company$Group_Size and grouped_by_company$Mean_Price
## t = -0.49325, df = 17, p-value = 0.6281
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5436603 0.3545556
## sample estimates:
## cor
## -0.1187827
# Regression test
lm(Mean_Price ~ Group_Size, data = grouped_by_company)
##
## Call:
## lm(formula = Mean_Price ~ Group_Size, data = grouped_by_company)
##
## Coefficients:
## (Intercept) Group_Size
## 1316.2356 -0.8275
#group by TypeName + Touchscreen
# Correlation test
cor.test(grouped_by_type_touchscreen$Group_Size, grouped_by_type_touchscreen$Mean_Price)
##
## Pearson's product-moment correlation
##
## data: grouped_by_type_touchscreen$Group_Size and grouped_by_type_touchscreen$Mean_Price
## t = -1.19, df = 10, p-value = 0.2615
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7703893 0.2778566
## sample estimates:
## cor
## -0.3522091
# Regression test
lm(Mean_Price ~ Group_Size, data = grouped_by_type_touchscreen)
##
## Call:
## lm(formula = Mean_Price ~ Group_Size, data = grouped_by_type_touchscreen)
##
## Coefficients:
## (Intercept) Group_Size
## 1677.197 -1.252
ggplot(grouped_by_company, aes(x = Group_Size, y = Mean_Price)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Relationship between Group Size and Mean Price",
x = "Group Size (Rarity)",
y = "Mean Price (Euros)")
## `geom_smooth()` using formula = 'y ~ x'
#Visualization:Helps to visualize the relationship between rarity (group
size) and price.
ggplot(grouped_by_company, aes(x = Company, y = Mean_Price)) +
geom_bar(stat = "identity", fill = "skyblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Mean Price by Company", x = "Company", y = "Mean Price (Euros)")
#Brands like Apple may have higher average prices and lighter
devices.
ggplot(grouped_by_os, aes(x = OS, y = Mean_Ram)) +
geom_bar(stat = "identity", fill = "lightgreen") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Mean RAM by Operating System", x = "Operating System", y = "Mean RAM (GB)")
#Operating systems like macOS or Linux may correspond to higher screen
size and RAM specifications.
grouped_by_type_touchscreen$Type_Touch <- paste(grouped_by_type_touchscreen$TypeName, "(Touchscreen:", grouped_by_type_touchscreen$Touchscreen, ")")
ggplot(grouped_by_type_touchscreen, aes(x = Type_Touch, y = Mean_Price)) +
geom_bar(stat = "identity", fill = "coral") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Mean Price by Type and Touchscreen Capability", x = "Type and Touchscreen", y = "Mean Price (Euros)")
#Here Touchscreen laptops, especially Ultrabooks, tend to be more
expensive due to additional features.
# Load required libraries
library(dplyr)
library(ggplot2)
#1 Pick two categorical variables
cat_var1 <- "Company"
cat_var2 <- "TypeName"
#2 Build a dataframe of all combinations of these two variables
all_combinations <- expand.grid(
Company = unique(df[[cat_var1]]),
TypeName = unique(df[[cat_var2]])
)
#3 Find combinations that do not exist in the data
#Create a dataframe with the actual combinations in the dataset
actual_combinations <- df %>%
select(Company, TypeName) %>%
distinct()
# Identify missing combinations by using anti_join
missing_combinations <- anti_join(all_combinations, actual_combinations, by = c("Company", "TypeName"))
# Display missing combinations (if any)
print("Missing combinations:")
## [1] "Missing combinations:"
print(missing_combinations)
## Company TypeName
## 1 Chuwi Ultrabook
## 2 MSI Ultrabook
## 3 Vero Ultrabook
## 4 Mediacom Ultrabook
## 5 Fujitsu Ultrabook
## 6 Apple Notebook
## 7 MSI Notebook
## 8 Microsoft Notebook
## 9 Huawei Notebook
## 10 Razer Notebook
## 11 Google Notebook
## 12 LG Notebook
## 13 Apple Netbook
## 14 Chuwi Netbook
## 15 MSI Netbook
## 16 Microsoft Netbook
## 17 Toshiba Netbook
## 18 Huawei Netbook
## 19 Xiaomi Netbook
## 20 Vero Netbook
## 21 Razer Netbook
## 22 Mediacom Netbook
## 23 Google Netbook
## 24 Fujitsu Netbook
## 25 LG Netbook
## 26 Apple Gaming
## 27 Chuwi Gaming
## 28 Microsoft Gaming
## 29 Toshiba Gaming
## 30 Huawei Gaming
## 31 Xiaomi Gaming
## 32 Vero Gaming
## 33 Mediacom Gaming
## 34 Samsung Gaming
## 35 Google Gaming
## 36 Fujitsu Gaming
## 37 LG Gaming
## 38 Apple 2 in 1 Convertible
## 39 Chuwi 2 in 1 Convertible
## 40 MSI 2 in 1 Convertible
## 41 Microsoft 2 in 1 Convertible
## 42 Toshiba 2 in 1 Convertible
## 43 Huawei 2 in 1 Convertible
## 44 Xiaomi 2 in 1 Convertible
## 45 Vero 2 in 1 Convertible
## 46 Razer 2 in 1 Convertible
## 47 Google 2 in 1 Convertible
## 48 Fujitsu 2 in 1 Convertible
## 49 LG 2 in 1 Convertible
## 50 Apple Workstation
## 51 Acer Workstation
## 52 Asus Workstation
## 53 Chuwi Workstation
## 54 MSI Workstation
## 55 Microsoft Workstation
## 56 Toshiba Workstation
## 57 Huawei Workstation
## 58 Xiaomi Workstation
## 59 Vero Workstation
## 60 Razer Workstation
## 61 Mediacom Workstation
## 62 Samsung Workstation
## 63 Google Workstation
## 64 Fujitsu Workstation
## 65 LG Workstation
#4 Count the most and least common combinations
combination_counts <- df %>%
group_by(Company, TypeName) %>%
count() %>%
arrange(desc(n))
# Display the most common and least common combinations
print("Most common combinations:")
## [1] "Most common combinations:"
print(head(combination_counts))
## # A tibble: 6 × 3
## # Groups: Company, TypeName [6]
## Company TypeName n
## <chr> <chr> <int>
## 1 HP Notebook 180
## 2 Lenovo Notebook 174
## 3 Dell Notebook 159
## 4 Acer Notebook 76
## 5 Asus Notebook 63
## 6 Asus Gaming 54
print("Least common combinations:")
## [1] "Least common combinations:"
print(tail(combination_counts))
## # A tibble: 6 × 3
## # Groups: Company, TypeName [6]
## Company TypeName n
## <chr> <chr> <int>
## 1 Samsung 2 in 1 Convertible 2
## 2 Xiaomi Notebook 2
## 3 Xiaomi Ultrabook 2
## 4 Mediacom 2 in 1 Convertible 1
## 5 Samsung Netbook 1
## 6 Samsung Notebook 1
#5 Visualize one of the combinations
#Example: Plot the counts of combinations using a bar plot
ggplot(combination_counts, aes(x = interaction(Company, TypeName), y = n)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Company and TypeName Combinations", y = "Count", title = "Frequency of Company-Type Combinations")
#Build Combinations: #expand.grid() creates a dataframe of all possible
combinations between Company and TypeName.
#anti_join() is used to find combinations that are in all_combinations but not in actual_combinations.
#group_by() and count() calculate the frequency of each combination in the dataset. arrange(desc(n)) sorts them by count, so you can see the most and least common combinations.
#ggplot() is used to create a bar plot showing the frequency of each combination. interaction() combines the two categorical variables into a single axis label.
#Market Strategy: Some combinations may be missing due to strategic decisions by the company. For instance, a company might not offer a specific type of product. #Demand and Production: Certain combinations may not exist due to low market demand or production constraints.