R Project

R Markdown

data <- read.csv("C:/Users/Abdul Qudoos/Downloads/R project.csv")

Structure of the data

str(data)

## 'data.frame':    28 obs. of  7 variables:
##  $ Observation: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Brand      : chr  "Canon" "Canon" "Canon" "Canon" ...
##  $ Price_.    : int  264 160 240 160 144 160 160 104 104 88 ...
##  $ Megapixels : int  10 12 12 10 12 12 14 10 12 16 ...
##  $ Weight_oz  : int  7 5 7 6 5 7 5 7 5 5 ...
##  $ Score      : int  70 70 69 66 66 65 64 64 63 59 ...
##  $ Brand.1    : int  1 1 1 1 1 1 1 1 1 1 ...

summary statistics for each variable

summary(data)

##   Observation       Brand              Price_.        Megapixels   
##  Min.   : 1.00   Length:28          Min.   : 64.0   Min.   :10.00  
##  1st Qu.: 7.75   Class :character   1st Qu.: 88.0   1st Qu.:12.00  
##  Median :14.50   Mode  :character   Median :128.0   Median :12.00  
##  Mean   :14.50                      Mean   :140.3   Mean   :12.86  
##  3rd Qu.:21.25                      3rd Qu.:160.0   3rd Qu.:14.00  
##  Max.   :28.00                      Max.   :320.0   Max.   :16.00  
##    Weight_oz         Score          Brand.1      
##  Min.   :4.000   Min.   :46.00   Min.   :0.0000  
##  1st Qu.:5.000   1st Qu.:56.00   1st Qu.:0.0000  
##  Median :6.000   Median :60.50   Median :0.0000  
##  Mean   :5.821   Mean   :60.36   Mean   :0.4643  
##  3rd Qu.:7.000   3rd Qu.:65.25   3rd Qu.:1.0000  
##  Max.   :7.000   Max.   :70.00   Max.   :1.0000

Checking the random sampling

set.seed(123)
sample_nikon <- sample(data$Price[data$Brand == "Nikon"], 10)
sample_canon <- sample(data$Price[data$Brand == "Canon"], 10)

# Check histograms for randomness
hist(sample_nikon, main = "Histogram of Nikon Prices (Sample)")

hist(sample_canon, main = "Histogram of Canon Prices (Sample)")

# Separate data for Nikon and Canon
nikon_prices <- data$Price[data$Brand == "Nikon"]
canon_prices <- data$Price[data$Brand == "Canon"]

# Perform independent two-sample t-test
t_test_result <- t.test(nikon_prices, canon_prices)

# Display the result
t_test_result

## 
##  Welch Two Sample t-test
## 
## data:  nikon_prices and canon_prices
## t = 0.08931, df = 25.993, p-value = 0.9295
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -49.67709  54.18991
## sample estimates:
## mean of x mean of y 
##  141.3333  139.0769

# Code data into a new variable based on Weight_oz
data$Weight_Category <- cut(data$Weight_oz, breaks = c(3, 5, 6, 7), labels = c("Light", "Medium", "Heavy"))

# Check the distribution of the new variable
table(data$Weight_Category)

## 
##  Light Medium  Heavy 
##     12      7      9

# Load required packages
library(stats)
library(multcomp)

## Warning: package 'multcomp' was built under R version 4.3.2

## Loading required package: mvtnorm

## Warning: package 'mvtnorm' was built under R version 4.3.2

## Loading required package: survival

## Loading required package: TH.data

## Warning: package 'TH.data' was built under R version 4.3.2

## Loading required package: MASS

## 
## Attaching package: 'TH.data'

## The following object is masked from 'package:MASS':
## 
##     geyser

# Ensure data types
data$Price <- as.numeric(data$Price)
data$Weight_oz <- as.numeric(data$Weight_oz)

# Check data structure
str(data)

## 'data.frame':    28 obs. of  9 variables:
##  $ Observation    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Brand          : chr  "Canon" "Canon" "Canon" "Canon" ...
##  $ Price_.        : int  264 160 240 160 144 160 160 104 104 88 ...
##  $ Megapixels     : int  10 12 12 10 12 12 14 10 12 16 ...
##  $ Weight_oz      : num  7 5 7 6 5 7 5 7 5 5 ...
##  $ Score          : int  70 70 69 66 66 65 64 64 63 59 ...
##  $ Brand.1        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Weight_Category: Factor w/ 3 levels "Light","Medium",..: 3 1 3 2 1 3 1 3 1 1 ...
##  $ Price          : num  264 160 240 160 144 160 160 104 104 88 ...

# Convert weight to categories
data$Weight_Cat <- cut(data$Weight_oz, breaks = c(3, 5, 6, 7), labels = c("Light Weight", "Medium Weight", "Heavy Weight"))

# Print data structure
str(data)

## 'data.frame':    28 obs. of  10 variables:
##  $ Observation    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Brand          : chr  "Canon" "Canon" "Canon" "Canon" ...
##  $ Price_.        : int  264 160 240 160 144 160 160 104 104 88 ...
##  $ Megapixels     : int  10 12 12 10 12 12 14 10 12 16 ...
##  $ Weight_oz      : num  7 5 7 6 5 7 5 7 5 5 ...
##  $ Score          : int  70 70 69 66 66 65 64 64 63 59 ...
##  $ Brand.1        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Weight_Category: Factor w/ 3 levels "Light","Medium",..: 3 1 3 2 1 3 1 3 1 1 ...
##  $ Price          : num  264 160 240 160 144 160 160 104 104 88 ...
##  $ Weight_Cat     : Factor w/ 3 levels "Light Weight",..: 3 1 3 2 1 3 1 3 1 1 ...

# Check for empty data
if (nrow(data) == 0) {
  stop("Data frame is empty. Please ensure data is loaded correctly.")
}

# Check for factor encoding
if (!is.factor(data$Weight_Cat)) {
  data$Weight_Cat <- as.factor(data$Weight_Cat)
}

# Perform one-way ANOVA
anova_result <- aov(Price ~ as.factor(Weight_Cat), data = data)
summary(anova_result)

##                       Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Weight_Cat)  2  13124    6562   1.557   0.23
## Residuals             25 105338    4214

# Install required packages (if not already installed)
if (!requireNamespace("ggplot2", quietly = TRUE)) install.packages("ggplot2")
if (!requireNamespace("MASS", quietly = TRUE)) install.packages("MASS")
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.2

library(MASS)  # For the truehist function

# Function to generate histogram and Q-Q plot
check_normality <- function(variable, data) {
  # Extract current variable
  current_variable <- names(data)[variable]

  # Compute standardized scores
  standardized_values <- scale(data[, variable])

  # Create the histogram using truehist for better visualization
  truehist(standardized_values, main = paste0("Histogram of ", current_variable), col = "skyblue")

  # Q-Q plot
  qqnorm(standardized_values, main = paste0("Q-Q Plot of ", current_variable))
  qqline(standardized_values, col = 2)

  # Perform and print Shapiro-Wilk test
  shapiro_test <- shapiro.test(standardized_values)
  cat(paste0("Shapiro-Wilk test for normality of ", current_variable, ":\n"))
  print(shapiro_test)
}

# Check normality for all numeric variables
for (i in 1:ncol(data)) {
  if (is.numeric(data[, i])) {
    check_normality(i, data)
  }
}

## Shapiro-Wilk test for normality of Observation:
## 
##  Shapiro-Wilk normality test
## 
## data:  standardized_values
## W = 0.95784, p-value = 0.3095

## Shapiro-Wilk test for normality of Price_.:
## 
##  Shapiro-Wilk normality test
## 
## data:  standardized_values
## W = 0.89739, p-value = 0.009945

## Shapiro-Wilk test for normality of Megapixels:
## 
##  Shapiro-Wilk normality test
## 
## data:  standardized_values
## W = 0.87756, p-value = 0.003549

## Shapiro-Wilk test for normality of Weight_oz:
## 
##  Shapiro-Wilk normality test
## 
## data:  standardized_values
## W = 0.84974, p-value = 0.0009235

## Shapiro-Wilk test for normality of Score:
## 
##  Shapiro-Wilk normality test
## 
## data:  standardized_values
## W = 0.95719, p-value = 0.2985

## Shapiro-Wilk test for normality of Brand.1:
## 
##  Shapiro-Wilk normality test
## 
## data:  standardized_values
## W = 0.63672, p-value = 4.084e-07

## Shapiro-Wilk test for normality of Price:
## 
##  Shapiro-Wilk normality test
## 
## data:  standardized_values
## W = 0.89739, p-value = 0.009945

# Install required package (if not already installed)
if (!requireNamespace("GGally", quietly = TRUE)) install.packages("GGally")

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(GGally)

## Warning: package 'GGally' was built under R version 4.3.2

# Select variables
variables <- c("Megapixels", "Weight_oz", "Score")

# Check if selected variables are present in the dataset
missing_vars <- setdiff(variables, names(data))

if (length(missing_vars) > 0) {
  stop(paste("The following variables are missing from the dataset:", paste(missing_vars, collapse = ", ")))
}

# Check for missing values in selected variables
missing_values <- any(is.na(data[, variables]))

if (missing_values) {
  stop("There are missing values in the selected variables.")
}

# 1. Create matrix scatter plot with Spearman's correlation
ggpairs(data, columns = variables, lower = list(continuous = "cor", diag = "hist")) +
  theme_bw() +
  labs(diag = ~correlation)

# 2. Compute correlation matrix using Spearman's method
correlation_matrix <- cor(data[, variables], method = "spearman")

# Print correlation matrix
print(correlation_matrix)

##             Megapixels  Weight_oz       Score
## Megapixels  1.00000000 -0.2092513 -0.04233974
## Weight_oz  -0.20925132  1.0000000  0.21404556
## Score      -0.04233974  0.2140456  1.00000000

R Project

Abdul Qudoos

2023-12-12

R Markdown