# Load necessary libraries (if not already loaded)
# install.packages("dplyr") # Uncomment if dplyr is not installed
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load the dataset
# Load the dataset from the Downloads folder
data <- read.csv("C:/Users/pulud/Downloads/WA_Marketing-Campaign.csv")
# Summary statistics for the numeric variables
summary_stats <- data %>%
select(LocationID, AgeOfStore, Promotion, week, SalesInThousands) %>%
summarise(
LocationID_mean = mean(LocationID, na.rm = TRUE),
LocationID_variance = var(LocationID, na.rm = TRUE),
LocationID_min = min(LocationID, na.rm = TRUE),
LocationID_max = max(LocationID, na.rm = TRUE),
AgeOfStore_mean = mean(AgeOfStore, na.rm = TRUE),
AgeOfStore_variance = var(AgeOfStore, na.rm = TRUE),
AgeOfStore_min = min(AgeOfStore, na.rm = TRUE),
AgeOfStore_max = max(AgeOfStore, na.rm = TRUE),
Promotion_mean = mean(Promotion, na.rm = TRUE),
Promotion_variance = var(Promotion, na.rm = TRUE),
Promotion_min = min(Promotion, na.rm = TRUE),
Promotion_max = max(Promotion, na.rm = TRUE),
week_mean = mean(week, na.rm = TRUE),
week_variance = var(week, na.rm = TRUE),
week_min = min(week, na.rm = TRUE),
week_max = max(week, na.rm = TRUE),
SalesInThousands_mean = mean(SalesInThousands, na.rm = TRUE),
SalesInThousands_variance = var(SalesInThousands, na.rm = TRUE),
SalesInThousands_min = min(SalesInThousands, na.rm = TRUE),
SalesInThousands_max = max(SalesInThousands, na.rm = TRUE)
)
# View the summary statistics
print(summary_stats)
## LocationID_mean LocationID_variance LocationID_min LocationID_max
## 1 479.6569 82928.84 1 920
## AgeOfStore_mean AgeOfStore_variance AgeOfStore_min AgeOfStore_max
## 1 8.50365 44.06763 1 28
## Promotion_mean Promotion_variance Promotion_min Promotion_max week_mean
## 1 2.029197 0.6572813 1 3 2.5
## week_variance week_min week_max SalesInThousands_mean
## 1 1.252285 1 4 53.4662
## SalesInThousands_variance SalesInThousands_min SalesInThousands_max
## 1 280.7373 17.34 99.65
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Set CRAN mirror
options(repos = c(CRAN = "https://cloud.r-project.org/"))
# Load necessary libraries
library(dplyr)
# Load the dataset from the Downloads folder
data <- read.csv("C:/Users/pulud/Downloads/WA_Marketing-Campaign.csv")
# Summary statistics for the numeric variables
summary_stats <- data %>%
select(LocationID, AgeOfStore, Promotion, week, SalesInThousands) %>%
summarise(
LocationID_mean = mean(LocationID, na.rm = TRUE),
LocationID_variance = var(LocationID, na.rm = TRUE),
LocationID_min = min(LocationID, na.rm = TRUE),
LocationID_max = max(LocationID, na.rm = TRUE),
AgeOfStore_mean = mean(AgeOfStore, na.rm = TRUE),
AgeOfStore_variance = var(AgeOfStore, na.rm = TRUE),
AgeOfStore_min = min(AgeOfStore, na.rm = TRUE),
AgeOfStore_max = max(AgeOfStore, na.rm = TRUE),
Promotion_mean = mean(Promotion, na.rm = TRUE),
Promotion_variance = var(Promotion, na.rm = TRUE),
Promotion_min = min(Promotion, na.rm = TRUE),
Promotion_max = max(Promotion, na.rm = TRUE),
week_mean = mean(week, na.rm = TRUE),
week_variance = var(week, na.rm = TRUE),
week_min = min(week, na.rm = TRUE),
week_max = max(week, na.rm = TRUE),
SalesInThousands_mean = mean(SalesInThousands, na.rm = TRUE),
SalesInThousands_variance = var(SalesInThousands, na.rm = TRUE),
SalesInThousands_min = min(SalesInThousands, na.rm = TRUE),
SalesInThousands_max = max(SalesInThousands, na.rm = TRUE)
)
# View the summary statistics
print(summary_stats)
## LocationID_mean LocationID_variance LocationID_min LocationID_max
## 1 479.6569 82928.84 1 920
## AgeOfStore_mean AgeOfStore_variance AgeOfStore_min AgeOfStore_max
## 1 8.50365 44.06763 1 28
## Promotion_mean Promotion_variance Promotion_min Promotion_max week_mean
## 1 2.029197 0.6572813 1 3 2.5
## week_variance week_min week_max SalesInThousands_mean
## 1 1.252285 1 4 53.4662
## SalesInThousands_variance SalesInThousands_min SalesInThousands_max
## 1 280.7373 17.34 99.65
# Load the dataset
data <- read.csv("C:/Users/pulud/Downloads/WA_Marketing-Campaign-1.csv")
# Filter the data for Medium and Large markets
filtered_data <- data %>% dplyr::filter(MarketSize %in% c("Medium", "Large"))
# Conduct the t-test
t_test_result <- t.test(SalesInThousands ~ MarketSize, data = filtered_data)
# View the t-test result
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: SalesInThousands by MarketSize
## t = 18.539, df = 217.57, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Large and group Medium is not equal to 0
## 95 percent confidence interval:
## 23.35326 28.90950
## sample estimates:
## mean in group Large mean in group Medium
## 70.11673 43.98534
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
# Load and explore the economics dataset
head(economics)
## # A tibble: 6 × 6
## date pce pop psavert uempmed unemploy
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1967-07-01 507. 198712 12.6 4.5 2944
## 2 1967-08-01 510. 198911 12.6 4.7 2945
## 3 1967-09-01 516. 199113 11.9 4.6 2958
## 4 1967-10-01 512. 199311 12.9 4.9 3143
## 5 1967-11-01 517. 199498 12.8 4.7 3066
## 6 1967-12-01 525. 199657 11.8 4.8 3018
# Calculate correlation between pce and psavert
cor(economics$pce, economics$psavert)
## [1] -0.7928546
# Calculate each part of the correlation manually
xPart <- economics$pce - mean(economics$pce)
yPart <- economics$psavert - mean(economics$psavert)
nMinusOne <- (nrow(economics) - 1)
xSD <- sd(economics$pce)
ySD <- sd(economics$psavert)
# Use the correlation formula
correlation_value <- sum(xPart * yPart) / (nMinusOne * xSD * ySD)
correlation_value
## [1] -0.7928546
# Calculate the correlation matrix
cormat <- round(cor(economics[, c(2, 4:6)]), 2)
# Reshape the correlation matrix into long format for plotting
library(reshape2)
melted_cormat <- reshape2::melt(cormat)
# Plot the correlation matrix as a heatmap
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile()
# Dealing with missing data
m <- c(9, 9, NA, 3, NA, 5, 8, 1, 10, 4)
n <- c(2, NA, 1, 6, 6, 4, 1, 1, 6, 7)
p <- c(8, 4, 3, 9, 10, NA, 3, NA, 9, 9)
q <- c(10, 10, 7, 8, 4, 2, 8, 5, 5, 2)
r <- c(1, 9, 7, 6, 5, 6, 2, 7, 9, 10)
# Combine them together
theMat <- cbind(m, n, p, q, r)
# Compute correlations with different methods of handling NA values
cor(theMat, use="everything")
## m n p q r
## m 1 NA NA NA NA
## n NA 1 NA NA NA
## p NA NA 1 NA NA
## q NA NA NA 1.0000000 -0.4242958
## r NA NA NA -0.4242958 1.0000000
cor(theMat, use="complete.obs")
## m n p q r
## m 1.0000000 -0.5228840 -0.2893527 0.2974398 -0.3459470
## n -0.5228840 1.0000000 0.8090195 -0.7448453 0.9350718
## p -0.2893527 0.8090195 1.0000000 -0.3613720 0.6221470
## q 0.2974398 -0.7448453 -0.3613720 1.0000000 -0.9059384
## r -0.3459470 0.9350718 0.6221470 -0.9059384 1.0000000
cor(theMat, use="na.or.complete")
## m n p q r
## m 1.0000000 -0.5228840 -0.2893527 0.2974398 -0.3459470
## n -0.5228840 1.0000000 0.8090195 -0.7448453 0.9350718
## p -0.2893527 0.8090195 1.0000000 -0.3613720 0.6221470
## q 0.2974398 -0.7448453 -0.3613720 1.0000000 -0.9059384
## r -0.3459470 0.9350718 0.6221470 -0.9059384 1.0000000
# Load and explore the 'tips' dataset
data(tips, package="reshape2")
head(tips)
## total_bill tip sex smoker day time size
## 1 16.99 1.01 Female No Sun Dinner 2
## 2 10.34 1.66 Male No Sun Dinner 3
## 3 21.01 3.50 Male No Sun Dinner 3
## 4 23.68 3.31 Male No Sun Dinner 2
## 5 24.59 3.61 Female No Sun Dinner 4
## 6 25.29 4.71 Male No Sun Dinner 4
# T-test to check if mean tip differs from 2.50
t_test_tip <- t.test(tips$tip, alternative="two.sided", mu=2.50)
print(t_test_tip)
##
## One Sample t-test
##
## data: tips$tip
## t = 5.6253, df = 243, p-value = 5.08e-08
## alternative hypothesis: true mean is not equal to 2.5
## 95 percent confidence interval:
## 2.823799 3.172758
## sample estimates:
## mean of x
## 2.998279
# One-way ANOVA for tip based on day
tipAnova <- aov(tip ~ day, tips)
summary(tipAnova)
## Df Sum Sq Mean Sq F value Pr(>F)
## day 3 9.5 3.175 1.672 0.174
## Residuals 240 455.7 1.899
# Variance of tips by sex
aggregate(tip ~ sex, data=tips, var)
## sex tip
## 1 Female 1.344428
## 2 Male 2.217424
# Shapiro-Wilk normality test
shapiro.test(tips$tip)
##
## Shapiro-Wilk normality test
##
## data: tips$tip
## W = 0.89781, p-value = 8.2e-12
shapiro.test(tips$tip[tips$sex == "Female"])
##
## Shapiro-Wilk normality test
##
## data: tips$tip[tips$sex == "Female"]
## W = 0.95678, p-value = 0.005448
shapiro.test(tips$tip[tips$sex == "Male"])
##
## Shapiro-Wilk normality test
##
## data: tips$tip[tips$sex == "Male"]
## W = 0.87587, p-value = 3.708e-10
# Visual inspection of distribution
ggplot(tips, aes(x=tip, fill=sex)) +
geom_histogram(binwidth=.5, alpha=1/2)
# ANOVA for tea types
green_tea <- c(0, 2, 3, 5, 8, 10, 12)
no_tea <- c(1, 2, 3, 9, 10, 10, 11)
peppermint_tea <- c(1, 4, 2, 5, 8, 9, 12)
tea_weight <- c(green_tea, no_tea, peppermint_tea)
tea_names <- c(rep("green_tea", 7), rep("no_tea", 7), rep("peppermint_tea",7))
weight_df <- data.frame(tea_names, tea_weight)
weightAnova <- aov(tea_weight ~ tea_names, weight_df)
summary(weightAnova)
## Df Sum Sq Mean Sq F value Pr(>F)
## tea_names 2 3 1.476 0.082 0.922
## Residuals 18 326 18.111