Proj4

# Load necessary libraries (if not already loaded)
# install.packages("dplyr") # Uncomment if dplyr is not installed
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Load the dataset
# Load the dataset from the Downloads folder
data <- read.csv("C:/Users/pulud/Downloads/WA_Marketing-Campaign.csv")



# Summary statistics for the numeric variables
summary_stats <- data %>%
  select(LocationID, AgeOfStore, Promotion, week, SalesInThousands) %>%
  summarise(
    LocationID_mean = mean(LocationID, na.rm = TRUE),
    LocationID_variance = var(LocationID, na.rm = TRUE),
    LocationID_min = min(LocationID, na.rm = TRUE),
    LocationID_max = max(LocationID, na.rm = TRUE),
    
    AgeOfStore_mean = mean(AgeOfStore, na.rm = TRUE),
    AgeOfStore_variance = var(AgeOfStore, na.rm = TRUE),
    AgeOfStore_min = min(AgeOfStore, na.rm = TRUE),
    AgeOfStore_max = max(AgeOfStore, na.rm = TRUE),
    
    Promotion_mean = mean(Promotion, na.rm = TRUE),
    Promotion_variance = var(Promotion, na.rm = TRUE),
    Promotion_min = min(Promotion, na.rm = TRUE),
    Promotion_max = max(Promotion, na.rm = TRUE),
    
    week_mean = mean(week, na.rm = TRUE),
    week_variance = var(week, na.rm = TRUE),
    week_min = min(week, na.rm = TRUE),
    week_max = max(week, na.rm = TRUE),
    
    SalesInThousands_mean = mean(SalesInThousands, na.rm = TRUE),
    SalesInThousands_variance = var(SalesInThousands, na.rm = TRUE),
    SalesInThousands_min = min(SalesInThousands, na.rm = TRUE),
    SalesInThousands_max = max(SalesInThousands, na.rm = TRUE)
  )

# View the summary statistics
print(summary_stats)

##   LocationID_mean LocationID_variance LocationID_min LocationID_max
## 1        479.6569            82928.84              1            920
##   AgeOfStore_mean AgeOfStore_variance AgeOfStore_min AgeOfStore_max
## 1         8.50365            44.06763              1             28
##   Promotion_mean Promotion_variance Promotion_min Promotion_max week_mean
## 1       2.029197          0.6572813             1             3       2.5
##   week_variance week_min week_max SalesInThousands_mean
## 1      1.252285        1        4               53.4662
##   SalesInThousands_variance SalesInThousands_min SalesInThousands_max
## 1                  280.7373                17.34                99.65

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Set CRAN mirror
options(repos = c(CRAN = "https://cloud.r-project.org/"))

# Load necessary libraries
library(dplyr)

# Load the dataset from the Downloads folder
data <- read.csv("C:/Users/pulud/Downloads/WA_Marketing-Campaign.csv")

# Summary statistics for the numeric variables
summary_stats <- data %>%
  select(LocationID, AgeOfStore, Promotion, week, SalesInThousands) %>%
  summarise(
    LocationID_mean = mean(LocationID, na.rm = TRUE),
    LocationID_variance = var(LocationID, na.rm = TRUE),
    LocationID_min = min(LocationID, na.rm = TRUE),
    LocationID_max = max(LocationID, na.rm = TRUE),
    
    AgeOfStore_mean = mean(AgeOfStore, na.rm = TRUE),
    AgeOfStore_variance = var(AgeOfStore, na.rm = TRUE),
    AgeOfStore_min = min(AgeOfStore, na.rm = TRUE),
    AgeOfStore_max = max(AgeOfStore, na.rm = TRUE),
    
    Promotion_mean = mean(Promotion, na.rm = TRUE),
    Promotion_variance = var(Promotion, na.rm = TRUE),
    Promotion_min = min(Promotion, na.rm = TRUE),
    Promotion_max = max(Promotion, na.rm = TRUE),
    
    week_mean = mean(week, na.rm = TRUE),
    week_variance = var(week, na.rm = TRUE),
    week_min = min(week, na.rm = TRUE),
    week_max = max(week, na.rm = TRUE),
    
    SalesInThousands_mean = mean(SalesInThousands, na.rm = TRUE),
    SalesInThousands_variance = var(SalesInThousands, na.rm = TRUE),
    SalesInThousands_min = min(SalesInThousands, na.rm = TRUE),
    SalesInThousands_max = max(SalesInThousands, na.rm = TRUE)
  )

# View the summary statistics
print(summary_stats)

##   LocationID_mean LocationID_variance LocationID_min LocationID_max
## 1        479.6569            82928.84              1            920
##   AgeOfStore_mean AgeOfStore_variance AgeOfStore_min AgeOfStore_max
## 1         8.50365            44.06763              1             28
##   Promotion_mean Promotion_variance Promotion_min Promotion_max week_mean
## 1       2.029197          0.6572813             1             3       2.5
##   week_variance week_min week_max SalesInThousands_mean
## 1      1.252285        1        4               53.4662
##   SalesInThousands_variance SalesInThousands_min SalesInThousands_max
## 1                  280.7373                17.34                99.65

# Load the dataset
data <- read.csv("C:/Users/pulud/Downloads/WA_Marketing-Campaign-1.csv")

# Filter the data for Medium and Large markets
filtered_data <- data %>% dplyr::filter(MarketSize %in% c("Medium", "Large"))

# Conduct the t-test
t_test_result <- t.test(SalesInThousands ~ MarketSize, data = filtered_data)

# View the t-test result
print(t_test_result)

## 
##  Welch Two Sample t-test
## 
## data:  SalesInThousands by MarketSize
## t = 18.539, df = 217.57, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Large and group Medium is not equal to 0
## 95 percent confidence interval:
##  23.35326 28.90950
## sample estimates:
##  mean in group Large mean in group Medium 
##             70.11673             43.98534

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

library(GGally)

## Loading required package: ggplot2

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(ggplot2)

# Load and explore the economics dataset
head(economics)

## # A tibble: 6 × 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1967-07-01  507. 198712    12.6     4.5     2944
## 2 1967-08-01  510. 198911    12.6     4.7     2945
## 3 1967-09-01  516. 199113    11.9     4.6     2958
## 4 1967-10-01  512. 199311    12.9     4.9     3143
## 5 1967-11-01  517. 199498    12.8     4.7     3066
## 6 1967-12-01  525. 199657    11.8     4.8     3018

# Calculate correlation between pce and psavert
cor(economics$pce, economics$psavert)

## [1] -0.7928546

# Calculate each part of the correlation manually
xPart <- economics$pce - mean(economics$pce)
yPart <- economics$psavert - mean(economics$psavert)
nMinusOne <- (nrow(economics) - 1)
xSD <- sd(economics$pce)
ySD <- sd(economics$psavert)

# Use the correlation formula
correlation_value <- sum(xPart * yPart) / (nMinusOne * xSD * ySD)
correlation_value

## [1] -0.7928546

# Calculate the correlation matrix
cormat <- round(cor(economics[, c(2, 4:6)]), 2)

# Reshape the correlation matrix into long format for plotting
library(reshape2)
melted_cormat <- reshape2::melt(cormat)

# Plot the correlation matrix as a heatmap
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile()

# Dealing with missing data
m <- c(9, 9, NA, 3, NA, 5, 8, 1, 10, 4)
n <- c(2, NA, 1, 6, 6, 4, 1, 1, 6, 7)
p <- c(8, 4, 3, 9, 10, NA, 3, NA, 9, 9)
q <- c(10, 10, 7, 8, 4, 2, 8, 5, 5, 2)
r <- c(1, 9, 7, 6, 5, 6, 2, 7, 9, 10)

# Combine them together
theMat <- cbind(m, n, p, q, r)

# Compute correlations with different methods of handling NA values
cor(theMat, use="everything")

##    m  n  p          q          r
## m  1 NA NA         NA         NA
## n NA  1 NA         NA         NA
## p NA NA  1         NA         NA
## q NA NA NA  1.0000000 -0.4242958
## r NA NA NA -0.4242958  1.0000000

cor(theMat, use="complete.obs")

##            m          n          p          q          r
## m  1.0000000 -0.5228840 -0.2893527  0.2974398 -0.3459470
## n -0.5228840  1.0000000  0.8090195 -0.7448453  0.9350718
## p -0.2893527  0.8090195  1.0000000 -0.3613720  0.6221470
## q  0.2974398 -0.7448453 -0.3613720  1.0000000 -0.9059384
## r -0.3459470  0.9350718  0.6221470 -0.9059384  1.0000000

cor(theMat, use="na.or.complete")

##            m          n          p          q          r
## m  1.0000000 -0.5228840 -0.2893527  0.2974398 -0.3459470
## n -0.5228840  1.0000000  0.8090195 -0.7448453  0.9350718
## p -0.2893527  0.8090195  1.0000000 -0.3613720  0.6221470
## q  0.2974398 -0.7448453 -0.3613720  1.0000000 -0.9059384
## r -0.3459470  0.9350718  0.6221470 -0.9059384  1.0000000

# Load and explore the 'tips' dataset
data(tips, package="reshape2")
head(tips)

##   total_bill  tip    sex smoker day   time size
## 1      16.99 1.01 Female     No Sun Dinner    2
## 2      10.34 1.66   Male     No Sun Dinner    3
## 3      21.01 3.50   Male     No Sun Dinner    3
## 4      23.68 3.31   Male     No Sun Dinner    2
## 5      24.59 3.61 Female     No Sun Dinner    4
## 6      25.29 4.71   Male     No Sun Dinner    4

# T-test to check if mean tip differs from 2.50
t_test_tip <- t.test(tips$tip, alternative="two.sided", mu=2.50)
print(t_test_tip)

## 
##  One Sample t-test
## 
## data:  tips$tip
## t = 5.6253, df = 243, p-value = 5.08e-08
## alternative hypothesis: true mean is not equal to 2.5
## 95 percent confidence interval:
##  2.823799 3.172758
## sample estimates:
## mean of x 
##  2.998279

# One-way ANOVA for tip based on day
tipAnova <- aov(tip ~ day, tips)
summary(tipAnova)

##              Df Sum Sq Mean Sq F value Pr(>F)
## day           3    9.5   3.175   1.672  0.174
## Residuals   240  455.7   1.899

# Variance of tips by sex
aggregate(tip ~ sex, data=tips, var)

##      sex      tip
## 1 Female 1.344428
## 2   Male 2.217424

# Shapiro-Wilk normality test
shapiro.test(tips$tip)

## 
##  Shapiro-Wilk normality test
## 
## data:  tips$tip
## W = 0.89781, p-value = 8.2e-12

shapiro.test(tips$tip[tips$sex == "Female"])

## 
##  Shapiro-Wilk normality test
## 
## data:  tips$tip[tips$sex == "Female"]
## W = 0.95678, p-value = 0.005448

shapiro.test(tips$tip[tips$sex == "Male"])

## 
##  Shapiro-Wilk normality test
## 
## data:  tips$tip[tips$sex == "Male"]
## W = 0.87587, p-value = 3.708e-10

# Visual inspection of distribution
ggplot(tips, aes(x=tip, fill=sex)) +
  geom_histogram(binwidth=.5, alpha=1/2)

# ANOVA for tea types
green_tea <- c(0, 2, 3, 5, 8, 10, 12)
no_tea <- c(1, 2, 3, 9, 10, 10, 11)
peppermint_tea <- c(1, 4, 2, 5, 8, 9, 12)

tea_weight <- c(green_tea, no_tea, peppermint_tea)
tea_names <- c(rep("green_tea", 7), rep("no_tea", 7), rep("peppermint_tea",7))
weight_df <- data.frame(tea_names, tea_weight)

weightAnova <- aov(tea_weight ~ tea_names, weight_df)
summary(weightAnova)

##             Df Sum Sq Mean Sq F value Pr(>F)
## tea_names    2      3   1.476   0.082  0.922
## Residuals   18    326  18.111

Proj4

Johnatan Pulido

2024-08-15

R Markdown