I am going to try to complete the homework and project by giving you some examples in coding these non-parametric tests.
To keep this assignment simple, we are going to use the built in dataset customer profit that is included with ggplot2.
head(data)
## Ship_Mode Profit Unit_Price Shipping_Cost Customer_Name
## 1 Regular Air 1877 38.94 35.00 Muhammed MacIntyre
## 2 Delivery Truck 1878 208.16 68.02 Barry French
## 3 Regular Air 1879 8.69 2.99 Barry French
## 4 Regular Air 1880 195.99 3.99 Clay Rozendal
## 5 Regular Air 1881 5.28 2.99 Claudia Miner
## 6 Regular Air 1881 39.89 3.04 Neola Schneider
table(data$Customer_Name)
##
## Allen Rosenblatt Barry French Carl Ludwig Carlos Soltero
## 12 24 24 24
## Claudia Miner Clay Rozendal Don Miller Edward Hooks
## 12 12 12 12
## Eugene Barchas Jack Garza Jim Radford Julia West
## 36 12 24 12
## Muhammed MacIntyre Neola Schneider Sylvia Foulston
## 12 12 24
I am going to look at customer name. Let’s ask if the median profit of a customer name is equal to the unit price. First I’ll trim down the data frame to just contain those customer profit and then compare the prices.
df <- data[which(data$Shipping_Cost %in% c("D","J")),]
summary(df)
## Ship_Mode Profit Unit_Price Shipping_Cost
## Length:0 Min. : NA Min. : NA Min. : NA
## Class :character 1st Qu.: NA 1st Qu.: NA 1st Qu.: NA
## Mode :character Median : NA Median : NA Median : NA
## Mean :NaN Mean :NaN Mean :NaN
## 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA
## Max. : NA Max. : NA Max. : NA
## Customer_Name
## Length:0
## Class :character
## Mode :character
##
##
##
wilcox.test(data$Profit, data$Shipping_Cost, data = df, paired=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: data$Profit and data$Shipping_Cost
## V = 34980, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
by(data$Profit,data$Shipping_Cost, median)
## data$Shipping_Cost: 0.5
## [1] 1884
## ------------------------------------------------------------
## data$Shipping_Cost: 0.7
## [1] 1885.5
## ------------------------------------------------------------
## data$Shipping_Cost: 1.39
## [1] 1882
## ------------------------------------------------------------
## data$Shipping_Cost: 1.99
## [1] 1887
## ------------------------------------------------------------
## data$Shipping_Cost: 2.99
## [1] 1880
## ------------------------------------------------------------
## data$Shipping_Cost: 3.04
## [1] 1881
## ------------------------------------------------------------
## data$Shipping_Cost: 3.99
## [1] 1880
## ------------------------------------------------------------
## data$Shipping_Cost: 4.93
## [1] 1886
## ------------------------------------------------------------
## data$Shipping_Cost: 5.26
## [1] 1883
## ------------------------------------------------------------
## data$Shipping_Cost: 8.99
## [1] 1884
## ------------------------------------------------------------
## data$Shipping_Cost: 13.18
## [1] 1885
## ------------------------------------------------------------
## data$Shipping_Cost: 26.22
## [1] 1882
## ------------------------------------------------------------
## data$Shipping_Cost: 35
## [1] 1877
## ------------------------------------------------------------
## data$Shipping_Cost: 68.02
## [1] 1878
## ------------------------------------------------------------
## data$Shipping_Cost: 69
## [1] 1883
## ------------------------------------------------------------
## data$Shipping_Cost: 74.35
## [1] 1933
It is clear here that these are very different! Might be more interesting to compare a price that is closer like F and G
df2 <- data[which(data$Profit %in% c("F","G")),]
wilcox.test(data$Unit_Price, data$Profit, data = df2, paired=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: data$Unit_Price and data$Profit
## V = 0, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
Here I am able to reject the null hypothesis.
head(data)
## Ship_Mode Profit Unit_Price Shipping_Cost Customer_Name
## 1 Regular Air 1877 38.94 35.00 Muhammed MacIntyre
## 2 Delivery Truck 1878 208.16 68.02 Barry French
## 3 Regular Air 1879 8.69 2.99 Barry French
## 4 Regular Air 1880 195.99 3.99 Clay Rozendal
## 5 Regular Air 1881 5.28 2.99 Claudia Miner
## 6 Regular Air 1881 39.89 3.04 Neola Schneider
I am going to look at the difference of the Profit and shipping cost and see if the species make a difference
df3 <- data[which(data$Shipping_Cost %in% c("setosa","versicolor")),]
df3["Customer.Difference"] = df3$Profit - df3$Shipping_Cost
With that all cleaned up we run the test.
wilcox.test(data$Profit, data$Shipping_Cost, data = df3, paired=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: data$Profit and data$Shipping_Cost
## V = 34980, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
So we are able to reject the null hypothesis.
by(data$Profit, data$Shipping_Cost, median)
## data$Shipping_Cost: 0.5
## [1] 1884
## ------------------------------------------------------------
## data$Shipping_Cost: 0.7
## [1] 1885.5
## ------------------------------------------------------------
## data$Shipping_Cost: 1.39
## [1] 1882
## ------------------------------------------------------------
## data$Shipping_Cost: 1.99
## [1] 1887
## ------------------------------------------------------------
## data$Shipping_Cost: 2.99
## [1] 1880
## ------------------------------------------------------------
## data$Shipping_Cost: 3.04
## [1] 1881
## ------------------------------------------------------------
## data$Shipping_Cost: 3.99
## [1] 1880
## ------------------------------------------------------------
## data$Shipping_Cost: 4.93
## [1] 1886
## ------------------------------------------------------------
## data$Shipping_Cost: 5.26
## [1] 1883
## ------------------------------------------------------------
## data$Shipping_Cost: 8.99
## [1] 1884
## ------------------------------------------------------------
## data$Shipping_Cost: 13.18
## [1] 1885
## ------------------------------------------------------------
## data$Shipping_Cost: 26.22
## [1] 1882
## ------------------------------------------------------------
## data$Shipping_Cost: 35
## [1] 1877
## ------------------------------------------------------------
## data$Shipping_Cost: 68.02
## [1] 1878
## ------------------------------------------------------------
## data$Shipping_Cost: 69
## [1] 1883
## ------------------------------------------------------------
## data$Shipping_Cost: 74.35
## [1] 1933
Visualize the data by boxplot
boxplot(data$Profit ~ data$Ship_Mode)
boxplot(data$Shipping_Cost ~ data$Customer_Name)
data$Profit[is.na(data$Profit)] <- 0
data$Unit_Price[is.na(data$Unit_Price)] <- 0
data[which(data$Profit < data$Unit_Price),"PROFIT"] = "Less"
data[which(data$Profit > data$Unit_Price),"PROFIT"] = "More"
data[which(data$Profit == data$Unit_Price),"PROFIT"] = "Equal"
Looking at the median and seeing if the Profit stay the same
by(data$Profit,data$PROFIT, median)
## data$PROFIT: More
## [1] 1931
kruskal.test(data$Profit ~ data$Shipping_Cost, data = data)
##
## Kruskal-Wallis rank sum test
##
## data: data$Profit by data$Shipping_Cost
## Kruskal-Wallis chi-squared = 47.591, df = 15, p-value = 2.959e-05
Here I am able to reject the null hypothesis.
cor.test(data$Profit, data$Unit_Price, method = "spearman")
## Warning in cor.test.default(data$Profit, data$Unit_Price, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: data$Profit and data$Unit_Price
## S = 3054926, p-value = 0.951
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.003800163
With this p value we will still reject the null hypothesis.
plot(data$Profit, data$Shipping_Cost)
abline(lm(Shipping_Cost ~ Profit, data = data),col = "Blue")
We see that this relationship is not strong but we can see that as the sepal gets longer it also gets narrower.