I am going to try to complete the homework and project by giving you some examples in coding these non-parametric tests.

To keep this assignment simple, we are going to use the built in dataset customer profit that is included with ggplot2.

head(data)
##        Ship_Mode Profit Unit_Price Shipping_Cost      Customer_Name
## 1    Regular Air   1877      38.94         35.00 Muhammed MacIntyre
## 2 Delivery Truck   1878     208.16         68.02       Barry French
## 3    Regular Air   1879       8.69          2.99       Barry French
## 4    Regular Air   1880     195.99          3.99      Clay Rozendal
## 5    Regular Air   1881       5.28          2.99      Claudia Miner
## 6    Regular Air   1881      39.89          3.04    Neola Schneider

Wilcoxson Ranked Sum Test

table(data$Customer_Name)
## 
##   Allen Rosenblatt       Barry French        Carl Ludwig     Carlos Soltero 
##                 12                 24                 24                 24 
##      Claudia Miner      Clay Rozendal         Don Miller       Edward Hooks 
##                 12                 12                 12                 12 
##     Eugene Barchas         Jack Garza        Jim Radford         Julia West 
##                 36                 12                 24                 12 
## Muhammed MacIntyre    Neola Schneider    Sylvia Foulston 
##                 12                 12                 24

I am going to look at customer name. Let’s ask if the median profit of a customer name is equal to the unit price. First I’ll trim down the data frame to just contain those customer profit and then compare the prices.

df <- data[which(data$Shipping_Cost %in% c("D","J")),]
summary(df)
##   Ship_Mode             Profit      Unit_Price  Shipping_Cost
##  Length:0           Min.   : NA   Min.   : NA   Min.   : NA  
##  Class :character   1st Qu.: NA   1st Qu.: NA   1st Qu.: NA  
##  Mode  :character   Median : NA   Median : NA   Median : NA  
##                     Mean   :NaN   Mean   :NaN   Mean   :NaN  
##                     3rd Qu.: NA   3rd Qu.: NA   3rd Qu.: NA  
##                     Max.   : NA   Max.   : NA   Max.   : NA  
##  Customer_Name     
##  Length:0          
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
wilcox.test(data$Profit, data$Shipping_Cost, data = df, paired=TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  data$Profit and data$Shipping_Cost
## V = 34980, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
by(data$Profit,data$Shipping_Cost, median)
## data$Shipping_Cost: 0.5
## [1] 1884
## ------------------------------------------------------------ 
## data$Shipping_Cost: 0.7
## [1] 1885.5
## ------------------------------------------------------------ 
## data$Shipping_Cost: 1.39
## [1] 1882
## ------------------------------------------------------------ 
## data$Shipping_Cost: 1.99
## [1] 1887
## ------------------------------------------------------------ 
## data$Shipping_Cost: 2.99
## [1] 1880
## ------------------------------------------------------------ 
## data$Shipping_Cost: 3.04
## [1] 1881
## ------------------------------------------------------------ 
## data$Shipping_Cost: 3.99
## [1] 1880
## ------------------------------------------------------------ 
## data$Shipping_Cost: 4.93
## [1] 1886
## ------------------------------------------------------------ 
## data$Shipping_Cost: 5.26
## [1] 1883
## ------------------------------------------------------------ 
## data$Shipping_Cost: 8.99
## [1] 1884
## ------------------------------------------------------------ 
## data$Shipping_Cost: 13.18
## [1] 1885
## ------------------------------------------------------------ 
## data$Shipping_Cost: 26.22
## [1] 1882
## ------------------------------------------------------------ 
## data$Shipping_Cost: 35
## [1] 1877
## ------------------------------------------------------------ 
## data$Shipping_Cost: 68.02
## [1] 1878
## ------------------------------------------------------------ 
## data$Shipping_Cost: 69
## [1] 1883
## ------------------------------------------------------------ 
## data$Shipping_Cost: 74.35
## [1] 1933

It is clear here that these are very different! Might be more interesting to compare a price that is closer like F and G

df2 <- data[which(data$Profit %in% c("F","G")),]
wilcox.test(data$Unit_Price, data$Profit, data = df2, paired=TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  data$Unit_Price and data$Profit
## V = 0, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

Here I am able to reject the null hypothesis.

Wilcoxson Ranked Sign Test

head(data)
##        Ship_Mode Profit Unit_Price Shipping_Cost      Customer_Name
## 1    Regular Air   1877      38.94         35.00 Muhammed MacIntyre
## 2 Delivery Truck   1878     208.16         68.02       Barry French
## 3    Regular Air   1879       8.69          2.99       Barry French
## 4    Regular Air   1880     195.99          3.99      Clay Rozendal
## 5    Regular Air   1881       5.28          2.99      Claudia Miner
## 6    Regular Air   1881      39.89          3.04    Neola Schneider

I am going to look at the difference of the Profit and shipping cost and see if the species make a difference

df3 <- data[which(data$Shipping_Cost %in% c("setosa","versicolor")),]
df3["Customer.Difference"] = df3$Profit - df3$Shipping_Cost

With that all cleaned up we run the test.

wilcox.test(data$Profit, data$Shipping_Cost, data = df3, paired=TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  data$Profit and data$Shipping_Cost
## V = 34980, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

So we are able to reject the null hypothesis.

by(data$Profit, data$Shipping_Cost, median)
## data$Shipping_Cost: 0.5
## [1] 1884
## ------------------------------------------------------------ 
## data$Shipping_Cost: 0.7
## [1] 1885.5
## ------------------------------------------------------------ 
## data$Shipping_Cost: 1.39
## [1] 1882
## ------------------------------------------------------------ 
## data$Shipping_Cost: 1.99
## [1] 1887
## ------------------------------------------------------------ 
## data$Shipping_Cost: 2.99
## [1] 1880
## ------------------------------------------------------------ 
## data$Shipping_Cost: 3.04
## [1] 1881
## ------------------------------------------------------------ 
## data$Shipping_Cost: 3.99
## [1] 1880
## ------------------------------------------------------------ 
## data$Shipping_Cost: 4.93
## [1] 1886
## ------------------------------------------------------------ 
## data$Shipping_Cost: 5.26
## [1] 1883
## ------------------------------------------------------------ 
## data$Shipping_Cost: 8.99
## [1] 1884
## ------------------------------------------------------------ 
## data$Shipping_Cost: 13.18
## [1] 1885
## ------------------------------------------------------------ 
## data$Shipping_Cost: 26.22
## [1] 1882
## ------------------------------------------------------------ 
## data$Shipping_Cost: 35
## [1] 1877
## ------------------------------------------------------------ 
## data$Shipping_Cost: 68.02
## [1] 1878
## ------------------------------------------------------------ 
## data$Shipping_Cost: 69
## [1] 1883
## ------------------------------------------------------------ 
## data$Shipping_Cost: 74.35
## [1] 1933

Visualize the data by boxplot

boxplot(data$Profit ~ data$Ship_Mode)

Kruskal-Wallis

boxplot(data$Shipping_Cost ~ data$Customer_Name)

data$Profit[is.na(data$Profit)] <- 0
data$Unit_Price[is.na(data$Unit_Price)] <- 0
data[which(data$Profit < data$Unit_Price),"PROFIT"] = "Less"
data[which(data$Profit > data$Unit_Price),"PROFIT"] = "More"
data[which(data$Profit == data$Unit_Price),"PROFIT"] = "Equal"

Looking at the median and seeing if the Profit stay the same

by(data$Profit,data$PROFIT, median)
## data$PROFIT: More
## [1] 1931
kruskal.test(data$Profit ~ data$Shipping_Cost, data = data)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  data$Profit by data$Shipping_Cost
## Kruskal-Wallis chi-squared = 47.591, df = 15, p-value = 2.959e-05

Here I am able to reject the null hypothesis.

Spearman

cor.test(data$Profit, data$Unit_Price, method = "spearman")
## Warning in cor.test.default(data$Profit, data$Unit_Price, method = "spearman"):
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  data$Profit and data$Unit_Price
## S = 3054926, p-value = 0.951
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## 0.003800163

With this p value we will still reject the null hypothesis.

plot(data$Profit, data$Shipping_Cost)
abline(lm(Shipping_Cost ~ Profit, data = data),col = "Blue")

We see that this relationship is not strong but we can see that as the sepal gets longer it also gets narrower.