WSES Solution

# set working directory as Datasets
data1 <- read.csv("G:\\IIMK DABS\\R\\Datasets\\wsesdata.csv", header = TRUE, sep = ",", stringsAsFactors = TRUE)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data1 <- data1 %>% rename("Sl.no" = `Opportunity.No.`)
data1$Product <- factor(data1$Product, ordered = TRUE, levels = c("GTMSys", "Procsys", "LearnSys", "Finsys", "Lifesys","Logissys","ContactSys"))
data1$Industry <- factor(data1$Industry)
data1$Region <- factor(data1$Region)
data1 <- data1 %>% rename(sales_value = `Sales.Value.in.Million` )
data1$`Reporting Status` <- factor(data1$`Reporting.Status`)
data1$`Leads Conversion Class` <- factor(data1$`Leads.Conversion.Class`)
data1 <- data1 %>% rename(Profit.of.customer = `Profit.of.Customer.in.Million`)
data1 <- data1 %>% rename(Relative.Strength = `Relative.Strength.in.the.segment`)
data1 <- data1 %>% rename(WSES.Proportion =`WSES.Proportion.in.Joint.Bid`)
data1 <- data1 %>% rename(Profit = `Profit..`)
data1 <- data1 %>% rename(Reporting.status = `Reporting.Status`)
data1 <- data1 %>% rename(`Leads.Conversion.Class` = `Leads.Conversion.Class`)

# Question 1
#1. The marketing team of WSES believes that the average sales value of the leads that they receive is at least 8 million dollars. 
#The marketing team believes that the standard deviation of sale value is about 2 million dollars.
#Use an appropriate hypothesis test to check whether the average sales value in the population is at least 8 million dollars.
mean_sales.value <- mean(data1$sales_value)
std.error <- 2/sqrt(1000)
z_statistc <- (mean_sales.value- 8)/ std.error
pnorm(z_statistc)

## [1] 0.7576813

pnorm <- pnorm(z_statistc)
1- pnorm

## [1] 0.2423187

## Question 2
#QUESTION 2
#Jason McCullagh, senior marketing manager at WSES doubted the value of standard deviation provided by the marketing team. 
#Jason argued that there is no way the marketing team could have known the population standard deviation for the sales value, 
#since the population itself is unknown. Do you agree with Jason McCullagh? If yes, perform the test again using an appropriate hypothesis test.

t.test_value <- t.test(data1$sales_value, mu = 8, alternative = "greater")

#Question 3:    Prudy Perkins, the Chief Marketing Office (CMO) informed the board that they win at least 50% of the sales leads that they receive. 
#Use an appropriate hypothesis testing procedure to check whether the proportion of leads won by WSES is more than 50%.
#Ho : p <= 0.50, Ha : p >0.50

summary(data1$Reporting.status)

## Lost  Won 
##  519  481

prop.test(x= c(481), p= 0.5, alternative = "greater", n= 1000)

## 
##  1-sample proportions test with continuity correction
## 
## data:  c(481) out of 1000, null probability 0.5
## X-squared = 1.369, df = 1, p-value = 0.879
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
##  0.4546 1.0000
## sample estimates:
##     p 
## 0.481

# Question 4
# Question ;    Hendry Jackson, who works in the product line "learnsys", 
#claims that the probability of winning a sales lead for the product "learnsys"
#is more than that of "Finsys". Is there statistically significant evidence in favor
#of Hendry's claim?

data3 <- data1 %>% select(`Reporting.status`, Product) %>% filter(data1$Product== "LearnSys" | data1$Product == "Finsys") 
# here we are using dplyr package to create a new dataframe with the columns and values that need to be analysed
table(data3)

##                 Product
## Reporting.status GTMSys Procsys LearnSys Finsys Lifesys Logissys ContactSys
##             Lost      0       0       55     83       0        0          0
##             Won       0       0       71     34       0        0          0

prop.table(table(data3), margin = 2)

##                 Product
## Reporting.status GTMSys Procsys  LearnSys    Finsys Lifesys Logissys ContactSys
##             Lost                0.4365079 0.7094017                            
##             Won                 0.5634921 0.2905983

# margin is used to calculate either rowwise proportion or columnwise proportion
# margin =1 for rowwise proportion calculation and margin = 2 for columnwise proportion
options(scipen = 100)
summary(data3)

##  Reporting.status       Product   
##  Lost:138         GTMSys    :  0  
##  Won :105         Procsys   :  0  
##                   LearnSys  :126  
##                   Finsys    :117  
##                   Lifesys   :  0  
##                   Logissys  :  0  
##                   ContactSys:  0

prop.test(x= c(71,34), n= c(126, 117), alternative = "greater")

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(71, 34) out of c(126, 117)
## X-squared = 17.316, df = 1, p-value = 0.00001583
## alternative hypothesis: greater
## 95 percent confidence interval:
##  0.1644089 1.0000000
## sample estimates:
##    prop 1    prop 2 
## 0.5634921 0.2905983

# Comparison btw Learnsys and Lifesys
data4 <- data1 %>% select(`Reporting.status`, Product) %>% filter(data1$Product== "LearnSys" | data1$Product == "Lifesys")
table(data4)

##                 Product
## Reporting.status GTMSys Procsys LearnSys Finsys Lifesys Logissys ContactSys
##             Lost      0       0       55      0      58        0          0
##             Won       0       0       71      0      54        0          0

prop.table(table(data4), margin = 2)

##                 Product
## Reporting.status GTMSys Procsys  LearnSys Finsys   Lifesys Logissys ContactSys
##             Lost                0.4365079        0.5178571                    
##             Won                 0.5634921        0.4821429

prop.test(x=c(71, 54), n= c(126, 112), alternative = "greater")

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(71, 54) out of c(126, 112)
## X-squared = 1.2642, df = 1, p-value = 0.1304
## alternative hypothesis: greater
## 95 percent confidence interval:
##  -0.0334461  1.0000000
## sample estimates:
##    prop 1    prop 2 
## 0.5634921 0.4821429

# Comparison btw UK and Other Europe
data5 <- data1 %>% select(`Reporting.status`, Region) %>% filter(data1$Region == "UK" | data1$Region== "Other Europe")
table(data5)

##                 Region
## Reporting.status Africa Americas Canada India Japan Other Europe Singapore
##             Lost      0        0      0     0     0           92         0
##             Won       0        0      0     0     0           66         0
##                 Region
## Reporting.status Spain  UK
##             Lost     0 286
##             Won      0 267

summary(data5)

##  Reporting.status          Region   
##  Lost:378         UK          :553  
##  Won :333         Other Europe:158  
##                   Africa      :  0  
##                   Americas    :  0  
##                   Canada      :  0  
##                   India       :  0  
##                   (Other)     :  0

prop.test(x= c(267, 66), n= c(553, 158), alternative = "greater")

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(267, 66) out of c(553, 158)
## X-squared = 1.8383, df = 1, p-value = 0.08758
## alternative hypothesis: greater
## 95 percent confidence interval:
##  -0.01236324  1.00000000
## sample estimates:
##    prop 1    prop 2 
## 0.4828210 0.4177215

# Question 5:   Hendry Jackson also claims that the average sales value of 
#"learnsys" projects is higher than that of "Finsys" projects. 
#Check whether he is correct at 5% significance.

# Comparing avg sales value of Learnsys and finsys to check if there is a significant difference
# 2 sample T test
# Ho : Avg sales value Learnsys <= avg sales value Finsys
# Ha : Avg sales value Learnsys > avg sales value Finsys
names(data1)

##  [1] "Sl.no"                  "Reporting.status"       "Sales.Outcome"         
##  [4] "Product"                "Industry"               "Region"                
##  [7] "Relative.Strength"      "Profit.of.customer"     "sales_value"           
## [10] "Profit"                 "WSES.Proportion"        "Leads.Conversion.Class"
## [13] "Reporting Status"       "Leads Conversion Class"

summary(data1$Product)

##     GTMSys    Procsys   LearnSys     Finsys    Lifesys   Logissys ContactSys 
##        463        133        126        117        112         29         20

data6 <- data1 %>% select(Product, sales_value) %>% filter(Product == "LearnSys" | Product == "Finsys")
t.test(sales_value ~ Product, data = data6, alternative = "greater", var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  sales_value by Product
## t = 0.93503, df = 241, p-value = 0.1754
## alternative hypothesis: true difference in means between group LearnSys and group Finsys is greater than 0
## 95 percent confidence interval:
##  -0.1728648        Inf
## sample estimates:
## mean in group LearnSys   mean in group Finsys 
##               8.030476               7.804786

#or
t.test(x= data1$sales_value [data1$Product == "LearnSys"], y= data1$sales_value [data1$Product == "Finsys"],  alternative = "greater",  var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  data1$sales_value[data1$Product == "LearnSys"] and data1$sales_value[data1$Product == "Finsys"]
## t = 0.93503, df = 241, p-value = 0.1754
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -0.1728648        Inf
## sample estimates:
## mean of x mean of y 
##  8.030476  7.804786

# Comparing avg sales value of Learnsys and Procsys to check if there is a significant difference
#Ho : avg sales value Learnsys <= avg sales value Finsys
#Ha : avg sales value Learnsys > avg sales value Procsys
# Two Sample T test
data7 <- data1 %>% select(Product, sales_value) %>% filter(Product == "LearnSys" | Product== "Procsys")
t.test(sales_value ~ Product, data = data7, alternative = "greater", var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  sales_value by Product
## t = 0.76326, df = 257, p-value = 0.223
## alternative hypothesis: true difference in means between group Procsys and group LearnSys is greater than 0
## 95 percent confidence interval:
##  -0.2147027        Inf
## sample estimates:
##  mean in group Procsys mean in group LearnSys 
##               8.215113               8.030476

#OR
t.test(x= data7$sales_value[data7$Product== "Procsys"], y= data7$sales_value[data7$Product== "LearnSys"], alternative = "greater", var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  data7$sales_value[data7$Product == "Procsys"] and data7$sales_value[data7$Product == "LearnSys"]
## t = 0.76326, df = 257, p-value = 0.223
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -0.2147027        Inf
## sample estimates:
## mean of x mean of y 
##  8.215113  8.030476

#Question 6: Liz was of the opinion that there is difference in the average profit across
#geographical locations, such as United Kingdom, India and the America.
#Use an appropriate test to verify the same.
#ANOVA Test
summary(data1$Region)

##       Africa     Americas       Canada        India        Japan Other Europe 
##           93          104            6           35           16          158 
##    Singapore        Spain           UK 
##           23           12          553

data8 <- data1 %>% select(Region,  Profit) %>% filter (Region == "India"| Region == "Americas"| Region == "UK")

anova <- aov(Profit ~ Region, data = data8)
summary(anova)

##              Df Sum Sq Mean Sq F value Pr(>F)
## Region        2    297   148.7   1.505  0.223
## Residuals   689  68075    98.8

model.tables(anova, type = "means")

## Tables of means
## Grand mean
##         
## 50.7052 
## 
##  Region 
##     Americas India    UK
##        50.33 53.51  50.6
## rep   104.00 35.00 553.0

summary(aov(Profit ~ Region, data = data8))

##              Df Sum Sq Mean Sq F value Pr(>F)
## Region        2    297   148.7   1.505  0.223
## Residuals   689  68075    98.8

#ANOVA for all regions
anova1 <- aov(Profit ~ Region, data = data1)
summary(anova1)

##              Df Sum Sq Mean Sq F value Pr(>F)
## Region        8   1171   146.3   1.428   0.18
## Residuals   991 101556   102.5

model.tables(anova1, type = "means")

## Tables of means
## Grand mean
##        
## 50.703 
## 
##  Region 
##     Africa Americas Canada India Japan Other Europe Singapore Spain    UK
##      50.97    50.33  43.17 53.51 48.12        50.36     54.61 52.75  50.6
## rep  93.00   104.00   6.00 35.00 16.00       158.00     23.00 12.00 553.0

#ANOVA Relative Strength comparison
data9 <- data1 %>% select(Relative.Strength, Region)
anova2 <- aov(Relative.Strength ~ Region, data = data9)
summary(anova2)

##              Df Sum Sq Mean Sq F value              Pr(>F)    
## Region        8  17554  2194.2   24.48 <0.0000000000000002 ***
## Residuals   991  88819    89.6                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

model.tables(anova2, type = "means")

## Tables of means
## Grand mean
##        
## 49.605 
## 
##  Region 
##     Africa Americas Canada India Japan Other Europe Singapore Spain     UK
##      45.56    46.45  25.83    44 47.44        51.67     66.39    66  49.91
## rep  93.00   104.00   6.00    35 16.00       158.00     23.00    12 553.00

TukeyHSD(anova2, which = "Region")

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Relative.Strength ~ Region, data = data9)
## 
## $Region
##                               diff         lwr         upr     p adj
## Americas-Africa          0.8927833  -3.3073387   5.0929053 0.9992043
## Canada-Africa          -19.7258065 -32.1219740  -7.3296389 0.0000312
## India-Africa            -1.5591398  -7.3951552   4.2768756 0.9959439
## Japan-Africa             1.8783602  -6.0868772   9.8435976 0.9983146
## Other Europe-Africa      6.1117463   2.2653522   9.9581404 0.0000323
## Singapore-Africa        20.8321646  13.9786941  27.6856350 0.0000000
## Spain-Africa            20.4408602  11.4137345  29.4679860 0.0000000
## UK-Africa                4.3504443   1.0520759   7.6488127 0.0014710
## Canada-Americas        -20.6185897 -32.9749621  -8.2622174 0.0000092
## India-Americas          -2.4519231  -8.2029268   3.2990806 0.9237556
## Japan-Americas           0.9855769  -6.9175854   8.8887393 0.9999859
## Other Europe-Americas    5.2189630   1.5028204   8.9351056 0.0004745
## Singapore-Americas      19.9393813  13.1581554  26.7206072 0.0000000
## Spain-Americas          19.5480769  10.5756766  28.5204772 0.0000000
## UK-Americas              3.4576610   0.3121559   6.6031661 0.0189272
## India-Canada            18.1666667   5.1628984  31.1704349 0.0005247
## Japan-Canada            21.6041667   7.5157345  35.6925988 0.0000750
## Other Europe-Canada     25.8375527  13.5968967  38.0782088 0.0000000
## Singapore-Canada        40.5579710  27.0669010  54.0490410 0.0000000
## Spain-Canada            40.1666667  25.4517792  54.8815541 0.0000000
## UK-Canada               24.0762508  11.9965923  36.1559092 0.0000000
## Japan-India              3.4375000  -5.4438302  12.3188302 0.9558614
## Other Europe-India       7.6708861   2.1729077  13.1688645 0.0005378
## Singapore-India         22.3913043  14.4917426  30.2908661 0.0000000
## Spain-India             22.0000000  12.1551019  31.8448981 0.0000000
## UK-India                 5.9095841   0.7800371  11.0391311 0.0107559
## Other Europe-Japan       4.2333861  -3.4876046  11.9543768 0.7438818
## Singapore-Japan         18.9538043   9.3731464  28.5344623 0.0000000
## Spain-Japan             18.5625000   7.3238191  29.8011809 0.0000121
## UK-Japan                 2.4720841  -4.9910375   9.9352057 0.9829945
## Singapore-Other Europe  14.7204183   8.1524089  21.2884277 0.0000000
## Spain-Other Europe      14.3291139   5.5167538  23.1414740 0.0000181
## UK-Other Europe         -1.7613020  -4.4160948   0.8934908 0.4999296
## Spain-Singapore         -0.3913043 -10.8714284  10.0888197 1.0000000
## UK-Singapore           -16.4817203 -22.7445658 -10.2188748 0.0000000
## UK-Spain               -16.0904159 -24.6777424  -7.5030894 0.0000003

# 2-Way ANOVA
# Here, two categorical variables and its significance on the numeric variable
anova3 <- aov(Relative.Strength ~ Region + Product + Region*Product, data = data1)
summary(anova3)

##                 Df Sum Sq Mean Sq F value              Pr(>F)    
## Region           8  17554  2194.2   31.30 <0.0000000000000002 ***
## Product          6  10358  1726.3   24.62 <0.0000000000000002 ***
## Region:Product   6   9819  1636.6   23.34 <0.0000000000000002 ***
## Residuals      979  68642    70.1                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

model.tables(anova3, type = "means")

## Tables of means
## Grand mean
##        
## 49.605 
## 
##  Region 
##     Africa Americas Canada India Japan Other Europe Singapore Spain     UK
##      45.56    46.45  25.83    44 47.44        51.67     66.39    66  49.91
## rep  93.00   104.00   6.00    35 16.00       158.00     23.00    12 553.00
## 
##  Product 
##     GTMSys Procsys LearnSys Finsys Lifesys Logissys ContactSys
##      50.27   48.66     46.8  54.53    47.3    38.21       58.7
## rep 463.00  133.00    126.0 117.00   112.0    29.00       20.0
## 
##  Region:Product 
##               Product
## Region         GTMSys Procsys LearnSys Finsys Lifesys Logissys ContactSys
##   Africa        45.84          45.00                                     
##   rep           62.00   0.00   31.00     0.00   0.00    0.00     0.00    
##   Americas      45.20  55.23                                             
##   rep           91.00  13.00    0.00     0.00   0.00    0.00     0.00    
##   Canada               29.00                   10.00                     
##   rep            0.00   5.00    0.00     0.00   1.00    0.00     0.00    
##   India                44.00                                             
##   rep            0.00  35.00    0.00     0.00   0.00    0.00     0.00    
##   Japan         58.67  33.00                                             
##   rep            9.00   7.00    0.00     0.00   0.00    0.00     0.00    
##   Other Europe  46.50                   55.97  53.64                     
##   rep           56.00   0.00    0.00    38.00  64.00    0.00     0.00    
##   Singapore     66.39                                                    
##   rep           23.00   0.00    0.00     0.00   0.00    0.00     0.00    
##   Spain         66.00                                                    
##   rep           12.00   0.00    0.00     0.00   0.00    0.00     0.00    
##   UK            52.74  47.79   46.37    55.14  42.06   38.52    59.00    
##   rep          210.00  73.00   95.00    79.00  47.00   29.00    20.00

TukeyHSD(anova3, which = "Region")

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Relative.Strength ~ Region + Product + Region * Product, data = data1)
## 
## $Region
##                               diff         lwr        upr     p adj
## Americas-Africa          0.8927833  -2.8222307   4.607797 0.9980708
## Canada-Africa          -19.7258065 -30.6902344  -8.761378 0.0000010
## India-Africa            -1.5591398  -6.7211038   3.602824 0.9906727
## Japan-Africa             1.8783602  -5.1669037   8.923624 0.9959997
## Other Europe-Africa      6.1117463   2.7096052   9.513887 0.0000011
## Singapore-Africa        20.8321646  14.7702602  26.894069 0.0000000
## Spain-Africa            20.4408602  12.4563545  28.425366 0.0000000
## UK-Africa                4.3504443   1.4330327   7.267856 0.0001397
## Canada-Americas        -20.6185897 -31.5478188  -9.689361 0.0000002
## India-Americas          -2.4519231  -7.5386941   2.634848 0.8564112
## Japan-Americas           0.9855769  -6.0047815   7.975935 0.9999637
## Other Europe-Americas    5.2189630   1.9320295   8.505896 0.0000329
## Singapore-Americas      19.9393813  13.9413772  25.937385 0.0000000
## Spain-Americas          19.5480769  11.6119760  27.484178 0.0000000
## UK-Americas              3.4576610   0.6754573   6.239865 0.0037877
## India-Canada            18.1666667   6.6648150  29.668518 0.0000374
## Japan-Canada            21.6041667   9.1429283  34.065405 0.0000032
## Other Europe-Canada     25.8375527  15.0106749  36.664431 0.0000000
## Singapore-Canada        40.5579710  28.6251003  52.490842 0.0000000
## Spain-Canada            40.1666667  27.1513276  53.182006 0.0000000
## UK-Canada               24.0762508  13.3917756  34.760726 0.0000000
## Japan-India              3.4375000  -4.4180492  11.293049 0.9122625
## Other Europe-India       7.6708861   2.8079163  12.533856 0.0000386
## Singapore-India         22.3913043  15.4041307  29.378478 0.0000000
## Spain-India             22.0000000  13.2921735  30.707826 0.0000000
## UK-India                 5.9095841   1.3724924  10.446676 0.0018145
## Other Europe-Japan       4.2333861  -2.5958413  11.062613 0.5949985
## Singapore-Japan         18.9538043  10.4796987  27.427910 0.0000000
## Spain-Japan             18.5625000   8.6218706  28.503129 0.0000003
## UK-Japan                 2.4720841  -4.1290576   9.073226 0.9636698
## Singapore-Other Europe  14.7204183   8.9110045  20.529832 0.0000000
## Spain-Other Europe      14.3291139   6.5345688  22.123659 0.0000005
## UK-Other Europe         -1.7613020  -4.1094700   0.586866 0.3239332
## Spain-Singapore         -0.3913043  -9.6609890   8.878380 1.0000000
## UK-Singapore           -16.4817203 -22.0212161 -10.942224 0.0000000
## UK-Spain               -16.0904159 -23.6859185  -8.494913 0.0000000

TukeyHSD(anova3, which = "Product")

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Relative.Strength ~ Region + Product + Region * Product, data = data1)
## 
## $Product
##                            diff        lwr         upr     p adj
## Procsys-GTMSys       -1.6125114  -4.046347   0.8213237 0.4427451
## LearnSys-GTMSys      -3.4763644  -5.962165  -0.9905642 0.0007706
## Finsys-GTMSys         4.2597518   1.699899   6.8196040 0.0000213
## Lifesys-GTMSys       -2.9770925  -5.582159  -0.3720263 0.0134298
## Logissys-GTMSys     -12.0612165 -16.796842  -7.3255905 0.0000000
## ContactSys-GTMSys     8.4215421   2.771491  14.0715937 0.0002379
## LearnSys-Procsys     -1.8638530  -4.939405   1.2116993 0.5548244
## Finsys-Procsys        5.8722632   2.736555   9.0079710 0.0000008
## Lifesys-Procsys      -1.3645811  -4.537307   1.8081448 0.8652576
## Logissys-Procsys    -10.4487051 -15.518811  -5.3785993 0.0000000
## ContactSys-Procsys   10.0340535   4.100850  15.9672569 0.0000143
## Finsys-LearnSys       7.7361162   4.559906  10.9123266 0.0000000
## Lifesys-LearnSys      0.4992719  -2.713490   3.7120338 0.9992976
## Logissys-LearnSys    -8.5848521 -13.680107  -3.4895972 0.0000157
## ContactSys-LearnSys  11.8979066   5.943198  17.8526149 0.0000001
## Lifesys-Finsys       -7.2368442 -10.507239  -3.9664498 0.0000000
## Logissys-Finsys     -16.3209682 -21.452758 -11.1891787 0.0000000
## ContactSys-Finsys     4.1617904  -1.824209  10.1477901 0.3812306
## Logissys-Lifesys     -9.0841240 -14.238616  -3.9296319 0.0000049
## ContactSys-Lifesys   11.3986346   5.393161  17.4041086 0.0000006
## ContactSys-Logissys  20.4827586  13.292105  27.6734124 0.0000000

model.tables(anova3, type = "means")

## Tables of means
## Grand mean
##        
## 49.605 
## 
##  Region 
##     Africa Americas Canada India Japan Other Europe Singapore Spain     UK
##      45.56    46.45  25.83    44 47.44        51.67     66.39    66  49.91
## rep  93.00   104.00   6.00    35 16.00       158.00     23.00    12 553.00
## 
##  Product 
##     GTMSys Procsys LearnSys Finsys Lifesys Logissys ContactSys
##      50.27   48.66     46.8  54.53    47.3    38.21       58.7
## rep 463.00  133.00    126.0 117.00   112.0    29.00       20.0
## 
##  Region:Product 
##               Product
## Region         GTMSys Procsys LearnSys Finsys Lifesys Logissys ContactSys
##   Africa        45.84          45.00                                     
##   rep           62.00   0.00   31.00     0.00   0.00    0.00     0.00    
##   Americas      45.20  55.23                                             
##   rep           91.00  13.00    0.00     0.00   0.00    0.00     0.00    
##   Canada               29.00                   10.00                     
##   rep            0.00   5.00    0.00     0.00   1.00    0.00     0.00    
##   India                44.00                                             
##   rep            0.00  35.00    0.00     0.00   0.00    0.00     0.00    
##   Japan         58.67  33.00                                             
##   rep            9.00   7.00    0.00     0.00   0.00    0.00     0.00    
##   Other Europe  46.50                   55.97  53.64                     
##   rep           56.00   0.00    0.00    38.00  64.00    0.00     0.00    
##   Singapore     66.39                                                    
##   rep           23.00   0.00    0.00     0.00   0.00    0.00     0.00    
##   Spain         66.00                                                    
##   rep           12.00   0.00    0.00     0.00   0.00    0.00     0.00    
##   UK            52.74  47.79   46.37    55.14  42.06   38.52    59.00    
##   rep          210.00  73.00   95.00    79.00  47.00   29.00    20.00

TukeyHSD(anova3, which = "Region : Product")

## Warning in qtukey(conf.level, length(means), x$df.residual): NaNs produced

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Relative.Strength ~ Region + Product + Region * Product, data = data1)
## 
## $<NA>
##      diff lwr upr p adj

interaction.plot(data1$Region, data1$Product, data1$Relative.Strength, col = data1$Product)

#Question 7: Jack Williams, the CEO of the company believed that the sales conversions are different
#for different for different geographical locations. 
#Check the validity of Jack's belief using an appropriate hypothesis test.

#here, we have to check whether sales conversion (reporting status) varies with region. Both are categorical variables.
#so we use chi square test.
#Ho: No association between sales conversion and region, Ha: there is an association
data10 <- data1 %>% select(Region, Reporting.status)
table(data10)

##               Reporting.status
## Region         Lost Won
##   Africa         38  55
##   Americas       49  55
##   Canada          2   4
##   India          18  17
##   Japan           6  10
##   Other Europe   92  66
##   Singapore      17   6
##   Spain          11   1
##   UK            286 267

prop.table(table(data10), margin = 1)

##               Reporting.status
## Region               Lost        Won
##   Africa       0.40860215 0.59139785
##   Americas     0.47115385 0.52884615
##   Canada       0.33333333 0.66666667
##   India        0.51428571 0.48571429
##   Japan        0.37500000 0.62500000
##   Other Europe 0.58227848 0.41772152
##   Singapore    0.73913043 0.26086957
##   Spain        0.91666667 0.08333333
##   UK           0.51717902 0.48282098

chisq.test(data10$Reporting.status, data10$Region)

## Warning in chisq.test(data10$Reporting.status, data10$Region): Chi-squared
## approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  data10$Reporting.status and data10$Region
## X-squared = 22.263, df = 8, p-value = 0.004452

# Question 8:   Joe Danby, the chief financial officer believes that 
#the sales conversions depend on the sales value. Use an appropriate hypothesis test to check the validity
# of this claim by making the following 3 groups

# Sales value less than 6 million dollars.
# Sales value between 6 and 8 million (both inclusive) dollars.
# More than 8 million dollars.

data11 <- data1 %>% select(sales_value, Reporting.status) %>% mutate(sales.value.category = case_when(sales_value < 6 ~ "Low", sales_value >= 6 & sales_value <= 8 ~ "Medium", sales_value > 8 ~ "High"))
chisq.test(data11$Reporting.status, data11$sales.value.category)

## 
##  Pearson's Chi-squared test
## 
## data:  data11$Reporting.status and data11$sales.value.category
## X-squared = 1.377, df = 2, p-value = 0.5023

data12 <- data11 %>% select(Reporting.status, sales.value.category)
table(data12)

##                 sales.value.category
## Reporting.status High Low Medium
##             Lost  251  86    182
##             Won   248  69    164

prop.table(table(data12), margin = 2)*100

##                 sales.value.category
## Reporting.status     High      Low   Medium
##             Lost 50.30060 55.48387 52.60116
##             Won  49.69940 44.51613 47.39884