library(BayesFactor)
## Warning: package 'BayesFactor' was built under R version 4.0.2
## Loading required package: coda
## Warning: package 'coda' was built under R version 4.0.2
## Loading required package: Matrix
## ************
## Welcome to BayesFactor 0.9.12-4.2. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
##
## Type BFManual() to open the manual.
## ************
set.seed(1000)
indoor<- sample(c("A","B","C","D"),prob=c(5,8,25,6),
size=200,replace=T)
outdoor <-sample(c("A","B","C","D"),prob=c(5,20,3,6),
size=200,replace=T)
tab1 <- table(indoor,outdoor)
tab2 <- table(c(indoor,outdoor),rep(c("I","O"),each=200))
tab1
## outdoor
## indoor A B C D
## A 4 11 3 5
## B 9 23 1 10
## C 19 59 10 19
## D 5 12 2 8
tab2
##
## I O
## A 23 37
## B 43 105
## C 107 16
## D 27 42
chisq.test(tab1)
## Warning in chisq.test(tab1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tab1
## X-squared = 5.0836, df = 9, p-value = 0.827
chisq.test(tab2)
##
## Pearson's Chi-squared test
##
## data: tab2
## X-squared = 99.826, df = 3, p-value < 2.2e-16
contingencyTableBF(tab1 ,sampleType = "indepMulti", fixedMargin = "cols")
## Bayes factor analysis
## --------------
## [1] Non-indep. (a=1) : 0.000346898 ±0%
##
## Against denominator:
## Null, independence, a = 1
## ---
## Bayes factor type: BFcontingencyTable, independent multinomial
contingencyTableBF(tab2 ,sampleType = "indepMulti", fixedMargin = "cols")
## Bayes factor analysis
## --------------
## [1] Non-indep. (a=1) : 1.21942e+21 ±0%
##
## Against denominator:
## Null, independence, a = 1
## ---
## Bayes factor type: BFcontingencyTable, independent multinomial
When performing the Chi-squared test for table 1 we get a non significant p-value because responses to indoor preferences do not influence responses to outdoor preferences and there is independency between the two. On the other hand, preferences in indoor and outdoor activities strongly differ with a significant p-value < 2.2e-16. Thus we cannot reject the null hypothesis in the first table but we can confidently reject the null hypothesis for the second case. The bayes factor contingency tables show similar results for both table. Interestingly, the bf for the second case is huge, meaning that we can confidently accept the alternative hypothesis.
For the following example data sets and questions, answer the statistical question using an appropriate test. Whenever possible, run both a NHST and a Bayesian test. If you choose to use a non-parametric test, give a rationale for why. The data here are completely fabricated, so do not use your intuition for what should be true, but rather find out from the data.
The column names are: * age: age of car owner * gender: gender of car owner * type: type of vehicle * origin: location car was manufactured * origin.last: location of previous car’s manufacture * carval: purchase price of vehicle * carval.last purchase price of previous vehicle
library('car')
## Warning: package 'car' was built under R version 4.0.2
## Loading required package: carData
cardat <- read.table(text="age gender type origin origin.last carval carval.last
34 F SUV US US 16400 15800
31 M Truck US Europe 16900 16000
47 M Sedan US US 18800 17100
21 F Sedan Japan Japan 16000 15500
42 M SUV US Japan 16800 16100
43 F SUV US US 17200 16300
60 F Truck Europe Europe 19900 17800
37 M Truck Europe Europe 17100 16200
46 F SUV Japan Japan 16900 16300
27 M Sedan US US 16200 15700
50 M SUV US US 18800 17100
64 F SUV Japan US 50700 31700
33 M SUV Japan Japan 16500 15900
39 M Truck US Europe 17000 16200
58 F Sedan Japan US 19400 17500
53 F SUV US Europe 19200 17400
29 F Sedan US Japan 16300 15700
37 F Sedan US US 17300 16300
37 M SUV US Japan 18200 16700
54 F Sedan Japan Japan 24500 19800
46 F SUV Japan Europe 18000 16700
55 F SUV US Japan 28900 21700
46 F Truck US Europe 16600 16100
57 M SUV Europe Europe 24300 19700
40 M SUV US US 16800 16100
27 M Sedan Japan US 16900 16000
58 M SUV Europe Europe 20300 17900
64 M Truck US US 40600 27100
47 M Truck US Europe 18400 16900
32 M Truck US US 15900 15600
43 F Sedan Japan US 17200 16300
66 M Truck Europe Europe 19100 17500
36 F SUV US Japan 16900 16100
68 M Truck US US 69300 40100
54 F Sedan Japan US 17000 16400
64 M Truck Japan Europe 34900 24600
27 M SUV Japan Europe 15800 15500
51 F Sedan Japan Japan 29000 21700
69 M Sedan US Japan 54400 33400
25 F Sedan Japan Japan 15800 15500",header=T)
# In each case:
# - describe the appropriate test to determine the answer
# - conduct the relevant NHST and Bayesian tests, or appropriate methods for estimating/predicting.
# - Create an appropriate figure illustrating the answer to the question.
# - Give an answer in words, both in terms of the question asked and in terms of the statistical test.
# - Provide the code used to conduct the test.
#1) Is there an impact of gender on the type of car purchased?
gendertype = cardat[,c(2,3)]
df_gendertype = data.frame(table(gendertype))
chisq.test(table(gendertype))
##
## Pearson's Chi-squared test
##
## data: table(gendertype)
## X-squared = 6.2934, df = 2, p-value = 0.04299
plot(df_gendertype$Freq,xaxt = 'n',xlab = 'Cars',ylab = 'Frequency', ylim = c(0,10), type="n")
text(x = 1:6, y = df_gendertype$Freq,labels = as.character(df_gendertype$type), col = c('red','blue'))
legend(4, 6, legend = c('F --> Red','M --> Blue'), box.lty = 0)
# There seems to be an impact of gender on the type of car purchased (x^2 = 6.2934, p = 0.04299) where women tend to prefer buying Sedans and men prefer buying Trucks. Interestingly SUV seems equally liked by both genders.
#2) Is there a difference in amount paid for a car for men versus women?
amountpaid = cardat[,c(2,6,7)]
amountpaid$twocars = amountpaid$carval + amountpaid$carval.last
amountpaid = amountpaid[,c(1,4)]
amountpaid$twocars = amountpaid$twocars[order(amountpaid$gender)]
amountpaid$gender = amountpaid$gender[order(amountpaid$gender)]
female = amountpaid$twocars[1:19]
male = amountpaid$twocars[20:40]
par(mfrow=c(1,2))
qqPlot(female,main = 'F amount paid norm assum')
## [1] 6 18
qqPlot(male,main = 'M amount paid norm assum')
## [1] 18 21
wilcox.test(male,female,paired=F)
## Warning in wilcox.test.default(male, female, paired = F): cannot compute exact
## p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: male and female
## W = 206, p-value = 0.8708
## alternative hypothesis: true location shift is not equal to 0
# there seems to be no significant difference (p=0.8708) between money spent by women and men for the previous and last car purchased. I used a Wilcoxon test because normality assumption was rejected.
#3)Do people tend to buy vehicles from the same origin as their last vehicle (US, europe, japan)?
origin <- table(cardat$origin,cardat$origin.last)
chisq.test(origin)
## Warning in chisq.test(origin): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: origin
## X-squared = 12.772, df = 4, p-value = 0.01245
# People tend to buy vehicles from the same origin as their last vehicle (p = 0.01245) perhaps because they were used to or satisfied with their previous car.
#4)Is there a relationship between driver age and the value of he car?
relationship_age_value = cor(cardat$age, cardat$carval)
r_squared = relationship_age_value ^ 2
relationship_age_value_2cars = cor(cardat$age, cardat$carval + cardat$carval.last)
r_squared_2 = relationship_age_value_2cars ^ 2
plot(cardat$age, cardat$carval, xlab = 'Age', ylab = 'Money',pch = 18, main = '')
legend(25, 60000, legend = c((sprintf('r = %s\nr^2 = %s ',round(relationship_age_value,3),round(r_squared,3)))),box.lty = 0)
cor.test(cardat$age, cardat$carval)
##
## Pearson's product-moment correlation
##
## data: cardat$age and cardat$carval
## t = 5.5385, df = 38, p-value = 2.446e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4506396 0.8109970
## sample estimates:
## cor
## 0.6683299
cor.test(cardat$age, cardat$carval + cardat$carval.last)
##
## Pearson's product-moment correlation
##
## data: cardat$age and cardat$carval + cardat$carval.last
## t = 5.6237, df = 38, p-value = 1.869e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4587712 0.8144767
## sample estimates:
## cor
## 0.6739634
#there seems to be a positive correlation (r = 0.668, p<0.05) between age and value of the last purchased car where variation in age explains up to 44% of variation (r^2 = 0.447) in amount spent for a vehicle, thus we should include other possible variables that could play an important role in this.
#The same is true for age and the sum of the two last vehicles (r = 0.673, r^2 = 0.454, p<0.05) further supporting this relationship.
#5)What is your best estimate for the value of a car driven by a 32, 52, and 62-year-old?
fit = lm(carval~age, cardat)
coeff=coefficients(fit)
eq = paste0("y = ", round(coeff[2],1), "*x ", round(coeff[1],1))
plot(cardat$age, cardat$carval, xlab = 'Age', ylab = 'Money',pch = 18,ylim = c(0,70000),main = eq)
abline(-4494.1 , 592.2)
years = c(32,52,62)
years_df = data.frame(NA,NA,NA)
colnames(years_df) <- c("32","52","62")
colnames(32,52,62)
## NULL
for(i in 1:3){
years_df[1,i] = 592.2 * years[i] - 4494}
#if our data followed a linear trend the value would be 32y = 14456.4, 52 = 26300, 62 = 32222.4
#However I feel like we should either eyeball it or use a nonlinear estimation model to get the real values...
#6)Is there a relationship between how much someone paid for their previous car and how much they paid for their current car?
#7)Did people tend to pay more for their current car than their previous car?
correlation = cor(cardat$carval.last,cardat$carval)
rq = paste0("r = ", round(correlation,4),"\nr^2 = ", round((correlation)^2,4))
plot(cardat$carval.last,cardat$carval, xlab = 'Previous Car', ylab = 'New Car',pch = 18,ylim = c(0,70000),main = rq)
wilcox.test(cardat$carval,cardat$carval.last,paired = T)
## Warning in wilcox.test.default(cardat$carval, cardat$carval.last, paired = T):
## cannot compute exact p-value with ties
##
## Wilcoxon signed rank test with continuity correction
##
## data: cardat$carval and cardat$carval.last
## V = 820, p-value = 3.641e-08
## alternative hypothesis: true location shift is not equal to 0
#Here the correlation is basically 1 which means that they are strongly positively correlated and a linear equation describes the relationship perfectly. I chose to adopt a paired wilcox.test because normality assumption was rejected and the same people were tested (although it is not too clear to me whether we can really claim that to be dependent, no real interventional protocol was provided and how do we know that i.e. the ageing process isn't the main protagonist here instead of the fact that they just preferred to spend more money on a car because of the previous car purchased?). A strong p.value (0.00003) shows us that there is a significant difference between amount paid for the car and the second car.
#8)Did trucks cost more than SUVs?
trucks_SUVs = subset(cardat, type == 'SUV' | type == 'Truck')
ordered_cars = trucks_SUVs[order(trucks_SUVs$type),]
SUVs = c(ordered_cars$carval[1:16], ordered_cars$carval.last[1:16])
Trucks = c(ordered_cars$carval[17:27], ordered_cars$carval.last[17:27])
wilcox.test(Trucks,SUVs, alternative = 'greater')
## Warning in wilcox.test.default(Trucks, SUVs, alternative = "greater"): cannot
## compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: Trucks and SUVs
## W = 401.5, p-value = 0.1941
## alternative hypothesis: true location shift is greater than 0
#Trucks do not statistically cost more than SUVs (p>0.05)
# #install.packages("devtools")
# library(devtools)
# #install_github("easyGgplot2", "kassambara") doesn't work!
# library(ggplot2)
# ggplot(as.data.frame(SUVs), aes(x=SUVs)) +
# geom_histogram(aes(y=..density..), colour="black", fill="white")+
# geom_density(alpha=.2, fill="#FF6666")
#
# ggplot(as.data.frame(Trucks), aes(x=Trucks)) +
# geom_histogram(fill="red", alpha=0.5, position="identity")