Paired T-Test
Field <- c(1,2,3,4,5,6,7)
Fum_A <-c(77,40,11,31,28,50,53)
Fum_B <- c(76,38,10,29,27,48,51)
data <- data.frame(Field,Fum_A,Fum_B)
data
mean_diff <-mean(data$Fum_A) - mean(data$Fum_B)
mean_diff
## [1] 1.571429
sd_diff <- sd(data$Fum_A)-sd(data$Fum_B)
sd_diff
## [1] 0.05855683
t_calc <- mean_diff/(sd_diff/sqrt(7))
t_calc
## [1] 71.00127
t.test(Fum_A,Fum_B, paired = T)
##
## Paired t-test
##
## data: Fum_A and Fum_B
## t = 7.7782, df = 6, p-value = 0.0002377
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.077078 2.065779
## sample estimates:
## mean of the differences
## 1.571429
Paired T-Test
Cola_A <- c(48,45,50,41,55,57,48,47,50,52)
Cola_B <- c(42,40,44,41,50,54,56,50,43,45)
Two samples are paired since the same person tasted both colas.
paired_ttest <- t.test(Cola_A,Cola_B, paired = TRUE)
paired_ttest
##
## Paired t-test
##
## data: Cola_A and Cola_B
## t = 1.7764, df = 9, p-value = 0.1094
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.7656394 6.3656394
## sample estimates:
## mean of the differences
## 2.8
p-value>0.05, fail to reject H0. We can conclude the mean taste scores of the two colas are not significantly different.
Assumptions normality
par(mfrow = c(1,2))
qqnorm(Cola_A, main = "QQ Plot Cola A")
qqline(Cola_A)
qqnorm(Cola_B, main = "QQ Plot Cola B")
qqline(Cola_B)
Interaction Plot
life<-c(8.6,7.9,5.4,8.2,8.4,5.7,9.4,8.5,5.8,8.8,8.9,5.9)
brand<-rep(c("A","B"),each=6)
brand
## [1] "A" "A" "A" "A" "A" "A" "B" "B" "B" "B" "B" "B"
device<- rep(c("Radio","Camera","DVD"),4)
device
## [1] "Radio" "Camera" "DVD" "Radio" "Camera" "DVD" "Radio" "Camera"
## [9] "DVD" "Radio" "Camera" "DVD"
data<-data.frame(life,brand,device)
data
interaction.plot(data$brand,data$device,data$life,xlab = "Brand", ylab = "life",main = "Interaction plot", ylim = c(1,10), trace.label = "Device", type = "b", col=c("red","green","blue"), pch = c(19,17), fixed = TRUE)
#using ggplot
library(ggplot2)
interaction_gg <- ggplot(data, aes(brand, life, color = device)) + geom_point() + stat_summary(fun.y = mean, geom = "line", aes(group = device))+ theme_bw() + labs( title = "Interaction plot of brand and device")
## Warning: `fun.y` is deprecated. Use `fun` instead.
interaction_gg
No interaction, since lines are parallel.
#anova to see interaction
model<-aov(life~brand*device,data = data)
anova(model)
Fail to reject null hypothesis since p-value >0.05. No interaction.
library(emmeans)
#drop the insignificant interaction term
model2<-aov(life~brand+device,data = data)
lsm_bat<-lsmeans(model2,~brand)
ct<- summary(contrast(lsm_bat,list("A-B"=c(1,-1))),infer=c(T,T),level=0.95,side ="two-sided")
ct
Since the model does not involve interaction effect, we recommend Brand B which has a longer battery life than Brand A with 95% CI of thedifference of main effects (-0.88,-0.153).
fit <- lm(life ~ device, data = data) # Fit the model
summary(fit)
##
## Call:
## lm(formula = life ~ device, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5500 -0.1875 0.0250 0.1250 0.6500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.4250 0.1970 42.761 1.04e-11 ***
## deviceDVD -2.7250 0.2786 -9.780 4.31e-06 ***
## deviceRadio 0.3250 0.2786 1.166 0.273
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3941 on 9 degrees of freedom
## Multiple R-squared: 0.9414, Adjusted R-squared: 0.9284
## F-statistic: 72.27 on 2 and 9 DF, p-value: 2.858e-06
par(mfrow = c(2, 2)) # Split the plotting panel into a 2 x 2 grid
plot(fit) # Plot the model information
Logistic Regression
flight_data <- read.csv("flightdata.csv")
head(flight_data)
str(flight_data)
## 'data.frame': 5196 obs. of 6 variables:
## $ origin : Factor w/ 1 level "JFK": 1 1 1 1 1 1 1 1 1 1 ...
## $ dest : Factor w/ 46 levels "ABQ","ATL","AUS",..: 30 5 5 27 21 40 13 2 46 34 ...
## $ delay : int 1 1 0 0 0 0 0 0 0 1 ...
## $ carrier : int 0 0 0 0 0 0 0 1 0 0 ...
## $ air_time: num 3.25 3.1 3.15 2.25 2.23 ...
## $ weekend : int 1 1 1 1 1 1 1 1 1 1 ...
logistic_reg <- glm(delay ~ air_time + carrier + weekend, family = binomial, data = flight_data)
summary(logistic_reg)
##
## Call:
## glm(formula = delay ~ air_time + carrier + weekend, family = binomial,
## data = flight_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3225 -1.1837 -0.9884 1.1706 1.3798
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.013021 0.060516 0.215 0.830
## air_time 0.001215 0.016762 0.072 0.942
## carrier -0.477806 0.064082 -7.456 8.91e-14 ***
## weekend 0.313711 0.061875 5.070 3.98e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7200.5 on 5195 degrees of freedom
## Residual deviance: 7112.0 on 5192 degrees of freedom
## AIC: 7120
##
## Number of Fisher Scoring iterations: 4
\[log(\hat{p}/(1−\hat{p}) = 0.013 + 0.001(air.time)−0.478(carrier) + 0.314(weekend)\]
Since the coefficient for carrier is negative, Delta flights have a lower estimated probability of being delayed than JetBlue flights.
Since the coefficient for weekend is positive, flights departing during the weekend (Sat or Sun) have a higher estimated probability of being delayed than flights departing during the week (Mon-Fri).
The coefficient for air time is not significantly different than zero (p-value = 0.94); this suggests that air time is not a useful predictor of flight delays and should be removed from the model.
#remove air_time
logistic_reg1 <- glm(delay ~ carrier + weekend, family = binomial, data = flight_data)
summary(logistic_reg1)
##
## Call:
## glm(formula = delay ~ carrier + weekend, family = binomial, data = flight_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3204 -1.1844 -0.9894 1.1704 1.3779
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.01643 0.03805 0.432 0.666
## carrier -0.47622 0.06024 -7.905 2.68e-15 ***
## weekend 0.31368 0.06187 5.070 3.99e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7200.5 on 5195 degrees of freedom
## Residual deviance: 7112.0 on 5193 degrees of freedom
## AIC: 7118
##
## Number of Fisher Scoring iterations: 4
\[log(\hat{p}/(1−\hat{p}) = 0.013 −0.476(carrier) + 0.314(weekend)\]
The estimated probability that a Delta flight departing JFK during the weekend is delayed
new_data <- data.frame(carrier = 1, weekend = 1)
new_data
predict(logistic_reg1, newdata=new_data, type="response")
## 1
## 0.4635359
The estimated probability that a JetBlue flight departing JFK during the weekend is delayed:
new_x <- data.frame(carrier = 0, weekend = 1)
predict(logistic_reg1, newdata=new_x, type="response")
## 1
## 0.5817854
Delta because of less probability of getting delayed.
n <-nrow(flight_data)
flight_data$SFO <- rep(0,n)
flight_data$SFO[flight_data$dest == "SFO"] <- 1
logistic_reg3 <- glm(delay ~ carrier + weekend + SFO, family = binomial, data = flight_data)
summary(logistic_reg3)
##
## Call:
## glm(formula = delay ~ carrier + weekend + SFO, family = binomial,
## data = flight_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3238 -1.1880 -0.8713 1.1669 1.5182
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.02480 0.03820 0.649 0.516
## carrier -0.45080 0.06104 -7.385 1.52e-13 ***
## weekend 0.31295 0.06191 5.055 4.31e-07 ***
## SFO -0.34678 0.13645 -2.541 0.011 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7200.5 on 5195 degrees of freedom
## Residual deviance: 7105.5 on 5192 degrees of freedom
## AIC: 7113.5
##
## Number of Fisher Scoring iterations: 4
GSS data
gss <- read.csv("gssdata.csv ")
head(gss)
dim(gss)
## [1] 1487 2
#contingency table
addmargins(table(gss)) #add margins for sum
## spend_educ
## gender About right Too little Too much Sum
## Female 142 658 42 842
## Male 139 453 53 645
## Sum 281 1111 95 1487
658/1111 # too little on educ, female
## [1] 0.5922592
42/95 # too much on educ, female
## [1] 0.4421053
Chi-square test of independence:
Null Hypothesis, H0: The variables gender and spen_educ are independent. HA: The variables are dependent
test1 <- chisq.test(gss$gender, gss$spend_educ)
test1
##
## Pearson's Chi-squared test
##
## data: gss$gender and gss$spend_educ
## X-squared = 13.266, df = 2, p-value = 0.001316
test1$p.value
## [1] 0.001316225
test1$expected
## gss$spend_educ
## gss$gender About right Too little Too much
## Female 159.1137 629.0935 53.79287
## Male 121.8863 481.9065 41.20713
Since p-value is less than 0.01, we reject null hypothesis. The data provide evidence that the gender and spend_edu are dependent.
We may be committing type I error. Rejecting HO when it’s true, meaning in context above the gender and spend_edu are dependent, when they actually are dependent.
Quakes data
head(quakes)
pl <-plot(quakes$depth,quakes$mag, data = quakes)
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
library(tidyverse)
## -- Attaching packages --------------------------- tidyverse 1.3.0 --
## v tibble 2.1.3 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## v purrr 0.3.3
## -- Conflicts ------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
ggplot(quakes, aes( x = depth, y = mag)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
No linear relation- it looks flat.
Using dplyer:
quakes_greater <- quakes %>% filter(depth>400)
quakes_greater %>% ggplot(aes(y=mag, x= depth))+geom_point()+ geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
quakes_smaller <- quakes %>% filter(depth<400)
quakes_smaller %>% ggplot(aes(y=mag, x= depth))+geom_point()+ geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
model <- lm(mag~depth, data = quakes)
summary(model)
##
## Call:
## lm(formula = mag ~ depth, data = quakes)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.72012 -0.29642 -0.03694 0.19818 1.70014
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.755e+00 2.179e-02 218.168 < 2e-16 ***
## depth -4.310e-04 5.756e-05 -7.488 1.54e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3921 on 998 degrees of freedom
## Multiple R-squared: 0.05319, Adjusted R-squared: 0.05225
## F-statistic: 56.07 on 1 and 998 DF, p-value: 1.535e-13
Statistically significant.
plot(quakes$long, quakes$lat, data = quakes)
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
quakes %>% ggplot(aes(y = lat, x= long)) + geom_point()
Map
head(map_data("world"))
map_data("world" ) %>% ggplot(aes(x=long, y = lat)) + geom_path(aes(group=group),color = "red", size = 0.1)
long_lat <- quakes %>% select(lat, long) %>% apply(MARGIN = 2, range) %>% as.data.frame()
head(long_lat)
base_map <- map_data("world") %>% ggplot(aes(x=long, y=lat)) + geom_path(aes(group = group), color = "green", size = 0.1) + lims(x =long_lat$long, y = long_lat$lat)
base_map
## Warning: Removed 98515 row(s) containing missing values (geom_path).
base_map + geom_point(data = quakes, aes(color = mag, alpha = 0.01, size = mag)) + theme(legend.position = "none")
## Warning: Removed 98515 row(s) containing missing values (geom_path).
Gasoline data - ANOVA
gas_add <- read.csv('gasoline_additives.csv')
gas_add
One way ANOVA: Null Hypothesis : all means are equal Alternative Hypothesis : Not all means are equal alpha = 0.05
car_group <- rep(c("W","X","Y","Z"), each=6)
mileage <- c(gas_add$W, gas_add$X, gas_add$Y, gas_add$Z)
gas_add1 <- data.frame(car_group, mileage)
gas_add1
plot(mileage~car_group, data = gas_add1)
anova_model <- aov(mileage~car_group, data = gas_add1)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## car_group 3 213.88 71.29 127.2 3.39e-13 ***
## Residuals 20 11.21 0.56
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
p-value < 0.05, reject H0. at least one population means are different from the other group means.
Assumptions - normality,independence, equal variance
par(mfrow = c(2,2))
plot(anova_model)
attributes(anova_model)
## $names
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "contrasts" "xlevels" "call" "terms"
## [13] "model"
##
## $class
## [1] "aov" "lm"
Interval Plot
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
plotmeans(mileage~car_group, data = gas_add1, mean.labels = T, main = "Interval plot of W,X,Y and Z")
Tukey Simultaneous Tests for Differences of Means
tukey.test <- TukeyHSD(anova_model)
tukey.test
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = mileage ~ car_group, data = gas_add1)
##
## $car_group
## diff lwr upr p adj
## X-W -3.4000000 -4.6097284 -2.190272 0.0000009
## Y-W 3.5666667 2.3569382 4.776395 0.0000004
## Z-W 3.9500000 2.7402716 5.159728 0.0000001
## Y-X 6.9666667 5.7569382 8.176395 0.0000000
## Z-X 7.3500000 6.1402716 8.559728 0.0000000
## Z-Y 0.3833333 -0.8263951 1.593062 0.8116208
plot(tukey.test)
if interval does not contain zero, the corresponding means are significantly different.
Alternative - Grouping Information Using the Tukey Method and 95% Confidence
library(agricolae)
tukey.test2 <- HSD.test(anova_model, trt = "car_group")
tukey.test2
## $statistics
## MSerror Df Mean CV MSD
## 0.5604167 20 32.39583 2.310821 1.209728
##
## $parameters
## test name.t ntr StudentizedRange alpha
## Tukey car_group 4 3.958293 0.05
##
## $means
## mileage std r Min Max Q25 Q50 Q75
## W 31.36667 0.8824209 6 30.1 32.6 30.900 31.35 31.875
## X 27.96667 0.5501515 6 27.4 28.7 27.525 27.85 28.400
## Y 34.93333 0.8869423 6 33.9 36.1 34.200 34.95 35.550
## Z 35.31667 0.6112828 6 34.5 36.2 34.975 35.25 35.675
##
## $comparison
## NULL
##
## $groups
## mileage groups
## Z 35.31667 a
## Y 34.93333 a
## W 31.36667 b
## X 27.96667 c
##
## attr(,"class")
## [1] "group"
Means that do not share a letter are significantly different. recommend additives Y or Z.
Preservative Problem
Researchers from the Department of Fruit Crops at a university compared two different preservatives to be used in freezing strawberries. A random sample of 10 bags of strawberries was randomly divided into 2 groups. The bags in Group I served as control group (current preservative), while those in Group II were assigned to a newly developed preservative. After all15 bags of strawberries were prepared, they were stored at 0o C for 6 months. At the end of this time, the contents of each bag were allowed to thaw and then rated on a scale of 1 to 10 points for discoloration (a low score means little discoloration.
scores<-c(3.70,3.92,3.04,3.71,3.41,3.22,3.50,3.79,1.75,2.06,1.90,2.57,2.20,1.50,
1.89,1.52)
preservative_group <- rep(c("I","II"), each = 8)
df <- data.frame(preservative_group, scores)
df
boxplot(scores~preservative_group)
As can been seen in the box plots, the center of the group I is the higher than that of group II and their spread levels are similar. In addition, the two box plots are separate, do not overlap at all. The difference seems significant.
Let mu be the mean discoloration scale and index 1, 2 are for Group I, II, respectively. Ho: mu1 = mu2 vs. Ha: mu1 > mu 2
test_ind <-t.test(scores~preservative_group, mu = 0, alternative = "greater", conf = 0.95,var.eq = T)
test_ind
##
## Two Sample t-test
##
## data: scores by preservative_group
## t = 9.7902, df = 14, p-value = 6.07e-08
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 1.322401 Inf
## sample estimates:
## mean in group I mean in group II
## 3.53625 1.92375
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
#to see if population variance are equal
leveneTest(scores~preservative_group)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
p-value > 0.05, fail to reject HO. Pop variance are are equal.
Non-parametric test
Let m be the median discoloration scale and index 1, 2 are for Group I, II, respectively. Ho: m1 = m2 vs. Ha: m1 is not equal to m 2
wilcox.test(scores~preservative_group)
##
## Wilcoxon rank sum test
##
## data: scores by preservative_group
## W = 64, p-value = 0.0001554
## alternative hypothesis: true location shift is not equal to 0
Again, because the small p value, we can conclude the two preservatives have different distributions of preservative discoloration.
What are the assumptions of your test in (b)? Assess these assumptions by graphs and tests. Be sure to include test statistics, p values and test conclusions.
#check normality of discolaration scales
two_groups <- split(scores,preservative_group)
two_groups
## $I
## [1] 3.70 3.92 3.04 3.71 3.41 3.22 3.50 3.79
##
## $II
## [1] 1.75 2.06 1.90 2.57 2.20 1.50 1.89 1.52
par(mfrow = c(1,2))
qqnorm(two_groups$I, main = "QQ plot for group I")
qqline(two_groups$I)
qqnorm(two_groups$II, main = "QQ plot for group II")
qqline(two_groups$II)
#Shapiro-Wilk Test for normality
shapiro.test(two_groups$I)
##
## Shapiro-Wilk normality test
##
## data: two_groups$I
## W = 0.95394, p-value = 0.7508
shapiro.test(two_groups$II)
##
## Shapiro-Wilk normality test
##
## data: two_groups$II
## W = 0.94875, p-value = 0.6986
As the p values .7508 and .6986 are both large and the normal Q-Q plots are both of linear trend, it is reasonable to assume normality of discoloration for both groups.
#Equal variance
var.test(scores~preservative_group)
##
## F test to compare two variances
##
## data: scores by preservative_group
## F = 0.71274, num df = 7, denom df = 7, p-value = 0.6662
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.1426924 3.5600492
## sample estimates:
## ratio of variances
## 0.7127355
As seen in the box plots in (a), the spread level of residuals stay almost the same for the two groups. In addition, the p value of F test is .6662, very large. Therefore, we can assume equal variances of discoloration scales of the two groups.
Combination and Permutation
n<- 10
r <- 2
#combination
choose(n,r)
## [1] 45
#permutation
choose(n,r) * factorial(r)
## [1] 90
Binomial Distribution
# Cumulative Binomial
n<-12 #size
p<-0.4 #prob of success
pbinom(5, 12, 0.4) # cumulative binomial dist, at most 5, P(X<=5)
## [1] 0.6652086
dbinom(7, 12, 0.4) # prob P(x=7)
## [1] 0.1009024
Exponential Distribution
pexp(31,1/44) #cumulative, p(x<=31), rate lamda = 1/44, theta(mean) = 44
## [1] 0.5056668
Two Way ANOVA
ob_prob1 <- read.csv("ob_prob1.csv")
ob_prob1
machine <- rep(c("A","B","C","D", "E"), each= 16 )
machine
## [1] "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "B" "B" "B"
## [20] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "C" "C" "C" "C" "C" "C"
## [39] "C" "C" "C" "C" "C" "C" "C" "C" "C" "C" "D" "D" "D" "D" "D" "D" "D" "D" "D"
## [58] "D" "D" "D" "D" "D" "D" "D" "E" "E" "E" "E" "E" "E" "E" "E" "E" "E" "E" "E"
## [77] "E" "E" "E" "E"
machine_head <- rep(c("1","2","3", "4"), each=4)
machine_head
## [1] "1" "1" "1" "1" "2" "2" "2" "2" "3" "3" "3" "3" "4" "4" "4" "4"
stress_data <- c(6,13,2,7,2, 3, 10, 4,1, 9, 5, 7,8, 8, 6, 9,10,2,4,0,9,1,1,3,7, 1, 7, 4, 12, 10, 9, 1,1,10,8,7,1,11,5,2,5,6,0,5, 5,7,7,4,11,5,1,3,1,10,8,8,6,8,9,6,4,3,4,5,2,6,3,3,4,6,7,1, 7,1,4,2, 9,3,1,2 )
df1 <- data.frame(machine_head,machine, stress_data)
df1
#two-way anova
mod_anova <- aov(stress_data~machine_head+machine, data= df1)
summary(mod_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## machine_head 3 11.7 3.883 0.361 0.781
## machine 4 53.6 13.394 1.245 0.300
## Residuals 72 774.7 10.760
Null hypotheses: tau1=tau2…=0 HA: at least one is not 0 F-value: 1.26 and p-value >0.05, fail to reject H0. Does not appear to be difference among the 5 machines.
and Ho: σβ^2 = 0 vs. Ha: σβ^2>0 Fail to reject H0.
SAT Data
sat_data <-read.delim("sat_data.txt") #read text file
head(sat_data)
plot(sat_data$expend,sat_data$sat)
#exploratory data analysis
library(ggplot2)
sat_data%>% ggplot(aes(x=expend,y=sat))+geom_point() + geom_text(aes(label=state)) + geom_smooth(method = lm)
## `geom_smooth()` using formula 'y ~ x'
summary(sat_data$sat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 844.0 897.2 945.5 965.9 1032.0 1107.0
summary(sat_data$expend)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.656 4.882 5.768 5.905 6.434 9.774
cor( sat_data$expend,sat_data$sat)
## [1] -0.380537
cor.test(sat_data$sat, sat_data$expend)
##
## Pearson's product-moment correlation
##
## data: sat_data$sat and sat_data$expend
## t = -2.8509, df = 48, p-value = 0.006408
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5957789 -0.1142957
## sample estimates:
## cor
## -0.380537
From the scatter plot there appears to be a negative relationship between expenditure and SAT scores. This is also confirmed by the correlation coefficient of -0.38 (p-value = 0.006). That is, the states that score highest on the SAT are the states that, on average, spend less money per student. For example, high spending states such as New Jersey, New York, Alaska and Connecticut, all post aggregate scores well below the national average combined score of 966.
linear_model <- lm(sat~expend, data = sat_data)
summary(linear_model)
##
## Call:
## lm(formula = sat ~ expend, data = sat_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -145.074 -46.821 4.087 40.034 128.489
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1089.294 44.390 24.539 < 2e-16 ***
## expend -20.892 7.328 -2.851 0.00641 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69.91 on 48 degrees of freedom
## Multiple R-squared: 0.1448, Adjusted R-squared: 0.127
## F-statistic: 8.128 on 1 and 48 DF, p-value: 0.006408
#mlr
linear_model2 <- lm(sat~expend + ratio + salary + frac, data=sat_data)
summary(linear_model2)
##
## Call:
## lm(formula = sat ~ expend + ratio + salary + frac, data = sat_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90.531 -20.855 -1.746 15.979 66.571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1045.9715 52.8698 19.784 < 2e-16 ***
## expend 4.4626 10.5465 0.423 0.674
## ratio -3.6242 3.2154 -1.127 0.266
## salary 1.6379 2.3872 0.686 0.496
## frac -2.9045 0.2313 -12.559 2.61e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.7 on 45 degrees of freedom
## Multiple R-squared: 0.8246, Adjusted R-squared: 0.809
## F-statistic: 52.88 on 4 and 45 DF, p-value: < 2.2e-16
Selection method with the AIC
sat.fit <-lm(sat~expend+frac, data = sat_data)
summary(sat.fit)
##
## Call:
## lm(formula = sat ~ expend + frac, data = sat_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -88.400 -22.884 1.968 19.142 68.755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 993.8317 21.8332 45.519 < 2e-16 ***
## expend 12.2865 4.2243 2.909 0.00553 **
## frac -2.8509 0.2151 -13.253 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.46 on 47 degrees of freedom
## Multiple R-squared: 0.8195, Adjusted R-squared: 0.8118
## F-statistic: 106.7 on 2 and 47 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(sat.fit)
From the residual vs fit plot, we can assume linearity and have equal variances. The QQ plot looks okay too and so we can assume normality. The Standardized residuals vs fitted values plot show there are no outliers in the data set.
With Matrix
cases <- matrix(c(8,22,14,21,42,12,71,90,19),ncol = 3, byrow = TRUE)
rownames(cases) <- c("age less than or equal 21","between 22 and 26","greater than or equal 27")
colnames(cases) <- c("0 accidents","1 or 2", "3 or more")
cases<-as.table(cases)
cases
## 0 accidents 1 or 2 3 or more
## age less than or equal 21 8 22 14
## between 22 and 26 21 42 12
## greater than or equal 27 71 90 19
margin.table(cases) # totals
## [1] 299
margin.table(cases, 1) # row totals
## age less than or equal 21 between 22 and 26 greater than or equal 27
## 44 75 180
margin.table(cases,2) #column totals
## 0 accidents 1 or 2 3 or more
## 100 154 45
round(prop.table(cases),2) # table proportion
## 0 accidents 1 or 2 3 or more
## age less than or equal 21 0.03 0.07 0.05
## between 22 and 26 0.07 0.14 0.04
## greater than or equal 27 0.24 0.30 0.06
margin.table(prop.table(cases)) # total prop
## [1] 1
prop.table(cases,1) #row proportions
## 0 accidents 1 or 2 3 or more
## age less than or equal 21 0.1818182 0.5000000 0.3181818
## between 22 and 26 0.2800000 0.5600000 0.1600000
## greater than or equal 27 0.3944444 0.5000000 0.1055556
prop.table(cases,2) #column proportions
## 0 accidents 1 or 2 3 or more
## age less than or equal 21 0.0800000 0.1428571 0.3111111
## between 22 and 26 0.2100000 0.2727273 0.2666667
## greater than or equal 27 0.7100000 0.5844156 0.4222222
margin.table(prop.table(cases,2),2)
## 0 accidents 1 or 2 3 or more
## 1 1 1
#clusterd bargraph
barplot(cases, legend=T, beside = T, main = "Accidents")
#stacked bargraph
barplot(cases, legend=T, beside = F, main = "Accidents")
plot(cases, main = "Accidents")
#Expected counts
Expected <- as.array(margin.table(cases,1)) %*% t(as.array(margin.table(cases,2))) /
margin.table(cases)
Expected <- as.table(Expected)
Expected
## 0 accidents 1 or 2 3 or more
## age less than or equal 21 14.715719 22.662207 6.622074
## between 22 and 26 25.083612 38.628763 11.287625
## greater than or equal 27 60.200669 92.709030 27.090301
barplot(Expected, legend = T, beside = T, main = "Expected")
summary(cases)
## Number of cases in table: 299
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 16.741, df = 4, p-value = 0.00217
Injury Data
injury_data <- read.csv("INJURY9.csv")
injury_data
plot(INJURY~CARCLASS, data = injury_data)
summary(injury_data)
## CARCLASS INJURY
## CLASS5 :23 Min. : 52.00
## CLASS1 :20 1st Qu.: 78.00
## CLASS4 :18 Median : 96.50
## CLASS2 :13 Mean : 99.65
## CLASS8 :13 3rd Qu.:118.25
## CLASS9 : 8 Max. :177.00
## (Other):17
anova_injury <- aov(INJURY~CARCLASS, data = injury_data)
summary(anova_injury)
## Df Sum Sq Mean Sq F value Pr(>F)
## CARCLASS 8 54762 6845 22.8 <2e-16 ***
## Residuals 103 30917 300
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
p-value < 0.05, reject H0. at least one population means are different from the other group means.
Assumptions - normality,independence, equal variance
par(mfrow = c(2,2))
plot(anova_model, which = c(1,2))
attributes(anova_injury)
## $names
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "contrasts" "xlevels" "call" "terms"
## [13] "model"
##
## $class
## [1] "aov" "lm"
Interval Plot
library(gplots)
plotmeans(mileage~car_group, data = gas_add1, mean.labels = T, main = "Interval plot of W,X,Y and Z")
Tukey Simultaneous Tests for Differences of Means
tukey.test <- TukeyHSD(anova_model)
tukey.test
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = mileage ~ car_group, data = gas_add1)
##
## $car_group
## diff lwr upr p adj
## X-W -3.4000000 -4.6097284 -2.190272 0.0000009
## Y-W 3.5666667 2.3569382 4.776395 0.0000004
## Z-W 3.9500000 2.7402716 5.159728 0.0000001
## Y-X 6.9666667 5.7569382 8.176395 0.0000000
## Z-X 7.3500000 6.1402716 8.559728 0.0000000
## Z-Y 0.3833333 -0.8263951 1.593062 0.8116208
plot(tukey.test)
if interval does not contain zero, the corresponding means are significantly different.
Alternative - Grouping Information Using the Tukey Method and 95% Confidence
library(agricolae)
tukey.test2 <- HSD.test(anova_model, trt = "car_group")
tukey.test2
## $statistics
## MSerror Df Mean CV MSD
## 0.5604167 20 32.39583 2.310821 1.209728
##
## $parameters
## test name.t ntr StudentizedRange alpha
## Tukey car_group 4 3.958293 0.05
##
## $means
## mileage std r Min Max Q25 Q50 Q75
## W 31.36667 0.8824209 6 30.1 32.6 30.900 31.35 31.875
## X 27.96667 0.5501515 6 27.4 28.7 27.525 27.85 28.400
## Y 34.93333 0.8869423 6 33.9 36.1 34.200 34.95 35.550
## Z 35.31667 0.6112828 6 34.5 36.2 34.975 35.25 35.675
##
## $comparison
## NULL
##
## $groups
## mileage groups
## Z 35.31667 a
## Y 34.93333 a
## W 31.36667 b
## X 27.96667 c
##
## attr(,"class")
## [1] "group"
Means that do not share a letter are significantly different. recommend additives Y or Z.
Design of Experiment
#observation studies
data2<- rbind(c(191,304), c(300,641))
data2
## [,1] [,2]
## [1,] 191 304
## [2,] 300 641
fisher.test(data2, alternative = "greater")
##
## Fisher's Exact Test for Count Data
##
## data: data2
## p-value = 0.006593
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 1.101837 Inf
## sample estimates:
## odds ratio
## 1.342128
#qqPlot for point-wise 95% confidence, points within the envelope are considered normal
library(car)
qqPlot(data2)
## [1] 4 1
acids <- c(1.697,1.601,1.830,2.032,2.017,2.409,2.211,1.673,1.973,2.091,2.255,2.987)
R50 <- rep(c("no","yes","no","yes"), each = 3)
R21 <- rep(c("no", "no","yes","yes"), each = 3)
cheddar <- data.frame(R50, R21, acids)
cheddar
library(phia)
modelAB<-aov(acids~R50+R21+R50:R21, data = cheddar)
summary(modelAB)
## Df Sum Sq Mean Sq F value Pr(>F)
## R50 1 0.6561 0.6561 7.234 0.0275 *
## R21 1 0.2144 0.2144 2.364 0.1627
## R50:R21 1 0.0018 0.0018 0.020 0.8922
## Residuals 8 0.7257 0.0907
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Larger p-value of interaction indicates that there may no interaction effect in this model.
IM <- interactionMeans(modelAB)
plot(IM)
dummy.coef(modelAB)
## Full coefficients are
##
## (Intercept): 1.709333
## R50: no yes
## 0.0000000 0.4433333
## R21: no yes
## 0.000 0.243
## R50:R21: no:no yes:no no:yes yes:yes
## 0.00000000 0.00000000 0.00000000 0.04866667
#model without interaction
mod_2 <-aov(acids~R50+R21, data = cheddar)
summary(mod_2)
## Df Sum Sq Mean Sq F value Pr(>F)
## R50 1 0.6561 0.6561 8.118 0.0191 *
## R21 1 0.2144 0.2144 2.653 0.1378
## Residuals 9 0.7274 0.0808
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
R50 has more effect on cheese quality.
library(ACSWR)
data("battery")
battery$Material <- as.factor(battery$Material)
battery$Temperature <- as.factor(battery$Temperature)
battery
#ANOVA
mod_3 <- aov(Life~Material + Temperature+Material*Temperature, data = battery)
summary(mod_3)
## Df Sum Sq Mean Sq F value Pr(>F)
## Material 2 10684 5342 7.911 0.00198 **
## Temperature 2 39119 19559 28.968 1.91e-07 ***
## Material:Temperature 4 9614 2403 3.560 0.01861 *
## Residuals 27 18231 675
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
p-value for interaction is <0.05, reject H0. Conclude there is interaction.
#check model assumptions
par(mfrow= c(2,2))
plot(mod_3)
plot(mod_3, which = 2)
shapiro.test(mod_3$residuals)
##
## Shapiro-Wilk normality test
##
## data: mod_3$residuals
## W = 0.97606, p-value = 0.6117
#boxplot
boxplot(Life~Material*Temperature, data = battery)
library(car)
leveneTest(Life~Material*Temperature, data = battery)